diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 38 | ||||
| -rw-r--r-- | mm/Makefile | 3 | ||||
| -rw-r--r-- | mm/compaction.c | 175 | ||||
| -rw-r--r-- | mm/dmapool.c | 16 | ||||
| -rw-r--r-- | mm/filemap.c | 25 | ||||
| -rw-r--r-- | mm/huge_memory.c | 2346 | ||||
| -rw-r--r-- | mm/hugetlb.c | 114 | ||||
| -rw-r--r-- | mm/internal.h | 16 | ||||
| -rw-r--r-- | mm/ksm.c | 88 | ||||
| -rw-r--r-- | mm/madvise.c | 10 | ||||
| -rw-r--r-- | mm/memcontrol.c | 277 | ||||
| -rw-r--r-- | mm/memory-failure.c | 30 | ||||
| -rw-r--r-- | mm/memory.c | 336 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 48 | ||||
| -rw-r--r-- | mm/mempolicy.c | 26 | ||||
| -rw-r--r-- | mm/migrate.c | 125 | ||||
| -rw-r--r-- | mm/mincore.c | 7 | ||||
| -rw-r--r-- | mm/mlock.c | 163 | ||||
| -rw-r--r-- | mm/mmap.c | 33 | ||||
| -rw-r--r-- | mm/mmu_notifier.c | 20 | ||||
| -rw-r--r-- | mm/mmzone.c | 21 | ||||
| -rw-r--r-- | mm/mprotect.c | 20 | ||||
| -rw-r--r-- | mm/mremap.c | 9 | ||||
| -rw-r--r-- | mm/nommu.c | 34 | ||||
| -rw-r--r-- | mm/page-writeback.c | 11 | ||||
| -rw-r--r-- | mm/page_alloc.c | 184 | ||||
| -rw-r--r-- | mm/pagewalk.c | 1 | ||||
| -rw-r--r-- | mm/percpu-vm.c | 2 | ||||
| -rw-r--r-- | mm/percpu.c | 12 | ||||
| -rw-r--r-- | mm/pgtable-generic.c | 123 | ||||
| -rw-r--r-- | mm/rmap.c | 93 | ||||
| -rw-r--r-- | mm/shmem.c | 9 | ||||
| -rw-r--r-- | mm/slab.c | 76 | ||||
| -rw-r--r-- | mm/slob.c | 5 | ||||
| -rw-r--r-- | mm/slub.c | 85 | ||||
| -rw-r--r-- | mm/sparse-vmemmap.c | 2 | ||||
| -rw-r--r-- | mm/sparse.c | 4 | ||||
| -rw-r--r-- | mm/swap.c | 322 | ||||
| -rw-r--r-- | mm/swap_state.c | 6 | ||||
| -rw-r--r-- | mm/swapfile.c | 9 | ||||
| -rw-r--r-- | mm/truncate.c | 4 | ||||
| -rw-r--r-- | mm/util.c | 21 | ||||
| -rw-r--r-- | mm/vmalloc.c | 118 | ||||
| -rw-r--r-- | mm/vmscan.c | 439 | ||||
| -rw-r--r-- | mm/vmstat.c | 206 |
45 files changed, 4649 insertions, 1063 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c2c8a4a11898..3ad483bdf505 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS | |||
| 302 | 302 | ||
| 303 | See Documentation/nommu-mmap.txt for more information. | 303 | See Documentation/nommu-mmap.txt for more information. |
| 304 | 304 | ||
| 305 | config TRANSPARENT_HUGEPAGE | ||
| 306 | bool "Transparent Hugepage Support" | ||
| 307 | depends on X86 && MMU | ||
| 308 | select COMPACTION | ||
| 309 | help | ||
| 310 | Transparent Hugepages allows the kernel to use huge pages and | ||
| 311 | huge tlb transparently to the applications whenever possible. | ||
| 312 | This feature can improve computing performance to certain | ||
| 313 | applications by speeding up page faults during memory | ||
| 314 | allocation, by reducing the number of tlb misses and by speeding | ||
| 315 | up the pagetable walking. | ||
| 316 | |||
| 317 | If memory constrained on embedded, you may want to say N. | ||
| 318 | |||
| 319 | choice | ||
| 320 | prompt "Transparent Hugepage Support sysfs defaults" | ||
| 321 | depends on TRANSPARENT_HUGEPAGE | ||
| 322 | default TRANSPARENT_HUGEPAGE_ALWAYS | ||
| 323 | help | ||
| 324 | Selects the sysfs defaults for Transparent Hugepage Support. | ||
| 325 | |||
| 326 | config TRANSPARENT_HUGEPAGE_ALWAYS | ||
| 327 | bool "always" | ||
| 328 | help | ||
| 329 | Enabling Transparent Hugepage always, can increase the | ||
| 330 | memory footprint of applications without a guaranteed | ||
| 331 | benefit but it will work automatically for all applications. | ||
| 332 | |||
| 333 | config TRANSPARENT_HUGEPAGE_MADVISE | ||
| 334 | bool "madvise" | ||
| 335 | help | ||
| 336 | Enabling Transparent Hugepage madvise, will only provide a | ||
| 337 | performance improvement benefit to the applications using | ||
| 338 | madvise(MADV_HUGEPAGE) but it won't risk to increase the | ||
| 339 | memory footprint of applications without a guaranteed | ||
| 340 | benefit. | ||
| 341 | endchoice | ||
| 342 | |||
| 305 | # | 343 | # |
| 306 | # UP and nommu archs use km based percpu allocator | 344 | # UP and nommu archs use km based percpu allocator |
| 307 | # | 345 | # |
diff --git a/mm/Makefile b/mm/Makefile index f73f75a29f82..2b1b575ae712 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -5,7 +5,7 @@ | |||
| 5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
| 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ |
| 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
| 8 | vmalloc.o pagewalk.o | 8 | vmalloc.o pagewalk.o pgtable-generic.o |
| 9 | 9 | ||
| 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
| 11 | maccess.o page_alloc.o page-writeback.o \ | 11 | maccess.o page_alloc.o page-writeback.o \ |
| @@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | |||
| 37 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 37 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
| 38 | obj-$(CONFIG_MIGRATION) += migrate.o | 38 | obj-$(CONFIG_MIGRATION) += migrate.o |
| 39 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 39 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
| 40 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | ||
| 40 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 41 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
| 41 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 42 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
| 42 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 43 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
diff --git a/mm/compaction.c b/mm/compaction.c index 4d709ee59013..6d592a021072 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
| @@ -16,6 +16,9 @@ | |||
| 16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
| 17 | #include "internal.h" | 17 | #include "internal.h" |
| 18 | 18 | ||
| 19 | #define CREATE_TRACE_POINTS | ||
| 20 | #include <trace/events/compaction.h> | ||
| 21 | |||
| 19 | /* | 22 | /* |
| 20 | * compact_control is used to track pages being migrated and the free pages | 23 | * compact_control is used to track pages being migrated and the free pages |
| 21 | * they are being migrated to during memory compaction. The free_pfn starts | 24 | * they are being migrated to during memory compaction. The free_pfn starts |
| @@ -30,6 +33,7 @@ struct compact_control { | |||
| 30 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 33 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
| 31 | unsigned long free_pfn; /* isolate_freepages search base */ | 34 | unsigned long free_pfn; /* isolate_freepages search base */ |
| 32 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
| 36 | bool sync; /* Synchronous migration */ | ||
| 33 | 37 | ||
| 34 | /* Account for isolated anon and file pages */ | 38 | /* Account for isolated anon and file pages */ |
| 35 | unsigned long nr_anon; | 39 | unsigned long nr_anon; |
| @@ -38,6 +42,8 @@ struct compact_control { | |||
| 38 | unsigned int order; /* order a direct compactor needs */ | 42 | unsigned int order; /* order a direct compactor needs */ |
| 39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 43 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
| 40 | struct zone *zone; | 44 | struct zone *zone; |
| 45 | |||
| 46 | int compact_mode; | ||
| 41 | }; | 47 | }; |
| 42 | 48 | ||
| 43 | static unsigned long release_freepages(struct list_head *freelist) | 49 | static unsigned long release_freepages(struct list_head *freelist) |
| @@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
| 60 | struct list_head *freelist) | 66 | struct list_head *freelist) |
| 61 | { | 67 | { |
| 62 | unsigned long zone_end_pfn, end_pfn; | 68 | unsigned long zone_end_pfn, end_pfn; |
| 63 | int total_isolated = 0; | 69 | int nr_scanned = 0, total_isolated = 0; |
| 64 | struct page *cursor; | 70 | struct page *cursor; |
| 65 | 71 | ||
| 66 | /* Get the last PFN we should scan for free pages at */ | 72 | /* Get the last PFN we should scan for free pages at */ |
| @@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
| 81 | 87 | ||
| 82 | if (!pfn_valid_within(blockpfn)) | 88 | if (!pfn_valid_within(blockpfn)) |
| 83 | continue; | 89 | continue; |
| 90 | nr_scanned++; | ||
| 84 | 91 | ||
| 85 | if (!PageBuddy(page)) | 92 | if (!PageBuddy(page)) |
| 86 | continue; | 93 | continue; |
| @@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
| 100 | } | 107 | } |
| 101 | } | 108 | } |
| 102 | 109 | ||
| 110 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | ||
| 103 | return total_isolated; | 111 | return total_isolated; |
| 104 | } | 112 | } |
| 105 | 113 | ||
| @@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
| 234 | struct compact_control *cc) | 242 | struct compact_control *cc) |
| 235 | { | 243 | { |
| 236 | unsigned long low_pfn, end_pfn; | 244 | unsigned long low_pfn, end_pfn; |
| 245 | unsigned long last_pageblock_nr = 0, pageblock_nr; | ||
| 246 | unsigned long nr_scanned = 0, nr_isolated = 0; | ||
| 237 | struct list_head *migratelist = &cc->migratepages; | 247 | struct list_head *migratelist = &cc->migratepages; |
| 238 | 248 | ||
| 239 | /* Do not scan outside zone boundaries */ | 249 | /* Do not scan outside zone boundaries */ |
| @@ -266,21 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
| 266 | struct page *page; | 276 | struct page *page; |
| 267 | if (!pfn_valid_within(low_pfn)) | 277 | if (!pfn_valid_within(low_pfn)) |
| 268 | continue; | 278 | continue; |
| 279 | nr_scanned++; | ||
| 269 | 280 | ||
| 270 | /* Get the page and skip if free */ | 281 | /* Get the page and skip if free */ |
| 271 | page = pfn_to_page(low_pfn); | 282 | page = pfn_to_page(low_pfn); |
| 272 | if (PageBuddy(page)) | 283 | if (PageBuddy(page)) |
| 273 | continue; | 284 | continue; |
| 274 | 285 | ||
| 286 | /* | ||
| 287 | * For async migration, also only scan in MOVABLE blocks. Async | ||
| 288 | * migration is optimistic to see if the minimum amount of work | ||
| 289 | * satisfies the allocation | ||
| 290 | */ | ||
| 291 | pageblock_nr = low_pfn >> pageblock_order; | ||
| 292 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | ||
| 293 | get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { | ||
| 294 | low_pfn += pageblock_nr_pages; | ||
| 295 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | ||
| 296 | last_pageblock_nr = pageblock_nr; | ||
| 297 | continue; | ||
| 298 | } | ||
| 299 | |||
| 300 | if (!PageLRU(page)) | ||
| 301 | continue; | ||
| 302 | |||
| 303 | /* | ||
| 304 | * PageLRU is set, and lru_lock excludes isolation, | ||
| 305 | * splitting and collapsing (collapsing has already | ||
| 306 | * happened if PageLRU is set). | ||
| 307 | */ | ||
| 308 | if (PageTransHuge(page)) { | ||
| 309 | low_pfn += (1 << compound_order(page)) - 1; | ||
| 310 | continue; | ||
| 311 | } | ||
| 312 | |||
| 275 | /* Try isolate the page */ | 313 | /* Try isolate the page */ |
| 276 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) | 314 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) |
| 277 | continue; | 315 | continue; |
| 278 | 316 | ||
| 317 | VM_BUG_ON(PageTransCompound(page)); | ||
| 318 | |||
| 279 | /* Successfully isolated */ | 319 | /* Successfully isolated */ |
| 280 | del_page_from_lru_list(zone, page, page_lru(page)); | 320 | del_page_from_lru_list(zone, page, page_lru(page)); |
| 281 | list_add(&page->lru, migratelist); | 321 | list_add(&page->lru, migratelist); |
| 282 | mem_cgroup_del_lru(page); | ||
| 283 | cc->nr_migratepages++; | 322 | cc->nr_migratepages++; |
| 323 | nr_isolated++; | ||
| 284 | 324 | ||
| 285 | /* Avoid isolating too much */ | 325 | /* Avoid isolating too much */ |
| 286 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) | 326 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) |
| @@ -292,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
| 292 | spin_unlock_irq(&zone->lru_lock); | 332 | spin_unlock_irq(&zone->lru_lock); |
| 293 | cc->migrate_pfn = low_pfn; | 333 | cc->migrate_pfn = low_pfn; |
| 294 | 334 | ||
| 335 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | ||
| 336 | |||
| 295 | return cc->nr_migratepages; | 337 | return cc->nr_migratepages; |
| 296 | } | 338 | } |
| 297 | 339 | ||
| @@ -342,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc) | |||
| 342 | } | 384 | } |
| 343 | 385 | ||
| 344 | static int compact_finished(struct zone *zone, | 386 | static int compact_finished(struct zone *zone, |
| 345 | struct compact_control *cc) | 387 | struct compact_control *cc) |
| 346 | { | 388 | { |
| 347 | unsigned int order; | 389 | unsigned int order; |
| 348 | unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); | 390 | unsigned long watermark; |
| 349 | 391 | ||
| 350 | if (fatal_signal_pending(current)) | 392 | if (fatal_signal_pending(current)) |
| 351 | return COMPACT_PARTIAL; | 393 | return COMPACT_PARTIAL; |
| @@ -355,12 +397,27 @@ static int compact_finished(struct zone *zone, | |||
| 355 | return COMPACT_COMPLETE; | 397 | return COMPACT_COMPLETE; |
| 356 | 398 | ||
| 357 | /* Compaction run is not finished if the watermark is not met */ | 399 | /* Compaction run is not finished if the watermark is not met */ |
| 400 | if (cc->compact_mode != COMPACT_MODE_KSWAPD) | ||
| 401 | watermark = low_wmark_pages(zone); | ||
| 402 | else | ||
| 403 | watermark = high_wmark_pages(zone); | ||
| 404 | watermark += (1 << cc->order); | ||
| 405 | |||
| 358 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | 406 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) |
| 359 | return COMPACT_CONTINUE; | 407 | return COMPACT_CONTINUE; |
| 360 | 408 | ||
| 361 | if (cc->order == -1) | 409 | if (cc->order == -1) |
| 362 | return COMPACT_CONTINUE; | 410 | return COMPACT_CONTINUE; |
| 363 | 411 | ||
| 412 | /* | ||
| 413 | * Generating only one page of the right order is not enough | ||
| 414 | * for kswapd, we must continue until we're above the high | ||
| 415 | * watermark as a pool for high order GFP_ATOMIC allocations | ||
| 416 | * too. | ||
| 417 | */ | ||
| 418 | if (cc->compact_mode == COMPACT_MODE_KSWAPD) | ||
| 419 | return COMPACT_CONTINUE; | ||
| 420 | |||
| 364 | /* Direct compactor: Is a suitable page free? */ | 421 | /* Direct compactor: Is a suitable page free? */ |
| 365 | for (order = cc->order; order < MAX_ORDER; order++) { | 422 | for (order = cc->order; order < MAX_ORDER; order++) { |
| 366 | /* Job done if page is free of the right migratetype */ | 423 | /* Job done if page is free of the right migratetype */ |
| @@ -375,10 +432,62 @@ static int compact_finished(struct zone *zone, | |||
| 375 | return COMPACT_CONTINUE; | 432 | return COMPACT_CONTINUE; |
| 376 | } | 433 | } |
| 377 | 434 | ||
| 435 | /* | ||
| 436 | * compaction_suitable: Is this suitable to run compaction on this zone now? | ||
| 437 | * Returns | ||
| 438 | * COMPACT_SKIPPED - If there are too few free pages for compaction | ||
| 439 | * COMPACT_PARTIAL - If the allocation would succeed without compaction | ||
| 440 | * COMPACT_CONTINUE - If compaction should run now | ||
| 441 | */ | ||
| 442 | unsigned long compaction_suitable(struct zone *zone, int order) | ||
| 443 | { | ||
| 444 | int fragindex; | ||
| 445 | unsigned long watermark; | ||
| 446 | |||
| 447 | /* | ||
| 448 | * Watermarks for order-0 must be met for compaction. Note the 2UL. | ||
| 449 | * This is because during migration, copies of pages need to be | ||
| 450 | * allocated and for a short time, the footprint is higher | ||
| 451 | */ | ||
| 452 | watermark = low_wmark_pages(zone) + (2UL << order); | ||
| 453 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | ||
| 454 | return COMPACT_SKIPPED; | ||
| 455 | |||
| 456 | /* | ||
| 457 | * fragmentation index determines if allocation failures are due to | ||
| 458 | * low memory or external fragmentation | ||
| 459 | * | ||
| 460 | * index of -1 implies allocations might succeed dependingon watermarks | ||
| 461 | * index towards 0 implies failure is due to lack of memory | ||
| 462 | * index towards 1000 implies failure is due to fragmentation | ||
| 463 | * | ||
| 464 | * Only compact if a failure would be due to fragmentation. | ||
| 465 | */ | ||
| 466 | fragindex = fragmentation_index(zone, order); | ||
| 467 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | ||
| 468 | return COMPACT_SKIPPED; | ||
| 469 | |||
| 470 | if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) | ||
| 471 | return COMPACT_PARTIAL; | ||
| 472 | |||
| 473 | return COMPACT_CONTINUE; | ||
| 474 | } | ||
| 475 | |||
| 378 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 476 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
| 379 | { | 477 | { |
| 380 | int ret; | 478 | int ret; |
| 381 | 479 | ||
| 480 | ret = compaction_suitable(zone, cc->order); | ||
| 481 | switch (ret) { | ||
| 482 | case COMPACT_PARTIAL: | ||
| 483 | case COMPACT_SKIPPED: | ||
| 484 | /* Compaction is likely to fail */ | ||
| 485 | return ret; | ||
| 486 | case COMPACT_CONTINUE: | ||
| 487 | /* Fall through to compaction */ | ||
| 488 | ; | ||
| 489 | } | ||
| 490 | |||
| 382 | /* Setup to move all movable pages to the end of the zone */ | 491 | /* Setup to move all movable pages to the end of the zone */ |
| 383 | cc->migrate_pfn = zone->zone_start_pfn; | 492 | cc->migrate_pfn = zone->zone_start_pfn; |
| 384 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; | 493 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; |
| @@ -394,7 +503,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 394 | 503 | ||
| 395 | nr_migrate = cc->nr_migratepages; | 504 | nr_migrate = cc->nr_migratepages; |
| 396 | migrate_pages(&cc->migratepages, compaction_alloc, | 505 | migrate_pages(&cc->migratepages, compaction_alloc, |
| 397 | (unsigned long)cc, 0); | 506 | (unsigned long)cc, false, |
| 507 | cc->sync); | ||
| 398 | update_nr_listpages(cc); | 508 | update_nr_listpages(cc); |
| 399 | nr_remaining = cc->nr_migratepages; | 509 | nr_remaining = cc->nr_migratepages; |
| 400 | 510 | ||
| @@ -402,6 +512,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 402 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); | 512 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); |
| 403 | if (nr_remaining) | 513 | if (nr_remaining) |
| 404 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); | 514 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); |
| 515 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | ||
| 516 | nr_remaining); | ||
| 405 | 517 | ||
| 406 | /* Release LRU pages not migrated */ | 518 | /* Release LRU pages not migrated */ |
| 407 | if (!list_empty(&cc->migratepages)) { | 519 | if (!list_empty(&cc->migratepages)) { |
| @@ -418,8 +530,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 418 | return ret; | 530 | return ret; |
| 419 | } | 531 | } |
| 420 | 532 | ||
| 421 | static unsigned long compact_zone_order(struct zone *zone, | 533 | unsigned long compact_zone_order(struct zone *zone, |
| 422 | int order, gfp_t gfp_mask) | 534 | int order, gfp_t gfp_mask, |
| 535 | bool sync, | ||
| 536 | int compact_mode) | ||
| 423 | { | 537 | { |
| 424 | struct compact_control cc = { | 538 | struct compact_control cc = { |
| 425 | .nr_freepages = 0, | 539 | .nr_freepages = 0, |
| @@ -427,6 +541,8 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
| 427 | .order = order, | 541 | .order = order, |
| 428 | .migratetype = allocflags_to_migratetype(gfp_mask), | 542 | .migratetype = allocflags_to_migratetype(gfp_mask), |
| 429 | .zone = zone, | 543 | .zone = zone, |
| 544 | .sync = sync, | ||
| 545 | .compact_mode = compact_mode, | ||
| 430 | }; | 546 | }; |
| 431 | INIT_LIST_HEAD(&cc.freepages); | 547 | INIT_LIST_HEAD(&cc.freepages); |
| 432 | INIT_LIST_HEAD(&cc.migratepages); | 548 | INIT_LIST_HEAD(&cc.migratepages); |
| @@ -442,16 +558,17 @@ int sysctl_extfrag_threshold = 500; | |||
| 442 | * @order: The order of the current allocation | 558 | * @order: The order of the current allocation |
| 443 | * @gfp_mask: The GFP mask of the current allocation | 559 | * @gfp_mask: The GFP mask of the current allocation |
| 444 | * @nodemask: The allowed nodes to allocate from | 560 | * @nodemask: The allowed nodes to allocate from |
| 561 | * @sync: Whether migration is synchronous or not | ||
| 445 | * | 562 | * |
| 446 | * This is the main entry point for direct page compaction. | 563 | * This is the main entry point for direct page compaction. |
| 447 | */ | 564 | */ |
| 448 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 565 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
| 449 | int order, gfp_t gfp_mask, nodemask_t *nodemask) | 566 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
| 567 | bool sync) | ||
| 450 | { | 568 | { |
| 451 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 569 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
| 452 | int may_enter_fs = gfp_mask & __GFP_FS; | 570 | int may_enter_fs = gfp_mask & __GFP_FS; |
| 453 | int may_perform_io = gfp_mask & __GFP_IO; | 571 | int may_perform_io = gfp_mask & __GFP_IO; |
| 454 | unsigned long watermark; | ||
| 455 | struct zoneref *z; | 572 | struct zoneref *z; |
| 456 | struct zone *zone; | 573 | struct zone *zone; |
| 457 | int rc = COMPACT_SKIPPED; | 574 | int rc = COMPACT_SKIPPED; |
| @@ -461,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
| 461 | * made because an assumption is made that the page allocator can satisfy | 578 | * made because an assumption is made that the page allocator can satisfy |
| 462 | * the "cheaper" orders without taking special steps | 579 | * the "cheaper" orders without taking special steps |
| 463 | */ | 580 | */ |
| 464 | if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) | 581 | if (!order || !may_enter_fs || !may_perform_io) |
| 465 | return rc; | 582 | return rc; |
| 466 | 583 | ||
| 467 | count_vm_event(COMPACTSTALL); | 584 | count_vm_event(COMPACTSTALL); |
| @@ -469,43 +586,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
| 469 | /* Compact each zone in the list */ | 586 | /* Compact each zone in the list */ |
| 470 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 587 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
| 471 | nodemask) { | 588 | nodemask) { |
| 472 | int fragindex; | ||
| 473 | int status; | 589 | int status; |
| 474 | 590 | ||
| 475 | /* | 591 | status = compact_zone_order(zone, order, gfp_mask, sync, |
| 476 | * Watermarks for order-0 must be met for compaction. Note | 592 | COMPACT_MODE_DIRECT_RECLAIM); |
| 477 | * the 2UL. This is because during migration, copies of | ||
| 478 | * pages need to be allocated and for a short time, the | ||
| 479 | * footprint is higher | ||
| 480 | */ | ||
| 481 | watermark = low_wmark_pages(zone) + (2UL << order); | ||
| 482 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | ||
| 483 | continue; | ||
| 484 | |||
| 485 | /* | ||
| 486 | * fragmentation index determines if allocation failures are | ||
| 487 | * due to low memory or external fragmentation | ||
| 488 | * | ||
| 489 | * index of -1 implies allocations might succeed depending | ||
| 490 | * on watermarks | ||
| 491 | * index towards 0 implies failure is due to lack of memory | ||
| 492 | * index towards 1000 implies failure is due to fragmentation | ||
| 493 | * | ||
| 494 | * Only compact if a failure would be due to fragmentation. | ||
| 495 | */ | ||
| 496 | fragindex = fragmentation_index(zone, order); | ||
| 497 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | ||
| 498 | continue; | ||
| 499 | |||
| 500 | if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { | ||
| 501 | rc = COMPACT_PARTIAL; | ||
| 502 | break; | ||
| 503 | } | ||
| 504 | |||
| 505 | status = compact_zone_order(zone, order, gfp_mask); | ||
| 506 | rc = max(status, rc); | 593 | rc = max(status, rc); |
| 507 | 594 | ||
| 508 | if (zone_watermark_ok(zone, order, watermark, 0, 0)) | 595 | /* If a normal allocation would succeed, stop compacting */ |
| 596 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
| 509 | break; | 597 | break; |
| 510 | } | 598 | } |
| 511 | 599 | ||
| @@ -532,6 +620,7 @@ static int compact_node(int nid) | |||
| 532 | .nr_freepages = 0, | 620 | .nr_freepages = 0, |
| 533 | .nr_migratepages = 0, | 621 | .nr_migratepages = 0, |
| 534 | .order = -1, | 622 | .order = -1, |
| 623 | .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, | ||
| 535 | }; | 624 | }; |
| 536 | 625 | ||
| 537 | zone = &pgdat->node_zones[zoneid]; | 626 | zone = &pgdat->node_zones[zoneid]; |
diff --git a/mm/dmapool.c b/mm/dmapool.c index 4df2de77e069..03bf3bb4519a 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
| @@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
| 324 | if (mem_flags & __GFP_WAIT) { | 324 | if (mem_flags & __GFP_WAIT) { |
| 325 | DECLARE_WAITQUEUE(wait, current); | 325 | DECLARE_WAITQUEUE(wait, current); |
| 326 | 326 | ||
| 327 | __set_current_state(TASK_INTERRUPTIBLE); | 327 | __set_current_state(TASK_UNINTERRUPTIBLE); |
| 328 | __add_wait_queue(&pool->waitq, &wait); | 328 | __add_wait_queue(&pool->waitq, &wait); |
| 329 | spin_unlock_irqrestore(&pool->lock, flags); | 329 | spin_unlock_irqrestore(&pool->lock, flags); |
| 330 | 330 | ||
| @@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc); | |||
| 355 | 355 | ||
| 356 | static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) | 356 | static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) |
| 357 | { | 357 | { |
| 358 | unsigned long flags; | ||
| 359 | struct dma_page *page; | 358 | struct dma_page *page; |
| 360 | 359 | ||
| 361 | spin_lock_irqsave(&pool->lock, flags); | ||
| 362 | list_for_each_entry(page, &pool->page_list, page_list) { | 360 | list_for_each_entry(page, &pool->page_list, page_list) { |
| 363 | if (dma < page->dma) | 361 | if (dma < page->dma) |
| 364 | continue; | 362 | continue; |
| 365 | if (dma < (page->dma + pool->allocation)) | 363 | if (dma < (page->dma + pool->allocation)) |
| 366 | goto done; | 364 | return page; |
| 367 | } | 365 | } |
| 368 | page = NULL; | 366 | return NULL; |
| 369 | done: | ||
| 370 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 371 | return page; | ||
| 372 | } | 367 | } |
| 373 | 368 | ||
| 374 | /** | 369 | /** |
| @@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
| 386 | unsigned long flags; | 381 | unsigned long flags; |
| 387 | unsigned int offset; | 382 | unsigned int offset; |
| 388 | 383 | ||
| 384 | spin_lock_irqsave(&pool->lock, flags); | ||
| 389 | page = pool_find_page(pool, dma); | 385 | page = pool_find_page(pool, dma); |
| 390 | if (!page) { | 386 | if (!page) { |
| 387 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 391 | if (pool->dev) | 388 | if (pool->dev) |
| 392 | dev_err(pool->dev, | 389 | dev_err(pool->dev, |
| 393 | "dma_pool_free %s, %p/%lx (bad dma)\n", | 390 | "dma_pool_free %s, %p/%lx (bad dma)\n", |
| @@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
| 401 | offset = vaddr - page->vaddr; | 398 | offset = vaddr - page->vaddr; |
| 402 | #ifdef DMAPOOL_DEBUG | 399 | #ifdef DMAPOOL_DEBUG |
| 403 | if ((dma - page->dma) != offset) { | 400 | if ((dma - page->dma) != offset) { |
| 401 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 404 | if (pool->dev) | 402 | if (pool->dev) |
| 405 | dev_err(pool->dev, | 403 | dev_err(pool->dev, |
| 406 | "dma_pool_free %s, %p (bad vaddr)/%Lx\n", | 404 | "dma_pool_free %s, %p (bad vaddr)/%Lx\n", |
| @@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
| 418 | chain = *(int *)(page->vaddr + chain); | 416 | chain = *(int *)(page->vaddr + chain); |
| 419 | continue; | 417 | continue; |
| 420 | } | 418 | } |
| 419 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 421 | if (pool->dev) | 420 | if (pool->dev) |
| 422 | dev_err(pool->dev, "dma_pool_free %s, dma %Lx " | 421 | dev_err(pool->dev, "dma_pool_free %s, dma %Lx " |
| 423 | "already free\n", pool->name, | 422 | "already free\n", pool->name, |
| @@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
| 432 | memset(vaddr, POOL_POISON_FREED, pool->size); | 431 | memset(vaddr, POOL_POISON_FREED, pool->size); |
| 433 | #endif | 432 | #endif |
| 434 | 433 | ||
| 435 | spin_lock_irqsave(&pool->lock, flags); | ||
| 436 | page->in_use--; | 434 | page->in_use--; |
| 437 | *(int *)vaddr = page->offset; | 435 | *(int *)vaddr = page->offset; |
| 438 | page->offset = offset; | 436 | page->offset = offset; |
diff --git a/mm/filemap.c b/mm/filemap.c index ea89840fc65f..83a45d35468b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -102,9 +102,6 @@ | |||
| 102 | * ->inode_lock (zap_pte_range->set_page_dirty) | 102 | * ->inode_lock (zap_pte_range->set_page_dirty) |
| 103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
| 104 | * | 104 | * |
| 105 | * ->task->proc_lock | ||
| 106 | * ->dcache_lock (proc_pid_lookup) | ||
| 107 | * | ||
| 108 | * (code doesn't rely on that order, so you could switch it around) | 105 | * (code doesn't rely on that order, so you could switch it around) |
| 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 106 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
| 110 | * ->i_mmap_lock | 107 | * ->i_mmap_lock |
| @@ -143,13 +140,18 @@ void __remove_from_page_cache(struct page *page) | |||
| 143 | void remove_from_page_cache(struct page *page) | 140 | void remove_from_page_cache(struct page *page) |
| 144 | { | 141 | { |
| 145 | struct address_space *mapping = page->mapping; | 142 | struct address_space *mapping = page->mapping; |
| 143 | void (*freepage)(struct page *); | ||
| 146 | 144 | ||
| 147 | BUG_ON(!PageLocked(page)); | 145 | BUG_ON(!PageLocked(page)); |
| 148 | 146 | ||
| 147 | freepage = mapping->a_ops->freepage; | ||
| 149 | spin_lock_irq(&mapping->tree_lock); | 148 | spin_lock_irq(&mapping->tree_lock); |
| 150 | __remove_from_page_cache(page); | 149 | __remove_from_page_cache(page); |
| 151 | spin_unlock_irq(&mapping->tree_lock); | 150 | spin_unlock_irq(&mapping->tree_lock); |
| 152 | mem_cgroup_uncharge_cache_page(page); | 151 | mem_cgroup_uncharge_cache_page(page); |
| 152 | |||
| 153 | if (freepage) | ||
| 154 | freepage(page); | ||
| 153 | } | 155 | } |
| 154 | EXPORT_SYMBOL(remove_from_page_cache); | 156 | EXPORT_SYMBOL(remove_from_page_cache); |
| 155 | 157 | ||
| @@ -296,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, | |||
| 296 | continue; | 298 | continue; |
| 297 | 299 | ||
| 298 | wait_on_page_writeback(page); | 300 | wait_on_page_writeback(page); |
| 299 | if (PageError(page)) | 301 | if (TestClearPageError(page)) |
| 300 | ret = -EIO; | 302 | ret = -EIO; |
| 301 | } | 303 | } |
| 302 | pagevec_release(&pvec); | 304 | pagevec_release(&pvec); |
| @@ -835,9 +837,6 @@ repeat: | |||
| 835 | if (radix_tree_deref_retry(page)) | 837 | if (radix_tree_deref_retry(page)) |
| 836 | goto restart; | 838 | goto restart; |
| 837 | 839 | ||
| 838 | if (page->mapping == NULL || page->index != index) | ||
| 839 | break; | ||
| 840 | |||
| 841 | if (!page_cache_get_speculative(page)) | 840 | if (!page_cache_get_speculative(page)) |
| 842 | goto repeat; | 841 | goto repeat; |
| 843 | 842 | ||
| @@ -847,6 +846,16 @@ repeat: | |||
| 847 | goto repeat; | 846 | goto repeat; |
| 848 | } | 847 | } |
| 849 | 848 | ||
| 849 | /* | ||
| 850 | * must check mapping and index after taking the ref. | ||
| 851 | * otherwise we can get both false positives and false | ||
| 852 | * negatives, which is just confusing to the caller. | ||
| 853 | */ | ||
| 854 | if (page->mapping == NULL || page->index != index) { | ||
| 855 | page_cache_release(page); | ||
| 856 | break; | ||
| 857 | } | ||
| 858 | |||
| 850 | pages[ret] = page; | 859 | pages[ret] = page; |
| 851 | ret++; | 860 | ret++; |
| 852 | index++; | 861 | index++; |
| @@ -2218,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, | |||
| 2218 | gfp_notmask = __GFP_FS; | 2227 | gfp_notmask = __GFP_FS; |
| 2219 | repeat: | 2228 | repeat: |
| 2220 | page = find_lock_page(mapping, index); | 2229 | page = find_lock_page(mapping, index); |
| 2221 | if (likely(page)) | 2230 | if (page) |
| 2222 | return page; | 2231 | return page; |
| 2223 | 2232 | ||
| 2224 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); | 2233 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 000000000000..004c9c2aac78 --- /dev/null +++ b/mm/huge_memory.c | |||
| @@ -0,0 +1,2346 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2009 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 5 | * the COPYING file in the top-level directory. | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/mm.h> | ||
| 9 | #include <linux/sched.h> | ||
| 10 | #include <linux/highmem.h> | ||
| 11 | #include <linux/hugetlb.h> | ||
| 12 | #include <linux/mmu_notifier.h> | ||
| 13 | #include <linux/rmap.h> | ||
| 14 | #include <linux/swap.h> | ||
| 15 | #include <linux/mm_inline.h> | ||
| 16 | #include <linux/kthread.h> | ||
| 17 | #include <linux/khugepaged.h> | ||
| 18 | #include <linux/freezer.h> | ||
| 19 | #include <linux/mman.h> | ||
| 20 | #include <asm/tlb.h> | ||
| 21 | #include <asm/pgalloc.h> | ||
| 22 | #include "internal.h" | ||
| 23 | |||
| 24 | /* | ||
| 25 | * By default transparent hugepage support is enabled for all mappings | ||
| 26 | * and khugepaged scans all mappings. Defrag is only invoked by | ||
| 27 | * khugepaged hugepage allocations and by page faults inside | ||
| 28 | * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived | ||
| 29 | * allocations. | ||
| 30 | */ | ||
| 31 | unsigned long transparent_hugepage_flags __read_mostly = | ||
| 32 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS | ||
| 33 | (1<<TRANSPARENT_HUGEPAGE_FLAG)| | ||
| 34 | #endif | ||
| 35 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE | ||
| 36 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| | ||
| 37 | #endif | ||
| 38 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| | ||
| 39 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
| 40 | |||
| 41 | /* default scan 8*512 pte (or vmas) every 30 second */ | ||
| 42 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | ||
| 43 | static unsigned int khugepaged_pages_collapsed; | ||
| 44 | static unsigned int khugepaged_full_scans; | ||
| 45 | static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; | ||
| 46 | /* during fragmentation poll the hugepage allocator once every minute */ | ||
| 47 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; | ||
| 48 | static struct task_struct *khugepaged_thread __read_mostly; | ||
| 49 | static DEFINE_MUTEX(khugepaged_mutex); | ||
| 50 | static DEFINE_SPINLOCK(khugepaged_mm_lock); | ||
| 51 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | ||
| 52 | /* | ||
| 53 | * default collapse hugepages if there is at least one pte mapped like | ||
| 54 | * it would have happened if the vma was large enough during page | ||
| 55 | * fault. | ||
| 56 | */ | ||
| 57 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | ||
| 58 | |||
| 59 | static int khugepaged(void *none); | ||
| 60 | static int mm_slots_hash_init(void); | ||
| 61 | static int khugepaged_slab_init(void); | ||
| 62 | static void khugepaged_slab_free(void); | ||
| 63 | |||
| 64 | #define MM_SLOTS_HASH_HEADS 1024 | ||
| 65 | static struct hlist_head *mm_slots_hash __read_mostly; | ||
| 66 | static struct kmem_cache *mm_slot_cache __read_mostly; | ||
| 67 | |||
| 68 | /** | ||
| 69 | * struct mm_slot - hash lookup from mm to mm_slot | ||
| 70 | * @hash: hash collision list | ||
| 71 | * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head | ||
| 72 | * @mm: the mm that this information is valid for | ||
| 73 | */ | ||
| 74 | struct mm_slot { | ||
| 75 | struct hlist_node hash; | ||
| 76 | struct list_head mm_node; | ||
| 77 | struct mm_struct *mm; | ||
| 78 | }; | ||
| 79 | |||
| 80 | /** | ||
| 81 | * struct khugepaged_scan - cursor for scanning | ||
| 82 | * @mm_head: the head of the mm list to scan | ||
| 83 | * @mm_slot: the current mm_slot we are scanning | ||
| 84 | * @address: the next address inside that to be scanned | ||
| 85 | * | ||
| 86 | * There is only the one khugepaged_scan instance of this cursor structure. | ||
| 87 | */ | ||
| 88 | struct khugepaged_scan { | ||
| 89 | struct list_head mm_head; | ||
| 90 | struct mm_slot *mm_slot; | ||
| 91 | unsigned long address; | ||
| 92 | } khugepaged_scan = { | ||
| 93 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | ||
| 94 | }; | ||
| 95 | |||
| 96 | |||
| 97 | static int set_recommended_min_free_kbytes(void) | ||
| 98 | { | ||
| 99 | struct zone *zone; | ||
| 100 | int nr_zones = 0; | ||
| 101 | unsigned long recommended_min; | ||
| 102 | extern int min_free_kbytes; | ||
| 103 | |||
| 104 | if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
| 105 | &transparent_hugepage_flags) && | ||
| 106 | !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
| 107 | &transparent_hugepage_flags)) | ||
| 108 | return 0; | ||
| 109 | |||
| 110 | for_each_populated_zone(zone) | ||
| 111 | nr_zones++; | ||
| 112 | |||
| 113 | /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ | ||
| 114 | recommended_min = pageblock_nr_pages * nr_zones * 2; | ||
| 115 | |||
| 116 | /* | ||
| 117 | * Make sure that on average at least two pageblocks are almost free | ||
| 118 | * of another type, one for a migratetype to fall back to and a | ||
| 119 | * second to avoid subsequent fallbacks of other types There are 3 | ||
| 120 | * MIGRATE_TYPES we care about. | ||
| 121 | */ | ||
| 122 | recommended_min += pageblock_nr_pages * nr_zones * | ||
| 123 | MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; | ||
| 124 | |||
| 125 | /* don't ever allow to reserve more than 5% of the lowmem */ | ||
| 126 | recommended_min = min(recommended_min, | ||
| 127 | (unsigned long) nr_free_buffer_pages() / 20); | ||
| 128 | recommended_min <<= (PAGE_SHIFT-10); | ||
| 129 | |||
| 130 | if (recommended_min > min_free_kbytes) | ||
| 131 | min_free_kbytes = recommended_min; | ||
| 132 | setup_per_zone_wmarks(); | ||
| 133 | return 0; | ||
| 134 | } | ||
| 135 | late_initcall(set_recommended_min_free_kbytes); | ||
| 136 | |||
| 137 | static int start_khugepaged(void) | ||
| 138 | { | ||
| 139 | int err = 0; | ||
| 140 | if (khugepaged_enabled()) { | ||
| 141 | int wakeup; | ||
| 142 | if (unlikely(!mm_slot_cache || !mm_slots_hash)) { | ||
| 143 | err = -ENOMEM; | ||
| 144 | goto out; | ||
| 145 | } | ||
| 146 | mutex_lock(&khugepaged_mutex); | ||
| 147 | if (!khugepaged_thread) | ||
| 148 | khugepaged_thread = kthread_run(khugepaged, NULL, | ||
| 149 | "khugepaged"); | ||
| 150 | if (unlikely(IS_ERR(khugepaged_thread))) { | ||
| 151 | printk(KERN_ERR | ||
| 152 | "khugepaged: kthread_run(khugepaged) failed\n"); | ||
| 153 | err = PTR_ERR(khugepaged_thread); | ||
| 154 | khugepaged_thread = NULL; | ||
| 155 | } | ||
| 156 | wakeup = !list_empty(&khugepaged_scan.mm_head); | ||
| 157 | mutex_unlock(&khugepaged_mutex); | ||
| 158 | if (wakeup) | ||
| 159 | wake_up_interruptible(&khugepaged_wait); | ||
| 160 | |||
| 161 | set_recommended_min_free_kbytes(); | ||
| 162 | } else | ||
| 163 | /* wakeup to exit */ | ||
| 164 | wake_up_interruptible(&khugepaged_wait); | ||
| 165 | out: | ||
| 166 | return err; | ||
| 167 | } | ||
| 168 | |||
| 169 | #ifdef CONFIG_SYSFS | ||
| 170 | |||
| 171 | static ssize_t double_flag_show(struct kobject *kobj, | ||
| 172 | struct kobj_attribute *attr, char *buf, | ||
| 173 | enum transparent_hugepage_flag enabled, | ||
| 174 | enum transparent_hugepage_flag req_madv) | ||
| 175 | { | ||
| 176 | if (test_bit(enabled, &transparent_hugepage_flags)) { | ||
| 177 | VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); | ||
| 178 | return sprintf(buf, "[always] madvise never\n"); | ||
| 179 | } else if (test_bit(req_madv, &transparent_hugepage_flags)) | ||
| 180 | return sprintf(buf, "always [madvise] never\n"); | ||
| 181 | else | ||
| 182 | return sprintf(buf, "always madvise [never]\n"); | ||
| 183 | } | ||
| 184 | static ssize_t double_flag_store(struct kobject *kobj, | ||
| 185 | struct kobj_attribute *attr, | ||
| 186 | const char *buf, size_t count, | ||
| 187 | enum transparent_hugepage_flag enabled, | ||
| 188 | enum transparent_hugepage_flag req_madv) | ||
| 189 | { | ||
| 190 | if (!memcmp("always", buf, | ||
| 191 | min(sizeof("always")-1, count))) { | ||
| 192 | set_bit(enabled, &transparent_hugepage_flags); | ||
| 193 | clear_bit(req_madv, &transparent_hugepage_flags); | ||
| 194 | } else if (!memcmp("madvise", buf, | ||
| 195 | min(sizeof("madvise")-1, count))) { | ||
| 196 | clear_bit(enabled, &transparent_hugepage_flags); | ||
| 197 | set_bit(req_madv, &transparent_hugepage_flags); | ||
| 198 | } else if (!memcmp("never", buf, | ||
| 199 | min(sizeof("never")-1, count))) { | ||
| 200 | clear_bit(enabled, &transparent_hugepage_flags); | ||
| 201 | clear_bit(req_madv, &transparent_hugepage_flags); | ||
| 202 | } else | ||
| 203 | return -EINVAL; | ||
| 204 | |||
| 205 | return count; | ||
| 206 | } | ||
| 207 | |||
| 208 | static ssize_t enabled_show(struct kobject *kobj, | ||
| 209 | struct kobj_attribute *attr, char *buf) | ||
| 210 | { | ||
| 211 | return double_flag_show(kobj, attr, buf, | ||
| 212 | TRANSPARENT_HUGEPAGE_FLAG, | ||
| 213 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | ||
| 214 | } | ||
| 215 | static ssize_t enabled_store(struct kobject *kobj, | ||
| 216 | struct kobj_attribute *attr, | ||
| 217 | const char *buf, size_t count) | ||
| 218 | { | ||
| 219 | ssize_t ret; | ||
| 220 | |||
| 221 | ret = double_flag_store(kobj, attr, buf, count, | ||
| 222 | TRANSPARENT_HUGEPAGE_FLAG, | ||
| 223 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | ||
| 224 | |||
| 225 | if (ret > 0) { | ||
| 226 | int err = start_khugepaged(); | ||
| 227 | if (err) | ||
| 228 | ret = err; | ||
| 229 | } | ||
| 230 | |||
| 231 | if (ret > 0 && | ||
| 232 | (test_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
| 233 | &transparent_hugepage_flags) || | ||
| 234 | test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
| 235 | &transparent_hugepage_flags))) | ||
| 236 | set_recommended_min_free_kbytes(); | ||
| 237 | |||
| 238 | return ret; | ||
| 239 | } | ||
| 240 | static struct kobj_attribute enabled_attr = | ||
| 241 | __ATTR(enabled, 0644, enabled_show, enabled_store); | ||
| 242 | |||
| 243 | static ssize_t single_flag_show(struct kobject *kobj, | ||
| 244 | struct kobj_attribute *attr, char *buf, | ||
| 245 | enum transparent_hugepage_flag flag) | ||
| 246 | { | ||
| 247 | if (test_bit(flag, &transparent_hugepage_flags)) | ||
| 248 | return sprintf(buf, "[yes] no\n"); | ||
| 249 | else | ||
| 250 | return sprintf(buf, "yes [no]\n"); | ||
| 251 | } | ||
| 252 | static ssize_t single_flag_store(struct kobject *kobj, | ||
| 253 | struct kobj_attribute *attr, | ||
| 254 | const char *buf, size_t count, | ||
| 255 | enum transparent_hugepage_flag flag) | ||
| 256 | { | ||
| 257 | if (!memcmp("yes", buf, | ||
| 258 | min(sizeof("yes")-1, count))) { | ||
| 259 | set_bit(flag, &transparent_hugepage_flags); | ||
| 260 | } else if (!memcmp("no", buf, | ||
| 261 | min(sizeof("no")-1, count))) { | ||
| 262 | clear_bit(flag, &transparent_hugepage_flags); | ||
| 263 | } else | ||
| 264 | return -EINVAL; | ||
| 265 | |||
| 266 | return count; | ||
| 267 | } | ||
| 268 | |||
| 269 | /* | ||
| 270 | * Currently defrag only disables __GFP_NOWAIT for allocation. A blind | ||
| 271 | * __GFP_REPEAT is too aggressive, it's never worth swapping tons of | ||
| 272 | * memory just to allocate one more hugepage. | ||
| 273 | */ | ||
| 274 | static ssize_t defrag_show(struct kobject *kobj, | ||
| 275 | struct kobj_attribute *attr, char *buf) | ||
| 276 | { | ||
| 277 | return double_flag_show(kobj, attr, buf, | ||
| 278 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | ||
| 279 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); | ||
| 280 | } | ||
| 281 | static ssize_t defrag_store(struct kobject *kobj, | ||
| 282 | struct kobj_attribute *attr, | ||
| 283 | const char *buf, size_t count) | ||
| 284 | { | ||
| 285 | return double_flag_store(kobj, attr, buf, count, | ||
| 286 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | ||
| 287 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); | ||
| 288 | } | ||
| 289 | static struct kobj_attribute defrag_attr = | ||
| 290 | __ATTR(defrag, 0644, defrag_show, defrag_store); | ||
| 291 | |||
| 292 | #ifdef CONFIG_DEBUG_VM | ||
| 293 | static ssize_t debug_cow_show(struct kobject *kobj, | ||
| 294 | struct kobj_attribute *attr, char *buf) | ||
| 295 | { | ||
| 296 | return single_flag_show(kobj, attr, buf, | ||
| 297 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | ||
| 298 | } | ||
| 299 | static ssize_t debug_cow_store(struct kobject *kobj, | ||
| 300 | struct kobj_attribute *attr, | ||
| 301 | const char *buf, size_t count) | ||
| 302 | { | ||
| 303 | return single_flag_store(kobj, attr, buf, count, | ||
| 304 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | ||
| 305 | } | ||
| 306 | static struct kobj_attribute debug_cow_attr = | ||
| 307 | __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); | ||
| 308 | #endif /* CONFIG_DEBUG_VM */ | ||
| 309 | |||
| 310 | static struct attribute *hugepage_attr[] = { | ||
| 311 | &enabled_attr.attr, | ||
| 312 | &defrag_attr.attr, | ||
| 313 | #ifdef CONFIG_DEBUG_VM | ||
| 314 | &debug_cow_attr.attr, | ||
| 315 | #endif | ||
| 316 | NULL, | ||
| 317 | }; | ||
| 318 | |||
| 319 | static struct attribute_group hugepage_attr_group = { | ||
| 320 | .attrs = hugepage_attr, | ||
| 321 | }; | ||
| 322 | |||
| 323 | static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, | ||
| 324 | struct kobj_attribute *attr, | ||
| 325 | char *buf) | ||
| 326 | { | ||
| 327 | return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); | ||
| 328 | } | ||
| 329 | |||
| 330 | static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, | ||
| 331 | struct kobj_attribute *attr, | ||
| 332 | const char *buf, size_t count) | ||
| 333 | { | ||
| 334 | unsigned long msecs; | ||
| 335 | int err; | ||
| 336 | |||
| 337 | err = strict_strtoul(buf, 10, &msecs); | ||
| 338 | if (err || msecs > UINT_MAX) | ||
| 339 | return -EINVAL; | ||
| 340 | |||
| 341 | khugepaged_scan_sleep_millisecs = msecs; | ||
| 342 | wake_up_interruptible(&khugepaged_wait); | ||
| 343 | |||
| 344 | return count; | ||
| 345 | } | ||
| 346 | static struct kobj_attribute scan_sleep_millisecs_attr = | ||
| 347 | __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, | ||
| 348 | scan_sleep_millisecs_store); | ||
| 349 | |||
| 350 | static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, | ||
| 351 | struct kobj_attribute *attr, | ||
| 352 | char *buf) | ||
| 353 | { | ||
| 354 | return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); | ||
| 355 | } | ||
| 356 | |||
| 357 | static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, | ||
| 358 | struct kobj_attribute *attr, | ||
| 359 | const char *buf, size_t count) | ||
| 360 | { | ||
| 361 | unsigned long msecs; | ||
| 362 | int err; | ||
| 363 | |||
| 364 | err = strict_strtoul(buf, 10, &msecs); | ||
| 365 | if (err || msecs > UINT_MAX) | ||
| 366 | return -EINVAL; | ||
| 367 | |||
| 368 | khugepaged_alloc_sleep_millisecs = msecs; | ||
| 369 | wake_up_interruptible(&khugepaged_wait); | ||
| 370 | |||
| 371 | return count; | ||
| 372 | } | ||
| 373 | static struct kobj_attribute alloc_sleep_millisecs_attr = | ||
| 374 | __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, | ||
| 375 | alloc_sleep_millisecs_store); | ||
| 376 | |||
| 377 | static ssize_t pages_to_scan_show(struct kobject *kobj, | ||
| 378 | struct kobj_attribute *attr, | ||
| 379 | char *buf) | ||
| 380 | { | ||
| 381 | return sprintf(buf, "%u\n", khugepaged_pages_to_scan); | ||
| 382 | } | ||
| 383 | static ssize_t pages_to_scan_store(struct kobject *kobj, | ||
| 384 | struct kobj_attribute *attr, | ||
| 385 | const char *buf, size_t count) | ||
| 386 | { | ||
| 387 | int err; | ||
| 388 | unsigned long pages; | ||
| 389 | |||
| 390 | err = strict_strtoul(buf, 10, &pages); | ||
| 391 | if (err || !pages || pages > UINT_MAX) | ||
| 392 | return -EINVAL; | ||
| 393 | |||
| 394 | khugepaged_pages_to_scan = pages; | ||
| 395 | |||
| 396 | return count; | ||
| 397 | } | ||
| 398 | static struct kobj_attribute pages_to_scan_attr = | ||
| 399 | __ATTR(pages_to_scan, 0644, pages_to_scan_show, | ||
| 400 | pages_to_scan_store); | ||
| 401 | |||
| 402 | static ssize_t pages_collapsed_show(struct kobject *kobj, | ||
| 403 | struct kobj_attribute *attr, | ||
| 404 | char *buf) | ||
| 405 | { | ||
| 406 | return sprintf(buf, "%u\n", khugepaged_pages_collapsed); | ||
| 407 | } | ||
| 408 | static struct kobj_attribute pages_collapsed_attr = | ||
| 409 | __ATTR_RO(pages_collapsed); | ||
| 410 | |||
| 411 | static ssize_t full_scans_show(struct kobject *kobj, | ||
| 412 | struct kobj_attribute *attr, | ||
| 413 | char *buf) | ||
| 414 | { | ||
| 415 | return sprintf(buf, "%u\n", khugepaged_full_scans); | ||
| 416 | } | ||
| 417 | static struct kobj_attribute full_scans_attr = | ||
| 418 | __ATTR_RO(full_scans); | ||
| 419 | |||
| 420 | static ssize_t khugepaged_defrag_show(struct kobject *kobj, | ||
| 421 | struct kobj_attribute *attr, char *buf) | ||
| 422 | { | ||
| 423 | return single_flag_show(kobj, attr, buf, | ||
| 424 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
| 425 | } | ||
| 426 | static ssize_t khugepaged_defrag_store(struct kobject *kobj, | ||
| 427 | struct kobj_attribute *attr, | ||
| 428 | const char *buf, size_t count) | ||
| 429 | { | ||
| 430 | return single_flag_store(kobj, attr, buf, count, | ||
| 431 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
| 432 | } | ||
| 433 | static struct kobj_attribute khugepaged_defrag_attr = | ||
| 434 | __ATTR(defrag, 0644, khugepaged_defrag_show, | ||
| 435 | khugepaged_defrag_store); | ||
| 436 | |||
| 437 | /* | ||
| 438 | * max_ptes_none controls if khugepaged should collapse hugepages over | ||
| 439 | * any unmapped ptes in turn potentially increasing the memory | ||
| 440 | * footprint of the vmas. When max_ptes_none is 0 khugepaged will not | ||
| 441 | * reduce the available free memory in the system as it | ||
| 442 | * runs. Increasing max_ptes_none will instead potentially reduce the | ||
| 443 | * free memory in the system during the khugepaged scan. | ||
| 444 | */ | ||
| 445 | static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, | ||
| 446 | struct kobj_attribute *attr, | ||
| 447 | char *buf) | ||
| 448 | { | ||
| 449 | return sprintf(buf, "%u\n", khugepaged_max_ptes_none); | ||
| 450 | } | ||
| 451 | static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, | ||
| 452 | struct kobj_attribute *attr, | ||
| 453 | const char *buf, size_t count) | ||
| 454 | { | ||
| 455 | int err; | ||
| 456 | unsigned long max_ptes_none; | ||
| 457 | |||
| 458 | err = strict_strtoul(buf, 10, &max_ptes_none); | ||
| 459 | if (err || max_ptes_none > HPAGE_PMD_NR-1) | ||
| 460 | return -EINVAL; | ||
| 461 | |||
| 462 | khugepaged_max_ptes_none = max_ptes_none; | ||
| 463 | |||
| 464 | return count; | ||
| 465 | } | ||
| 466 | static struct kobj_attribute khugepaged_max_ptes_none_attr = | ||
| 467 | __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, | ||
| 468 | khugepaged_max_ptes_none_store); | ||
| 469 | |||
| 470 | static struct attribute *khugepaged_attr[] = { | ||
| 471 | &khugepaged_defrag_attr.attr, | ||
| 472 | &khugepaged_max_ptes_none_attr.attr, | ||
| 473 | &pages_to_scan_attr.attr, | ||
| 474 | &pages_collapsed_attr.attr, | ||
| 475 | &full_scans_attr.attr, | ||
| 476 | &scan_sleep_millisecs_attr.attr, | ||
| 477 | &alloc_sleep_millisecs_attr.attr, | ||
| 478 | NULL, | ||
| 479 | }; | ||
| 480 | |||
| 481 | static struct attribute_group khugepaged_attr_group = { | ||
| 482 | .attrs = khugepaged_attr, | ||
| 483 | .name = "khugepaged", | ||
| 484 | }; | ||
| 485 | #endif /* CONFIG_SYSFS */ | ||
| 486 | |||
| 487 | static int __init hugepage_init(void) | ||
| 488 | { | ||
| 489 | int err; | ||
| 490 | #ifdef CONFIG_SYSFS | ||
| 491 | static struct kobject *hugepage_kobj; | ||
| 492 | #endif | ||
| 493 | |||
| 494 | err = -EINVAL; | ||
| 495 | if (!has_transparent_hugepage()) { | ||
| 496 | transparent_hugepage_flags = 0; | ||
| 497 | goto out; | ||
| 498 | } | ||
| 499 | |||
| 500 | #ifdef CONFIG_SYSFS | ||
| 501 | err = -ENOMEM; | ||
| 502 | hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); | ||
| 503 | if (unlikely(!hugepage_kobj)) { | ||
| 504 | printk(KERN_ERR "hugepage: failed kobject create\n"); | ||
| 505 | goto out; | ||
| 506 | } | ||
| 507 | |||
| 508 | err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); | ||
| 509 | if (err) { | ||
| 510 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | ||
| 511 | goto out; | ||
| 512 | } | ||
| 513 | |||
| 514 | err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); | ||
| 515 | if (err) { | ||
| 516 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | ||
| 517 | goto out; | ||
| 518 | } | ||
| 519 | #endif | ||
| 520 | |||
| 521 | err = khugepaged_slab_init(); | ||
| 522 | if (err) | ||
| 523 | goto out; | ||
| 524 | |||
| 525 | err = mm_slots_hash_init(); | ||
| 526 | if (err) { | ||
| 527 | khugepaged_slab_free(); | ||
| 528 | goto out; | ||
| 529 | } | ||
| 530 | |||
| 531 | /* | ||
| 532 | * By default disable transparent hugepages on smaller systems, | ||
| 533 | * where the extra memory used could hurt more than TLB overhead | ||
| 534 | * is likely to save. The admin can still enable it through /sys. | ||
| 535 | */ | ||
| 536 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) | ||
| 537 | transparent_hugepage_flags = 0; | ||
| 538 | |||
| 539 | start_khugepaged(); | ||
| 540 | |||
| 541 | set_recommended_min_free_kbytes(); | ||
| 542 | |||
| 543 | out: | ||
| 544 | return err; | ||
| 545 | } | ||
| 546 | module_init(hugepage_init) | ||
| 547 | |||
| 548 | static int __init setup_transparent_hugepage(char *str) | ||
| 549 | { | ||
| 550 | int ret = 0; | ||
| 551 | if (!str) | ||
| 552 | goto out; | ||
| 553 | if (!strcmp(str, "always")) { | ||
| 554 | set_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
| 555 | &transparent_hugepage_flags); | ||
| 556 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
| 557 | &transparent_hugepage_flags); | ||
| 558 | ret = 1; | ||
| 559 | } else if (!strcmp(str, "madvise")) { | ||
| 560 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
| 561 | &transparent_hugepage_flags); | ||
| 562 | set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
| 563 | &transparent_hugepage_flags); | ||
| 564 | ret = 1; | ||
| 565 | } else if (!strcmp(str, "never")) { | ||
| 566 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
| 567 | &transparent_hugepage_flags); | ||
| 568 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
| 569 | &transparent_hugepage_flags); | ||
| 570 | ret = 1; | ||
| 571 | } | ||
| 572 | out: | ||
| 573 | if (!ret) | ||
| 574 | printk(KERN_WARNING | ||
| 575 | "transparent_hugepage= cannot parse, ignored\n"); | ||
| 576 | return ret; | ||
| 577 | } | ||
| 578 | __setup("transparent_hugepage=", setup_transparent_hugepage); | ||
| 579 | |||
| 580 | static void prepare_pmd_huge_pte(pgtable_t pgtable, | ||
| 581 | struct mm_struct *mm) | ||
| 582 | { | ||
| 583 | assert_spin_locked(&mm->page_table_lock); | ||
| 584 | |||
| 585 | /* FIFO */ | ||
| 586 | if (!mm->pmd_huge_pte) | ||
| 587 | INIT_LIST_HEAD(&pgtable->lru); | ||
| 588 | else | ||
| 589 | list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); | ||
| 590 | mm->pmd_huge_pte = pgtable; | ||
| 591 | } | ||
| 592 | |||
| 593 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | ||
| 594 | { | ||
| 595 | if (likely(vma->vm_flags & VM_WRITE)) | ||
| 596 | pmd = pmd_mkwrite(pmd); | ||
| 597 | return pmd; | ||
| 598 | } | ||
| 599 | |||
| 600 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | ||
| 601 | struct vm_area_struct *vma, | ||
| 602 | unsigned long haddr, pmd_t *pmd, | ||
| 603 | struct page *page) | ||
| 604 | { | ||
| 605 | int ret = 0; | ||
| 606 | pgtable_t pgtable; | ||
| 607 | |||
| 608 | VM_BUG_ON(!PageCompound(page)); | ||
| 609 | pgtable = pte_alloc_one(mm, haddr); | ||
| 610 | if (unlikely(!pgtable)) { | ||
| 611 | mem_cgroup_uncharge_page(page); | ||
| 612 | put_page(page); | ||
| 613 | return VM_FAULT_OOM; | ||
| 614 | } | ||
| 615 | |||
| 616 | clear_huge_page(page, haddr, HPAGE_PMD_NR); | ||
| 617 | __SetPageUptodate(page); | ||
| 618 | |||
| 619 | spin_lock(&mm->page_table_lock); | ||
| 620 | if (unlikely(!pmd_none(*pmd))) { | ||
| 621 | spin_unlock(&mm->page_table_lock); | ||
| 622 | mem_cgroup_uncharge_page(page); | ||
| 623 | put_page(page); | ||
| 624 | pte_free(mm, pgtable); | ||
| 625 | } else { | ||
| 626 | pmd_t entry; | ||
| 627 | entry = mk_pmd(page, vma->vm_page_prot); | ||
| 628 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
| 629 | entry = pmd_mkhuge(entry); | ||
| 630 | /* | ||
| 631 | * The spinlocking to take the lru_lock inside | ||
| 632 | * page_add_new_anon_rmap() acts as a full memory | ||
| 633 | * barrier to be sure clear_huge_page writes become | ||
| 634 | * visible after the set_pmd_at() write. | ||
| 635 | */ | ||
| 636 | page_add_new_anon_rmap(page, vma, haddr); | ||
| 637 | set_pmd_at(mm, haddr, pmd, entry); | ||
| 638 | prepare_pmd_huge_pte(pgtable, mm); | ||
| 639 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | ||
| 640 | spin_unlock(&mm->page_table_lock); | ||
| 641 | } | ||
| 642 | |||
| 643 | return ret; | ||
| 644 | } | ||
| 645 | |||
| 646 | static inline gfp_t alloc_hugepage_gfpmask(int defrag) | ||
| 647 | { | ||
| 648 | return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); | ||
| 649 | } | ||
| 650 | |||
| 651 | static inline struct page *alloc_hugepage_vma(int defrag, | ||
| 652 | struct vm_area_struct *vma, | ||
| 653 | unsigned long haddr) | ||
| 654 | { | ||
| 655 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), | ||
| 656 | HPAGE_PMD_ORDER, vma, haddr); | ||
| 657 | } | ||
| 658 | |||
| 659 | #ifndef CONFIG_NUMA | ||
| 660 | static inline struct page *alloc_hugepage(int defrag) | ||
| 661 | { | ||
| 662 | return alloc_pages(alloc_hugepage_gfpmask(defrag), | ||
| 663 | HPAGE_PMD_ORDER); | ||
| 664 | } | ||
| 665 | #endif | ||
| 666 | |||
| 667 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 668 | unsigned long address, pmd_t *pmd, | ||
| 669 | unsigned int flags) | ||
| 670 | { | ||
| 671 | struct page *page; | ||
| 672 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
| 673 | pte_t *pte; | ||
| 674 | |||
| 675 | if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { | ||
| 676 | if (unlikely(anon_vma_prepare(vma))) | ||
| 677 | return VM_FAULT_OOM; | ||
| 678 | if (unlikely(khugepaged_enter(vma))) | ||
| 679 | return VM_FAULT_OOM; | ||
| 680 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | ||
| 681 | vma, haddr); | ||
| 682 | if (unlikely(!page)) | ||
| 683 | goto out; | ||
| 684 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | ||
| 685 | put_page(page); | ||
| 686 | goto out; | ||
| 687 | } | ||
| 688 | |||
| 689 | return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); | ||
| 690 | } | ||
| 691 | out: | ||
| 692 | /* | ||
| 693 | * Use __pte_alloc instead of pte_alloc_map, because we can't | ||
| 694 | * run pte_offset_map on the pmd, if an huge pmd could | ||
| 695 | * materialize from under us from a different thread. | ||
| 696 | */ | ||
| 697 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
| 698 | return VM_FAULT_OOM; | ||
| 699 | /* if an huge pmd materialized from under us just retry later */ | ||
| 700 | if (unlikely(pmd_trans_huge(*pmd))) | ||
| 701 | return 0; | ||
| 702 | /* | ||
| 703 | * A regular pmd is established and it can't morph into a huge pmd | ||
| 704 | * from under us anymore at this point because we hold the mmap_sem | ||
| 705 | * read mode and khugepaged takes it in write mode. So now it's | ||
| 706 | * safe to run pte_offset_map(). | ||
| 707 | */ | ||
| 708 | pte = pte_offset_map(pmd, address); | ||
| 709 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | ||
| 710 | } | ||
| 711 | |||
| 712 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||
| 713 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | ||
| 714 | struct vm_area_struct *vma) | ||
| 715 | { | ||
| 716 | struct page *src_page; | ||
| 717 | pmd_t pmd; | ||
| 718 | pgtable_t pgtable; | ||
| 719 | int ret; | ||
| 720 | |||
| 721 | ret = -ENOMEM; | ||
| 722 | pgtable = pte_alloc_one(dst_mm, addr); | ||
| 723 | if (unlikely(!pgtable)) | ||
| 724 | goto out; | ||
| 725 | |||
| 726 | spin_lock(&dst_mm->page_table_lock); | ||
| 727 | spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); | ||
| 728 | |||
| 729 | ret = -EAGAIN; | ||
| 730 | pmd = *src_pmd; | ||
| 731 | if (unlikely(!pmd_trans_huge(pmd))) { | ||
| 732 | pte_free(dst_mm, pgtable); | ||
| 733 | goto out_unlock; | ||
| 734 | } | ||
| 735 | if (unlikely(pmd_trans_splitting(pmd))) { | ||
| 736 | /* split huge page running from under us */ | ||
| 737 | spin_unlock(&src_mm->page_table_lock); | ||
| 738 | spin_unlock(&dst_mm->page_table_lock); | ||
| 739 | pte_free(dst_mm, pgtable); | ||
| 740 | |||
| 741 | wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ | ||
| 742 | goto out; | ||
| 743 | } | ||
| 744 | src_page = pmd_page(pmd); | ||
| 745 | VM_BUG_ON(!PageHead(src_page)); | ||
| 746 | get_page(src_page); | ||
| 747 | page_dup_rmap(src_page); | ||
| 748 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | ||
| 749 | |||
| 750 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | ||
| 751 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | ||
| 752 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | ||
| 753 | prepare_pmd_huge_pte(pgtable, dst_mm); | ||
| 754 | |||
| 755 | ret = 0; | ||
| 756 | out_unlock: | ||
| 757 | spin_unlock(&src_mm->page_table_lock); | ||
| 758 | spin_unlock(&dst_mm->page_table_lock); | ||
| 759 | out: | ||
| 760 | return ret; | ||
| 761 | } | ||
| 762 | |||
| 763 | /* no "address" argument so destroys page coloring of some arch */ | ||
| 764 | pgtable_t get_pmd_huge_pte(struct mm_struct *mm) | ||
| 765 | { | ||
| 766 | pgtable_t pgtable; | ||
| 767 | |||
| 768 | assert_spin_locked(&mm->page_table_lock); | ||
| 769 | |||
| 770 | /* FIFO */ | ||
| 771 | pgtable = mm->pmd_huge_pte; | ||
| 772 | if (list_empty(&pgtable->lru)) | ||
| 773 | mm->pmd_huge_pte = NULL; | ||
| 774 | else { | ||
| 775 | mm->pmd_huge_pte = list_entry(pgtable->lru.next, | ||
| 776 | struct page, lru); | ||
| 777 | list_del(&pgtable->lru); | ||
| 778 | } | ||
| 779 | return pgtable; | ||
| 780 | } | ||
| 781 | |||
| 782 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | ||
| 783 | struct vm_area_struct *vma, | ||
| 784 | unsigned long address, | ||
| 785 | pmd_t *pmd, pmd_t orig_pmd, | ||
| 786 | struct page *page, | ||
| 787 | unsigned long haddr) | ||
| 788 | { | ||
| 789 | pgtable_t pgtable; | ||
| 790 | pmd_t _pmd; | ||
| 791 | int ret = 0, i; | ||
| 792 | struct page **pages; | ||
| 793 | |||
| 794 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, | ||
| 795 | GFP_KERNEL); | ||
| 796 | if (unlikely(!pages)) { | ||
| 797 | ret |= VM_FAULT_OOM; | ||
| 798 | goto out; | ||
| 799 | } | ||
| 800 | |||
| 801 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
| 802 | pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | ||
| 803 | vma, address); | ||
| 804 | if (unlikely(!pages[i] || | ||
| 805 | mem_cgroup_newpage_charge(pages[i], mm, | ||
| 806 | GFP_KERNEL))) { | ||
| 807 | if (pages[i]) | ||
| 808 | put_page(pages[i]); | ||
| 809 | mem_cgroup_uncharge_start(); | ||
| 810 | while (--i >= 0) { | ||
| 811 | mem_cgroup_uncharge_page(pages[i]); | ||
| 812 | put_page(pages[i]); | ||
| 813 | } | ||
| 814 | mem_cgroup_uncharge_end(); | ||
| 815 | kfree(pages); | ||
| 816 | ret |= VM_FAULT_OOM; | ||
| 817 | goto out; | ||
| 818 | } | ||
| 819 | } | ||
| 820 | |||
| 821 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
| 822 | copy_user_highpage(pages[i], page + i, | ||
| 823 | haddr + PAGE_SHIFT*i, vma); | ||
| 824 | __SetPageUptodate(pages[i]); | ||
| 825 | cond_resched(); | ||
| 826 | } | ||
| 827 | |||
| 828 | spin_lock(&mm->page_table_lock); | ||
| 829 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
| 830 | goto out_free_pages; | ||
| 831 | VM_BUG_ON(!PageHead(page)); | ||
| 832 | |||
| 833 | pmdp_clear_flush_notify(vma, haddr, pmd); | ||
| 834 | /* leave pmd empty until pte is filled */ | ||
| 835 | |||
| 836 | pgtable = get_pmd_huge_pte(mm); | ||
| 837 | pmd_populate(mm, &_pmd, pgtable); | ||
| 838 | |||
| 839 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
| 840 | pte_t *pte, entry; | ||
| 841 | entry = mk_pte(pages[i], vma->vm_page_prot); | ||
| 842 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
| 843 | page_add_new_anon_rmap(pages[i], vma, haddr); | ||
| 844 | pte = pte_offset_map(&_pmd, haddr); | ||
| 845 | VM_BUG_ON(!pte_none(*pte)); | ||
| 846 | set_pte_at(mm, haddr, pte, entry); | ||
| 847 | pte_unmap(pte); | ||
| 848 | } | ||
| 849 | kfree(pages); | ||
| 850 | |||
| 851 | mm->nr_ptes++; | ||
| 852 | smp_wmb(); /* make pte visible before pmd */ | ||
| 853 | pmd_populate(mm, pmd, pgtable); | ||
| 854 | page_remove_rmap(page); | ||
| 855 | spin_unlock(&mm->page_table_lock); | ||
| 856 | |||
| 857 | ret |= VM_FAULT_WRITE; | ||
| 858 | put_page(page); | ||
| 859 | |||
| 860 | out: | ||
| 861 | return ret; | ||
| 862 | |||
| 863 | out_free_pages: | ||
| 864 | spin_unlock(&mm->page_table_lock); | ||
| 865 | mem_cgroup_uncharge_start(); | ||
| 866 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
| 867 | mem_cgroup_uncharge_page(pages[i]); | ||
| 868 | put_page(pages[i]); | ||
| 869 | } | ||
| 870 | mem_cgroup_uncharge_end(); | ||
| 871 | kfree(pages); | ||
| 872 | goto out; | ||
| 873 | } | ||
| 874 | |||
| 875 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 876 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | ||
| 877 | { | ||
| 878 | int ret = 0; | ||
| 879 | struct page *page, *new_page; | ||
| 880 | unsigned long haddr; | ||
| 881 | |||
| 882 | VM_BUG_ON(!vma->anon_vma); | ||
| 883 | spin_lock(&mm->page_table_lock); | ||
| 884 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
| 885 | goto out_unlock; | ||
| 886 | |||
| 887 | page = pmd_page(orig_pmd); | ||
| 888 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | ||
| 889 | haddr = address & HPAGE_PMD_MASK; | ||
| 890 | if (page_mapcount(page) == 1) { | ||
| 891 | pmd_t entry; | ||
| 892 | entry = pmd_mkyoung(orig_pmd); | ||
| 893 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
| 894 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) | ||
| 895 | update_mmu_cache(vma, address, entry); | ||
| 896 | ret |= VM_FAULT_WRITE; | ||
| 897 | goto out_unlock; | ||
| 898 | } | ||
| 899 | get_page(page); | ||
| 900 | spin_unlock(&mm->page_table_lock); | ||
| 901 | |||
| 902 | if (transparent_hugepage_enabled(vma) && | ||
| 903 | !transparent_hugepage_debug_cow()) | ||
| 904 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | ||
| 905 | vma, haddr); | ||
| 906 | else | ||
| 907 | new_page = NULL; | ||
| 908 | |||
| 909 | if (unlikely(!new_page)) { | ||
| 910 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | ||
| 911 | pmd, orig_pmd, page, haddr); | ||
| 912 | put_page(page); | ||
| 913 | goto out; | ||
| 914 | } | ||
| 915 | |||
| 916 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | ||
| 917 | put_page(new_page); | ||
| 918 | put_page(page); | ||
| 919 | ret |= VM_FAULT_OOM; | ||
| 920 | goto out; | ||
| 921 | } | ||
| 922 | |||
| 923 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | ||
| 924 | __SetPageUptodate(new_page); | ||
| 925 | |||
| 926 | spin_lock(&mm->page_table_lock); | ||
| 927 | put_page(page); | ||
| 928 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | ||
| 929 | mem_cgroup_uncharge_page(new_page); | ||
| 930 | put_page(new_page); | ||
| 931 | } else { | ||
| 932 | pmd_t entry; | ||
| 933 | VM_BUG_ON(!PageHead(page)); | ||
| 934 | entry = mk_pmd(new_page, vma->vm_page_prot); | ||
| 935 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
| 936 | entry = pmd_mkhuge(entry); | ||
| 937 | pmdp_clear_flush_notify(vma, haddr, pmd); | ||
| 938 | page_add_new_anon_rmap(new_page, vma, haddr); | ||
| 939 | set_pmd_at(mm, haddr, pmd, entry); | ||
| 940 | update_mmu_cache(vma, address, entry); | ||
| 941 | page_remove_rmap(page); | ||
| 942 | put_page(page); | ||
| 943 | ret |= VM_FAULT_WRITE; | ||
| 944 | } | ||
| 945 | out_unlock: | ||
| 946 | spin_unlock(&mm->page_table_lock); | ||
| 947 | out: | ||
| 948 | return ret; | ||
| 949 | } | ||
| 950 | |||
| 951 | struct page *follow_trans_huge_pmd(struct mm_struct *mm, | ||
| 952 | unsigned long addr, | ||
| 953 | pmd_t *pmd, | ||
| 954 | unsigned int flags) | ||
| 955 | { | ||
| 956 | struct page *page = NULL; | ||
| 957 | |||
| 958 | assert_spin_locked(&mm->page_table_lock); | ||
| 959 | |||
| 960 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) | ||
| 961 | goto out; | ||
| 962 | |||
| 963 | page = pmd_page(*pmd); | ||
| 964 | VM_BUG_ON(!PageHead(page)); | ||
| 965 | if (flags & FOLL_TOUCH) { | ||
| 966 | pmd_t _pmd; | ||
| 967 | /* | ||
| 968 | * We should set the dirty bit only for FOLL_WRITE but | ||
| 969 | * for now the dirty bit in the pmd is meaningless. | ||
| 970 | * And if the dirty bit will become meaningful and | ||
| 971 | * we'll only set it with FOLL_WRITE, an atomic | ||
| 972 | * set_bit will be required on the pmd to set the | ||
| 973 | * young bit, instead of the current set_pmd_at. | ||
| 974 | */ | ||
| 975 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | ||
| 976 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); | ||
| 977 | } | ||
| 978 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | ||
| 979 | VM_BUG_ON(!PageCompound(page)); | ||
| 980 | if (flags & FOLL_GET) | ||
| 981 | get_page(page); | ||
| 982 | |||
| 983 | out: | ||
| 984 | return page; | ||
| 985 | } | ||
| 986 | |||
| 987 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | ||
| 988 | pmd_t *pmd) | ||
| 989 | { | ||
| 990 | int ret = 0; | ||
| 991 | |||
| 992 | spin_lock(&tlb->mm->page_table_lock); | ||
| 993 | if (likely(pmd_trans_huge(*pmd))) { | ||
| 994 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
| 995 | spin_unlock(&tlb->mm->page_table_lock); | ||
| 996 | wait_split_huge_page(vma->anon_vma, | ||
| 997 | pmd); | ||
| 998 | } else { | ||
| 999 | struct page *page; | ||
| 1000 | pgtable_t pgtable; | ||
| 1001 | pgtable = get_pmd_huge_pte(tlb->mm); | ||
| 1002 | page = pmd_page(*pmd); | ||
| 1003 | pmd_clear(pmd); | ||
| 1004 | page_remove_rmap(page); | ||
| 1005 | VM_BUG_ON(page_mapcount(page) < 0); | ||
| 1006 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
| 1007 | VM_BUG_ON(!PageHead(page)); | ||
| 1008 | spin_unlock(&tlb->mm->page_table_lock); | ||
| 1009 | tlb_remove_page(tlb, page); | ||
| 1010 | pte_free(tlb->mm, pgtable); | ||
| 1011 | ret = 1; | ||
| 1012 | } | ||
| 1013 | } else | ||
| 1014 | spin_unlock(&tlb->mm->page_table_lock); | ||
| 1015 | |||
| 1016 | return ret; | ||
| 1017 | } | ||
| 1018 | |||
| 1019 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | ||
| 1020 | unsigned long addr, unsigned long end, | ||
| 1021 | unsigned char *vec) | ||
| 1022 | { | ||
| 1023 | int ret = 0; | ||
| 1024 | |||
| 1025 | spin_lock(&vma->vm_mm->page_table_lock); | ||
| 1026 | if (likely(pmd_trans_huge(*pmd))) { | ||
| 1027 | ret = !pmd_trans_splitting(*pmd); | ||
| 1028 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
| 1029 | if (unlikely(!ret)) | ||
| 1030 | wait_split_huge_page(vma->anon_vma, pmd); | ||
| 1031 | else { | ||
| 1032 | /* | ||
| 1033 | * All logical pages in the range are present | ||
| 1034 | * if backed by a huge page. | ||
| 1035 | */ | ||
| 1036 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
| 1037 | } | ||
| 1038 | } else | ||
| 1039 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
| 1040 | |||
| 1041 | return ret; | ||
| 1042 | } | ||
| 1043 | |||
| 1044 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | ||
| 1045 | unsigned long addr, pgprot_t newprot) | ||
| 1046 | { | ||
| 1047 | struct mm_struct *mm = vma->vm_mm; | ||
| 1048 | int ret = 0; | ||
| 1049 | |||
| 1050 | spin_lock(&mm->page_table_lock); | ||
| 1051 | if (likely(pmd_trans_huge(*pmd))) { | ||
| 1052 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
| 1053 | spin_unlock(&mm->page_table_lock); | ||
| 1054 | wait_split_huge_page(vma->anon_vma, pmd); | ||
| 1055 | } else { | ||
| 1056 | pmd_t entry; | ||
| 1057 | |||
| 1058 | entry = pmdp_get_and_clear(mm, addr, pmd); | ||
| 1059 | entry = pmd_modify(entry, newprot); | ||
| 1060 | set_pmd_at(mm, addr, pmd, entry); | ||
| 1061 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
| 1062 | flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); | ||
| 1063 | ret = 1; | ||
| 1064 | } | ||
| 1065 | } else | ||
| 1066 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
| 1067 | |||
| 1068 | return ret; | ||
| 1069 | } | ||
| 1070 | |||
| 1071 | pmd_t *page_check_address_pmd(struct page *page, | ||
| 1072 | struct mm_struct *mm, | ||
| 1073 | unsigned long address, | ||
| 1074 | enum page_check_address_pmd_flag flag) | ||
| 1075 | { | ||
| 1076 | pgd_t *pgd; | ||
| 1077 | pud_t *pud; | ||
| 1078 | pmd_t *pmd, *ret = NULL; | ||
| 1079 | |||
| 1080 | if (address & ~HPAGE_PMD_MASK) | ||
| 1081 | goto out; | ||
| 1082 | |||
| 1083 | pgd = pgd_offset(mm, address); | ||
| 1084 | if (!pgd_present(*pgd)) | ||
| 1085 | goto out; | ||
| 1086 | |||
| 1087 | pud = pud_offset(pgd, address); | ||
| 1088 | if (!pud_present(*pud)) | ||
| 1089 | goto out; | ||
| 1090 | |||
| 1091 | pmd = pmd_offset(pud, address); | ||
| 1092 | if (pmd_none(*pmd)) | ||
| 1093 | goto out; | ||
| 1094 | if (pmd_page(*pmd) != page) | ||
| 1095 | goto out; | ||
| 1096 | /* | ||
| 1097 | * split_vma() may create temporary aliased mappings. There is | ||
| 1098 | * no risk as long as all huge pmd are found and have their | ||
| 1099 | * splitting bit set before __split_huge_page_refcount | ||
| 1100 | * runs. Finding the same huge pmd more than once during the | ||
| 1101 | * same rmap walk is not a problem. | ||
| 1102 | */ | ||
| 1103 | if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && | ||
| 1104 | pmd_trans_splitting(*pmd)) | ||
| 1105 | goto out; | ||
| 1106 | if (pmd_trans_huge(*pmd)) { | ||
| 1107 | VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && | ||
| 1108 | !pmd_trans_splitting(*pmd)); | ||
| 1109 | ret = pmd; | ||
| 1110 | } | ||
| 1111 | out: | ||
| 1112 | return ret; | ||
| 1113 | } | ||
| 1114 | |||
| 1115 | static int __split_huge_page_splitting(struct page *page, | ||
| 1116 | struct vm_area_struct *vma, | ||
| 1117 | unsigned long address) | ||
| 1118 | { | ||
| 1119 | struct mm_struct *mm = vma->vm_mm; | ||
| 1120 | pmd_t *pmd; | ||
| 1121 | int ret = 0; | ||
| 1122 | |||
| 1123 | spin_lock(&mm->page_table_lock); | ||
| 1124 | pmd = page_check_address_pmd(page, mm, address, | ||
| 1125 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); | ||
| 1126 | if (pmd) { | ||
| 1127 | /* | ||
| 1128 | * We can't temporarily set the pmd to null in order | ||
| 1129 | * to split it, the pmd must remain marked huge at all | ||
| 1130 | * times or the VM won't take the pmd_trans_huge paths | ||
| 1131 | * and it won't wait on the anon_vma->root->lock to | ||
| 1132 | * serialize against split_huge_page*. | ||
| 1133 | */ | ||
| 1134 | pmdp_splitting_flush_notify(vma, address, pmd); | ||
| 1135 | ret = 1; | ||
| 1136 | } | ||
| 1137 | spin_unlock(&mm->page_table_lock); | ||
| 1138 | |||
| 1139 | return ret; | ||
| 1140 | } | ||
| 1141 | |||
| 1142 | static void __split_huge_page_refcount(struct page *page) | ||
| 1143 | { | ||
| 1144 | int i; | ||
| 1145 | unsigned long head_index = page->index; | ||
| 1146 | struct zone *zone = page_zone(page); | ||
| 1147 | int zonestat; | ||
| 1148 | |||
| 1149 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | ||
| 1150 | spin_lock_irq(&zone->lru_lock); | ||
| 1151 | compound_lock(page); | ||
| 1152 | |||
| 1153 | for (i = 1; i < HPAGE_PMD_NR; i++) { | ||
| 1154 | struct page *page_tail = page + i; | ||
| 1155 | |||
| 1156 | /* tail_page->_count cannot change */ | ||
| 1157 | atomic_sub(atomic_read(&page_tail->_count), &page->_count); | ||
| 1158 | BUG_ON(page_count(page) <= 0); | ||
| 1159 | atomic_add(page_mapcount(page) + 1, &page_tail->_count); | ||
| 1160 | BUG_ON(atomic_read(&page_tail->_count) <= 0); | ||
| 1161 | |||
| 1162 | /* after clearing PageTail the gup refcount can be released */ | ||
| 1163 | smp_mb(); | ||
| 1164 | |||
| 1165 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | ||
| 1166 | page_tail->flags |= (page->flags & | ||
| 1167 | ((1L << PG_referenced) | | ||
| 1168 | (1L << PG_swapbacked) | | ||
| 1169 | (1L << PG_mlocked) | | ||
| 1170 | (1L << PG_uptodate))); | ||
| 1171 | page_tail->flags |= (1L << PG_dirty); | ||
| 1172 | |||
| 1173 | /* | ||
| 1174 | * 1) clear PageTail before overwriting first_page | ||
| 1175 | * 2) clear PageTail before clearing PageHead for VM_BUG_ON | ||
| 1176 | */ | ||
| 1177 | smp_wmb(); | ||
| 1178 | |||
| 1179 | /* | ||
| 1180 | * __split_huge_page_splitting() already set the | ||
| 1181 | * splitting bit in all pmd that could map this | ||
| 1182 | * hugepage, that will ensure no CPU can alter the | ||
| 1183 | * mapcount on the head page. The mapcount is only | ||
| 1184 | * accounted in the head page and it has to be | ||
| 1185 | * transferred to all tail pages in the below code. So | ||
| 1186 | * for this code to be safe, the split the mapcount | ||
| 1187 | * can't change. But that doesn't mean userland can't | ||
| 1188 | * keep changing and reading the page contents while | ||
| 1189 | * we transfer the mapcount, so the pmd splitting | ||
| 1190 | * status is achieved setting a reserved bit in the | ||
| 1191 | * pmd, not by clearing the present bit. | ||
| 1192 | */ | ||
| 1193 | BUG_ON(page_mapcount(page_tail)); | ||
| 1194 | page_tail->_mapcount = page->_mapcount; | ||
| 1195 | |||
| 1196 | BUG_ON(page_tail->mapping); | ||
| 1197 | page_tail->mapping = page->mapping; | ||
| 1198 | |||
| 1199 | page_tail->index = ++head_index; | ||
| 1200 | |||
| 1201 | BUG_ON(!PageAnon(page_tail)); | ||
| 1202 | BUG_ON(!PageUptodate(page_tail)); | ||
| 1203 | BUG_ON(!PageDirty(page_tail)); | ||
| 1204 | BUG_ON(!PageSwapBacked(page_tail)); | ||
| 1205 | |||
| 1206 | lru_add_page_tail(zone, page, page_tail); | ||
| 1207 | } | ||
| 1208 | |||
| 1209 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | ||
| 1210 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | ||
| 1211 | |||
| 1212 | /* | ||
| 1213 | * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, | ||
| 1214 | * so adjust those appropriately if this page is on the LRU. | ||
| 1215 | */ | ||
| 1216 | if (PageLRU(page)) { | ||
| 1217 | zonestat = NR_LRU_BASE + page_lru(page); | ||
| 1218 | __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); | ||
| 1219 | } | ||
| 1220 | |||
| 1221 | ClearPageCompound(page); | ||
| 1222 | compound_unlock(page); | ||
| 1223 | spin_unlock_irq(&zone->lru_lock); | ||
| 1224 | |||
| 1225 | for (i = 1; i < HPAGE_PMD_NR; i++) { | ||
| 1226 | struct page *page_tail = page + i; | ||
| 1227 | BUG_ON(page_count(page_tail) <= 0); | ||
| 1228 | /* | ||
| 1229 | * Tail pages may be freed if there wasn't any mapping | ||
| 1230 | * like if add_to_swap() is running on a lru page that | ||
| 1231 | * had its mapping zapped. And freeing these pages | ||
| 1232 | * requires taking the lru_lock so we do the put_page | ||
| 1233 | * of the tail pages after the split is complete. | ||
| 1234 | */ | ||
| 1235 | put_page(page_tail); | ||
| 1236 | } | ||
| 1237 | |||
| 1238 | /* | ||
| 1239 | * Only the head page (now become a regular page) is required | ||
| 1240 | * to be pinned by the caller. | ||
| 1241 | */ | ||
| 1242 | BUG_ON(page_count(page) <= 0); | ||
| 1243 | } | ||
| 1244 | |||
| 1245 | static int __split_huge_page_map(struct page *page, | ||
| 1246 | struct vm_area_struct *vma, | ||
| 1247 | unsigned long address) | ||
| 1248 | { | ||
| 1249 | struct mm_struct *mm = vma->vm_mm; | ||
| 1250 | pmd_t *pmd, _pmd; | ||
| 1251 | int ret = 0, i; | ||
| 1252 | pgtable_t pgtable; | ||
| 1253 | unsigned long haddr; | ||
| 1254 | |||
| 1255 | spin_lock(&mm->page_table_lock); | ||
| 1256 | pmd = page_check_address_pmd(page, mm, address, | ||
| 1257 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); | ||
| 1258 | if (pmd) { | ||
| 1259 | pgtable = get_pmd_huge_pte(mm); | ||
| 1260 | pmd_populate(mm, &_pmd, pgtable); | ||
| 1261 | |||
| 1262 | for (i = 0, haddr = address; i < HPAGE_PMD_NR; | ||
| 1263 | i++, haddr += PAGE_SIZE) { | ||
| 1264 | pte_t *pte, entry; | ||
| 1265 | BUG_ON(PageCompound(page+i)); | ||
| 1266 | entry = mk_pte(page + i, vma->vm_page_prot); | ||
| 1267 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
| 1268 | if (!pmd_write(*pmd)) | ||
| 1269 | entry = pte_wrprotect(entry); | ||
| 1270 | else | ||
| 1271 | BUG_ON(page_mapcount(page) != 1); | ||
| 1272 | if (!pmd_young(*pmd)) | ||
| 1273 | entry = pte_mkold(entry); | ||
| 1274 | pte = pte_offset_map(&_pmd, haddr); | ||
| 1275 | BUG_ON(!pte_none(*pte)); | ||
| 1276 | set_pte_at(mm, haddr, pte, entry); | ||
| 1277 | pte_unmap(pte); | ||
| 1278 | } | ||
| 1279 | |||
| 1280 | mm->nr_ptes++; | ||
| 1281 | smp_wmb(); /* make pte visible before pmd */ | ||
| 1282 | /* | ||
| 1283 | * Up to this point the pmd is present and huge and | ||
| 1284 | * userland has the whole access to the hugepage | ||
| 1285 | * during the split (which happens in place). If we | ||
| 1286 | * overwrite the pmd with the not-huge version | ||
| 1287 | * pointing to the pte here (which of course we could | ||
| 1288 | * if all CPUs were bug free), userland could trigger | ||
| 1289 | * a small page size TLB miss on the small sized TLB | ||
| 1290 | * while the hugepage TLB entry is still established | ||
| 1291 | * in the huge TLB. Some CPU doesn't like that. See | ||
| 1292 | * http://support.amd.com/us/Processor_TechDocs/41322.pdf, | ||
| 1293 | * Erratum 383 on page 93. Intel should be safe but is | ||
| 1294 | * also warns that it's only safe if the permission | ||
| 1295 | * and cache attributes of the two entries loaded in | ||
| 1296 | * the two TLB is identical (which should be the case | ||
| 1297 | * here). But it is generally safer to never allow | ||
| 1298 | * small and huge TLB entries for the same virtual | ||
| 1299 | * address to be loaded simultaneously. So instead of | ||
| 1300 | * doing "pmd_populate(); flush_tlb_range();" we first | ||
| 1301 | * mark the current pmd notpresent (atomically because | ||
| 1302 | * here the pmd_trans_huge and pmd_trans_splitting | ||
| 1303 | * must remain set at all times on the pmd until the | ||
| 1304 | * split is complete for this pmd), then we flush the | ||
| 1305 | * SMP TLB and finally we write the non-huge version | ||
| 1306 | * of the pmd entry with pmd_populate. | ||
| 1307 | */ | ||
| 1308 | set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); | ||
| 1309 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
| 1310 | pmd_populate(mm, pmd, pgtable); | ||
| 1311 | ret = 1; | ||
| 1312 | } | ||
| 1313 | spin_unlock(&mm->page_table_lock); | ||
| 1314 | |||
| 1315 | return ret; | ||
| 1316 | } | ||
| 1317 | |||
| 1318 | /* must be called with anon_vma->root->lock hold */ | ||
| 1319 | static void __split_huge_page(struct page *page, | ||
| 1320 | struct anon_vma *anon_vma) | ||
| 1321 | { | ||
| 1322 | int mapcount, mapcount2; | ||
| 1323 | struct anon_vma_chain *avc; | ||
| 1324 | |||
| 1325 | BUG_ON(!PageHead(page)); | ||
| 1326 | BUG_ON(PageTail(page)); | ||
| 1327 | |||
| 1328 | mapcount = 0; | ||
| 1329 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | ||
| 1330 | struct vm_area_struct *vma = avc->vma; | ||
| 1331 | unsigned long addr = vma_address(page, vma); | ||
| 1332 | BUG_ON(is_vma_temporary_stack(vma)); | ||
| 1333 | if (addr == -EFAULT) | ||
| 1334 | continue; | ||
| 1335 | mapcount += __split_huge_page_splitting(page, vma, addr); | ||
| 1336 | } | ||
| 1337 | /* | ||
| 1338 | * It is critical that new vmas are added to the tail of the | ||
| 1339 | * anon_vma list. This guarantes that if copy_huge_pmd() runs | ||
| 1340 | * and establishes a child pmd before | ||
| 1341 | * __split_huge_page_splitting() freezes the parent pmd (so if | ||
| 1342 | * we fail to prevent copy_huge_pmd() from running until the | ||
| 1343 | * whole __split_huge_page() is complete), we will still see | ||
| 1344 | * the newly established pmd of the child later during the | ||
| 1345 | * walk, to be able to set it as pmd_trans_splitting too. | ||
| 1346 | */ | ||
| 1347 | if (mapcount != page_mapcount(page)) | ||
| 1348 | printk(KERN_ERR "mapcount %d page_mapcount %d\n", | ||
| 1349 | mapcount, page_mapcount(page)); | ||
| 1350 | BUG_ON(mapcount != page_mapcount(page)); | ||
| 1351 | |||
| 1352 | __split_huge_page_refcount(page); | ||
| 1353 | |||
| 1354 | mapcount2 = 0; | ||
| 1355 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | ||
| 1356 | struct vm_area_struct *vma = avc->vma; | ||
| 1357 | unsigned long addr = vma_address(page, vma); | ||
| 1358 | BUG_ON(is_vma_temporary_stack(vma)); | ||
| 1359 | if (addr == -EFAULT) | ||
| 1360 | continue; | ||
| 1361 | mapcount2 += __split_huge_page_map(page, vma, addr); | ||
| 1362 | } | ||
| 1363 | if (mapcount != mapcount2) | ||
| 1364 | printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", | ||
| 1365 | mapcount, mapcount2, page_mapcount(page)); | ||
| 1366 | BUG_ON(mapcount != mapcount2); | ||
| 1367 | } | ||
| 1368 | |||
| 1369 | int split_huge_page(struct page *page) | ||
| 1370 | { | ||
| 1371 | struct anon_vma *anon_vma; | ||
| 1372 | int ret = 1; | ||
| 1373 | |||
| 1374 | BUG_ON(!PageAnon(page)); | ||
| 1375 | anon_vma = page_lock_anon_vma(page); | ||
| 1376 | if (!anon_vma) | ||
| 1377 | goto out; | ||
| 1378 | ret = 0; | ||
| 1379 | if (!PageCompound(page)) | ||
| 1380 | goto out_unlock; | ||
| 1381 | |||
| 1382 | BUG_ON(!PageSwapBacked(page)); | ||
| 1383 | __split_huge_page(page, anon_vma); | ||
| 1384 | |||
| 1385 | BUG_ON(PageCompound(page)); | ||
| 1386 | out_unlock: | ||
| 1387 | page_unlock_anon_vma(anon_vma); | ||
| 1388 | out: | ||
| 1389 | return ret; | ||
| 1390 | } | ||
| 1391 | |||
| 1392 | int hugepage_madvise(struct vm_area_struct *vma, | ||
| 1393 | unsigned long *vm_flags, int advice) | ||
| 1394 | { | ||
| 1395 | switch (advice) { | ||
| 1396 | case MADV_HUGEPAGE: | ||
| 1397 | /* | ||
| 1398 | * Be somewhat over-protective like KSM for now! | ||
| 1399 | */ | ||
| 1400 | if (*vm_flags & (VM_HUGEPAGE | | ||
| 1401 | VM_SHARED | VM_MAYSHARE | | ||
| 1402 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | ||
| 1403 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | ||
| 1404 | VM_MIXEDMAP | VM_SAO)) | ||
| 1405 | return -EINVAL; | ||
| 1406 | *vm_flags &= ~VM_NOHUGEPAGE; | ||
| 1407 | *vm_flags |= VM_HUGEPAGE; | ||
| 1408 | /* | ||
| 1409 | * If the vma become good for khugepaged to scan, | ||
| 1410 | * register it here without waiting a page fault that | ||
| 1411 | * may not happen any time soon. | ||
| 1412 | */ | ||
| 1413 | if (unlikely(khugepaged_enter_vma_merge(vma))) | ||
| 1414 | return -ENOMEM; | ||
| 1415 | break; | ||
| 1416 | case MADV_NOHUGEPAGE: | ||
| 1417 | /* | ||
| 1418 | * Be somewhat over-protective like KSM for now! | ||
| 1419 | */ | ||
| 1420 | if (*vm_flags & (VM_NOHUGEPAGE | | ||
| 1421 | VM_SHARED | VM_MAYSHARE | | ||
| 1422 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | ||
| 1423 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | ||
| 1424 | VM_MIXEDMAP | VM_SAO)) | ||
| 1425 | return -EINVAL; | ||
| 1426 | *vm_flags &= ~VM_HUGEPAGE; | ||
| 1427 | *vm_flags |= VM_NOHUGEPAGE; | ||
| 1428 | /* | ||
| 1429 | * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning | ||
| 1430 | * this vma even if we leave the mm registered in khugepaged if | ||
| 1431 | * it got registered before VM_NOHUGEPAGE was set. | ||
| 1432 | */ | ||
| 1433 | break; | ||
| 1434 | } | ||
| 1435 | |||
| 1436 | return 0; | ||
| 1437 | } | ||
| 1438 | |||
| 1439 | static int __init khugepaged_slab_init(void) | ||
| 1440 | { | ||
| 1441 | mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", | ||
| 1442 | sizeof(struct mm_slot), | ||
| 1443 | __alignof__(struct mm_slot), 0, NULL); | ||
| 1444 | if (!mm_slot_cache) | ||
| 1445 | return -ENOMEM; | ||
| 1446 | |||
| 1447 | return 0; | ||
| 1448 | } | ||
| 1449 | |||
| 1450 | static void __init khugepaged_slab_free(void) | ||
| 1451 | { | ||
| 1452 | kmem_cache_destroy(mm_slot_cache); | ||
| 1453 | mm_slot_cache = NULL; | ||
| 1454 | } | ||
| 1455 | |||
| 1456 | static inline struct mm_slot *alloc_mm_slot(void) | ||
| 1457 | { | ||
| 1458 | if (!mm_slot_cache) /* initialization failed */ | ||
| 1459 | return NULL; | ||
| 1460 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); | ||
| 1461 | } | ||
| 1462 | |||
| 1463 | static inline void free_mm_slot(struct mm_slot *mm_slot) | ||
| 1464 | { | ||
| 1465 | kmem_cache_free(mm_slot_cache, mm_slot); | ||
| 1466 | } | ||
| 1467 | |||
| 1468 | static int __init mm_slots_hash_init(void) | ||
| 1469 | { | ||
| 1470 | mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), | ||
| 1471 | GFP_KERNEL); | ||
| 1472 | if (!mm_slots_hash) | ||
| 1473 | return -ENOMEM; | ||
| 1474 | return 0; | ||
| 1475 | } | ||
| 1476 | |||
| 1477 | #if 0 | ||
| 1478 | static void __init mm_slots_hash_free(void) | ||
| 1479 | { | ||
| 1480 | kfree(mm_slots_hash); | ||
| 1481 | mm_slots_hash = NULL; | ||
| 1482 | } | ||
| 1483 | #endif | ||
| 1484 | |||
| 1485 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | ||
| 1486 | { | ||
| 1487 | struct mm_slot *mm_slot; | ||
| 1488 | struct hlist_head *bucket; | ||
| 1489 | struct hlist_node *node; | ||
| 1490 | |||
| 1491 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
| 1492 | % MM_SLOTS_HASH_HEADS]; | ||
| 1493 | hlist_for_each_entry(mm_slot, node, bucket, hash) { | ||
| 1494 | if (mm == mm_slot->mm) | ||
| 1495 | return mm_slot; | ||
| 1496 | } | ||
| 1497 | return NULL; | ||
| 1498 | } | ||
| 1499 | |||
| 1500 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | ||
| 1501 | struct mm_slot *mm_slot) | ||
| 1502 | { | ||
| 1503 | struct hlist_head *bucket; | ||
| 1504 | |||
| 1505 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
| 1506 | % MM_SLOTS_HASH_HEADS]; | ||
| 1507 | mm_slot->mm = mm; | ||
| 1508 | hlist_add_head(&mm_slot->hash, bucket); | ||
| 1509 | } | ||
| 1510 | |||
| 1511 | static inline int khugepaged_test_exit(struct mm_struct *mm) | ||
| 1512 | { | ||
| 1513 | return atomic_read(&mm->mm_users) == 0; | ||
| 1514 | } | ||
| 1515 | |||
| 1516 | int __khugepaged_enter(struct mm_struct *mm) | ||
| 1517 | { | ||
| 1518 | struct mm_slot *mm_slot; | ||
| 1519 | int wakeup; | ||
| 1520 | |||
| 1521 | mm_slot = alloc_mm_slot(); | ||
| 1522 | if (!mm_slot) | ||
| 1523 | return -ENOMEM; | ||
| 1524 | |||
| 1525 | /* __khugepaged_exit() must not run from under us */ | ||
| 1526 | VM_BUG_ON(khugepaged_test_exit(mm)); | ||
| 1527 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { | ||
| 1528 | free_mm_slot(mm_slot); | ||
| 1529 | return 0; | ||
| 1530 | } | ||
| 1531 | |||
| 1532 | spin_lock(&khugepaged_mm_lock); | ||
| 1533 | insert_to_mm_slots_hash(mm, mm_slot); | ||
| 1534 | /* | ||
| 1535 | * Insert just behind the scanning cursor, to let the area settle | ||
| 1536 | * down a little. | ||
| 1537 | */ | ||
| 1538 | wakeup = list_empty(&khugepaged_scan.mm_head); | ||
| 1539 | list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); | ||
| 1540 | spin_unlock(&khugepaged_mm_lock); | ||
| 1541 | |||
| 1542 | atomic_inc(&mm->mm_count); | ||
| 1543 | if (wakeup) | ||
| 1544 | wake_up_interruptible(&khugepaged_wait); | ||
| 1545 | |||
| 1546 | return 0; | ||
| 1547 | } | ||
| 1548 | |||
| 1549 | int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | ||
| 1550 | { | ||
| 1551 | unsigned long hstart, hend; | ||
| 1552 | if (!vma->anon_vma) | ||
| 1553 | /* | ||
| 1554 | * Not yet faulted in so we will register later in the | ||
| 1555 | * page fault if needed. | ||
| 1556 | */ | ||
| 1557 | return 0; | ||
| 1558 | if (vma->vm_file || vma->vm_ops) | ||
| 1559 | /* khugepaged not yet working on file or special mappings */ | ||
| 1560 | return 0; | ||
| 1561 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
| 1562 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
| 1563 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
| 1564 | if (hstart < hend) | ||
| 1565 | return khugepaged_enter(vma); | ||
| 1566 | return 0; | ||
| 1567 | } | ||
| 1568 | |||
| 1569 | void __khugepaged_exit(struct mm_struct *mm) | ||
| 1570 | { | ||
| 1571 | struct mm_slot *mm_slot; | ||
| 1572 | int free = 0; | ||
| 1573 | |||
| 1574 | spin_lock(&khugepaged_mm_lock); | ||
| 1575 | mm_slot = get_mm_slot(mm); | ||
| 1576 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | ||
| 1577 | hlist_del(&mm_slot->hash); | ||
| 1578 | list_del(&mm_slot->mm_node); | ||
| 1579 | free = 1; | ||
| 1580 | } | ||
| 1581 | |||
| 1582 | if (free) { | ||
| 1583 | spin_unlock(&khugepaged_mm_lock); | ||
| 1584 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
| 1585 | free_mm_slot(mm_slot); | ||
| 1586 | mmdrop(mm); | ||
| 1587 | } else if (mm_slot) { | ||
| 1588 | spin_unlock(&khugepaged_mm_lock); | ||
| 1589 | /* | ||
| 1590 | * This is required to serialize against | ||
| 1591 | * khugepaged_test_exit() (which is guaranteed to run | ||
| 1592 | * under mmap sem read mode). Stop here (after we | ||
| 1593 | * return all pagetables will be destroyed) until | ||
| 1594 | * khugepaged has finished working on the pagetables | ||
| 1595 | * under the mmap_sem. | ||
| 1596 | */ | ||
| 1597 | down_write(&mm->mmap_sem); | ||
| 1598 | up_write(&mm->mmap_sem); | ||
| 1599 | } else | ||
| 1600 | spin_unlock(&khugepaged_mm_lock); | ||
| 1601 | } | ||
| 1602 | |||
| 1603 | static void release_pte_page(struct page *page) | ||
| 1604 | { | ||
| 1605 | /* 0 stands for page_is_file_cache(page) == false */ | ||
| 1606 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
| 1607 | unlock_page(page); | ||
| 1608 | putback_lru_page(page); | ||
| 1609 | } | ||
| 1610 | |||
| 1611 | static void release_pte_pages(pte_t *pte, pte_t *_pte) | ||
| 1612 | { | ||
| 1613 | while (--_pte >= pte) { | ||
| 1614 | pte_t pteval = *_pte; | ||
| 1615 | if (!pte_none(pteval)) | ||
| 1616 | release_pte_page(pte_page(pteval)); | ||
| 1617 | } | ||
| 1618 | } | ||
| 1619 | |||
| 1620 | static void release_all_pte_pages(pte_t *pte) | ||
| 1621 | { | ||
| 1622 | release_pte_pages(pte, pte + HPAGE_PMD_NR); | ||
| 1623 | } | ||
| 1624 | |||
| 1625 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | ||
| 1626 | unsigned long address, | ||
| 1627 | pte_t *pte) | ||
| 1628 | { | ||
| 1629 | struct page *page; | ||
| 1630 | pte_t *_pte; | ||
| 1631 | int referenced = 0, isolated = 0, none = 0; | ||
| 1632 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
| 1633 | _pte++, address += PAGE_SIZE) { | ||
| 1634 | pte_t pteval = *_pte; | ||
| 1635 | if (pte_none(pteval)) { | ||
| 1636 | if (++none <= khugepaged_max_ptes_none) | ||
| 1637 | continue; | ||
| 1638 | else { | ||
| 1639 | release_pte_pages(pte, _pte); | ||
| 1640 | goto out; | ||
| 1641 | } | ||
| 1642 | } | ||
| 1643 | if (!pte_present(pteval) || !pte_write(pteval)) { | ||
| 1644 | release_pte_pages(pte, _pte); | ||
| 1645 | goto out; | ||
| 1646 | } | ||
| 1647 | page = vm_normal_page(vma, address, pteval); | ||
| 1648 | if (unlikely(!page)) { | ||
| 1649 | release_pte_pages(pte, _pte); | ||
| 1650 | goto out; | ||
| 1651 | } | ||
| 1652 | VM_BUG_ON(PageCompound(page)); | ||
| 1653 | BUG_ON(!PageAnon(page)); | ||
| 1654 | VM_BUG_ON(!PageSwapBacked(page)); | ||
| 1655 | |||
| 1656 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
| 1657 | if (page_count(page) != 1) { | ||
| 1658 | release_pte_pages(pte, _pte); | ||
| 1659 | goto out; | ||
| 1660 | } | ||
| 1661 | /* | ||
| 1662 | * We can do it before isolate_lru_page because the | ||
| 1663 | * page can't be freed from under us. NOTE: PG_lock | ||
| 1664 | * is needed to serialize against split_huge_page | ||
| 1665 | * when invoked from the VM. | ||
| 1666 | */ | ||
| 1667 | if (!trylock_page(page)) { | ||
| 1668 | release_pte_pages(pte, _pte); | ||
| 1669 | goto out; | ||
| 1670 | } | ||
| 1671 | /* | ||
| 1672 | * Isolate the page to avoid collapsing an hugepage | ||
| 1673 | * currently in use by the VM. | ||
| 1674 | */ | ||
| 1675 | if (isolate_lru_page(page)) { | ||
| 1676 | unlock_page(page); | ||
| 1677 | release_pte_pages(pte, _pte); | ||
| 1678 | goto out; | ||
| 1679 | } | ||
| 1680 | /* 0 stands for page_is_file_cache(page) == false */ | ||
| 1681 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
| 1682 | VM_BUG_ON(!PageLocked(page)); | ||
| 1683 | VM_BUG_ON(PageLRU(page)); | ||
| 1684 | |||
| 1685 | /* If there is no mapped pte young don't collapse the page */ | ||
| 1686 | if (pte_young(pteval) || PageReferenced(page) || | ||
| 1687 | mmu_notifier_test_young(vma->vm_mm, address)) | ||
| 1688 | referenced = 1; | ||
| 1689 | } | ||
| 1690 | if (unlikely(!referenced)) | ||
| 1691 | release_all_pte_pages(pte); | ||
| 1692 | else | ||
| 1693 | isolated = 1; | ||
| 1694 | out: | ||
| 1695 | return isolated; | ||
| 1696 | } | ||
| 1697 | |||
| 1698 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | ||
| 1699 | struct vm_area_struct *vma, | ||
| 1700 | unsigned long address, | ||
| 1701 | spinlock_t *ptl) | ||
| 1702 | { | ||
| 1703 | pte_t *_pte; | ||
| 1704 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { | ||
| 1705 | pte_t pteval = *_pte; | ||
| 1706 | struct page *src_page; | ||
| 1707 | |||
| 1708 | if (pte_none(pteval)) { | ||
| 1709 | clear_user_highpage(page, address); | ||
| 1710 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); | ||
| 1711 | } else { | ||
| 1712 | src_page = pte_page(pteval); | ||
| 1713 | copy_user_highpage(page, src_page, address, vma); | ||
| 1714 | VM_BUG_ON(page_mapcount(src_page) != 1); | ||
| 1715 | VM_BUG_ON(page_count(src_page) != 2); | ||
| 1716 | release_pte_page(src_page); | ||
| 1717 | /* | ||
| 1718 | * ptl mostly unnecessary, but preempt has to | ||
| 1719 | * be disabled to update the per-cpu stats | ||
| 1720 | * inside page_remove_rmap(). | ||
| 1721 | */ | ||
| 1722 | spin_lock(ptl); | ||
| 1723 | /* | ||
| 1724 | * paravirt calls inside pte_clear here are | ||
| 1725 | * superfluous. | ||
| 1726 | */ | ||
| 1727 | pte_clear(vma->vm_mm, address, _pte); | ||
| 1728 | page_remove_rmap(src_page); | ||
| 1729 | spin_unlock(ptl); | ||
| 1730 | free_page_and_swap_cache(src_page); | ||
| 1731 | } | ||
| 1732 | |||
| 1733 | address += PAGE_SIZE; | ||
| 1734 | page++; | ||
| 1735 | } | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | static void collapse_huge_page(struct mm_struct *mm, | ||
| 1739 | unsigned long address, | ||
| 1740 | struct page **hpage, | ||
| 1741 | struct vm_area_struct *vma) | ||
| 1742 | { | ||
| 1743 | pgd_t *pgd; | ||
| 1744 | pud_t *pud; | ||
| 1745 | pmd_t *pmd, _pmd; | ||
| 1746 | pte_t *pte; | ||
| 1747 | pgtable_t pgtable; | ||
| 1748 | struct page *new_page; | ||
| 1749 | spinlock_t *ptl; | ||
| 1750 | int isolated; | ||
| 1751 | unsigned long hstart, hend; | ||
| 1752 | |||
| 1753 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
| 1754 | #ifndef CONFIG_NUMA | ||
| 1755 | VM_BUG_ON(!*hpage); | ||
| 1756 | new_page = *hpage; | ||
| 1757 | #else | ||
| 1758 | VM_BUG_ON(*hpage); | ||
| 1759 | /* | ||
| 1760 | * Allocate the page while the vma is still valid and under | ||
| 1761 | * the mmap_sem read mode so there is no memory allocation | ||
| 1762 | * later when we take the mmap_sem in write mode. This is more | ||
| 1763 | * friendly behavior (OTOH it may actually hide bugs) to | ||
| 1764 | * filesystems in userland with daemons allocating memory in | ||
| 1765 | * the userland I/O paths. Allocating memory with the | ||
| 1766 | * mmap_sem in read mode is good idea also to allow greater | ||
| 1767 | * scalability. | ||
| 1768 | */ | ||
| 1769 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); | ||
| 1770 | if (unlikely(!new_page)) { | ||
| 1771 | up_read(&mm->mmap_sem); | ||
| 1772 | *hpage = ERR_PTR(-ENOMEM); | ||
| 1773 | return; | ||
| 1774 | } | ||
| 1775 | #endif | ||
| 1776 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | ||
| 1777 | up_read(&mm->mmap_sem); | ||
| 1778 | put_page(new_page); | ||
| 1779 | return; | ||
| 1780 | } | ||
| 1781 | |||
| 1782 | /* after allocating the hugepage upgrade to mmap_sem write mode */ | ||
| 1783 | up_read(&mm->mmap_sem); | ||
| 1784 | |||
| 1785 | /* | ||
| 1786 | * Prevent all access to pagetables with the exception of | ||
| 1787 | * gup_fast later hanlded by the ptep_clear_flush and the VM | ||
| 1788 | * handled by the anon_vma lock + PG_lock. | ||
| 1789 | */ | ||
| 1790 | down_write(&mm->mmap_sem); | ||
| 1791 | if (unlikely(khugepaged_test_exit(mm))) | ||
| 1792 | goto out; | ||
| 1793 | |||
| 1794 | vma = find_vma(mm, address); | ||
| 1795 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
| 1796 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
| 1797 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | ||
| 1798 | goto out; | ||
| 1799 | |||
| 1800 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
| 1801 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
| 1802 | goto out; | ||
| 1803 | |||
| 1804 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | ||
| 1805 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) | ||
| 1806 | goto out; | ||
| 1807 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
| 1808 | |||
| 1809 | pgd = pgd_offset(mm, address); | ||
| 1810 | if (!pgd_present(*pgd)) | ||
| 1811 | goto out; | ||
| 1812 | |||
| 1813 | pud = pud_offset(pgd, address); | ||
| 1814 | if (!pud_present(*pud)) | ||
| 1815 | goto out; | ||
| 1816 | |||
| 1817 | pmd = pmd_offset(pud, address); | ||
| 1818 | /* pmd can't go away or become huge under us */ | ||
| 1819 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
| 1820 | goto out; | ||
| 1821 | |||
| 1822 | anon_vma_lock(vma->anon_vma); | ||
| 1823 | |||
| 1824 | pte = pte_offset_map(pmd, address); | ||
| 1825 | ptl = pte_lockptr(mm, pmd); | ||
| 1826 | |||
| 1827 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | ||
| 1828 | /* | ||
| 1829 | * After this gup_fast can't run anymore. This also removes | ||
| 1830 | * any huge TLB entry from the CPU so we won't allow | ||
| 1831 | * huge and small TLB entries for the same virtual address | ||
| 1832 | * to avoid the risk of CPU bugs in that area. | ||
| 1833 | */ | ||
| 1834 | _pmd = pmdp_clear_flush_notify(vma, address, pmd); | ||
| 1835 | spin_unlock(&mm->page_table_lock); | ||
| 1836 | |||
| 1837 | spin_lock(ptl); | ||
| 1838 | isolated = __collapse_huge_page_isolate(vma, address, pte); | ||
| 1839 | spin_unlock(ptl); | ||
| 1840 | pte_unmap(pte); | ||
| 1841 | |||
| 1842 | if (unlikely(!isolated)) { | ||
| 1843 | spin_lock(&mm->page_table_lock); | ||
| 1844 | BUG_ON(!pmd_none(*pmd)); | ||
| 1845 | set_pmd_at(mm, address, pmd, _pmd); | ||
| 1846 | spin_unlock(&mm->page_table_lock); | ||
| 1847 | anon_vma_unlock(vma->anon_vma); | ||
| 1848 | mem_cgroup_uncharge_page(new_page); | ||
| 1849 | goto out; | ||
| 1850 | } | ||
| 1851 | |||
| 1852 | /* | ||
| 1853 | * All pages are isolated and locked so anon_vma rmap | ||
| 1854 | * can't run anymore. | ||
| 1855 | */ | ||
| 1856 | anon_vma_unlock(vma->anon_vma); | ||
| 1857 | |||
| 1858 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); | ||
| 1859 | __SetPageUptodate(new_page); | ||
| 1860 | pgtable = pmd_pgtable(_pmd); | ||
| 1861 | VM_BUG_ON(page_count(pgtable) != 1); | ||
| 1862 | VM_BUG_ON(page_mapcount(pgtable) != 0); | ||
| 1863 | |||
| 1864 | _pmd = mk_pmd(new_page, vma->vm_page_prot); | ||
| 1865 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
| 1866 | _pmd = pmd_mkhuge(_pmd); | ||
| 1867 | |||
| 1868 | /* | ||
| 1869 | * spin_lock() below is not the equivalent of smp_wmb(), so | ||
| 1870 | * this is needed to avoid the copy_huge_page writes to become | ||
| 1871 | * visible after the set_pmd_at() write. | ||
| 1872 | */ | ||
| 1873 | smp_wmb(); | ||
| 1874 | |||
| 1875 | spin_lock(&mm->page_table_lock); | ||
| 1876 | BUG_ON(!pmd_none(*pmd)); | ||
| 1877 | page_add_new_anon_rmap(new_page, vma, address); | ||
| 1878 | set_pmd_at(mm, address, pmd, _pmd); | ||
| 1879 | update_mmu_cache(vma, address, entry); | ||
| 1880 | prepare_pmd_huge_pte(pgtable, mm); | ||
| 1881 | mm->nr_ptes--; | ||
| 1882 | spin_unlock(&mm->page_table_lock); | ||
| 1883 | |||
| 1884 | #ifndef CONFIG_NUMA | ||
| 1885 | *hpage = NULL; | ||
| 1886 | #endif | ||
| 1887 | khugepaged_pages_collapsed++; | ||
| 1888 | out_up_write: | ||
| 1889 | up_write(&mm->mmap_sem); | ||
| 1890 | return; | ||
| 1891 | |||
| 1892 | out: | ||
| 1893 | #ifdef CONFIG_NUMA | ||
| 1894 | put_page(new_page); | ||
| 1895 | #endif | ||
| 1896 | goto out_up_write; | ||
| 1897 | } | ||
| 1898 | |||
| 1899 | static int khugepaged_scan_pmd(struct mm_struct *mm, | ||
| 1900 | struct vm_area_struct *vma, | ||
| 1901 | unsigned long address, | ||
| 1902 | struct page **hpage) | ||
| 1903 | { | ||
| 1904 | pgd_t *pgd; | ||
| 1905 | pud_t *pud; | ||
| 1906 | pmd_t *pmd; | ||
| 1907 | pte_t *pte, *_pte; | ||
| 1908 | int ret = 0, referenced = 0, none = 0; | ||
| 1909 | struct page *page; | ||
| 1910 | unsigned long _address; | ||
| 1911 | spinlock_t *ptl; | ||
| 1912 | |||
| 1913 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
| 1914 | |||
| 1915 | pgd = pgd_offset(mm, address); | ||
| 1916 | if (!pgd_present(*pgd)) | ||
| 1917 | goto out; | ||
| 1918 | |||
| 1919 | pud = pud_offset(pgd, address); | ||
| 1920 | if (!pud_present(*pud)) | ||
| 1921 | goto out; | ||
| 1922 | |||
| 1923 | pmd = pmd_offset(pud, address); | ||
| 1924 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
| 1925 | goto out; | ||
| 1926 | |||
| 1927 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
| 1928 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
| 1929 | _pte++, _address += PAGE_SIZE) { | ||
| 1930 | pte_t pteval = *_pte; | ||
| 1931 | if (pte_none(pteval)) { | ||
| 1932 | if (++none <= khugepaged_max_ptes_none) | ||
| 1933 | continue; | ||
| 1934 | else | ||
| 1935 | goto out_unmap; | ||
| 1936 | } | ||
| 1937 | if (!pte_present(pteval) || !pte_write(pteval)) | ||
| 1938 | goto out_unmap; | ||
| 1939 | page = vm_normal_page(vma, _address, pteval); | ||
| 1940 | if (unlikely(!page)) | ||
| 1941 | goto out_unmap; | ||
| 1942 | VM_BUG_ON(PageCompound(page)); | ||
| 1943 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | ||
| 1944 | goto out_unmap; | ||
| 1945 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
| 1946 | if (page_count(page) != 1) | ||
| 1947 | goto out_unmap; | ||
| 1948 | if (pte_young(pteval) || PageReferenced(page) || | ||
| 1949 | mmu_notifier_test_young(vma->vm_mm, address)) | ||
| 1950 | referenced = 1; | ||
| 1951 | } | ||
| 1952 | if (referenced) | ||
| 1953 | ret = 1; | ||
| 1954 | out_unmap: | ||
| 1955 | pte_unmap_unlock(pte, ptl); | ||
| 1956 | if (ret) | ||
| 1957 | /* collapse_huge_page will return with the mmap_sem released */ | ||
| 1958 | collapse_huge_page(mm, address, hpage, vma); | ||
| 1959 | out: | ||
| 1960 | return ret; | ||
| 1961 | } | ||
| 1962 | |||
| 1963 | static void collect_mm_slot(struct mm_slot *mm_slot) | ||
| 1964 | { | ||
| 1965 | struct mm_struct *mm = mm_slot->mm; | ||
| 1966 | |||
| 1967 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | ||
| 1968 | |||
| 1969 | if (khugepaged_test_exit(mm)) { | ||
| 1970 | /* free mm_slot */ | ||
| 1971 | hlist_del(&mm_slot->hash); | ||
| 1972 | list_del(&mm_slot->mm_node); | ||
| 1973 | |||
| 1974 | /* | ||
| 1975 | * Not strictly needed because the mm exited already. | ||
| 1976 | * | ||
| 1977 | * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
| 1978 | */ | ||
| 1979 | |||
| 1980 | /* khugepaged_mm_lock actually not necessary for the below */ | ||
| 1981 | free_mm_slot(mm_slot); | ||
| 1982 | mmdrop(mm); | ||
| 1983 | } | ||
| 1984 | } | ||
| 1985 | |||
| 1986 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | ||
| 1987 | struct page **hpage) | ||
| 1988 | { | ||
| 1989 | struct mm_slot *mm_slot; | ||
| 1990 | struct mm_struct *mm; | ||
| 1991 | struct vm_area_struct *vma; | ||
| 1992 | int progress = 0; | ||
| 1993 | |||
| 1994 | VM_BUG_ON(!pages); | ||
| 1995 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | ||
| 1996 | |||
| 1997 | if (khugepaged_scan.mm_slot) | ||
| 1998 | mm_slot = khugepaged_scan.mm_slot; | ||
| 1999 | else { | ||
| 2000 | mm_slot = list_entry(khugepaged_scan.mm_head.next, | ||
| 2001 | struct mm_slot, mm_node); | ||
| 2002 | khugepaged_scan.address = 0; | ||
| 2003 | khugepaged_scan.mm_slot = mm_slot; | ||
| 2004 | } | ||
| 2005 | spin_unlock(&khugepaged_mm_lock); | ||
| 2006 | |||
| 2007 | mm = mm_slot->mm; | ||
| 2008 | down_read(&mm->mmap_sem); | ||
| 2009 | if (unlikely(khugepaged_test_exit(mm))) | ||
| 2010 | vma = NULL; | ||
| 2011 | else | ||
| 2012 | vma = find_vma(mm, khugepaged_scan.address); | ||
| 2013 | |||
| 2014 | progress++; | ||
| 2015 | for (; vma; vma = vma->vm_next) { | ||
| 2016 | unsigned long hstart, hend; | ||
| 2017 | |||
| 2018 | cond_resched(); | ||
| 2019 | if (unlikely(khugepaged_test_exit(mm))) { | ||
| 2020 | progress++; | ||
| 2021 | break; | ||
| 2022 | } | ||
| 2023 | |||
| 2024 | if ((!(vma->vm_flags & VM_HUGEPAGE) && | ||
| 2025 | !khugepaged_always()) || | ||
| 2026 | (vma->vm_flags & VM_NOHUGEPAGE)) { | ||
| 2027 | progress++; | ||
| 2028 | continue; | ||
| 2029 | } | ||
| 2030 | |||
| 2031 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | ||
| 2032 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { | ||
| 2033 | khugepaged_scan.address = vma->vm_end; | ||
| 2034 | progress++; | ||
| 2035 | continue; | ||
| 2036 | } | ||
| 2037 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
| 2038 | |||
| 2039 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
| 2040 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
| 2041 | if (hstart >= hend) { | ||
| 2042 | progress++; | ||
| 2043 | continue; | ||
| 2044 | } | ||
| 2045 | if (khugepaged_scan.address < hstart) | ||
| 2046 | khugepaged_scan.address = hstart; | ||
| 2047 | if (khugepaged_scan.address > hend) { | ||
| 2048 | khugepaged_scan.address = hend + HPAGE_PMD_SIZE; | ||
| 2049 | progress++; | ||
| 2050 | continue; | ||
| 2051 | } | ||
| 2052 | BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); | ||
| 2053 | |||
| 2054 | while (khugepaged_scan.address < hend) { | ||
| 2055 | int ret; | ||
| 2056 | cond_resched(); | ||
| 2057 | if (unlikely(khugepaged_test_exit(mm))) | ||
| 2058 | goto breakouterloop; | ||
| 2059 | |||
| 2060 | VM_BUG_ON(khugepaged_scan.address < hstart || | ||
| 2061 | khugepaged_scan.address + HPAGE_PMD_SIZE > | ||
| 2062 | hend); | ||
| 2063 | ret = khugepaged_scan_pmd(mm, vma, | ||
| 2064 | khugepaged_scan.address, | ||
| 2065 | hpage); | ||
| 2066 | /* move to next address */ | ||
| 2067 | khugepaged_scan.address += HPAGE_PMD_SIZE; | ||
| 2068 | progress += HPAGE_PMD_NR; | ||
| 2069 | if (ret) | ||
| 2070 | /* we released mmap_sem so break loop */ | ||
| 2071 | goto breakouterloop_mmap_sem; | ||
| 2072 | if (progress >= pages) | ||
| 2073 | goto breakouterloop; | ||
| 2074 | } | ||
| 2075 | } | ||
| 2076 | breakouterloop: | ||
| 2077 | up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ | ||
| 2078 | breakouterloop_mmap_sem: | ||
| 2079 | |||
| 2080 | spin_lock(&khugepaged_mm_lock); | ||
| 2081 | BUG_ON(khugepaged_scan.mm_slot != mm_slot); | ||
| 2082 | /* | ||
| 2083 | * Release the current mm_slot if this mm is about to die, or | ||
| 2084 | * if we scanned all vmas of this mm. | ||
| 2085 | */ | ||
| 2086 | if (khugepaged_test_exit(mm) || !vma) { | ||
| 2087 | /* | ||
| 2088 | * Make sure that if mm_users is reaching zero while | ||
| 2089 | * khugepaged runs here, khugepaged_exit will find | ||
| 2090 | * mm_slot not pointing to the exiting mm. | ||
| 2091 | */ | ||
| 2092 | if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { | ||
| 2093 | khugepaged_scan.mm_slot = list_entry( | ||
| 2094 | mm_slot->mm_node.next, | ||
| 2095 | struct mm_slot, mm_node); | ||
| 2096 | khugepaged_scan.address = 0; | ||
| 2097 | } else { | ||
| 2098 | khugepaged_scan.mm_slot = NULL; | ||
| 2099 | khugepaged_full_scans++; | ||
| 2100 | } | ||
| 2101 | |||
| 2102 | collect_mm_slot(mm_slot); | ||
| 2103 | } | ||
| 2104 | |||
| 2105 | return progress; | ||
| 2106 | } | ||
| 2107 | |||
| 2108 | static int khugepaged_has_work(void) | ||
| 2109 | { | ||
| 2110 | return !list_empty(&khugepaged_scan.mm_head) && | ||
| 2111 | khugepaged_enabled(); | ||
| 2112 | } | ||
| 2113 | |||
| 2114 | static int khugepaged_wait_event(void) | ||
| 2115 | { | ||
| 2116 | return !list_empty(&khugepaged_scan.mm_head) || | ||
| 2117 | !khugepaged_enabled(); | ||
| 2118 | } | ||
| 2119 | |||
| 2120 | static void khugepaged_do_scan(struct page **hpage) | ||
| 2121 | { | ||
| 2122 | unsigned int progress = 0, pass_through_head = 0; | ||
| 2123 | unsigned int pages = khugepaged_pages_to_scan; | ||
| 2124 | |||
| 2125 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | ||
| 2126 | |||
| 2127 | while (progress < pages) { | ||
| 2128 | cond_resched(); | ||
| 2129 | |||
| 2130 | #ifndef CONFIG_NUMA | ||
| 2131 | if (!*hpage) { | ||
| 2132 | *hpage = alloc_hugepage(khugepaged_defrag()); | ||
| 2133 | if (unlikely(!*hpage)) | ||
| 2134 | break; | ||
| 2135 | } | ||
| 2136 | #else | ||
| 2137 | if (IS_ERR(*hpage)) | ||
| 2138 | break; | ||
| 2139 | #endif | ||
| 2140 | |||
| 2141 | if (unlikely(kthread_should_stop() || freezing(current))) | ||
| 2142 | break; | ||
| 2143 | |||
| 2144 | spin_lock(&khugepaged_mm_lock); | ||
| 2145 | if (!khugepaged_scan.mm_slot) | ||
| 2146 | pass_through_head++; | ||
| 2147 | if (khugepaged_has_work() && | ||
| 2148 | pass_through_head < 2) | ||
| 2149 | progress += khugepaged_scan_mm_slot(pages - progress, | ||
| 2150 | hpage); | ||
| 2151 | else | ||
| 2152 | progress = pages; | ||
| 2153 | spin_unlock(&khugepaged_mm_lock); | ||
| 2154 | } | ||
| 2155 | } | ||
| 2156 | |||
| 2157 | static void khugepaged_alloc_sleep(void) | ||
| 2158 | { | ||
| 2159 | DEFINE_WAIT(wait); | ||
| 2160 | add_wait_queue(&khugepaged_wait, &wait); | ||
| 2161 | schedule_timeout_interruptible( | ||
| 2162 | msecs_to_jiffies( | ||
| 2163 | khugepaged_alloc_sleep_millisecs)); | ||
| 2164 | remove_wait_queue(&khugepaged_wait, &wait); | ||
| 2165 | } | ||
| 2166 | |||
| 2167 | #ifndef CONFIG_NUMA | ||
| 2168 | static struct page *khugepaged_alloc_hugepage(void) | ||
| 2169 | { | ||
| 2170 | struct page *hpage; | ||
| 2171 | |||
| 2172 | do { | ||
| 2173 | hpage = alloc_hugepage(khugepaged_defrag()); | ||
| 2174 | if (!hpage) | ||
| 2175 | khugepaged_alloc_sleep(); | ||
| 2176 | } while (unlikely(!hpage) && | ||
| 2177 | likely(khugepaged_enabled())); | ||
| 2178 | return hpage; | ||
| 2179 | } | ||
| 2180 | #endif | ||
| 2181 | |||
| 2182 | static void khugepaged_loop(void) | ||
| 2183 | { | ||
| 2184 | struct page *hpage; | ||
| 2185 | |||
| 2186 | #ifdef CONFIG_NUMA | ||
| 2187 | hpage = NULL; | ||
| 2188 | #endif | ||
| 2189 | while (likely(khugepaged_enabled())) { | ||
| 2190 | #ifndef CONFIG_NUMA | ||
| 2191 | hpage = khugepaged_alloc_hugepage(); | ||
| 2192 | if (unlikely(!hpage)) | ||
| 2193 | break; | ||
| 2194 | #else | ||
| 2195 | if (IS_ERR(hpage)) { | ||
| 2196 | khugepaged_alloc_sleep(); | ||
| 2197 | hpage = NULL; | ||
| 2198 | } | ||
| 2199 | #endif | ||
| 2200 | |||
| 2201 | khugepaged_do_scan(&hpage); | ||
| 2202 | #ifndef CONFIG_NUMA | ||
| 2203 | if (hpage) | ||
| 2204 | put_page(hpage); | ||
| 2205 | #endif | ||
| 2206 | try_to_freeze(); | ||
| 2207 | if (unlikely(kthread_should_stop())) | ||
| 2208 | break; | ||
| 2209 | if (khugepaged_has_work()) { | ||
| 2210 | DEFINE_WAIT(wait); | ||
| 2211 | if (!khugepaged_scan_sleep_millisecs) | ||
| 2212 | continue; | ||
| 2213 | add_wait_queue(&khugepaged_wait, &wait); | ||
| 2214 | schedule_timeout_interruptible( | ||
| 2215 | msecs_to_jiffies( | ||
| 2216 | khugepaged_scan_sleep_millisecs)); | ||
| 2217 | remove_wait_queue(&khugepaged_wait, &wait); | ||
| 2218 | } else if (khugepaged_enabled()) | ||
| 2219 | wait_event_freezable(khugepaged_wait, | ||
| 2220 | khugepaged_wait_event()); | ||
| 2221 | } | ||
| 2222 | } | ||
| 2223 | |||
| 2224 | static int khugepaged(void *none) | ||
| 2225 | { | ||
| 2226 | struct mm_slot *mm_slot; | ||
| 2227 | |||
| 2228 | set_freezable(); | ||
| 2229 | set_user_nice(current, 19); | ||
| 2230 | |||
| 2231 | /* serialize with start_khugepaged() */ | ||
| 2232 | mutex_lock(&khugepaged_mutex); | ||
| 2233 | |||
| 2234 | for (;;) { | ||
| 2235 | mutex_unlock(&khugepaged_mutex); | ||
| 2236 | BUG_ON(khugepaged_thread != current); | ||
| 2237 | khugepaged_loop(); | ||
| 2238 | BUG_ON(khugepaged_thread != current); | ||
| 2239 | |||
| 2240 | mutex_lock(&khugepaged_mutex); | ||
| 2241 | if (!khugepaged_enabled()) | ||
| 2242 | break; | ||
| 2243 | if (unlikely(kthread_should_stop())) | ||
| 2244 | break; | ||
| 2245 | } | ||
| 2246 | |||
| 2247 | spin_lock(&khugepaged_mm_lock); | ||
| 2248 | mm_slot = khugepaged_scan.mm_slot; | ||
| 2249 | khugepaged_scan.mm_slot = NULL; | ||
| 2250 | if (mm_slot) | ||
| 2251 | collect_mm_slot(mm_slot); | ||
| 2252 | spin_unlock(&khugepaged_mm_lock); | ||
| 2253 | |||
| 2254 | khugepaged_thread = NULL; | ||
| 2255 | mutex_unlock(&khugepaged_mutex); | ||
| 2256 | |||
| 2257 | return 0; | ||
| 2258 | } | ||
| 2259 | |||
| 2260 | void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | ||
| 2261 | { | ||
| 2262 | struct page *page; | ||
| 2263 | |||
| 2264 | spin_lock(&mm->page_table_lock); | ||
| 2265 | if (unlikely(!pmd_trans_huge(*pmd))) { | ||
| 2266 | spin_unlock(&mm->page_table_lock); | ||
| 2267 | return; | ||
| 2268 | } | ||
| 2269 | page = pmd_page(*pmd); | ||
| 2270 | VM_BUG_ON(!page_count(page)); | ||
| 2271 | get_page(page); | ||
| 2272 | spin_unlock(&mm->page_table_lock); | ||
| 2273 | |||
| 2274 | split_huge_page(page); | ||
| 2275 | |||
| 2276 | put_page(page); | ||
| 2277 | BUG_ON(pmd_trans_huge(*pmd)); | ||
| 2278 | } | ||
| 2279 | |||
| 2280 | static void split_huge_page_address(struct mm_struct *mm, | ||
| 2281 | unsigned long address) | ||
| 2282 | { | ||
| 2283 | pgd_t *pgd; | ||
| 2284 | pud_t *pud; | ||
| 2285 | pmd_t *pmd; | ||
| 2286 | |||
| 2287 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); | ||
| 2288 | |||
| 2289 | pgd = pgd_offset(mm, address); | ||
| 2290 | if (!pgd_present(*pgd)) | ||
| 2291 | return; | ||
| 2292 | |||
| 2293 | pud = pud_offset(pgd, address); | ||
| 2294 | if (!pud_present(*pud)) | ||
| 2295 | return; | ||
| 2296 | |||
| 2297 | pmd = pmd_offset(pud, address); | ||
| 2298 | if (!pmd_present(*pmd)) | ||
| 2299 | return; | ||
| 2300 | /* | ||
| 2301 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | ||
| 2302 | * materialize from under us. | ||
| 2303 | */ | ||
| 2304 | split_huge_page_pmd(mm, pmd); | ||
| 2305 | } | ||
| 2306 | |||
| 2307 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | ||
| 2308 | unsigned long start, | ||
| 2309 | unsigned long end, | ||
| 2310 | long adjust_next) | ||
| 2311 | { | ||
| 2312 | /* | ||
| 2313 | * If the new start address isn't hpage aligned and it could | ||
| 2314 | * previously contain an hugepage: check if we need to split | ||
| 2315 | * an huge pmd. | ||
| 2316 | */ | ||
| 2317 | if (start & ~HPAGE_PMD_MASK && | ||
| 2318 | (start & HPAGE_PMD_MASK) >= vma->vm_start && | ||
| 2319 | (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | ||
| 2320 | split_huge_page_address(vma->vm_mm, start); | ||
| 2321 | |||
| 2322 | /* | ||
| 2323 | * If the new end address isn't hpage aligned and it could | ||
| 2324 | * previously contain an hugepage: check if we need to split | ||
| 2325 | * an huge pmd. | ||
| 2326 | */ | ||
| 2327 | if (end & ~HPAGE_PMD_MASK && | ||
| 2328 | (end & HPAGE_PMD_MASK) >= vma->vm_start && | ||
| 2329 | (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | ||
| 2330 | split_huge_page_address(vma->vm_mm, end); | ||
| 2331 | |||
| 2332 | /* | ||
| 2333 | * If we're also updating the vma->vm_next->vm_start, if the new | ||
| 2334 | * vm_next->vm_start isn't page aligned and it could previously | ||
| 2335 | * contain an hugepage: check if we need to split an huge pmd. | ||
| 2336 | */ | ||
| 2337 | if (adjust_next > 0) { | ||
| 2338 | struct vm_area_struct *next = vma->vm_next; | ||
| 2339 | unsigned long nstart = next->vm_start; | ||
| 2340 | nstart += adjust_next << PAGE_SHIFT; | ||
| 2341 | if (nstart & ~HPAGE_PMD_MASK && | ||
| 2342 | (nstart & HPAGE_PMD_MASK) >= next->vm_start && | ||
| 2343 | (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) | ||
| 2344 | split_huge_page_address(next->vm_mm, nstart); | ||
| 2345 | } | ||
| 2346 | } | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c4a3558589ab..bb0b7c128015 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma) | |||
| 394 | return 0; | 394 | return 0; |
| 395 | } | 395 | } |
| 396 | 396 | ||
| 397 | static void clear_gigantic_page(struct page *page, | ||
| 398 | unsigned long addr, unsigned long sz) | ||
| 399 | { | ||
| 400 | int i; | ||
| 401 | struct page *p = page; | ||
| 402 | |||
| 403 | might_sleep(); | ||
| 404 | for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { | ||
| 405 | cond_resched(); | ||
| 406 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
| 407 | } | ||
| 408 | } | ||
| 409 | static void clear_huge_page(struct page *page, | ||
| 410 | unsigned long addr, unsigned long sz) | ||
| 411 | { | ||
| 412 | int i; | ||
| 413 | |||
| 414 | if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { | ||
| 415 | clear_gigantic_page(page, addr, sz); | ||
| 416 | return; | ||
| 417 | } | ||
| 418 | |||
| 419 | might_sleep(); | ||
| 420 | for (i = 0; i < sz/PAGE_SIZE; i++) { | ||
| 421 | cond_resched(); | ||
| 422 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
| 423 | } | ||
| 424 | } | ||
| 425 | |||
| 426 | static void copy_user_gigantic_page(struct page *dst, struct page *src, | ||
| 427 | unsigned long addr, struct vm_area_struct *vma) | ||
| 428 | { | ||
| 429 | int i; | ||
| 430 | struct hstate *h = hstate_vma(vma); | ||
| 431 | struct page *dst_base = dst; | ||
| 432 | struct page *src_base = src; | ||
| 433 | |||
| 434 | for (i = 0; i < pages_per_huge_page(h); ) { | ||
| 435 | cond_resched(); | ||
| 436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | ||
| 437 | |||
| 438 | i++; | ||
| 439 | dst = mem_map_next(dst, dst_base, i); | ||
| 440 | src = mem_map_next(src, src_base, i); | ||
| 441 | } | ||
| 442 | } | ||
| 443 | |||
| 444 | static void copy_user_huge_page(struct page *dst, struct page *src, | ||
| 445 | unsigned long addr, struct vm_area_struct *vma) | ||
| 446 | { | ||
| 447 | int i; | ||
| 448 | struct hstate *h = hstate_vma(vma); | ||
| 449 | |||
| 450 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | ||
| 451 | copy_user_gigantic_page(dst, src, addr, vma); | ||
| 452 | return; | ||
| 453 | } | ||
| 454 | |||
| 455 | might_sleep(); | ||
| 456 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
| 457 | cond_resched(); | ||
| 458 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | ||
| 459 | } | ||
| 460 | } | ||
| 461 | |||
| 462 | static void copy_gigantic_page(struct page *dst, struct page *src) | 397 | static void copy_gigantic_page(struct page *dst, struct page *src) |
| 463 | { | 398 | { |
| 464 | int i; | 399 | int i; |
| @@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, | |||
| 1428 | 1363 | ||
| 1429 | return sprintf(buf, "%lu\n", nr_huge_pages); | 1364 | return sprintf(buf, "%lu\n", nr_huge_pages); |
| 1430 | } | 1365 | } |
| 1366 | |||
| 1431 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | 1367 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
| 1432 | struct kobject *kobj, struct kobj_attribute *attr, | 1368 | struct kobject *kobj, struct kobj_attribute *attr, |
| 1433 | const char *buf, size_t len) | 1369 | const char *buf, size_t len) |
| @@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
| 1440 | 1376 | ||
| 1441 | err = strict_strtoul(buf, 10, &count); | 1377 | err = strict_strtoul(buf, 10, &count); |
| 1442 | if (err) | 1378 | if (err) |
| 1443 | return 0; | 1379 | goto out; |
| 1444 | 1380 | ||
| 1445 | h = kobj_to_hstate(kobj, &nid); | 1381 | h = kobj_to_hstate(kobj, &nid); |
| 1382 | if (h->order >= MAX_ORDER) { | ||
| 1383 | err = -EINVAL; | ||
| 1384 | goto out; | ||
| 1385 | } | ||
| 1386 | |||
| 1446 | if (nid == NUMA_NO_NODE) { | 1387 | if (nid == NUMA_NO_NODE) { |
| 1447 | /* | 1388 | /* |
| 1448 | * global hstate attribute | 1389 | * global hstate attribute |
| @@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
| 1468 | NODEMASK_FREE(nodes_allowed); | 1409 | NODEMASK_FREE(nodes_allowed); |
| 1469 | 1410 | ||
| 1470 | return len; | 1411 | return len; |
| 1412 | out: | ||
| 1413 | NODEMASK_FREE(nodes_allowed); | ||
| 1414 | return err; | ||
| 1471 | } | 1415 | } |
| 1472 | 1416 | ||
| 1473 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1417 | static ssize_t nr_hugepages_show(struct kobject *kobj, |
| @@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | |||
| 1510 | struct hstate *h = kobj_to_hstate(kobj, NULL); | 1454 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
| 1511 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1455 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
| 1512 | } | 1456 | } |
| 1457 | |||
| 1513 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1458 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
| 1514 | struct kobj_attribute *attr, const char *buf, size_t count) | 1459 | struct kobj_attribute *attr, const char *buf, size_t count) |
| 1515 | { | 1460 | { |
| @@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
| 1517 | unsigned long input; | 1462 | unsigned long input; |
| 1518 | struct hstate *h = kobj_to_hstate(kobj, NULL); | 1463 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
| 1519 | 1464 | ||
| 1465 | if (h->order >= MAX_ORDER) | ||
| 1466 | return -EINVAL; | ||
| 1467 | |||
| 1520 | err = strict_strtoul(buf, 10, &input); | 1468 | err = strict_strtoul(buf, 10, &input); |
| 1521 | if (err) | 1469 | if (err) |
| 1522 | return 0; | 1470 | return err; |
| 1523 | 1471 | ||
| 1524 | spin_lock(&hugetlb_lock); | 1472 | spin_lock(&hugetlb_lock); |
| 1525 | h->nr_overcommit_huge_pages = input; | 1473 | h->nr_overcommit_huge_pages = input; |
| @@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
| 1922 | { | 1870 | { |
| 1923 | struct hstate *h = &default_hstate; | 1871 | struct hstate *h = &default_hstate; |
| 1924 | unsigned long tmp; | 1872 | unsigned long tmp; |
| 1873 | int ret; | ||
| 1925 | 1874 | ||
| 1926 | if (!write) | 1875 | if (!write) |
| 1927 | tmp = h->max_huge_pages; | 1876 | tmp = h->max_huge_pages; |
| 1928 | 1877 | ||
| 1878 | if (write && h->order >= MAX_ORDER) | ||
| 1879 | return -EINVAL; | ||
| 1880 | |||
| 1929 | table->data = &tmp; | 1881 | table->data = &tmp; |
| 1930 | table->maxlen = sizeof(unsigned long); | 1882 | table->maxlen = sizeof(unsigned long); |
| 1931 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1883 | ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); |
| 1884 | if (ret) | ||
| 1885 | goto out; | ||
| 1932 | 1886 | ||
| 1933 | if (write) { | 1887 | if (write) { |
| 1934 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, | 1888 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
| @@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
| 1943 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 1897 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) |
| 1944 | NODEMASK_FREE(nodes_allowed); | 1898 | NODEMASK_FREE(nodes_allowed); |
| 1945 | } | 1899 | } |
| 1946 | 1900 | out: | |
| 1947 | return 0; | 1901 | return ret; |
| 1948 | } | 1902 | } |
| 1949 | 1903 | ||
| 1950 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1904 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
| @@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
| 1982 | { | 1936 | { |
| 1983 | struct hstate *h = &default_hstate; | 1937 | struct hstate *h = &default_hstate; |
| 1984 | unsigned long tmp; | 1938 | unsigned long tmp; |
| 1939 | int ret; | ||
| 1985 | 1940 | ||
| 1986 | if (!write) | 1941 | if (!write) |
| 1987 | tmp = h->nr_overcommit_huge_pages; | 1942 | tmp = h->nr_overcommit_huge_pages; |
| 1988 | 1943 | ||
| 1944 | if (write && h->order >= MAX_ORDER) | ||
| 1945 | return -EINVAL; | ||
| 1946 | |||
| 1989 | table->data = &tmp; | 1947 | table->data = &tmp; |
| 1990 | table->maxlen = sizeof(unsigned long); | 1948 | table->maxlen = sizeof(unsigned long); |
| 1991 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1949 | ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); |
| 1950 | if (ret) | ||
| 1951 | goto out; | ||
| 1992 | 1952 | ||
| 1993 | if (write) { | 1953 | if (write) { |
| 1994 | spin_lock(&hugetlb_lock); | 1954 | spin_lock(&hugetlb_lock); |
| 1995 | h->nr_overcommit_huge_pages = tmp; | 1955 | h->nr_overcommit_huge_pages = tmp; |
| 1996 | spin_unlock(&hugetlb_lock); | 1956 | spin_unlock(&hugetlb_lock); |
| 1997 | } | 1957 | } |
| 1998 | 1958 | out: | |
| 1999 | return 0; | 1959 | return ret; |
| 2000 | } | 1960 | } |
| 2001 | 1961 | ||
| 2002 | #endif /* CONFIG_SYSCTL */ | 1962 | #endif /* CONFIG_SYSCTL */ |
| @@ -2454,7 +2414,8 @@ retry_avoidcopy: | |||
| 2454 | return VM_FAULT_OOM; | 2414 | return VM_FAULT_OOM; |
| 2455 | } | 2415 | } |
| 2456 | 2416 | ||
| 2457 | copy_user_huge_page(new_page, old_page, address, vma); | 2417 | copy_user_huge_page(new_page, old_page, address, vma, |
| 2418 | pages_per_huge_page(h)); | ||
| 2458 | __SetPageUptodate(new_page); | 2419 | __SetPageUptodate(new_page); |
| 2459 | 2420 | ||
| 2460 | /* | 2421 | /* |
| @@ -2558,7 +2519,7 @@ retry: | |||
| 2558 | ret = -PTR_ERR(page); | 2519 | ret = -PTR_ERR(page); |
| 2559 | goto out; | 2520 | goto out; |
| 2560 | } | 2521 | } |
| 2561 | clear_huge_page(page, address, huge_page_size(h)); | 2522 | clear_huge_page(page, address, pages_per_huge_page(h)); |
| 2562 | __SetPageUptodate(page); | 2523 | __SetPageUptodate(page); |
| 2563 | 2524 | ||
| 2564 | if (vma->vm_flags & VM_MAYSHARE) { | 2525 | if (vma->vm_flags & VM_MAYSHARE) { |
| @@ -2738,7 +2699,8 @@ out_page_table_lock: | |||
| 2738 | unlock_page(pagecache_page); | 2699 | unlock_page(pagecache_page); |
| 2739 | put_page(pagecache_page); | 2700 | put_page(pagecache_page); |
| 2740 | } | 2701 | } |
| 2741 | unlock_page(page); | 2702 | if (page != pagecache_page) |
| 2703 | unlock_page(page); | ||
| 2742 | 2704 | ||
| 2743 | out_mutex: | 2705 | out_mutex: |
| 2744 | mutex_unlock(&hugetlb_instantiation_mutex); | 2706 | mutex_unlock(&hugetlb_instantiation_mutex); |
diff --git a/mm/internal.h b/mm/internal.h index dedb0aff673f..4c98630f0f77 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -39,6 +39,15 @@ static inline void __put_page(struct page *page) | |||
| 39 | 39 | ||
| 40 | extern unsigned long highest_memmap_pfn; | 40 | extern unsigned long highest_memmap_pfn; |
| 41 | 41 | ||
| 42 | #ifdef CONFIG_SMP | ||
| 43 | extern int putback_active_lru_page(struct zone *zone, struct page *page); | ||
| 44 | #else | ||
| 45 | static inline int putback_active_lru_page(struct zone *zone, struct page *page) | ||
| 46 | { | ||
| 47 | return 0; | ||
| 48 | } | ||
| 49 | #endif | ||
| 50 | |||
| 42 | /* | 51 | /* |
| 43 | * in mm/vmscan.c: | 52 | * in mm/vmscan.c: |
| 44 | */ | 53 | */ |
| @@ -134,6 +143,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
| 134 | } | 143 | } |
| 135 | } | 144 | } |
| 136 | 145 | ||
| 146 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 147 | extern unsigned long vma_address(struct page *page, | ||
| 148 | struct vm_area_struct *vma); | ||
| 149 | #endif | ||
| 137 | #else /* !CONFIG_MMU */ | 150 | #else /* !CONFIG_MMU */ |
| 138 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 151 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
| 139 | { | 152 | { |
| @@ -243,7 +256,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
| 243 | 256 | ||
| 244 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 257 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 245 | unsigned long start, int len, unsigned int foll_flags, | 258 | unsigned long start, int len, unsigned int foll_flags, |
| 246 | struct page **pages, struct vm_area_struct **vmas); | 259 | struct page **pages, struct vm_area_struct **vmas, |
| 260 | int *nonblocking); | ||
| 247 | 261 | ||
| 248 | #define ZONE_RECLAIM_NOSCAN -2 | 262 | #define ZONE_RECLAIM_NOSCAN -2 |
| 249 | #define ZONE_RECLAIM_FULL -1 | 263 | #define ZONE_RECLAIM_FULL -1 |
| @@ -34,6 +34,7 @@ | |||
| 34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
| 35 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
| 36 | #include <linux/hash.h> | 36 | #include <linux/hash.h> |
| 37 | #include <linux/freezer.h> | ||
| 37 | 38 | ||
| 38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
| 39 | #include "internal.h" | 40 | #include "internal.h" |
| @@ -411,6 +412,20 @@ out: | |||
| 411 | up_read(&mm->mmap_sem); | 412 | up_read(&mm->mmap_sem); |
| 412 | } | 413 | } |
| 413 | 414 | ||
| 415 | static struct page *page_trans_compound_anon(struct page *page) | ||
| 416 | { | ||
| 417 | if (PageTransCompound(page)) { | ||
| 418 | struct page *head = compound_trans_head(page); | ||
| 419 | /* | ||
| 420 | * head may actually be splitted and freed from under | ||
| 421 | * us but it's ok here. | ||
| 422 | */ | ||
| 423 | if (PageAnon(head)) | ||
| 424 | return head; | ||
| 425 | } | ||
| 426 | return NULL; | ||
| 427 | } | ||
| 428 | |||
| 414 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) | 429 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) |
| 415 | { | 430 | { |
| 416 | struct mm_struct *mm = rmap_item->mm; | 431 | struct mm_struct *mm = rmap_item->mm; |
| @@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
| 430 | page = follow_page(vma, addr, FOLL_GET); | 445 | page = follow_page(vma, addr, FOLL_GET); |
| 431 | if (IS_ERR_OR_NULL(page)) | 446 | if (IS_ERR_OR_NULL(page)) |
| 432 | goto out; | 447 | goto out; |
| 433 | if (PageAnon(page)) { | 448 | if (PageAnon(page) || page_trans_compound_anon(page)) { |
| 434 | flush_anon_page(vma, page, addr); | 449 | flush_anon_page(vma, page, addr); |
| 435 | flush_dcache_page(page); | 450 | flush_dcache_page(page); |
| 436 | } else { | 451 | } else { |
| @@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
| 708 | if (addr == -EFAULT) | 723 | if (addr == -EFAULT) |
| 709 | goto out; | 724 | goto out; |
| 710 | 725 | ||
| 726 | BUG_ON(PageTransCompound(page)); | ||
| 711 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 727 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
| 712 | if (!ptep) | 728 | if (!ptep) |
| 713 | goto out; | 729 | goto out; |
| @@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
| 783 | goto out; | 799 | goto out; |
| 784 | 800 | ||
| 785 | pmd = pmd_offset(pud, addr); | 801 | pmd = pmd_offset(pud, addr); |
| 802 | BUG_ON(pmd_trans_huge(*pmd)); | ||
| 786 | if (!pmd_present(*pmd)) | 803 | if (!pmd_present(*pmd)) |
| 787 | goto out; | 804 | goto out; |
| 788 | 805 | ||
| @@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
| 800 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | 817 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
| 801 | 818 | ||
| 802 | page_remove_rmap(page); | 819 | page_remove_rmap(page); |
| 820 | if (!page_mapped(page)) | ||
| 821 | try_to_free_swap(page); | ||
| 803 | put_page(page); | 822 | put_page(page); |
| 804 | 823 | ||
| 805 | pte_unmap_unlock(ptep, ptl); | 824 | pte_unmap_unlock(ptep, ptl); |
| @@ -808,6 +827,33 @@ out: | |||
| 808 | return err; | 827 | return err; |
| 809 | } | 828 | } |
| 810 | 829 | ||
| 830 | static int page_trans_compound_anon_split(struct page *page) | ||
| 831 | { | ||
| 832 | int ret = 0; | ||
| 833 | struct page *transhuge_head = page_trans_compound_anon(page); | ||
| 834 | if (transhuge_head) { | ||
| 835 | /* Get the reference on the head to split it. */ | ||
| 836 | if (get_page_unless_zero(transhuge_head)) { | ||
| 837 | /* | ||
| 838 | * Recheck we got the reference while the head | ||
| 839 | * was still anonymous. | ||
| 840 | */ | ||
| 841 | if (PageAnon(transhuge_head)) | ||
| 842 | ret = split_huge_page(transhuge_head); | ||
| 843 | else | ||
| 844 | /* | ||
| 845 | * Retry later if split_huge_page run | ||
| 846 | * from under us. | ||
| 847 | */ | ||
| 848 | ret = 1; | ||
| 849 | put_page(transhuge_head); | ||
| 850 | } else | ||
| 851 | /* Retry later if split_huge_page run from under us. */ | ||
| 852 | ret = 1; | ||
| 853 | } | ||
| 854 | return ret; | ||
| 855 | } | ||
| 856 | |||
| 811 | /* | 857 | /* |
| 812 | * try_to_merge_one_page - take two pages and merge them into one | 858 | * try_to_merge_one_page - take two pages and merge them into one |
| 813 | * @vma: the vma that holds the pte pointing to page | 859 | * @vma: the vma that holds the pte pointing to page |
| @@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
| 828 | 874 | ||
| 829 | if (!(vma->vm_flags & VM_MERGEABLE)) | 875 | if (!(vma->vm_flags & VM_MERGEABLE)) |
| 830 | goto out; | 876 | goto out; |
| 877 | if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | ||
| 878 | goto out; | ||
| 879 | BUG_ON(PageTransCompound(page)); | ||
| 831 | if (!PageAnon(page)) | 880 | if (!PageAnon(page)) |
| 832 | goto out; | 881 | goto out; |
| 833 | 882 | ||
| @@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
| 1247 | 1296 | ||
| 1248 | slot = ksm_scan.mm_slot; | 1297 | slot = ksm_scan.mm_slot; |
| 1249 | if (slot == &ksm_mm_head) { | 1298 | if (slot == &ksm_mm_head) { |
| 1299 | /* | ||
| 1300 | * A number of pages can hang around indefinitely on per-cpu | ||
| 1301 | * pagevecs, raised page count preventing write_protect_page | ||
| 1302 | * from merging them. Though it doesn't really matter much, | ||
| 1303 | * it is puzzling to see some stuck in pages_volatile until | ||
| 1304 | * other activity jostles them out, and they also prevented | ||
| 1305 | * LTP's KSM test from succeeding deterministically; so drain | ||
| 1306 | * them here (here rather than on entry to ksm_do_scan(), | ||
| 1307 | * so we don't IPI too often when pages_to_scan is set low). | ||
| 1308 | */ | ||
| 1309 | lru_add_drain_all(); | ||
| 1310 | |||
| 1250 | root_unstable_tree = RB_ROOT; | 1311 | root_unstable_tree = RB_ROOT; |
| 1251 | 1312 | ||
| 1252 | spin_lock(&ksm_mmlist_lock); | 1313 | spin_lock(&ksm_mmlist_lock); |
| @@ -1277,7 +1338,13 @@ next_mm: | |||
| 1277 | if (ksm_test_exit(mm)) | 1338 | if (ksm_test_exit(mm)) |
| 1278 | break; | 1339 | break; |
| 1279 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); | 1340 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); |
| 1280 | if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { | 1341 | if (IS_ERR_OR_NULL(*page)) { |
| 1342 | ksm_scan.address += PAGE_SIZE; | ||
| 1343 | cond_resched(); | ||
| 1344 | continue; | ||
| 1345 | } | ||
| 1346 | if (PageAnon(*page) || | ||
| 1347 | page_trans_compound_anon(*page)) { | ||
| 1281 | flush_anon_page(vma, *page, ksm_scan.address); | 1348 | flush_anon_page(vma, *page, ksm_scan.address); |
| 1282 | flush_dcache_page(*page); | 1349 | flush_dcache_page(*page); |
| 1283 | rmap_item = get_next_rmap_item(slot, | 1350 | rmap_item = get_next_rmap_item(slot, |
| @@ -1291,8 +1358,7 @@ next_mm: | |||
| 1291 | up_read(&mm->mmap_sem); | 1358 | up_read(&mm->mmap_sem); |
| 1292 | return rmap_item; | 1359 | return rmap_item; |
| 1293 | } | 1360 | } |
| 1294 | if (!IS_ERR_OR_NULL(*page)) | 1361 | put_page(*page); |
| 1295 | put_page(*page); | ||
| 1296 | ksm_scan.address += PAGE_SIZE; | 1362 | ksm_scan.address += PAGE_SIZE; |
| 1297 | cond_resched(); | 1363 | cond_resched(); |
| 1298 | } | 1364 | } |
| @@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
| 1352 | struct rmap_item *rmap_item; | 1418 | struct rmap_item *rmap_item; |
| 1353 | struct page *uninitialized_var(page); | 1419 | struct page *uninitialized_var(page); |
| 1354 | 1420 | ||
| 1355 | while (scan_npages--) { | 1421 | while (scan_npages-- && likely(!freezing(current))) { |
| 1356 | cond_resched(); | 1422 | cond_resched(); |
| 1357 | rmap_item = scan_get_next_rmap_item(&page); | 1423 | rmap_item = scan_get_next_rmap_item(&page); |
| 1358 | if (!rmap_item) | 1424 | if (!rmap_item) |
| @@ -1370,6 +1436,7 @@ static int ksmd_should_run(void) | |||
| 1370 | 1436 | ||
| 1371 | static int ksm_scan_thread(void *nothing) | 1437 | static int ksm_scan_thread(void *nothing) |
| 1372 | { | 1438 | { |
| 1439 | set_freezable(); | ||
| 1373 | set_user_nice(current, 5); | 1440 | set_user_nice(current, 5); |
| 1374 | 1441 | ||
| 1375 | while (!kthread_should_stop()) { | 1442 | while (!kthread_should_stop()) { |
| @@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing) | |||
| 1378 | ksm_do_scan(ksm_thread_pages_to_scan); | 1445 | ksm_do_scan(ksm_thread_pages_to_scan); |
| 1379 | mutex_unlock(&ksm_thread_mutex); | 1446 | mutex_unlock(&ksm_thread_mutex); |
| 1380 | 1447 | ||
| 1448 | try_to_freeze(); | ||
| 1449 | |||
| 1381 | if (ksmd_should_run()) { | 1450 | if (ksmd_should_run()) { |
| 1382 | schedule_timeout_interruptible( | 1451 | schedule_timeout_interruptible( |
| 1383 | msecs_to_jiffies(ksm_thread_sleep_millisecs)); | 1452 | msecs_to_jiffies(ksm_thread_sleep_millisecs)); |
| 1384 | } else { | 1453 | } else { |
| 1385 | wait_event_interruptible(ksm_thread_wait, | 1454 | wait_event_freezable(ksm_thread_wait, |
| 1386 | ksmd_should_run() || kthread_should_stop()); | 1455 | ksmd_should_run() || kthread_should_stop()); |
| 1387 | } | 1456 | } |
| 1388 | } | 1457 | } |
| @@ -1724,8 +1793,13 @@ static int ksm_memory_callback(struct notifier_block *self, | |||
| 1724 | /* | 1793 | /* |
| 1725 | * Keep it very simple for now: just lock out ksmd and | 1794 | * Keep it very simple for now: just lock out ksmd and |
| 1726 | * MADV_UNMERGEABLE while any memory is going offline. | 1795 | * MADV_UNMERGEABLE while any memory is going offline. |
| 1796 | * mutex_lock_nested() is necessary because lockdep was alarmed | ||
| 1797 | * that here we take ksm_thread_mutex inside notifier chain | ||
| 1798 | * mutex, and later take notifier chain mutex inside | ||
| 1799 | * ksm_thread_mutex to unlock it. But that's safe because both | ||
| 1800 | * are inside mem_hotplug_mutex. | ||
| 1727 | */ | 1801 | */ |
| 1728 | mutex_lock(&ksm_thread_mutex); | 1802 | mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); |
| 1729 | break; | 1803 | break; |
| 1730 | 1804 | ||
| 1731 | case MEM_OFFLINE: | 1805 | case MEM_OFFLINE: |
diff --git a/mm/madvise.c b/mm/madvise.c index 319528b8db74..2221491ed503 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma, | |||
| 71 | if (error) | 71 | if (error) |
| 72 | goto out; | 72 | goto out; |
| 73 | break; | 73 | break; |
| 74 | case MADV_HUGEPAGE: | ||
| 75 | case MADV_NOHUGEPAGE: | ||
| 76 | error = hugepage_madvise(vma, &new_flags, behavior); | ||
| 77 | if (error) | ||
| 78 | goto out; | ||
| 79 | break; | ||
| 74 | } | 80 | } |
| 75 | 81 | ||
| 76 | if (new_flags == vma->vm_flags) { | 82 | if (new_flags == vma->vm_flags) { |
| @@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior) | |||
| 283 | case MADV_MERGEABLE: | 289 | case MADV_MERGEABLE: |
| 284 | case MADV_UNMERGEABLE: | 290 | case MADV_UNMERGEABLE: |
| 285 | #endif | 291 | #endif |
| 292 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 293 | case MADV_HUGEPAGE: | ||
| 294 | case MADV_NOHUGEPAGE: | ||
| 295 | #endif | ||
| 286 | return 1; | 296 | return 1; |
| 287 | 297 | ||
| 288 | default: | 298 | default: |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7a22b4129211..8ab841031436 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -292,7 +292,6 @@ static struct move_charge_struct { | |||
| 292 | unsigned long moved_charge; | 292 | unsigned long moved_charge; |
| 293 | unsigned long moved_swap; | 293 | unsigned long moved_swap; |
| 294 | struct task_struct *moving_task; /* a task moving charges */ | 294 | struct task_struct *moving_task; /* a task moving charges */ |
| 295 | struct mm_struct *mm; | ||
| 296 | wait_queue_head_t waitq; /* a waitq for other context */ | 295 | wait_queue_head_t waitq; /* a waitq for other context */ |
| 297 | } mc = { | 296 | } mc = { |
| 298 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | 297 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), |
| @@ -821,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | |||
| 821 | return; | 820 | return; |
| 822 | VM_BUG_ON(list_empty(&pc->lru)); | 821 | VM_BUG_ON(list_empty(&pc->lru)); |
| 823 | list_del_init(&pc->lru); | 822 | list_del_init(&pc->lru); |
| 824 | return; | ||
| 825 | } | 823 | } |
| 826 | 824 | ||
| 827 | void mem_cgroup_del_lru(struct page *page) | 825 | void mem_cgroup_del_lru(struct page *page) |
| @@ -1087,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
| 1087 | case 0: | 1085 | case 0: |
| 1088 | list_move(&page->lru, dst); | 1086 | list_move(&page->lru, dst); |
| 1089 | mem_cgroup_del_lru(page); | 1087 | mem_cgroup_del_lru(page); |
| 1090 | nr_taken++; | 1088 | nr_taken += hpage_nr_pages(page); |
| 1091 | break; | 1089 | break; |
| 1092 | case -EBUSY: | 1090 | case -EBUSY: |
| 1093 | /* we don't affect global LRU but rotate in our LRU */ | 1091 | /* we don't affect global LRU but rotate in our LRU */ |
| @@ -1312,8 +1310,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
| 1312 | u64 limit; | 1310 | u64 limit; |
| 1313 | u64 memsw; | 1311 | u64 memsw; |
| 1314 | 1312 | ||
| 1315 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + | 1313 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
| 1316 | total_swap_pages; | 1314 | limit += total_swap_pages << PAGE_SHIFT; |
| 1315 | |||
| 1317 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 1316 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
| 1318 | /* | 1317 | /* |
| 1319 | * If memsw is finite and limits the amount of swap space available | 1318 | * If memsw is finite and limits the amount of swap space available |
| @@ -1600,11 +1599,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
| 1600 | * possibility of race condition. If there is, we take a lock. | 1599 | * possibility of race condition. If there is, we take a lock. |
| 1601 | */ | 1600 | */ |
| 1602 | 1601 | ||
| 1603 | static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) | 1602 | void mem_cgroup_update_page_stat(struct page *page, |
| 1603 | enum mem_cgroup_page_stat_item idx, int val) | ||
| 1604 | { | 1604 | { |
| 1605 | struct mem_cgroup *mem; | 1605 | struct mem_cgroup *mem; |
| 1606 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1606 | struct page_cgroup *pc = lookup_page_cgroup(page); |
| 1607 | bool need_unlock = false; | 1607 | bool need_unlock = false; |
| 1608 | unsigned long uninitialized_var(flags); | ||
| 1608 | 1609 | ||
| 1609 | if (unlikely(!pc)) | 1610 | if (unlikely(!pc)) |
| 1610 | return; | 1611 | return; |
| @@ -1616,37 +1617,34 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) | |||
| 1616 | /* pc->mem_cgroup is unstable ? */ | 1617 | /* pc->mem_cgroup is unstable ? */ |
| 1617 | if (unlikely(mem_cgroup_stealed(mem))) { | 1618 | if (unlikely(mem_cgroup_stealed(mem))) { |
| 1618 | /* take a lock against to access pc->mem_cgroup */ | 1619 | /* take a lock against to access pc->mem_cgroup */ |
| 1619 | lock_page_cgroup(pc); | 1620 | move_lock_page_cgroup(pc, &flags); |
| 1620 | need_unlock = true; | 1621 | need_unlock = true; |
| 1621 | mem = pc->mem_cgroup; | 1622 | mem = pc->mem_cgroup; |
| 1622 | if (!mem || !PageCgroupUsed(pc)) | 1623 | if (!mem || !PageCgroupUsed(pc)) |
| 1623 | goto out; | 1624 | goto out; |
| 1624 | } | 1625 | } |
| 1625 | 1626 | ||
| 1626 | this_cpu_add(mem->stat->count[idx], val); | ||
| 1627 | |||
| 1628 | switch (idx) { | 1627 | switch (idx) { |
| 1629 | case MEM_CGROUP_STAT_FILE_MAPPED: | 1628 | case MEMCG_NR_FILE_MAPPED: |
| 1630 | if (val > 0) | 1629 | if (val > 0) |
| 1631 | SetPageCgroupFileMapped(pc); | 1630 | SetPageCgroupFileMapped(pc); |
| 1632 | else if (!page_mapped(page)) | 1631 | else if (!page_mapped(page)) |
| 1633 | ClearPageCgroupFileMapped(pc); | 1632 | ClearPageCgroupFileMapped(pc); |
| 1633 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | ||
| 1634 | break; | 1634 | break; |
| 1635 | default: | 1635 | default: |
| 1636 | BUG(); | 1636 | BUG(); |
| 1637 | } | 1637 | } |
| 1638 | 1638 | ||
| 1639 | this_cpu_add(mem->stat->count[idx], val); | ||
| 1640 | |||
| 1639 | out: | 1641 | out: |
| 1640 | if (unlikely(need_unlock)) | 1642 | if (unlikely(need_unlock)) |
| 1641 | unlock_page_cgroup(pc); | 1643 | move_unlock_page_cgroup(pc, &flags); |
| 1642 | rcu_read_unlock(); | 1644 | rcu_read_unlock(); |
| 1643 | return; | 1645 | return; |
| 1644 | } | 1646 | } |
| 1645 | 1647 | EXPORT_SYMBOL(mem_cgroup_update_page_stat); | |
| 1646 | void mem_cgroup_update_file_mapped(struct page *page, int val) | ||
| 1647 | { | ||
| 1648 | mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); | ||
| 1649 | } | ||
| 1650 | 1648 | ||
| 1651 | /* | 1649 | /* |
| 1652 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1650 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
| @@ -1887,12 +1885,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
| 1887 | * oom-killer can be invoked. | 1885 | * oom-killer can be invoked. |
| 1888 | */ | 1886 | */ |
| 1889 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1887 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
| 1890 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | 1888 | gfp_t gfp_mask, |
| 1889 | struct mem_cgroup **memcg, bool oom, | ||
| 1890 | int page_size) | ||
| 1891 | { | 1891 | { |
| 1892 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1892 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
| 1893 | struct mem_cgroup *mem = NULL; | 1893 | struct mem_cgroup *mem = NULL; |
| 1894 | int ret; | 1894 | int ret; |
| 1895 | int csize = CHARGE_SIZE; | 1895 | int csize = max(CHARGE_SIZE, (unsigned long) page_size); |
| 1896 | 1896 | ||
| 1897 | /* | 1897 | /* |
| 1898 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage | 1898 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
| @@ -1917,7 +1917,7 @@ again: | |||
| 1917 | VM_BUG_ON(css_is_removed(&mem->css)); | 1917 | VM_BUG_ON(css_is_removed(&mem->css)); |
| 1918 | if (mem_cgroup_is_root(mem)) | 1918 | if (mem_cgroup_is_root(mem)) |
| 1919 | goto done; | 1919 | goto done; |
| 1920 | if (consume_stock(mem)) | 1920 | if (page_size == PAGE_SIZE && consume_stock(mem)) |
| 1921 | goto done; | 1921 | goto done; |
| 1922 | css_get(&mem->css); | 1922 | css_get(&mem->css); |
| 1923 | } else { | 1923 | } else { |
| @@ -1925,23 +1925,22 @@ again: | |||
| 1925 | 1925 | ||
| 1926 | rcu_read_lock(); | 1926 | rcu_read_lock(); |
| 1927 | p = rcu_dereference(mm->owner); | 1927 | p = rcu_dereference(mm->owner); |
| 1928 | VM_BUG_ON(!p); | ||
| 1929 | /* | 1928 | /* |
| 1930 | * because we don't have task_lock(), "p" can exit while | 1929 | * Because we don't have task_lock(), "p" can exit. |
| 1931 | * we're here. In that case, "mem" can point to root | 1930 | * In that case, "mem" can point to root or p can be NULL with |
| 1932 | * cgroup but never be NULL. (and task_struct itself is freed | 1931 | * race with swapoff. Then, we have small risk of mis-accouning. |
| 1933 | * by RCU, cgroup itself is RCU safe.) Then, we have small | 1932 | * But such kind of mis-account by race always happens because |
| 1934 | * risk here to get wrong cgroup. But such kind of mis-account | 1933 | * we don't have cgroup_mutex(). It's overkill and we allo that |
| 1935 | * by race always happens because we don't have cgroup_mutex(). | 1934 | * small race, here. |
| 1936 | * It's overkill and we allow that small race, here. | 1935 | * (*) swapoff at el will charge against mm-struct not against |
| 1936 | * task-struct. So, mm->owner can be NULL. | ||
| 1937 | */ | 1937 | */ |
| 1938 | mem = mem_cgroup_from_task(p); | 1938 | mem = mem_cgroup_from_task(p); |
| 1939 | VM_BUG_ON(!mem); | 1939 | if (!mem || mem_cgroup_is_root(mem)) { |
| 1940 | if (mem_cgroup_is_root(mem)) { | ||
| 1941 | rcu_read_unlock(); | 1940 | rcu_read_unlock(); |
| 1942 | goto done; | 1941 | goto done; |
| 1943 | } | 1942 | } |
| 1944 | if (consume_stock(mem)) { | 1943 | if (page_size == PAGE_SIZE && consume_stock(mem)) { |
| 1945 | /* | 1944 | /* |
| 1946 | * It seems dagerous to access memcg without css_get(). | 1945 | * It seems dagerous to access memcg without css_get(). |
| 1947 | * But considering how consume_stok works, it's not | 1946 | * But considering how consume_stok works, it's not |
| @@ -1982,7 +1981,7 @@ again: | |||
| 1982 | case CHARGE_OK: | 1981 | case CHARGE_OK: |
| 1983 | break; | 1982 | break; |
| 1984 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 1983 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
| 1985 | csize = PAGE_SIZE; | 1984 | csize = page_size; |
| 1986 | css_put(&mem->css); | 1985 | css_put(&mem->css); |
| 1987 | mem = NULL; | 1986 | mem = NULL; |
| 1988 | goto again; | 1987 | goto again; |
| @@ -2003,8 +2002,8 @@ again: | |||
| 2003 | } | 2002 | } |
| 2004 | } while (ret != CHARGE_OK); | 2003 | } while (ret != CHARGE_OK); |
| 2005 | 2004 | ||
| 2006 | if (csize > PAGE_SIZE) | 2005 | if (csize > page_size) |
| 2007 | refill_stock(mem, csize - PAGE_SIZE); | 2006 | refill_stock(mem, csize - page_size); |
| 2008 | css_put(&mem->css); | 2007 | css_put(&mem->css); |
| 2009 | done: | 2008 | done: |
| 2010 | *memcg = mem; | 2009 | *memcg = mem; |
| @@ -2032,9 +2031,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | |||
| 2032 | } | 2031 | } |
| 2033 | } | 2032 | } |
| 2034 | 2033 | ||
| 2035 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | 2034 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
| 2035 | int page_size) | ||
| 2036 | { | 2036 | { |
| 2037 | __mem_cgroup_cancel_charge(mem, 1); | 2037 | __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); |
| 2038 | } | 2038 | } |
| 2039 | 2039 | ||
| 2040 | /* | 2040 | /* |
| @@ -2088,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
| 2088 | * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be | 2088 | * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be |
| 2089 | * USED state. If already USED, uncharge and return. | 2089 | * USED state. If already USED, uncharge and return. |
| 2090 | */ | 2090 | */ |
| 2091 | 2091 | static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, | |
| 2092 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 2092 | struct page_cgroup *pc, |
| 2093 | struct page_cgroup *pc, | 2093 | enum charge_type ctype) |
| 2094 | enum charge_type ctype) | ||
| 2095 | { | 2094 | { |
| 2096 | /* try_charge() can return NULL to *memcg, taking care of it. */ | ||
| 2097 | if (!mem) | ||
| 2098 | return; | ||
| 2099 | |||
| 2100 | lock_page_cgroup(pc); | ||
| 2101 | if (unlikely(PageCgroupUsed(pc))) { | ||
| 2102 | unlock_page_cgroup(pc); | ||
| 2103 | mem_cgroup_cancel_charge(mem); | ||
| 2104 | return; | ||
| 2105 | } | ||
| 2106 | |||
| 2107 | pc->mem_cgroup = mem; | 2095 | pc->mem_cgroup = mem; |
| 2108 | /* | 2096 | /* |
| 2109 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | 2097 | * We access a page_cgroup asynchronously without lock_page_cgroup(). |
| @@ -2128,6 +2116,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
| 2128 | } | 2116 | } |
| 2129 | 2117 | ||
| 2130 | mem_cgroup_charge_statistics(mem, pc, true); | 2118 | mem_cgroup_charge_statistics(mem, pc, true); |
| 2119 | } | ||
| 2120 | |||
| 2121 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | ||
| 2122 | struct page_cgroup *pc, | ||
| 2123 | enum charge_type ctype, | ||
| 2124 | int page_size) | ||
| 2125 | { | ||
| 2126 | int i; | ||
| 2127 | int count = page_size >> PAGE_SHIFT; | ||
| 2128 | |||
| 2129 | /* try_charge() can return NULL to *memcg, taking care of it. */ | ||
| 2130 | if (!mem) | ||
| 2131 | return; | ||
| 2132 | |||
| 2133 | lock_page_cgroup(pc); | ||
| 2134 | if (unlikely(PageCgroupUsed(pc))) { | ||
| 2135 | unlock_page_cgroup(pc); | ||
| 2136 | mem_cgroup_cancel_charge(mem, page_size); | ||
| 2137 | return; | ||
| 2138 | } | ||
| 2139 | |||
| 2140 | /* | ||
| 2141 | * we don't need page_cgroup_lock about tail pages, becase they are not | ||
| 2142 | * accessed by any other context at this point. | ||
| 2143 | */ | ||
| 2144 | for (i = 0; i < count; i++) | ||
| 2145 | ____mem_cgroup_commit_charge(mem, pc + i, ctype); | ||
| 2131 | 2146 | ||
| 2132 | unlock_page_cgroup(pc); | 2147 | unlock_page_cgroup(pc); |
| 2133 | /* | 2148 | /* |
| @@ -2174,7 +2189,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 2174 | mem_cgroup_charge_statistics(from, pc, false); | 2189 | mem_cgroup_charge_statistics(from, pc, false); |
| 2175 | if (uncharge) | 2190 | if (uncharge) |
| 2176 | /* This is not "cancel", but cancel_charge does all we need. */ | 2191 | /* This is not "cancel", but cancel_charge does all we need. */ |
| 2177 | mem_cgroup_cancel_charge(from); | 2192 | mem_cgroup_cancel_charge(from, PAGE_SIZE); |
| 2178 | 2193 | ||
| 2179 | /* caller should have done css_get */ | 2194 | /* caller should have done css_get */ |
| 2180 | pc->mem_cgroup = to; | 2195 | pc->mem_cgroup = to; |
| @@ -2196,9 +2211,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 2196 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) | 2211 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
| 2197 | { | 2212 | { |
| 2198 | int ret = -EINVAL; | 2213 | int ret = -EINVAL; |
| 2214 | unsigned long flags; | ||
| 2215 | |||
| 2199 | lock_page_cgroup(pc); | 2216 | lock_page_cgroup(pc); |
| 2200 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | 2217 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { |
| 2218 | move_lock_page_cgroup(pc, &flags); | ||
| 2201 | __mem_cgroup_move_account(pc, from, to, uncharge); | 2219 | __mem_cgroup_move_account(pc, from, to, uncharge); |
| 2220 | move_unlock_page_cgroup(pc, &flags); | ||
| 2202 | ret = 0; | 2221 | ret = 0; |
| 2203 | } | 2222 | } |
| 2204 | unlock_page_cgroup(pc); | 2223 | unlock_page_cgroup(pc); |
| @@ -2235,13 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
| 2235 | goto put; | 2254 | goto put; |
| 2236 | 2255 | ||
| 2237 | parent = mem_cgroup_from_cont(pcg); | 2256 | parent = mem_cgroup_from_cont(pcg); |
| 2238 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 2257 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, |
| 2258 | PAGE_SIZE); | ||
| 2239 | if (ret || !parent) | 2259 | if (ret || !parent) |
| 2240 | goto put_back; | 2260 | goto put_back; |
| 2241 | 2261 | ||
| 2242 | ret = mem_cgroup_move_account(pc, child, parent, true); | 2262 | ret = mem_cgroup_move_account(pc, child, parent, true); |
| 2243 | if (ret) | 2263 | if (ret) |
| 2244 | mem_cgroup_cancel_charge(parent); | 2264 | mem_cgroup_cancel_charge(parent, PAGE_SIZE); |
| 2245 | put_back: | 2265 | put_back: |
| 2246 | putback_lru_page(page); | 2266 | putback_lru_page(page); |
| 2247 | put: | 2267 | put: |
| @@ -2262,6 +2282,12 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 2262 | struct mem_cgroup *mem = NULL; | 2282 | struct mem_cgroup *mem = NULL; |
| 2263 | struct page_cgroup *pc; | 2283 | struct page_cgroup *pc; |
| 2264 | int ret; | 2284 | int ret; |
| 2285 | int page_size = PAGE_SIZE; | ||
| 2286 | |||
| 2287 | if (PageTransHuge(page)) { | ||
| 2288 | page_size <<= compound_order(page); | ||
| 2289 | VM_BUG_ON(!PageTransHuge(page)); | ||
| 2290 | } | ||
| 2265 | 2291 | ||
| 2266 | pc = lookup_page_cgroup(page); | 2292 | pc = lookup_page_cgroup(page); |
| 2267 | /* can happen at boot */ | 2293 | /* can happen at boot */ |
| @@ -2269,11 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 2269 | return 0; | 2295 | return 0; |
| 2270 | prefetchw(pc); | 2296 | prefetchw(pc); |
| 2271 | 2297 | ||
| 2272 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 2298 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); |
| 2273 | if (ret || !mem) | 2299 | if (ret || !mem) |
| 2274 | return ret; | 2300 | return ret; |
| 2275 | 2301 | ||
| 2276 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2302 | __mem_cgroup_commit_charge(mem, pc, ctype, page_size); |
| 2277 | return 0; | 2303 | return 0; |
| 2278 | } | 2304 | } |
| 2279 | 2305 | ||
| @@ -2282,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
| 2282 | { | 2308 | { |
| 2283 | if (mem_cgroup_disabled()) | 2309 | if (mem_cgroup_disabled()) |
| 2284 | return 0; | 2310 | return 0; |
| 2285 | if (PageCompound(page)) | ||
| 2286 | return 0; | ||
| 2287 | /* | 2311 | /* |
| 2288 | * If already mapped, we don't have to account. | 2312 | * If already mapped, we don't have to account. |
| 2289 | * If page cache, page->mapping has address_space. | 2313 | * If page cache, page->mapping has address_space. |
| @@ -2389,13 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
| 2389 | if (!mem) | 2413 | if (!mem) |
| 2390 | goto charge_cur_mm; | 2414 | goto charge_cur_mm; |
| 2391 | *ptr = mem; | 2415 | *ptr = mem; |
| 2392 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 2416 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); |
| 2393 | css_put(&mem->css); | 2417 | css_put(&mem->css); |
| 2394 | return ret; | 2418 | return ret; |
| 2395 | charge_cur_mm: | 2419 | charge_cur_mm: |
| 2396 | if (unlikely(!mm)) | 2420 | if (unlikely(!mm)) |
| 2397 | mm = &init_mm; | 2421 | mm = &init_mm; |
| 2398 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 2422 | return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); |
| 2399 | } | 2423 | } |
| 2400 | 2424 | ||
| 2401 | static void | 2425 | static void |
| @@ -2411,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
| 2411 | cgroup_exclude_rmdir(&ptr->css); | 2435 | cgroup_exclude_rmdir(&ptr->css); |
| 2412 | pc = lookup_page_cgroup(page); | 2436 | pc = lookup_page_cgroup(page); |
| 2413 | mem_cgroup_lru_del_before_commit_swapcache(page); | 2437 | mem_cgroup_lru_del_before_commit_swapcache(page); |
| 2414 | __mem_cgroup_commit_charge(ptr, pc, ctype); | 2438 | __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); |
| 2415 | mem_cgroup_lru_add_after_commit_swapcache(page); | 2439 | mem_cgroup_lru_add_after_commit_swapcache(page); |
| 2416 | /* | 2440 | /* |
| 2417 | * Now swap is on-memory. This means this page may be | 2441 | * Now swap is on-memory. This means this page may be |
| @@ -2460,11 +2484,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
| 2460 | return; | 2484 | return; |
| 2461 | if (!mem) | 2485 | if (!mem) |
| 2462 | return; | 2486 | return; |
| 2463 | mem_cgroup_cancel_charge(mem); | 2487 | mem_cgroup_cancel_charge(mem, PAGE_SIZE); |
| 2464 | } | 2488 | } |
| 2465 | 2489 | ||
| 2466 | static void | 2490 | static void |
| 2467 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | 2491 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, |
| 2492 | int page_size) | ||
| 2468 | { | 2493 | { |
| 2469 | struct memcg_batch_info *batch = NULL; | 2494 | struct memcg_batch_info *batch = NULL; |
| 2470 | bool uncharge_memsw = true; | 2495 | bool uncharge_memsw = true; |
| @@ -2491,6 +2516,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
| 2491 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | 2516 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) |
| 2492 | goto direct_uncharge; | 2517 | goto direct_uncharge; |
| 2493 | 2518 | ||
| 2519 | if (page_size != PAGE_SIZE) | ||
| 2520 | goto direct_uncharge; | ||
| 2521 | |||
| 2494 | /* | 2522 | /* |
| 2495 | * In typical case, batch->memcg == mem. This means we can | 2523 | * In typical case, batch->memcg == mem. This means we can |
| 2496 | * merge a series of uncharges to an uncharge of res_counter. | 2524 | * merge a series of uncharges to an uncharge of res_counter. |
| @@ -2504,9 +2532,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
| 2504 | batch->memsw_bytes += PAGE_SIZE; | 2532 | batch->memsw_bytes += PAGE_SIZE; |
| 2505 | return; | 2533 | return; |
| 2506 | direct_uncharge: | 2534 | direct_uncharge: |
| 2507 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2535 | res_counter_uncharge(&mem->res, page_size); |
| 2508 | if (uncharge_memsw) | 2536 | if (uncharge_memsw) |
| 2509 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 2537 | res_counter_uncharge(&mem->memsw, page_size); |
| 2510 | if (unlikely(batch->memcg != mem)) | 2538 | if (unlikely(batch->memcg != mem)) |
| 2511 | memcg_oom_recover(mem); | 2539 | memcg_oom_recover(mem); |
| 2512 | return; | 2540 | return; |
| @@ -2518,8 +2546,11 @@ direct_uncharge: | |||
| 2518 | static struct mem_cgroup * | 2546 | static struct mem_cgroup * |
| 2519 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2547 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
| 2520 | { | 2548 | { |
| 2549 | int i; | ||
| 2550 | int count; | ||
| 2521 | struct page_cgroup *pc; | 2551 | struct page_cgroup *pc; |
| 2522 | struct mem_cgroup *mem = NULL; | 2552 | struct mem_cgroup *mem = NULL; |
| 2553 | int page_size = PAGE_SIZE; | ||
| 2523 | 2554 | ||
| 2524 | if (mem_cgroup_disabled()) | 2555 | if (mem_cgroup_disabled()) |
| 2525 | return NULL; | 2556 | return NULL; |
| @@ -2527,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 2527 | if (PageSwapCache(page)) | 2558 | if (PageSwapCache(page)) |
| 2528 | return NULL; | 2559 | return NULL; |
| 2529 | 2560 | ||
| 2561 | if (PageTransHuge(page)) { | ||
| 2562 | page_size <<= compound_order(page); | ||
| 2563 | VM_BUG_ON(!PageTransHuge(page)); | ||
| 2564 | } | ||
| 2565 | |||
| 2566 | count = page_size >> PAGE_SHIFT; | ||
| 2530 | /* | 2567 | /* |
| 2531 | * Check if our page_cgroup is valid | 2568 | * Check if our page_cgroup is valid |
| 2532 | */ | 2569 | */ |
| @@ -2559,7 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 2559 | break; | 2596 | break; |
| 2560 | } | 2597 | } |
| 2561 | 2598 | ||
| 2562 | mem_cgroup_charge_statistics(mem, pc, false); | 2599 | for (i = 0; i < count; i++) |
| 2600 | mem_cgroup_charge_statistics(mem, pc + i, false); | ||
| 2563 | 2601 | ||
| 2564 | ClearPageCgroupUsed(pc); | 2602 | ClearPageCgroupUsed(pc); |
| 2565 | /* | 2603 | /* |
| @@ -2580,7 +2618,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 2580 | mem_cgroup_get(mem); | 2618 | mem_cgroup_get(mem); |
| 2581 | } | 2619 | } |
| 2582 | if (!mem_cgroup_is_root(mem)) | 2620 | if (!mem_cgroup_is_root(mem)) |
| 2583 | __do_uncharge(mem, ctype); | 2621 | __do_uncharge(mem, ctype, page_size); |
| 2584 | 2622 | ||
| 2585 | return mem; | 2623 | return mem; |
| 2586 | 2624 | ||
| @@ -2775,6 +2813,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
| 2775 | enum charge_type ctype; | 2813 | enum charge_type ctype; |
| 2776 | int ret = 0; | 2814 | int ret = 0; |
| 2777 | 2815 | ||
| 2816 | VM_BUG_ON(PageTransHuge(page)); | ||
| 2778 | if (mem_cgroup_disabled()) | 2817 | if (mem_cgroup_disabled()) |
| 2779 | return 0; | 2818 | return 0; |
| 2780 | 2819 | ||
| @@ -2824,7 +2863,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
| 2824 | return 0; | 2863 | return 0; |
| 2825 | 2864 | ||
| 2826 | *ptr = mem; | 2865 | *ptr = mem; |
| 2827 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); | 2866 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); |
| 2828 | css_put(&mem->css);/* drop extra refcnt */ | 2867 | css_put(&mem->css);/* drop extra refcnt */ |
| 2829 | if (ret || *ptr == NULL) { | 2868 | if (ret || *ptr == NULL) { |
| 2830 | if (PageAnon(page)) { | 2869 | if (PageAnon(page)) { |
| @@ -2851,13 +2890,13 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
| 2851 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 2890 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
| 2852 | else | 2891 | else |
| 2853 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 2892 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
| 2854 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2893 | __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); |
| 2855 | return ret; | 2894 | return ret; |
| 2856 | } | 2895 | } |
| 2857 | 2896 | ||
| 2858 | /* remove redundant charge if migration failed*/ | 2897 | /* remove redundant charge if migration failed*/ |
| 2859 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 2898 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
| 2860 | struct page *oldpage, struct page *newpage) | 2899 | struct page *oldpage, struct page *newpage, bool migration_ok) |
| 2861 | { | 2900 | { |
| 2862 | struct page *used, *unused; | 2901 | struct page *used, *unused; |
| 2863 | struct page_cgroup *pc; | 2902 | struct page_cgroup *pc; |
| @@ -2866,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
| 2866 | return; | 2905 | return; |
| 2867 | /* blocks rmdir() */ | 2906 | /* blocks rmdir() */ |
| 2868 | cgroup_exclude_rmdir(&mem->css); | 2907 | cgroup_exclude_rmdir(&mem->css); |
| 2869 | /* at migration success, oldpage->mapping is NULL. */ | 2908 | if (!migration_ok) { |
| 2870 | if (oldpage->mapping) { | ||
| 2871 | used = oldpage; | 2909 | used = oldpage; |
| 2872 | unused = newpage; | 2910 | unused = newpage; |
| 2873 | } else { | 2911 | } else { |
| @@ -4177,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
| 4177 | */ | 4215 | */ |
| 4178 | if (!node_state(node, N_NORMAL_MEMORY)) | 4216 | if (!node_state(node, N_NORMAL_MEMORY)) |
| 4179 | tmp = -1; | 4217 | tmp = -1; |
| 4180 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); | 4218 | pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); |
| 4181 | if (!pn) | 4219 | if (!pn) |
| 4182 | return 1; | 4220 | return 1; |
| 4183 | 4221 | ||
| 4184 | mem->info.nodeinfo[node] = pn; | 4222 | mem->info.nodeinfo[node] = pn; |
| 4185 | memset(pn, 0, sizeof(*pn)); | ||
| 4186 | |||
| 4187 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4223 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
| 4188 | mz = &pn->zoneinfo[zone]; | 4224 | mz = &pn->zoneinfo[zone]; |
| 4189 | for_each_lru(l) | 4225 | for_each_lru(l) |
| @@ -4207,14 +4243,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
| 4207 | 4243 | ||
| 4208 | /* Can be very big if MAX_NUMNODES is very big */ | 4244 | /* Can be very big if MAX_NUMNODES is very big */ |
| 4209 | if (size < PAGE_SIZE) | 4245 | if (size < PAGE_SIZE) |
| 4210 | mem = kmalloc(size, GFP_KERNEL); | 4246 | mem = kzalloc(size, GFP_KERNEL); |
| 4211 | else | 4247 | else |
| 4212 | mem = vmalloc(size); | 4248 | mem = vzalloc(size); |
| 4213 | 4249 | ||
| 4214 | if (!mem) | 4250 | if (!mem) |
| 4215 | return NULL; | 4251 | return NULL; |
| 4216 | 4252 | ||
| 4217 | memset(mem, 0, size); | ||
| 4218 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | 4253 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); |
| 4219 | if (!mem->stat) | 4254 | if (!mem->stat) |
| 4220 | goto out_free; | 4255 | goto out_free; |
| @@ -4462,7 +4497,8 @@ one_by_one: | |||
| 4462 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 4497 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
| 4463 | cond_resched(); | 4498 | cond_resched(); |
| 4464 | } | 4499 | } |
| 4465 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 4500 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
| 4501 | PAGE_SIZE); | ||
| 4466 | if (ret || !mem) | 4502 | if (ret || !mem) |
| 4467 | /* mem_cgroup_clear_mc() will do uncharge later */ | 4503 | /* mem_cgroup_clear_mc() will do uncharge later */ |
| 4468 | return -ENOMEM; | 4504 | return -ENOMEM; |
| @@ -4624,6 +4660,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
| 4624 | pte_t *pte; | 4660 | pte_t *pte; |
| 4625 | spinlock_t *ptl; | 4661 | spinlock_t *ptl; |
| 4626 | 4662 | ||
| 4663 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
| 4627 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 4664 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| 4628 | for (; addr != end; pte++, addr += PAGE_SIZE) | 4665 | for (; addr != end; pte++, addr += PAGE_SIZE) |
| 4629 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | 4666 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) |
| @@ -4639,7 +4676,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
| 4639 | unsigned long precharge; | 4676 | unsigned long precharge; |
| 4640 | struct vm_area_struct *vma; | 4677 | struct vm_area_struct *vma; |
| 4641 | 4678 | ||
| 4642 | /* We've already held the mmap_sem */ | 4679 | down_read(&mm->mmap_sem); |
| 4643 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4680 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
| 4644 | struct mm_walk mem_cgroup_count_precharge_walk = { | 4681 | struct mm_walk mem_cgroup_count_precharge_walk = { |
| 4645 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | 4682 | .pmd_entry = mem_cgroup_count_precharge_pte_range, |
| @@ -4651,6 +4688,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
| 4651 | walk_page_range(vma->vm_start, vma->vm_end, | 4688 | walk_page_range(vma->vm_start, vma->vm_end, |
| 4652 | &mem_cgroup_count_precharge_walk); | 4689 | &mem_cgroup_count_precharge_walk); |
| 4653 | } | 4690 | } |
| 4691 | up_read(&mm->mmap_sem); | ||
| 4654 | 4692 | ||
| 4655 | precharge = mc.precharge; | 4693 | precharge = mc.precharge; |
| 4656 | mc.precharge = 0; | 4694 | mc.precharge = 0; |
| @@ -4660,10 +4698,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
| 4660 | 4698 | ||
| 4661 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | 4699 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) |
| 4662 | { | 4700 | { |
| 4663 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | 4701 | unsigned long precharge = mem_cgroup_count_precharge(mm); |
| 4702 | |||
| 4703 | VM_BUG_ON(mc.moving_task); | ||
| 4704 | mc.moving_task = current; | ||
| 4705 | return mem_cgroup_do_precharge(precharge); | ||
| 4664 | } | 4706 | } |
| 4665 | 4707 | ||
| 4666 | static void mem_cgroup_clear_mc(void) | 4708 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ |
| 4709 | static void __mem_cgroup_clear_mc(void) | ||
| 4667 | { | 4710 | { |
| 4668 | struct mem_cgroup *from = mc.from; | 4711 | struct mem_cgroup *from = mc.from; |
| 4669 | struct mem_cgroup *to = mc.to; | 4712 | struct mem_cgroup *to = mc.to; |
| @@ -4698,23 +4741,28 @@ static void mem_cgroup_clear_mc(void) | |||
| 4698 | PAGE_SIZE * mc.moved_swap); | 4741 | PAGE_SIZE * mc.moved_swap); |
| 4699 | } | 4742 | } |
| 4700 | /* we've already done mem_cgroup_get(mc.to) */ | 4743 | /* we've already done mem_cgroup_get(mc.to) */ |
| 4701 | |||
| 4702 | mc.moved_swap = 0; | 4744 | mc.moved_swap = 0; |
| 4703 | } | 4745 | } |
| 4704 | if (mc.mm) { | 4746 | memcg_oom_recover(from); |
| 4705 | up_read(&mc.mm->mmap_sem); | 4747 | memcg_oom_recover(to); |
| 4706 | mmput(mc.mm); | 4748 | wake_up_all(&mc.waitq); |
| 4707 | } | 4749 | } |
| 4750 | |||
| 4751 | static void mem_cgroup_clear_mc(void) | ||
| 4752 | { | ||
| 4753 | struct mem_cgroup *from = mc.from; | ||
| 4754 | |||
| 4755 | /* | ||
| 4756 | * we must clear moving_task before waking up waiters at the end of | ||
| 4757 | * task migration. | ||
| 4758 | */ | ||
| 4759 | mc.moving_task = NULL; | ||
| 4760 | __mem_cgroup_clear_mc(); | ||
| 4708 | spin_lock(&mc.lock); | 4761 | spin_lock(&mc.lock); |
| 4709 | mc.from = NULL; | 4762 | mc.from = NULL; |
| 4710 | mc.to = NULL; | 4763 | mc.to = NULL; |
| 4711 | spin_unlock(&mc.lock); | 4764 | spin_unlock(&mc.lock); |
| 4712 | mc.moving_task = NULL; | ||
| 4713 | mc.mm = NULL; | ||
| 4714 | mem_cgroup_end_move(from); | 4765 | mem_cgroup_end_move(from); |
| 4715 | memcg_oom_recover(from); | ||
| 4716 | memcg_oom_recover(to); | ||
| 4717 | wake_up_all(&mc.waitq); | ||
| 4718 | } | 4766 | } |
| 4719 | 4767 | ||
| 4720 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 4768 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
| @@ -4736,38 +4784,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
| 4736 | return 0; | 4784 | return 0; |
| 4737 | /* We move charges only when we move a owner of the mm */ | 4785 | /* We move charges only when we move a owner of the mm */ |
| 4738 | if (mm->owner == p) { | 4786 | if (mm->owner == p) { |
| 4739 | /* | ||
| 4740 | * We do all the move charge works under one mmap_sem to | ||
| 4741 | * avoid deadlock with down_write(&mmap_sem) | ||
| 4742 | * -> try_charge() -> if (mc.moving_task) -> sleep. | ||
| 4743 | */ | ||
| 4744 | down_read(&mm->mmap_sem); | ||
| 4745 | |||
| 4746 | VM_BUG_ON(mc.from); | 4787 | VM_BUG_ON(mc.from); |
| 4747 | VM_BUG_ON(mc.to); | 4788 | VM_BUG_ON(mc.to); |
| 4748 | VM_BUG_ON(mc.precharge); | 4789 | VM_BUG_ON(mc.precharge); |
| 4749 | VM_BUG_ON(mc.moved_charge); | 4790 | VM_BUG_ON(mc.moved_charge); |
| 4750 | VM_BUG_ON(mc.moved_swap); | 4791 | VM_BUG_ON(mc.moved_swap); |
| 4751 | VM_BUG_ON(mc.moving_task); | ||
| 4752 | VM_BUG_ON(mc.mm); | ||
| 4753 | |||
| 4754 | mem_cgroup_start_move(from); | 4792 | mem_cgroup_start_move(from); |
| 4755 | spin_lock(&mc.lock); | 4793 | spin_lock(&mc.lock); |
| 4756 | mc.from = from; | 4794 | mc.from = from; |
| 4757 | mc.to = mem; | 4795 | mc.to = mem; |
| 4758 | mc.precharge = 0; | ||
| 4759 | mc.moved_charge = 0; | ||
| 4760 | mc.moved_swap = 0; | ||
| 4761 | spin_unlock(&mc.lock); | 4796 | spin_unlock(&mc.lock); |
| 4762 | mc.moving_task = current; | 4797 | /* We set mc.moving_task later */ |
| 4763 | mc.mm = mm; | ||
| 4764 | 4798 | ||
| 4765 | ret = mem_cgroup_precharge_mc(mm); | 4799 | ret = mem_cgroup_precharge_mc(mm); |
| 4766 | if (ret) | 4800 | if (ret) |
| 4767 | mem_cgroup_clear_mc(); | 4801 | mem_cgroup_clear_mc(); |
| 4768 | /* We call up_read() and mmput() in clear_mc(). */ | 4802 | } |
| 4769 | } else | 4803 | mmput(mm); |
| 4770 | mmput(mm); | ||
| 4771 | } | 4804 | } |
| 4772 | return ret; | 4805 | return ret; |
| 4773 | } | 4806 | } |
| @@ -4790,6 +4823,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
| 4790 | spinlock_t *ptl; | 4823 | spinlock_t *ptl; |
| 4791 | 4824 | ||
| 4792 | retry: | 4825 | retry: |
| 4826 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
| 4793 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 4827 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| 4794 | for (; addr != end; addr += PAGE_SIZE) { | 4828 | for (; addr != end; addr += PAGE_SIZE) { |
| 4795 | pte_t ptent = *(pte++); | 4829 | pte_t ptent = *(pte++); |
| @@ -4855,7 +4889,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
| 4855 | struct vm_area_struct *vma; | 4889 | struct vm_area_struct *vma; |
| 4856 | 4890 | ||
| 4857 | lru_add_drain_all(); | 4891 | lru_add_drain_all(); |
| 4858 | /* We've already held the mmap_sem */ | 4892 | retry: |
| 4893 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | ||
| 4894 | /* | ||
| 4895 | * Someone who are holding the mmap_sem might be waiting in | ||
| 4896 | * waitq. So we cancel all extra charges, wake up all waiters, | ||
| 4897 | * and retry. Because we cancel precharges, we might not be able | ||
| 4898 | * to move enough charges, but moving charge is a best-effort | ||
| 4899 | * feature anyway, so it wouldn't be a big problem. | ||
| 4900 | */ | ||
| 4901 | __mem_cgroup_clear_mc(); | ||
| 4902 | cond_resched(); | ||
| 4903 | goto retry; | ||
| 4904 | } | ||
| 4859 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4905 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
| 4860 | int ret; | 4906 | int ret; |
| 4861 | struct mm_walk mem_cgroup_move_charge_walk = { | 4907 | struct mm_walk mem_cgroup_move_charge_walk = { |
| @@ -4874,6 +4920,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
| 4874 | */ | 4920 | */ |
| 4875 | break; | 4921 | break; |
| 4876 | } | 4922 | } |
| 4923 | up_read(&mm->mmap_sem); | ||
| 4877 | } | 4924 | } |
| 4878 | 4925 | ||
| 4879 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 4926 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
| @@ -4882,11 +4929,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
| 4882 | struct task_struct *p, | 4929 | struct task_struct *p, |
| 4883 | bool threadgroup) | 4930 | bool threadgroup) |
| 4884 | { | 4931 | { |
| 4885 | if (!mc.mm) | 4932 | struct mm_struct *mm; |
| 4933 | |||
| 4934 | if (!mc.to) | ||
| 4886 | /* no need to move charge */ | 4935 | /* no need to move charge */ |
| 4887 | return; | 4936 | return; |
| 4888 | 4937 | ||
| 4889 | mem_cgroup_move_charge(mc.mm); | 4938 | mm = get_task_mm(p); |
| 4939 | if (mm) { | ||
| 4940 | mem_cgroup_move_charge(mm); | ||
| 4941 | mmput(mm); | ||
| 4942 | } | ||
| 4890 | mem_cgroup_clear_mc(); | 4943 | mem_cgroup_clear_mc(); |
| 4891 | } | 4944 | } |
| 4892 | #else /* !CONFIG_MMU */ | 4945 | #else /* !CONFIG_MMU */ |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 124324134ff6..548fbd70f026 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -51,6 +51,7 @@ | |||
| 51 | #include <linux/slab.h> | 51 | #include <linux/slab.h> |
| 52 | #include <linux/swapops.h> | 52 | #include <linux/swapops.h> |
| 53 | #include <linux/hugetlb.h> | 53 | #include <linux/hugetlb.h> |
| 54 | #include <linux/memory_hotplug.h> | ||
| 54 | #include "internal.h" | 55 | #include "internal.h" |
| 55 | 56 | ||
| 56 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 57 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
| @@ -202,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
| 202 | #ifdef __ARCH_SI_TRAPNO | 203 | #ifdef __ARCH_SI_TRAPNO |
| 203 | si.si_trapno = trapno; | 204 | si.si_trapno = trapno; |
| 204 | #endif | 205 | #endif |
| 205 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; | 206 | si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; |
| 206 | /* | 207 | /* |
| 207 | * Don't use force here, it's convenient if the signal | 208 | * Don't use force here, it's convenient if the signal |
| 208 | * can be temporarily blocked. | 209 | * can be temporarily blocked. |
| @@ -385,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
| 385 | struct task_struct *tsk; | 386 | struct task_struct *tsk; |
| 386 | struct anon_vma *av; | 387 | struct anon_vma *av; |
| 387 | 388 | ||
| 389 | if (!PageHuge(page) && unlikely(split_huge_page(page))) | ||
| 390 | return; | ||
| 388 | read_lock(&tasklist_lock); | 391 | read_lock(&tasklist_lock); |
| 389 | av = page_lock_anon_vma(page); | 392 | av = page_lock_anon_vma(page); |
| 390 | if (av == NULL) /* Not actually mapped anymore */ | 393 | if (av == NULL) /* Not actually mapped anymore */ |
| @@ -927,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 927 | static void set_page_hwpoison_huge_page(struct page *hpage) | 930 | static void set_page_hwpoison_huge_page(struct page *hpage) |
| 928 | { | 931 | { |
| 929 | int i; | 932 | int i; |
| 930 | int nr_pages = 1 << compound_order(hpage); | 933 | int nr_pages = 1 << compound_trans_order(hpage); |
| 931 | for (i = 0; i < nr_pages; i++) | 934 | for (i = 0; i < nr_pages; i++) |
| 932 | SetPageHWPoison(hpage + i); | 935 | SetPageHWPoison(hpage + i); |
| 933 | } | 936 | } |
| @@ -935,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) | |||
| 935 | static void clear_page_hwpoison_huge_page(struct page *hpage) | 938 | static void clear_page_hwpoison_huge_page(struct page *hpage) |
| 936 | { | 939 | { |
| 937 | int i; | 940 | int i; |
| 938 | int nr_pages = 1 << compound_order(hpage); | 941 | int nr_pages = 1 << compound_trans_order(hpage); |
| 939 | for (i = 0; i < nr_pages; i++) | 942 | for (i = 0; i < nr_pages; i++) |
| 940 | ClearPageHWPoison(hpage + i); | 943 | ClearPageHWPoison(hpage + i); |
| 941 | } | 944 | } |
| @@ -965,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 965 | return 0; | 968 | return 0; |
| 966 | } | 969 | } |
| 967 | 970 | ||
| 968 | nr_pages = 1 << compound_order(hpage); | 971 | nr_pages = 1 << compound_trans_order(hpage); |
| 969 | atomic_long_add(nr_pages, &mce_bad_pages); | 972 | atomic_long_add(nr_pages, &mce_bad_pages); |
| 970 | 973 | ||
| 971 | /* | 974 | /* |
| @@ -1163,7 +1166,7 @@ int unpoison_memory(unsigned long pfn) | |||
| 1163 | return 0; | 1166 | return 0; |
| 1164 | } | 1167 | } |
| 1165 | 1168 | ||
| 1166 | nr_pages = 1 << compound_order(page); | 1169 | nr_pages = 1 << compound_trans_order(page); |
| 1167 | 1170 | ||
| 1168 | if (!get_page_unless_zero(page)) { | 1171 | if (!get_page_unless_zero(page)) { |
| 1169 | /* | 1172 | /* |
| @@ -1230,11 +1233,10 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
| 1230 | return 1; | 1233 | return 1; |
| 1231 | 1234 | ||
| 1232 | /* | 1235 | /* |
| 1233 | * The lock_system_sleep prevents a race with memory hotplug, | 1236 | * The lock_memory_hotplug prevents a race with memory hotplug. |
| 1234 | * because the isolation assumes there's only a single user. | ||
| 1235 | * This is a big hammer, a better would be nicer. | 1237 | * This is a big hammer, a better would be nicer. |
| 1236 | */ | 1238 | */ |
| 1237 | lock_system_sleep(); | 1239 | lock_memory_hotplug(); |
| 1238 | 1240 | ||
| 1239 | /* | 1241 | /* |
| 1240 | * Isolate the page, so that it doesn't get reallocated if it | 1242 | * Isolate the page, so that it doesn't get reallocated if it |
| @@ -1264,7 +1266,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
| 1264 | ret = 1; | 1266 | ret = 1; |
| 1265 | } | 1267 | } |
| 1266 | unset_migratetype_isolate(p); | 1268 | unset_migratetype_isolate(p); |
| 1267 | unlock_system_sleep(); | 1269 | unlock_memory_hotplug(); |
| 1268 | return ret; | 1270 | return ret; |
| 1269 | } | 1271 | } |
| 1270 | 1272 | ||
| @@ -1290,9 +1292,10 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
| 1290 | /* Keep page count to indicate a given hugepage is isolated. */ | 1292 | /* Keep page count to indicate a given hugepage is isolated. */ |
| 1291 | 1293 | ||
| 1292 | list_add(&hpage->lru, &pagelist); | 1294 | list_add(&hpage->lru, &pagelist); |
| 1293 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | 1295 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, |
| 1296 | true); | ||
| 1294 | if (ret) { | 1297 | if (ret) { |
| 1295 | putback_lru_pages(&pagelist); | 1298 | putback_lru_pages(&pagelist); |
| 1296 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | 1299 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", |
| 1297 | pfn, ret, page->flags); | 1300 | pfn, ret, page->flags); |
| 1298 | if (ret > 0) | 1301 | if (ret > 0) |
| @@ -1301,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
| 1301 | } | 1304 | } |
| 1302 | done: | 1305 | done: |
| 1303 | if (!PageHWPoison(hpage)) | 1306 | if (!PageHWPoison(hpage)) |
| 1304 | atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); | 1307 | atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); |
| 1305 | set_page_hwpoison_huge_page(hpage); | 1308 | set_page_hwpoison_huge_page(hpage); |
| 1306 | dequeue_hwpoisoned_huge_page(hpage); | 1309 | dequeue_hwpoisoned_huge_page(hpage); |
| 1307 | /* keep elevated page count for bad page */ | 1310 | /* keep elevated page count for bad page */ |
| @@ -1413,7 +1416,8 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1413 | LIST_HEAD(pagelist); | 1416 | LIST_HEAD(pagelist); |
| 1414 | 1417 | ||
| 1415 | list_add(&page->lru, &pagelist); | 1418 | list_add(&page->lru, &pagelist); |
| 1416 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | 1419 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
| 1420 | 0, true); | ||
| 1417 | if (ret) { | 1421 | if (ret) { |
| 1418 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1422 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
| 1419 | pfn, ret, page->flags); | 1423 | pfn, ret, page->flags); |
diff --git a/mm/memory.c b/mm/memory.c index 02e48aa0ed13..31250faff390 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 394 | } | 394 | } |
| 395 | } | 395 | } |
| 396 | 396 | ||
| 397 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | 397 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
| 398 | pmd_t *pmd, unsigned long address) | ||
| 398 | { | 399 | { |
| 399 | pgtable_t new = pte_alloc_one(mm, address); | 400 | pgtable_t new = pte_alloc_one(mm, address); |
| 401 | int wait_split_huge_page; | ||
| 400 | if (!new) | 402 | if (!new) |
| 401 | return -ENOMEM; | 403 | return -ENOMEM; |
| 402 | 404 | ||
| @@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | |||
| 416 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ | 418 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ |
| 417 | 419 | ||
| 418 | spin_lock(&mm->page_table_lock); | 420 | spin_lock(&mm->page_table_lock); |
| 419 | if (!pmd_present(*pmd)) { /* Has another populated it ? */ | 421 | wait_split_huge_page = 0; |
| 422 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | ||
| 420 | mm->nr_ptes++; | 423 | mm->nr_ptes++; |
| 421 | pmd_populate(mm, pmd, new); | 424 | pmd_populate(mm, pmd, new); |
| 422 | new = NULL; | 425 | new = NULL; |
| 423 | } | 426 | } else if (unlikely(pmd_trans_splitting(*pmd))) |
| 427 | wait_split_huge_page = 1; | ||
| 424 | spin_unlock(&mm->page_table_lock); | 428 | spin_unlock(&mm->page_table_lock); |
| 425 | if (new) | 429 | if (new) |
| 426 | pte_free(mm, new); | 430 | pte_free(mm, new); |
| 431 | if (wait_split_huge_page) | ||
| 432 | wait_split_huge_page(vma->anon_vma, pmd); | ||
| 427 | return 0; | 433 | return 0; |
| 428 | } | 434 | } |
| 429 | 435 | ||
| @@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
| 436 | smp_wmb(); /* See comment in __pte_alloc */ | 442 | smp_wmb(); /* See comment in __pte_alloc */ |
| 437 | 443 | ||
| 438 | spin_lock(&init_mm.page_table_lock); | 444 | spin_lock(&init_mm.page_table_lock); |
| 439 | if (!pmd_present(*pmd)) { /* Has another populated it ? */ | 445 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
| 440 | pmd_populate_kernel(&init_mm, pmd, new); | 446 | pmd_populate_kernel(&init_mm, pmd, new); |
| 441 | new = NULL; | 447 | new = NULL; |
| 442 | } | 448 | } else |
| 449 | VM_BUG_ON(pmd_trans_splitting(*pmd)); | ||
| 443 | spin_unlock(&init_mm.page_table_lock); | 450 | spin_unlock(&init_mm.page_table_lock); |
| 444 | if (new) | 451 | if (new) |
| 445 | pte_free_kernel(&init_mm, new); | 452 | pte_free_kernel(&init_mm, new); |
| @@ -719,9 +726,9 @@ out_set_pte: | |||
| 719 | return 0; | 726 | return 0; |
| 720 | } | 727 | } |
| 721 | 728 | ||
| 722 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 729 | int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
| 723 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 730 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
| 724 | unsigned long addr, unsigned long end) | 731 | unsigned long addr, unsigned long end) |
| 725 | { | 732 | { |
| 726 | pte_t *orig_src_pte, *orig_dst_pte; | 733 | pte_t *orig_src_pte, *orig_dst_pte; |
| 727 | pte_t *src_pte, *dst_pte; | 734 | pte_t *src_pte, *dst_pte; |
| @@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src | |||
| 795 | src_pmd = pmd_offset(src_pud, addr); | 802 | src_pmd = pmd_offset(src_pud, addr); |
| 796 | do { | 803 | do { |
| 797 | next = pmd_addr_end(addr, end); | 804 | next = pmd_addr_end(addr, end); |
| 805 | if (pmd_trans_huge(*src_pmd)) { | ||
| 806 | int err; | ||
| 807 | VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); | ||
| 808 | err = copy_huge_pmd(dst_mm, src_mm, | ||
| 809 | dst_pmd, src_pmd, addr, vma); | ||
| 810 | if (err == -ENOMEM) | ||
| 811 | return -ENOMEM; | ||
| 812 | if (!err) | ||
| 813 | continue; | ||
| 814 | /* fall through */ | ||
| 815 | } | ||
| 798 | if (pmd_none_or_clear_bad(src_pmd)) | 816 | if (pmd_none_or_clear_bad(src_pmd)) |
| 799 | continue; | 817 | continue; |
| 800 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, | 818 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, |
| @@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
| 997 | pmd = pmd_offset(pud, addr); | 1015 | pmd = pmd_offset(pud, addr); |
| 998 | do { | 1016 | do { |
| 999 | next = pmd_addr_end(addr, end); | 1017 | next = pmd_addr_end(addr, end); |
| 1018 | if (pmd_trans_huge(*pmd)) { | ||
| 1019 | if (next-addr != HPAGE_PMD_SIZE) { | ||
| 1020 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | ||
| 1021 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
| 1022 | } else if (zap_huge_pmd(tlb, vma, pmd)) { | ||
| 1023 | (*zap_work)--; | ||
| 1024 | continue; | ||
| 1025 | } | ||
| 1026 | /* fall through */ | ||
| 1027 | } | ||
| 1000 | if (pmd_none_or_clear_bad(pmd)) { | 1028 | if (pmd_none_or_clear_bad(pmd)) { |
| 1001 | (*zap_work)--; | 1029 | (*zap_work)--; |
| 1002 | continue; | 1030 | continue; |
| @@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
| 1262 | pud = pud_offset(pgd, address); | 1290 | pud = pud_offset(pgd, address); |
| 1263 | if (pud_none(*pud)) | 1291 | if (pud_none(*pud)) |
| 1264 | goto no_page_table; | 1292 | goto no_page_table; |
| 1265 | if (pud_huge(*pud)) { | 1293 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
| 1266 | BUG_ON(flags & FOLL_GET); | 1294 | BUG_ON(flags & FOLL_GET); |
| 1267 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 1295 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); |
| 1268 | goto out; | 1296 | goto out; |
| @@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
| 1273 | pmd = pmd_offset(pud, address); | 1301 | pmd = pmd_offset(pud, address); |
| 1274 | if (pmd_none(*pmd)) | 1302 | if (pmd_none(*pmd)) |
| 1275 | goto no_page_table; | 1303 | goto no_page_table; |
| 1276 | if (pmd_huge(*pmd)) { | 1304 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
| 1277 | BUG_ON(flags & FOLL_GET); | 1305 | BUG_ON(flags & FOLL_GET); |
| 1278 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1306 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
| 1279 | goto out; | 1307 | goto out; |
| 1280 | } | 1308 | } |
| 1309 | if (pmd_trans_huge(*pmd)) { | ||
| 1310 | if (flags & FOLL_SPLIT) { | ||
| 1311 | split_huge_page_pmd(mm, pmd); | ||
| 1312 | goto split_fallthrough; | ||
| 1313 | } | ||
| 1314 | spin_lock(&mm->page_table_lock); | ||
| 1315 | if (likely(pmd_trans_huge(*pmd))) { | ||
| 1316 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
| 1317 | spin_unlock(&mm->page_table_lock); | ||
| 1318 | wait_split_huge_page(vma->anon_vma, pmd); | ||
| 1319 | } else { | ||
| 1320 | page = follow_trans_huge_pmd(mm, address, | ||
| 1321 | pmd, flags); | ||
| 1322 | spin_unlock(&mm->page_table_lock); | ||
| 1323 | goto out; | ||
| 1324 | } | ||
| 1325 | } else | ||
| 1326 | spin_unlock(&mm->page_table_lock); | ||
| 1327 | /* fall through */ | ||
| 1328 | } | ||
| 1329 | split_fallthrough: | ||
| 1281 | if (unlikely(pmd_bad(*pmd))) | 1330 | if (unlikely(pmd_bad(*pmd))) |
| 1282 | goto no_page_table; | 1331 | goto no_page_table; |
| 1283 | 1332 | ||
| @@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
| 1310 | */ | 1359 | */ |
| 1311 | mark_page_accessed(page); | 1360 | mark_page_accessed(page); |
| 1312 | } | 1361 | } |
| 1362 | if (flags & FOLL_MLOCK) { | ||
| 1363 | /* | ||
| 1364 | * The preliminary mapping check is mainly to avoid the | ||
| 1365 | * pointless overhead of lock_page on the ZERO_PAGE | ||
| 1366 | * which might bounce very badly if there is contention. | ||
| 1367 | * | ||
| 1368 | * If the page is already locked, we don't need to | ||
| 1369 | * handle it now - vmscan will handle it later if and | ||
| 1370 | * when it attempts to reclaim the page. | ||
| 1371 | */ | ||
| 1372 | if (page->mapping && trylock_page(page)) { | ||
| 1373 | lru_add_drain(); /* push cached pages to LRU */ | ||
| 1374 | /* | ||
| 1375 | * Because we lock page here and migration is | ||
| 1376 | * blocked by the pte's page reference, we need | ||
| 1377 | * only check for file-cache page truncation. | ||
| 1378 | */ | ||
| 1379 | if (page->mapping) | ||
| 1380 | mlock_vma_page(page); | ||
| 1381 | unlock_page(page); | ||
| 1382 | } | ||
| 1383 | } | ||
| 1313 | unlock: | 1384 | unlock: |
| 1314 | pte_unmap_unlock(ptep, ptl); | 1385 | pte_unmap_unlock(ptep, ptl); |
| 1315 | out: | 1386 | out: |
| @@ -1341,7 +1412,8 @@ no_page_table: | |||
| 1341 | 1412 | ||
| 1342 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1413 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 1343 | unsigned long start, int nr_pages, unsigned int gup_flags, | 1414 | unsigned long start, int nr_pages, unsigned int gup_flags, |
| 1344 | struct page **pages, struct vm_area_struct **vmas) | 1415 | struct page **pages, struct vm_area_struct **vmas, |
| 1416 | int *nonblocking) | ||
| 1345 | { | 1417 | { |
| 1346 | int i; | 1418 | int i; |
| 1347 | unsigned long vm_flags; | 1419 | unsigned long vm_flags; |
| @@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1386 | pmd = pmd_offset(pud, pg); | 1458 | pmd = pmd_offset(pud, pg); |
| 1387 | if (pmd_none(*pmd)) | 1459 | if (pmd_none(*pmd)) |
| 1388 | return i ? : -EFAULT; | 1460 | return i ? : -EFAULT; |
| 1461 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
| 1389 | pte = pte_offset_map(pmd, pg); | 1462 | pte = pte_offset_map(pmd, pg); |
| 1390 | if (pte_none(*pte)) { | 1463 | if (pte_none(*pte)) { |
| 1391 | pte_unmap(pte); | 1464 | pte_unmap(pte); |
| @@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1441 | cond_resched(); | 1514 | cond_resched(); |
| 1442 | while (!(page = follow_page(vma, start, foll_flags))) { | 1515 | while (!(page = follow_page(vma, start, foll_flags))) { |
| 1443 | int ret; | 1516 | int ret; |
| 1517 | unsigned int fault_flags = 0; | ||
| 1518 | |||
| 1519 | if (foll_flags & FOLL_WRITE) | ||
| 1520 | fault_flags |= FAULT_FLAG_WRITE; | ||
| 1521 | if (nonblocking) | ||
| 1522 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
| 1444 | 1523 | ||
| 1445 | ret = handle_mm_fault(mm, vma, start, | 1524 | ret = handle_mm_fault(mm, vma, start, |
| 1446 | (foll_flags & FOLL_WRITE) ? | 1525 | fault_flags); |
| 1447 | FAULT_FLAG_WRITE : 0); | ||
| 1448 | 1526 | ||
| 1449 | if (ret & VM_FAULT_ERROR) { | 1527 | if (ret & VM_FAULT_ERROR) { |
| 1450 | if (ret & VM_FAULT_OOM) | 1528 | if (ret & VM_FAULT_OOM) |
| @@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1460 | else | 1538 | else |
| 1461 | tsk->min_flt++; | 1539 | tsk->min_flt++; |
| 1462 | 1540 | ||
| 1541 | if (ret & VM_FAULT_RETRY) { | ||
| 1542 | *nonblocking = 0; | ||
| 1543 | return i; | ||
| 1544 | } | ||
| 1545 | |||
| 1463 | /* | 1546 | /* |
| 1464 | * The VM_FAULT_WRITE bit tells us that | 1547 | * The VM_FAULT_WRITE bit tells us that |
| 1465 | * do_wp_page has broken COW when necessary, | 1548 | * do_wp_page has broken COW when necessary, |
| @@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1559 | if (force) | 1642 | if (force) |
| 1560 | flags |= FOLL_FORCE; | 1643 | flags |= FOLL_FORCE; |
| 1561 | 1644 | ||
| 1562 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 1645 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, |
| 1646 | NULL); | ||
| 1563 | } | 1647 | } |
| 1564 | EXPORT_SYMBOL(get_user_pages); | 1648 | EXPORT_SYMBOL(get_user_pages); |
| 1565 | 1649 | ||
| @@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr) | |||
| 1584 | struct page *page; | 1668 | struct page *page; |
| 1585 | 1669 | ||
| 1586 | if (__get_user_pages(current, current->mm, addr, 1, | 1670 | if (__get_user_pages(current, current->mm, addr, 1, |
| 1587 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) | 1671 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, |
| 1672 | NULL) < 1) | ||
| 1588 | return NULL; | 1673 | return NULL; |
| 1589 | flush_cache_page(vma, addr, page_to_pfn(page)); | 1674 | flush_cache_page(vma, addr, page_to_pfn(page)); |
| 1590 | return page; | 1675 | return page; |
| @@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, | |||
| 1598 | pud_t * pud = pud_alloc(mm, pgd, addr); | 1683 | pud_t * pud = pud_alloc(mm, pgd, addr); |
| 1599 | if (pud) { | 1684 | if (pud) { |
| 1600 | pmd_t * pmd = pmd_alloc(mm, pud, addr); | 1685 | pmd_t * pmd = pmd_alloc(mm, pud, addr); |
| 1601 | if (pmd) | 1686 | if (pmd) { |
| 1687 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
| 1602 | return pte_alloc_map_lock(mm, pmd, addr, ptl); | 1688 | return pte_alloc_map_lock(mm, pmd, addr, ptl); |
| 1689 | } | ||
| 1603 | } | 1690 | } |
| 1604 | return NULL; | 1691 | return NULL; |
| 1605 | } | 1692 | } |
| @@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
| 1818 | pmd = pmd_alloc(mm, pud, addr); | 1905 | pmd = pmd_alloc(mm, pud, addr); |
| 1819 | if (!pmd) | 1906 | if (!pmd) |
| 1820 | return -ENOMEM; | 1907 | return -ENOMEM; |
| 1908 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
| 1821 | do { | 1909 | do { |
| 1822 | next = pmd_addr_end(addr, end); | 1910 | next = pmd_addr_end(addr, end); |
| 1823 | if (remap_pte_range(mm, pmd, addr, next, | 1911 | if (remap_pte_range(mm, pmd, addr, next, |
| @@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | |||
| 2048 | return same; | 2136 | return same; |
| 2049 | } | 2137 | } |
| 2050 | 2138 | ||
| 2051 | /* | ||
| 2052 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when | ||
| 2053 | * servicing faults for write access. In the normal case, do always want | ||
| 2054 | * pte_mkwrite. But get_user_pages can cause write faults for mappings | ||
| 2055 | * that do not have writing enabled, when used by access_process_vm. | ||
| 2056 | */ | ||
| 2057 | static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | ||
| 2058 | { | ||
| 2059 | if (likely(vma->vm_flags & VM_WRITE)) | ||
| 2060 | pte = pte_mkwrite(pte); | ||
| 2061 | return pte; | ||
| 2062 | } | ||
| 2063 | |||
| 2064 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 2139 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
| 2065 | { | 2140 | { |
| 2066 | /* | 2141 | /* |
| @@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2112 | { | 2187 | { |
| 2113 | struct page *old_page, *new_page; | 2188 | struct page *old_page, *new_page; |
| 2114 | pte_t entry; | 2189 | pte_t entry; |
| 2115 | int reuse = 0, ret = 0; | 2190 | int ret = 0; |
| 2116 | int page_mkwrite = 0; | 2191 | int page_mkwrite = 0; |
| 2117 | struct page *dirty_page = NULL; | 2192 | struct page *dirty_page = NULL; |
| 2118 | 2193 | ||
| @@ -2149,14 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2149 | } | 2224 | } |
| 2150 | page_cache_release(old_page); | 2225 | page_cache_release(old_page); |
| 2151 | } | 2226 | } |
| 2152 | reuse = reuse_swap_page(old_page); | 2227 | if (reuse_swap_page(old_page)) { |
| 2153 | if (reuse) | ||
| 2154 | /* | 2228 | /* |
| 2155 | * The page is all ours. Move it to our anon_vma so | 2229 | * The page is all ours. Move it to our anon_vma so |
| 2156 | * the rmap code will not search our parent or siblings. | 2230 | * the rmap code will not search our parent or siblings. |
| 2157 | * Protected against the rmap code by the page lock. | 2231 | * Protected against the rmap code by the page lock. |
| 2158 | */ | 2232 | */ |
| 2159 | page_move_anon_rmap(old_page, vma, address); | 2233 | page_move_anon_rmap(old_page, vma, address); |
| 2234 | unlock_page(old_page); | ||
| 2235 | goto reuse; | ||
| 2236 | } | ||
| 2160 | unlock_page(old_page); | 2237 | unlock_page(old_page); |
| 2161 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2238 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
| 2162 | (VM_WRITE|VM_SHARED))) { | 2239 | (VM_WRITE|VM_SHARED))) { |
| @@ -2220,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2220 | } | 2297 | } |
| 2221 | dirty_page = old_page; | 2298 | dirty_page = old_page; |
| 2222 | get_page(dirty_page); | 2299 | get_page(dirty_page); |
| 2223 | reuse = 1; | ||
| 2224 | } | ||
| 2225 | 2300 | ||
| 2226 | if (reuse) { | ||
| 2227 | reuse: | 2301 | reuse: |
| 2228 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2302 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
| 2229 | entry = pte_mkyoung(orig_pte); | 2303 | entry = pte_mkyoung(orig_pte); |
| 2230 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2304 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2231 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 2305 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
| 2232 | update_mmu_cache(vma, address, page_table); | 2306 | update_mmu_cache(vma, address, page_table); |
| 2307 | pte_unmap_unlock(page_table, ptl); | ||
| 2233 | ret |= VM_FAULT_WRITE; | 2308 | ret |= VM_FAULT_WRITE; |
| 2234 | goto unlock; | 2309 | |
| 2310 | if (!dirty_page) | ||
| 2311 | return ret; | ||
| 2312 | |||
| 2313 | /* | ||
| 2314 | * Yes, Virginia, this is actually required to prevent a race | ||
| 2315 | * with clear_page_dirty_for_io() from clearing the page dirty | ||
| 2316 | * bit after it clear all dirty ptes, but before a racing | ||
| 2317 | * do_wp_page installs a dirty pte. | ||
| 2318 | * | ||
| 2319 | * do_no_page is protected similarly. | ||
| 2320 | */ | ||
| 2321 | if (!page_mkwrite) { | ||
| 2322 | wait_on_page_locked(dirty_page); | ||
| 2323 | set_page_dirty_balance(dirty_page, page_mkwrite); | ||
| 2324 | } | ||
| 2325 | put_page(dirty_page); | ||
| 2326 | if (page_mkwrite) { | ||
| 2327 | struct address_space *mapping = dirty_page->mapping; | ||
| 2328 | |||
| 2329 | set_page_dirty(dirty_page); | ||
| 2330 | unlock_page(dirty_page); | ||
| 2331 | page_cache_release(dirty_page); | ||
| 2332 | if (mapping) { | ||
| 2333 | /* | ||
| 2334 | * Some device drivers do not set page.mapping | ||
| 2335 | * but still dirty their pages | ||
| 2336 | */ | ||
| 2337 | balance_dirty_pages_ratelimited(mapping); | ||
| 2338 | } | ||
| 2339 | } | ||
| 2340 | |||
| 2341 | /* file_update_time outside page_lock */ | ||
| 2342 | if (vma->vm_file) | ||
| 2343 | file_update_time(vma->vm_file); | ||
| 2344 | |||
| 2345 | return ret; | ||
| 2235 | } | 2346 | } |
| 2236 | 2347 | ||
| 2237 | /* | 2348 | /* |
| @@ -2337,39 +2448,6 @@ gotten: | |||
| 2337 | page_cache_release(old_page); | 2448 | page_cache_release(old_page); |
| 2338 | unlock: | 2449 | unlock: |
| 2339 | pte_unmap_unlock(page_table, ptl); | 2450 | pte_unmap_unlock(page_table, ptl); |
| 2340 | if (dirty_page) { | ||
| 2341 | /* | ||
| 2342 | * Yes, Virginia, this is actually required to prevent a race | ||
| 2343 | * with clear_page_dirty_for_io() from clearing the page dirty | ||
| 2344 | * bit after it clear all dirty ptes, but before a racing | ||
| 2345 | * do_wp_page installs a dirty pte. | ||
| 2346 | * | ||
| 2347 | * do_no_page is protected similarly. | ||
| 2348 | */ | ||
| 2349 | if (!page_mkwrite) { | ||
| 2350 | wait_on_page_locked(dirty_page); | ||
| 2351 | set_page_dirty_balance(dirty_page, page_mkwrite); | ||
| 2352 | } | ||
| 2353 | put_page(dirty_page); | ||
| 2354 | if (page_mkwrite) { | ||
| 2355 | struct address_space *mapping = dirty_page->mapping; | ||
| 2356 | |||
| 2357 | set_page_dirty(dirty_page); | ||
| 2358 | unlock_page(dirty_page); | ||
| 2359 | page_cache_release(dirty_page); | ||
| 2360 | if (mapping) { | ||
| 2361 | /* | ||
| 2362 | * Some device drivers do not set page.mapping | ||
| 2363 | * but still dirty their pages | ||
| 2364 | */ | ||
| 2365 | balance_dirty_pages_ratelimited(mapping); | ||
| 2366 | } | ||
| 2367 | } | ||
| 2368 | |||
| 2369 | /* file_update_time outside page_lock */ | ||
| 2370 | if (vma->vm_file) | ||
| 2371 | file_update_time(vma->vm_file); | ||
| 2372 | } | ||
| 2373 | return ret; | 2451 | return ret; |
| 2374 | oom_free_new: | 2452 | oom_free_new: |
| 2375 | page_cache_release(new_page); | 2453 | page_cache_release(new_page); |
| @@ -3147,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3147 | * but allow concurrent faults), and pte mapped but not yet locked. | 3225 | * but allow concurrent faults), and pte mapped but not yet locked. |
| 3148 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 3226 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
| 3149 | */ | 3227 | */ |
| 3150 | static inline int handle_pte_fault(struct mm_struct *mm, | 3228 | int handle_pte_fault(struct mm_struct *mm, |
| 3151 | struct vm_area_struct *vma, unsigned long address, | 3229 | struct vm_area_struct *vma, unsigned long address, |
| 3152 | pte_t *pte, pmd_t *pmd, unsigned int flags) | 3230 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
| 3153 | { | 3231 | { |
| 3154 | pte_t entry; | 3232 | pte_t entry; |
| 3155 | spinlock_t *ptl; | 3233 | spinlock_t *ptl; |
| @@ -3228,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3228 | pmd = pmd_alloc(mm, pud, address); | 3306 | pmd = pmd_alloc(mm, pud, address); |
| 3229 | if (!pmd) | 3307 | if (!pmd) |
| 3230 | return VM_FAULT_OOM; | 3308 | return VM_FAULT_OOM; |
| 3231 | pte = pte_alloc_map(mm, pmd, address); | 3309 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { |
| 3232 | if (!pte) | 3310 | if (!vma->vm_ops) |
| 3311 | return do_huge_pmd_anonymous_page(mm, vma, address, | ||
| 3312 | pmd, flags); | ||
| 3313 | } else { | ||
| 3314 | pmd_t orig_pmd = *pmd; | ||
| 3315 | barrier(); | ||
| 3316 | if (pmd_trans_huge(orig_pmd)) { | ||
| 3317 | if (flags & FAULT_FLAG_WRITE && | ||
| 3318 | !pmd_write(orig_pmd) && | ||
| 3319 | !pmd_trans_splitting(orig_pmd)) | ||
| 3320 | return do_huge_pmd_wp_page(mm, vma, address, | ||
| 3321 | pmd, orig_pmd); | ||
| 3322 | return 0; | ||
| 3323 | } | ||
| 3324 | } | ||
| 3325 | |||
| 3326 | /* | ||
| 3327 | * Use __pte_alloc instead of pte_alloc_map, because we can't | ||
| 3328 | * run pte_offset_map on the pmd, if an huge pmd could | ||
| 3329 | * materialize from under us from a different thread. | ||
| 3330 | */ | ||
| 3331 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
| 3233 | return VM_FAULT_OOM; | 3332 | return VM_FAULT_OOM; |
| 3333 | /* if an huge pmd materialized from under us just retry later */ | ||
| 3334 | if (unlikely(pmd_trans_huge(*pmd))) | ||
| 3335 | return 0; | ||
| 3336 | /* | ||
| 3337 | * A regular pmd is established and it can't morph into a huge pmd | ||
| 3338 | * from under us anymore at this point because we hold the mmap_sem | ||
| 3339 | * read mode and khugepaged takes it in write mode. So now it's | ||
| 3340 | * safe to run pte_offset_map(). | ||
| 3341 | */ | ||
| 3342 | pte = pte_offset_map(pmd, address); | ||
| 3234 | 3343 | ||
| 3235 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3344 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
| 3236 | } | 3345 | } |
| @@ -3296,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
| 3296 | vma = find_vma(current->mm, addr); | 3405 | vma = find_vma(current->mm, addr); |
| 3297 | if (!vma) | 3406 | if (!vma) |
| 3298 | return -ENOMEM; | 3407 | return -ENOMEM; |
| 3299 | write = (vma->vm_flags & VM_WRITE) != 0; | 3408 | /* |
| 3409 | * We want to touch writable mappings with a write fault in order | ||
| 3410 | * to break COW, except for shared mappings because these don't COW | ||
| 3411 | * and we would not want to dirty them for nothing. | ||
| 3412 | */ | ||
| 3413 | write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; | ||
| 3300 | BUG_ON(addr >= end); | 3414 | BUG_ON(addr >= end); |
| 3301 | BUG_ON(end > vma->vm_end); | 3415 | BUG_ON(end > vma->vm_end); |
| 3302 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | 3416 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
| @@ -3368,6 +3482,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, | |||
| 3368 | goto out; | 3482 | goto out; |
| 3369 | 3483 | ||
| 3370 | pmd = pmd_offset(pud, address); | 3484 | pmd = pmd_offset(pud, address); |
| 3485 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
| 3371 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 3486 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
| 3372 | goto out; | 3487 | goto out; |
| 3373 | 3488 | ||
| @@ -3608,3 +3723,74 @@ void might_fault(void) | |||
| 3608 | } | 3723 | } |
| 3609 | EXPORT_SYMBOL(might_fault); | 3724 | EXPORT_SYMBOL(might_fault); |
| 3610 | #endif | 3725 | #endif |
| 3726 | |||
| 3727 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) | ||
| 3728 | static void clear_gigantic_page(struct page *page, | ||
| 3729 | unsigned long addr, | ||
| 3730 | unsigned int pages_per_huge_page) | ||
| 3731 | { | ||
| 3732 | int i; | ||
| 3733 | struct page *p = page; | ||
| 3734 | |||
| 3735 | might_sleep(); | ||
| 3736 | for (i = 0; i < pages_per_huge_page; | ||
| 3737 | i++, p = mem_map_next(p, page, i)) { | ||
| 3738 | cond_resched(); | ||
| 3739 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
| 3740 | } | ||
| 3741 | } | ||
| 3742 | void clear_huge_page(struct page *page, | ||
| 3743 | unsigned long addr, unsigned int pages_per_huge_page) | ||
| 3744 | { | ||
| 3745 | int i; | ||
| 3746 | |||
| 3747 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | ||
| 3748 | clear_gigantic_page(page, addr, pages_per_huge_page); | ||
| 3749 | return; | ||
| 3750 | } | ||
| 3751 | |||
| 3752 | might_sleep(); | ||
| 3753 | for (i = 0; i < pages_per_huge_page; i++) { | ||
| 3754 | cond_resched(); | ||
| 3755 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
| 3756 | } | ||
| 3757 | } | ||
| 3758 | |||
| 3759 | static void copy_user_gigantic_page(struct page *dst, struct page *src, | ||
| 3760 | unsigned long addr, | ||
| 3761 | struct vm_area_struct *vma, | ||
| 3762 | unsigned int pages_per_huge_page) | ||
| 3763 | { | ||
| 3764 | int i; | ||
| 3765 | struct page *dst_base = dst; | ||
| 3766 | struct page *src_base = src; | ||
| 3767 | |||
| 3768 | for (i = 0; i < pages_per_huge_page; ) { | ||
| 3769 | cond_resched(); | ||
| 3770 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | ||
| 3771 | |||
| 3772 | i++; | ||
| 3773 | dst = mem_map_next(dst, dst_base, i); | ||
| 3774 | src = mem_map_next(src, src_base, i); | ||
| 3775 | } | ||
| 3776 | } | ||
| 3777 | |||
| 3778 | void copy_user_huge_page(struct page *dst, struct page *src, | ||
| 3779 | unsigned long addr, struct vm_area_struct *vma, | ||
| 3780 | unsigned int pages_per_huge_page) | ||
| 3781 | { | ||
| 3782 | int i; | ||
| 3783 | |||
| 3784 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | ||
| 3785 | copy_user_gigantic_page(dst, src, addr, vma, | ||
| 3786 | pages_per_huge_page); | ||
| 3787 | return; | ||
| 3788 | } | ||
| 3789 | |||
| 3790 | might_sleep(); | ||
| 3791 | for (i = 0; i < pages_per_huge_page; i++) { | ||
| 3792 | cond_resched(); | ||
| 3793 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | ||
| 3794 | } | ||
| 3795 | } | ||
| 3796 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9260314a221e..e92f04749fcb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -34,6 +34,23 @@ | |||
| 34 | 34 | ||
| 35 | #include "internal.h" | 35 | #include "internal.h" |
| 36 | 36 | ||
| 37 | DEFINE_MUTEX(mem_hotplug_mutex); | ||
| 38 | |||
| 39 | void lock_memory_hotplug(void) | ||
| 40 | { | ||
| 41 | mutex_lock(&mem_hotplug_mutex); | ||
| 42 | |||
| 43 | /* for exclusive hibernation if CONFIG_HIBERNATION=y */ | ||
| 44 | lock_system_sleep(); | ||
| 45 | } | ||
| 46 | |||
| 47 | void unlock_memory_hotplug(void) | ||
| 48 | { | ||
| 49 | unlock_system_sleep(); | ||
| 50 | mutex_unlock(&mem_hotplug_mutex); | ||
| 51 | } | ||
| 52 | |||
| 53 | |||
| 37 | /* add this memory to iomem resource */ | 54 | /* add this memory to iomem resource */ |
| 38 | static struct resource *register_memory_resource(u64 start, u64 size) | 55 | static struct resource *register_memory_resource(u64 start, u64 size) |
| 39 | { | 56 | { |
| @@ -65,9 +82,10 @@ static void release_memory_resource(struct resource *res) | |||
| 65 | 82 | ||
| 66 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | 83 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE |
| 67 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 84 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
| 68 | static void get_page_bootmem(unsigned long info, struct page *page, int type) | 85 | static void get_page_bootmem(unsigned long info, struct page *page, |
| 86 | unsigned long type) | ||
| 69 | { | 87 | { |
| 70 | atomic_set(&page->_mapcount, type); | 88 | page->lru.next = (struct list_head *) type; |
| 71 | SetPagePrivate(page); | 89 | SetPagePrivate(page); |
| 72 | set_page_private(page, info); | 90 | set_page_private(page, info); |
| 73 | atomic_inc(&page->_count); | 91 | atomic_inc(&page->_count); |
| @@ -77,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) | |||
| 77 | * so use __ref to tell modpost not to generate a warning */ | 95 | * so use __ref to tell modpost not to generate a warning */ |
| 78 | void __ref put_page_bootmem(struct page *page) | 96 | void __ref put_page_bootmem(struct page *page) |
| 79 | { | 97 | { |
| 80 | int type; | 98 | unsigned long type; |
| 81 | 99 | ||
| 82 | type = atomic_read(&page->_mapcount); | 100 | type = (unsigned long) page->lru.next; |
| 83 | BUG_ON(type >= -1); | 101 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
| 102 | type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); | ||
| 84 | 103 | ||
| 85 | if (atomic_dec_return(&page->_count) == 1) { | 104 | if (atomic_dec_return(&page->_count) == 1) { |
| 86 | ClearPagePrivate(page); | 105 | ClearPagePrivate(page); |
| 87 | set_page_private(page, 0); | 106 | set_page_private(page, 0); |
| 88 | reset_page_mapcount(page); | 107 | INIT_LIST_HEAD(&page->lru); |
| 89 | __free_pages_bootmem(page, 0); | 108 | __free_pages_bootmem(page, 0); |
| 90 | } | 109 | } |
| 91 | 110 | ||
| @@ -493,7 +512,7 @@ int mem_online_node(int nid) | |||
| 493 | pg_data_t *pgdat; | 512 | pg_data_t *pgdat; |
| 494 | int ret; | 513 | int ret; |
| 495 | 514 | ||
| 496 | lock_system_sleep(); | 515 | lock_memory_hotplug(); |
| 497 | pgdat = hotadd_new_pgdat(nid, 0); | 516 | pgdat = hotadd_new_pgdat(nid, 0); |
| 498 | if (pgdat) { | 517 | if (pgdat) { |
| 499 | ret = -ENOMEM; | 518 | ret = -ENOMEM; |
| @@ -504,7 +523,7 @@ int mem_online_node(int nid) | |||
| 504 | BUG_ON(ret); | 523 | BUG_ON(ret); |
| 505 | 524 | ||
| 506 | out: | 525 | out: |
| 507 | unlock_system_sleep(); | 526 | unlock_memory_hotplug(); |
| 508 | return ret; | 527 | return ret; |
| 509 | } | 528 | } |
| 510 | 529 | ||
| @@ -516,7 +535,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
| 516 | struct resource *res; | 535 | struct resource *res; |
| 517 | int ret; | 536 | int ret; |
| 518 | 537 | ||
| 519 | lock_system_sleep(); | 538 | lock_memory_hotplug(); |
| 520 | 539 | ||
| 521 | res = register_memory_resource(start, size); | 540 | res = register_memory_resource(start, size); |
| 522 | ret = -EEXIST; | 541 | ret = -EEXIST; |
| @@ -563,7 +582,7 @@ error: | |||
| 563 | release_memory_resource(res); | 582 | release_memory_resource(res); |
| 564 | 583 | ||
| 565 | out: | 584 | out: |
| 566 | unlock_system_sleep(); | 585 | unlock_memory_hotplug(); |
| 567 | return ret; | 586 | return ret; |
| 568 | } | 587 | } |
| 569 | EXPORT_SYMBOL_GPL(add_memory); | 588 | EXPORT_SYMBOL_GPL(add_memory); |
| @@ -716,7 +735,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
| 716 | goto out; | 735 | goto out; |
| 717 | } | 736 | } |
| 718 | /* this function returns # of failed pages */ | 737 | /* this function returns # of failed pages */ |
| 719 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); | 738 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, |
| 739 | true, true); | ||
| 720 | if (ret) | 740 | if (ret) |
| 721 | putback_lru_pages(&source); | 741 | putback_lru_pages(&source); |
| 722 | } | 742 | } |
| @@ -791,7 +811,7 @@ static int offline_pages(unsigned long start_pfn, | |||
| 791 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | 811 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) |
| 792 | return -EINVAL; | 812 | return -EINVAL; |
| 793 | 813 | ||
| 794 | lock_system_sleep(); | 814 | lock_memory_hotplug(); |
| 795 | 815 | ||
| 796 | zone = page_zone(pfn_to_page(start_pfn)); | 816 | zone = page_zone(pfn_to_page(start_pfn)); |
| 797 | node = zone_to_nid(zone); | 817 | node = zone_to_nid(zone); |
| @@ -880,7 +900,7 @@ repeat: | |||
| 880 | writeback_set_ratelimit(); | 900 | writeback_set_ratelimit(); |
| 881 | 901 | ||
| 882 | memory_notify(MEM_OFFLINE, &arg); | 902 | memory_notify(MEM_OFFLINE, &arg); |
| 883 | unlock_system_sleep(); | 903 | unlock_memory_hotplug(); |
| 884 | return 0; | 904 | return 0; |
| 885 | 905 | ||
| 886 | failed_removal: | 906 | failed_removal: |
| @@ -891,7 +911,7 @@ failed_removal: | |||
| 891 | undo_isolate_page_range(start_pfn, end_pfn); | 911 | undo_isolate_page_range(start_pfn, end_pfn); |
| 892 | 912 | ||
| 893 | out: | 913 | out: |
| 894 | unlock_system_sleep(); | 914 | unlock_memory_hotplug(); |
| 895 | return ret; | 915 | return ret; |
| 896 | } | 916 | } |
| 897 | 917 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4a57f135b76e..368fc9d23610 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
| 514 | pmd = pmd_offset(pud, addr); | 514 | pmd = pmd_offset(pud, addr); |
| 515 | do { | 515 | do { |
| 516 | next = pmd_addr_end(addr, end); | 516 | next = pmd_addr_end(addr, end); |
| 517 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
| 517 | if (pmd_none_or_clear_bad(pmd)) | 518 | if (pmd_none_or_clear_bad(pmd)) |
| 518 | continue; | 519 | continue; |
| 519 | if (check_pte_range(vma, pmd, addr, next, nodes, | 520 | if (check_pte_range(vma, pmd, addr, next, nodes, |
| @@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
| 935 | return PTR_ERR(vma); | 936 | return PTR_ERR(vma); |
| 936 | 937 | ||
| 937 | if (!list_empty(&pagelist)) { | 938 | if (!list_empty(&pagelist)) { |
| 938 | err = migrate_pages(&pagelist, new_node_page, dest, 0); | 939 | err = migrate_pages(&pagelist, new_node_page, dest, |
| 940 | false, true); | ||
| 939 | if (err) | 941 | if (err) |
| 940 | putback_lru_pages(&pagelist); | 942 | putback_lru_pages(&pagelist); |
| 941 | } | 943 | } |
| @@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
| 1155 | 1157 | ||
| 1156 | if (!list_empty(&pagelist)) { | 1158 | if (!list_empty(&pagelist)) { |
| 1157 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1159 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
| 1158 | (unsigned long)vma, 0); | 1160 | (unsigned long)vma, |
| 1161 | false, true); | ||
| 1159 | if (nr_failed) | 1162 | if (nr_failed) |
| 1160 | putback_lru_pages(&pagelist); | 1163 | putback_lru_pages(&pagelist); |
| 1161 | } | 1164 | } |
| @@ -1307,15 +1310,15 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
| 1307 | goto out; | 1310 | goto out; |
| 1308 | 1311 | ||
| 1309 | /* Find the mm_struct */ | 1312 | /* Find the mm_struct */ |
| 1310 | read_lock(&tasklist_lock); | 1313 | rcu_read_lock(); |
| 1311 | task = pid ? find_task_by_vpid(pid) : current; | 1314 | task = pid ? find_task_by_vpid(pid) : current; |
| 1312 | if (!task) { | 1315 | if (!task) { |
| 1313 | read_unlock(&tasklist_lock); | 1316 | rcu_read_unlock(); |
| 1314 | err = -ESRCH; | 1317 | err = -ESRCH; |
| 1315 | goto out; | 1318 | goto out; |
| 1316 | } | 1319 | } |
| 1317 | mm = get_task_mm(task); | 1320 | mm = get_task_mm(task); |
| 1318 | read_unlock(&tasklist_lock); | 1321 | rcu_read_unlock(); |
| 1319 | 1322 | ||
| 1320 | err = -EINVAL; | 1323 | err = -EINVAL; |
| 1321 | if (!mm) | 1324 | if (!mm) |
| @@ -1793,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
| 1793 | } | 1796 | } |
| 1794 | 1797 | ||
| 1795 | /** | 1798 | /** |
| 1796 | * alloc_page_vma - Allocate a page for a VMA. | 1799 | * alloc_pages_vma - Allocate a page for a VMA. |
| 1797 | * | 1800 | * |
| 1798 | * @gfp: | 1801 | * @gfp: |
| 1799 | * %GFP_USER user allocation. | 1802 | * %GFP_USER user allocation. |
| @@ -1802,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
| 1802 | * %GFP_FS allocation should not call back into a file system. | 1805 | * %GFP_FS allocation should not call back into a file system. |
| 1803 | * %GFP_ATOMIC don't sleep. | 1806 | * %GFP_ATOMIC don't sleep. |
| 1804 | * | 1807 | * |
| 1808 | * @order:Order of the GFP allocation. | ||
| 1805 | * @vma: Pointer to VMA or NULL if not available. | 1809 | * @vma: Pointer to VMA or NULL if not available. |
| 1806 | * @addr: Virtual Address of the allocation. Must be inside the VMA. | 1810 | * @addr: Virtual Address of the allocation. Must be inside the VMA. |
| 1807 | * | 1811 | * |
| @@ -1815,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
| 1815 | * Should be called with the mm_sem of the vma hold. | 1819 | * Should be called with the mm_sem of the vma hold. |
| 1816 | */ | 1820 | */ |
| 1817 | struct page * | 1821 | struct page * |
| 1818 | alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | 1822 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
| 1823 | unsigned long addr) | ||
| 1819 | { | 1824 | { |
| 1820 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1825 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
| 1821 | struct zonelist *zl; | 1826 | struct zonelist *zl; |
| @@ -1827,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 1827 | 1832 | ||
| 1828 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | 1833 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
| 1829 | mpol_cond_put(pol); | 1834 | mpol_cond_put(pol); |
| 1830 | page = alloc_page_interleave(gfp, 0, nid); | 1835 | page = alloc_page_interleave(gfp, order, nid); |
| 1831 | put_mems_allowed(); | 1836 | put_mems_allowed(); |
| 1832 | return page; | 1837 | return page; |
| 1833 | } | 1838 | } |
| @@ -1836,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 1836 | /* | 1841 | /* |
| 1837 | * slow path: ref counted shared policy | 1842 | * slow path: ref counted shared policy |
| 1838 | */ | 1843 | */ |
| 1839 | struct page *page = __alloc_pages_nodemask(gfp, 0, | 1844 | struct page *page = __alloc_pages_nodemask(gfp, order, |
| 1840 | zl, policy_nodemask(gfp, pol)); | 1845 | zl, policy_nodemask(gfp, pol)); |
| 1841 | __mpol_put(pol); | 1846 | __mpol_put(pol); |
| 1842 | put_mems_allowed(); | 1847 | put_mems_allowed(); |
| @@ -1845,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 1845 | /* | 1850 | /* |
| 1846 | * fast path: default or task policy | 1851 | * fast path: default or task policy |
| 1847 | */ | 1852 | */ |
| 1848 | page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); | 1853 | page = __alloc_pages_nodemask(gfp, order, zl, |
| 1854 | policy_nodemask(gfp, pol)); | ||
| 1849 | put_mems_allowed(); | 1855 | put_mems_allowed(); |
| 1850 | return page; | 1856 | return page; |
| 1851 | } | 1857 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index fe5a3c6a5426..46fe8cc13d67 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -35,6 +35,8 @@ | |||
| 35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
| 36 | #include <linux/gfp.h> | 36 | #include <linux/gfp.h> |
| 37 | 37 | ||
| 38 | #include <asm/tlbflush.h> | ||
| 39 | |||
| 38 | #include "internal.h" | 40 | #include "internal.h" |
| 39 | 41 | ||
| 40 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 42 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
| @@ -111,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
| 111 | goto out; | 113 | goto out; |
| 112 | 114 | ||
| 113 | pmd = pmd_offset(pud, addr); | 115 | pmd = pmd_offset(pud, addr); |
| 116 | if (pmd_trans_huge(*pmd)) | ||
| 117 | goto out; | ||
| 114 | if (!pmd_present(*pmd)) | 118 | if (!pmd_present(*pmd)) |
| 115 | goto out; | 119 | goto out; |
| 116 | 120 | ||
| @@ -244,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 244 | 248 | ||
| 245 | expected_count = 2 + page_has_private(page); | 249 | expected_count = 2 + page_has_private(page); |
| 246 | if (page_count(page) != expected_count || | 250 | if (page_count(page) != expected_count || |
| 247 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 251 | radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { |
| 248 | spin_unlock_irq(&mapping->tree_lock); | 252 | spin_unlock_irq(&mapping->tree_lock); |
| 249 | return -EAGAIN; | 253 | return -EAGAIN; |
| 250 | } | 254 | } |
| @@ -316,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
| 316 | 320 | ||
| 317 | expected_count = 2 + page_has_private(page); | 321 | expected_count = 2 + page_has_private(page); |
| 318 | if (page_count(page) != expected_count || | 322 | if (page_count(page) != expected_count || |
| 319 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 323 | radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { |
| 320 | spin_unlock_irq(&mapping->tree_lock); | 324 | spin_unlock_irq(&mapping->tree_lock); |
| 321 | return -EAGAIN; | 325 | return -EAGAIN; |
| 322 | } | 326 | } |
| @@ -612,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
| 612 | * to the newly allocated page in newpage. | 616 | * to the newly allocated page in newpage. |
| 613 | */ | 617 | */ |
| 614 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 618 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
| 615 | struct page *page, int force, int offlining) | 619 | struct page *page, int force, bool offlining, bool sync) |
| 616 | { | 620 | { |
| 617 | int rc = 0; | 621 | int rc = 0; |
| 618 | int *result = NULL; | 622 | int *result = NULL; |
| 619 | struct page *newpage = get_new_page(page, private, &result); | 623 | struct page *newpage = get_new_page(page, private, &result); |
| 620 | int remap_swapcache = 1; | 624 | int remap_swapcache = 1; |
| 621 | int rcu_locked = 0; | ||
| 622 | int charge = 0; | 625 | int charge = 0; |
| 623 | struct mem_cgroup *mem = NULL; | 626 | struct mem_cgroup *mem = NULL; |
| 624 | struct anon_vma *anon_vma = NULL; | 627 | struct anon_vma *anon_vma = NULL; |
| @@ -630,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 630 | /* page was freed from under us. So we are done. */ | 633 | /* page was freed from under us. So we are done. */ |
| 631 | goto move_newpage; | 634 | goto move_newpage; |
| 632 | } | 635 | } |
| 636 | if (unlikely(PageTransHuge(page))) | ||
| 637 | if (unlikely(split_huge_page(page))) | ||
| 638 | goto move_newpage; | ||
| 633 | 639 | ||
| 634 | /* prepare cgroup just returns 0 or -ENOMEM */ | 640 | /* prepare cgroup just returns 0 or -ENOMEM */ |
| 635 | rc = -EAGAIN; | 641 | rc = -EAGAIN; |
| @@ -637,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 637 | if (!trylock_page(page)) { | 643 | if (!trylock_page(page)) { |
| 638 | if (!force) | 644 | if (!force) |
| 639 | goto move_newpage; | 645 | goto move_newpage; |
| 646 | |||
| 647 | /* | ||
| 648 | * It's not safe for direct compaction to call lock_page. | ||
| 649 | * For example, during page readahead pages are added locked | ||
| 650 | * to the LRU. Later, when the IO completes the pages are | ||
| 651 | * marked uptodate and unlocked. However, the queueing | ||
| 652 | * could be merging multiple pages for one bio (e.g. | ||
| 653 | * mpage_readpages). If an allocation happens for the | ||
| 654 | * second or third page, the process can end up locking | ||
| 655 | * the same page twice and deadlocking. Rather than | ||
| 656 | * trying to be clever about what pages can be locked, | ||
| 657 | * avoid the use of lock_page for direct compaction | ||
| 658 | * altogether. | ||
| 659 | */ | ||
| 660 | if (current->flags & PF_MEMALLOC) | ||
| 661 | goto move_newpage; | ||
| 662 | |||
| 640 | lock_page(page); | 663 | lock_page(page); |
| 641 | } | 664 | } |
| 642 | 665 | ||
| @@ -663,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 663 | BUG_ON(charge); | 686 | BUG_ON(charge); |
| 664 | 687 | ||
| 665 | if (PageWriteback(page)) { | 688 | if (PageWriteback(page)) { |
| 666 | if (!force) | 689 | if (!force || !sync) |
| 667 | goto uncharge; | 690 | goto uncharge; |
| 668 | wait_on_page_writeback(page); | 691 | wait_on_page_writeback(page); |
| 669 | } | 692 | } |
| 670 | /* | 693 | /* |
| 671 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, | 694 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, |
| 672 | * we cannot notice that anon_vma is freed while we migrates a page. | 695 | * we cannot notice that anon_vma is freed while we migrates a page. |
| 673 | * This rcu_read_lock() delays freeing anon_vma pointer until the end | 696 | * This get_anon_vma() delays freeing anon_vma pointer until the end |
| 674 | * of migration. File cache pages are no problem because of page_lock() | 697 | * of migration. File cache pages are no problem because of page_lock() |
| 675 | * File Caches may use write_page() or lock_page() in migration, then, | 698 | * File Caches may use write_page() or lock_page() in migration, then, |
| 676 | * just care Anon page here. | 699 | * just care Anon page here. |
| 677 | */ | 700 | */ |
| 678 | if (PageAnon(page)) { | 701 | if (PageAnon(page)) { |
| 679 | rcu_read_lock(); | 702 | /* |
| 680 | rcu_locked = 1; | 703 | * Only page_lock_anon_vma() understands the subtleties of |
| 681 | 704 | * getting a hold on an anon_vma from outside one of its mms. | |
| 682 | /* Determine how to safely use anon_vma */ | 705 | */ |
| 683 | if (!page_mapped(page)) { | 706 | anon_vma = page_lock_anon_vma(page); |
| 684 | if (!PageSwapCache(page)) | 707 | if (anon_vma) { |
| 685 | goto rcu_unlock; | 708 | /* |
| 686 | 709 | * Take a reference count on the anon_vma if the | |
| 710 | * page is mapped so that it is guaranteed to | ||
| 711 | * exist when the page is remapped later | ||
| 712 | */ | ||
| 713 | get_anon_vma(anon_vma); | ||
| 714 | page_unlock_anon_vma(anon_vma); | ||
| 715 | } else if (PageSwapCache(page)) { | ||
| 687 | /* | 716 | /* |
| 688 | * We cannot be sure that the anon_vma of an unmapped | 717 | * We cannot be sure that the anon_vma of an unmapped |
| 689 | * swapcache page is safe to use because we don't | 718 | * swapcache page is safe to use because we don't |
| @@ -698,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 698 | */ | 727 | */ |
| 699 | remap_swapcache = 0; | 728 | remap_swapcache = 0; |
| 700 | } else { | 729 | } else { |
| 701 | /* | 730 | goto uncharge; |
| 702 | * Take a reference count on the anon_vma if the | ||
| 703 | * page is mapped so that it is guaranteed to | ||
| 704 | * exist when the page is remapped later | ||
| 705 | */ | ||
| 706 | anon_vma = page_anon_vma(page); | ||
| 707 | get_anon_vma(anon_vma); | ||
| 708 | } | 731 | } |
| 709 | } | 732 | } |
| 710 | 733 | ||
| @@ -721,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 721 | * free the metadata, so the page can be freed. | 744 | * free the metadata, so the page can be freed. |
| 722 | */ | 745 | */ |
| 723 | if (!page->mapping) { | 746 | if (!page->mapping) { |
| 724 | if (!PageAnon(page) && page_has_private(page)) { | 747 | VM_BUG_ON(PageAnon(page)); |
| 725 | /* | 748 | if (page_has_private(page)) { |
| 726 | * Go direct to try_to_free_buffers() here because | ||
| 727 | * a) that's what try_to_release_page() would do anyway | ||
| 728 | * b) we may be under rcu_read_lock() here, so we can't | ||
| 729 | * use GFP_KERNEL which is what try_to_release_page() | ||
| 730 | * needs to be effective. | ||
| 731 | */ | ||
| 732 | try_to_free_buffers(page); | 749 | try_to_free_buffers(page); |
| 733 | goto rcu_unlock; | 750 | goto uncharge; |
| 734 | } | 751 | } |
| 735 | goto skip_unmap; | 752 | goto skip_unmap; |
| 736 | } | 753 | } |
| @@ -744,17 +761,14 @@ skip_unmap: | |||
| 744 | 761 | ||
| 745 | if (rc && remap_swapcache) | 762 | if (rc && remap_swapcache) |
| 746 | remove_migration_ptes(page, page); | 763 | remove_migration_ptes(page, page); |
| 747 | rcu_unlock: | ||
| 748 | 764 | ||
| 749 | /* Drop an anon_vma reference if we took one */ | 765 | /* Drop an anon_vma reference if we took one */ |
| 750 | if (anon_vma) | 766 | if (anon_vma) |
| 751 | drop_anon_vma(anon_vma); | 767 | drop_anon_vma(anon_vma); |
| 752 | 768 | ||
| 753 | if (rcu_locked) | ||
| 754 | rcu_read_unlock(); | ||
| 755 | uncharge: | 769 | uncharge: |
| 756 | if (!charge) | 770 | if (!charge) |
| 757 | mem_cgroup_end_migration(mem, page, newpage); | 771 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
| 758 | unlock: | 772 | unlock: |
| 759 | unlock_page(page); | 773 | unlock_page(page); |
| 760 | 774 | ||
| @@ -808,12 +822,11 @@ move_newpage: | |||
| 808 | */ | 822 | */ |
| 809 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 823 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
| 810 | unsigned long private, struct page *hpage, | 824 | unsigned long private, struct page *hpage, |
| 811 | int force, int offlining) | 825 | int force, bool offlining, bool sync) |
| 812 | { | 826 | { |
| 813 | int rc = 0; | 827 | int rc = 0; |
| 814 | int *result = NULL; | 828 | int *result = NULL; |
| 815 | struct page *new_hpage = get_new_page(hpage, private, &result); | 829 | struct page *new_hpage = get_new_page(hpage, private, &result); |
| 816 | int rcu_locked = 0; | ||
| 817 | struct anon_vma *anon_vma = NULL; | 830 | struct anon_vma *anon_vma = NULL; |
| 818 | 831 | ||
| 819 | if (!new_hpage) | 832 | if (!new_hpage) |
| @@ -822,18 +835,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
| 822 | rc = -EAGAIN; | 835 | rc = -EAGAIN; |
| 823 | 836 | ||
| 824 | if (!trylock_page(hpage)) { | 837 | if (!trylock_page(hpage)) { |
| 825 | if (!force) | 838 | if (!force || !sync) |
| 826 | goto out; | 839 | goto out; |
| 827 | lock_page(hpage); | 840 | lock_page(hpage); |
| 828 | } | 841 | } |
| 829 | 842 | ||
| 830 | if (PageAnon(hpage)) { | 843 | if (PageAnon(hpage)) { |
| 831 | rcu_read_lock(); | 844 | anon_vma = page_lock_anon_vma(hpage); |
| 832 | rcu_locked = 1; | 845 | if (anon_vma) { |
| 833 | 846 | get_anon_vma(anon_vma); | |
| 834 | if (page_mapped(hpage)) { | 847 | page_unlock_anon_vma(anon_vma); |
| 835 | anon_vma = page_anon_vma(hpage); | ||
| 836 | atomic_inc(&anon_vma->external_refcount); | ||
| 837 | } | 848 | } |
| 838 | } | 849 | } |
| 839 | 850 | ||
| @@ -845,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
| 845 | if (rc) | 856 | if (rc) |
| 846 | remove_migration_ptes(hpage, hpage); | 857 | remove_migration_ptes(hpage, hpage); |
| 847 | 858 | ||
| 848 | if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, | 859 | if (anon_vma) |
| 849 | &anon_vma->lock)) { | 860 | drop_anon_vma(anon_vma); |
| 850 | int empty = list_empty(&anon_vma->head); | ||
| 851 | spin_unlock(&anon_vma->lock); | ||
| 852 | if (empty) | ||
| 853 | anon_vma_free(anon_vma); | ||
| 854 | } | ||
| 855 | |||
| 856 | if (rcu_locked) | ||
| 857 | rcu_read_unlock(); | ||
| 858 | out: | 861 | out: |
| 859 | unlock_page(hpage); | 862 | unlock_page(hpage); |
| 860 | 863 | ||
| @@ -890,7 +893,8 @@ out: | |||
| 890 | * Return: Number of pages not migrated or error code. | 893 | * Return: Number of pages not migrated or error code. |
| 891 | */ | 894 | */ |
| 892 | int migrate_pages(struct list_head *from, | 895 | int migrate_pages(struct list_head *from, |
| 893 | new_page_t get_new_page, unsigned long private, int offlining) | 896 | new_page_t get_new_page, unsigned long private, bool offlining, |
| 897 | bool sync) | ||
| 894 | { | 898 | { |
| 895 | int retry = 1; | 899 | int retry = 1; |
| 896 | int nr_failed = 0; | 900 | int nr_failed = 0; |
| @@ -910,7 +914,8 @@ int migrate_pages(struct list_head *from, | |||
| 910 | cond_resched(); | 914 | cond_resched(); |
| 911 | 915 | ||
| 912 | rc = unmap_and_move(get_new_page, private, | 916 | rc = unmap_and_move(get_new_page, private, |
| 913 | page, pass > 2, offlining); | 917 | page, pass > 2, offlining, |
| 918 | sync); | ||
| 914 | 919 | ||
| 915 | switch(rc) { | 920 | switch(rc) { |
| 916 | case -ENOMEM: | 921 | case -ENOMEM: |
| @@ -939,7 +944,8 @@ out: | |||
| 939 | } | 944 | } |
| 940 | 945 | ||
| 941 | int migrate_huge_pages(struct list_head *from, | 946 | int migrate_huge_pages(struct list_head *from, |
| 942 | new_page_t get_new_page, unsigned long private, int offlining) | 947 | new_page_t get_new_page, unsigned long private, bool offlining, |
| 948 | bool sync) | ||
| 943 | { | 949 | { |
| 944 | int retry = 1; | 950 | int retry = 1; |
| 945 | int nr_failed = 0; | 951 | int nr_failed = 0; |
| @@ -955,7 +961,8 @@ int migrate_huge_pages(struct list_head *from, | |||
| 955 | cond_resched(); | 961 | cond_resched(); |
| 956 | 962 | ||
| 957 | rc = unmap_and_move_huge_page(get_new_page, | 963 | rc = unmap_and_move_huge_page(get_new_page, |
| 958 | private, page, pass > 2, offlining); | 964 | private, page, pass > 2, offlining, |
| 965 | sync); | ||
| 959 | 966 | ||
| 960 | switch(rc) { | 967 | switch(rc) { |
| 961 | case -ENOMEM: | 968 | case -ENOMEM: |
| @@ -1040,7 +1047,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
| 1040 | if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) | 1047 | if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) |
| 1041 | goto set_status; | 1048 | goto set_status; |
| 1042 | 1049 | ||
| 1043 | page = follow_page(vma, pp->addr, FOLL_GET); | 1050 | page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); |
| 1044 | 1051 | ||
| 1045 | err = PTR_ERR(page); | 1052 | err = PTR_ERR(page); |
| 1046 | if (IS_ERR(page)) | 1053 | if (IS_ERR(page)) |
| @@ -1088,7 +1095,7 @@ set_status: | |||
| 1088 | err = 0; | 1095 | err = 0; |
| 1089 | if (!list_empty(&pagelist)) { | 1096 | if (!list_empty(&pagelist)) { |
| 1090 | err = migrate_pages(&pagelist, new_page_node, | 1097 | err = migrate_pages(&pagelist, new_page_node, |
| 1091 | (unsigned long)pm, 0); | 1098 | (unsigned long)pm, 0, true); |
| 1092 | if (err) | 1099 | if (err) |
| 1093 | putback_lru_pages(&pagelist); | 1100 | putback_lru_pages(&pagelist); |
| 1094 | } | 1101 | } |
diff --git a/mm/mincore.c b/mm/mincore.c index 9ac42dc6d7b6..a4e6b9d75c76 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
| @@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
| 154 | pmd = pmd_offset(pud, addr); | 154 | pmd = pmd_offset(pud, addr); |
| 155 | do { | 155 | do { |
| 156 | next = pmd_addr_end(addr, end); | 156 | next = pmd_addr_end(addr, end); |
| 157 | if (pmd_trans_huge(*pmd)) { | ||
| 158 | if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { | ||
| 159 | vec += (next - addr) >> PAGE_SHIFT; | ||
| 160 | continue; | ||
| 161 | } | ||
| 162 | /* fall through */ | ||
| 163 | } | ||
| 157 | if (pmd_none_or_clear_bad(pmd)) | 164 | if (pmd_none_or_clear_bad(pmd)) |
| 158 | mincore_unmapped_range(vma, addr, next, vec); | 165 | mincore_unmapped_range(vma, addr, next, vec); |
| 159 | else | 166 | else |
diff --git a/mm/mlock.c b/mm/mlock.c index b70919ce4f72..13e81ee8be9d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add | |||
| 155 | * vma->vm_mm->mmap_sem must be held for at least read. | 155 | * vma->vm_mm->mmap_sem must be held for at least read. |
| 156 | */ | 156 | */ |
| 157 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | 157 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, |
| 158 | unsigned long start, unsigned long end) | 158 | unsigned long start, unsigned long end, |
| 159 | int *nonblocking) | ||
| 159 | { | 160 | { |
| 160 | struct mm_struct *mm = vma->vm_mm; | 161 | struct mm_struct *mm = vma->vm_mm; |
| 161 | unsigned long addr = start; | 162 | unsigned long addr = start; |
| 162 | struct page *pages[16]; /* 16 gives a reasonable batch */ | ||
| 163 | int nr_pages = (end - start) / PAGE_SIZE; | 163 | int nr_pages = (end - start) / PAGE_SIZE; |
| 164 | int ret = 0; | ||
| 165 | int gup_flags; | 164 | int gup_flags; |
| 166 | 165 | ||
| 167 | VM_BUG_ON(start & ~PAGE_MASK); | 166 | VM_BUG_ON(start & ~PAGE_MASK); |
| @@ -170,73 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
| 170 | VM_BUG_ON(end > vma->vm_end); | 169 | VM_BUG_ON(end > vma->vm_end); |
| 171 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | 170 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); |
| 172 | 171 | ||
| 173 | gup_flags = FOLL_TOUCH | FOLL_GET; | 172 | gup_flags = FOLL_TOUCH; |
| 174 | if (vma->vm_flags & VM_WRITE) | 173 | /* |
| 174 | * We want to touch writable mappings with a write fault in order | ||
| 175 | * to break COW, except for shared mappings because these don't COW | ||
| 176 | * and we would not want to dirty them for nothing. | ||
| 177 | */ | ||
| 178 | if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) | ||
| 175 | gup_flags |= FOLL_WRITE; | 179 | gup_flags |= FOLL_WRITE; |
| 176 | 180 | ||
| 181 | if (vma->vm_flags & VM_LOCKED) | ||
| 182 | gup_flags |= FOLL_MLOCK; | ||
| 183 | |||
| 177 | /* We don't try to access the guard page of a stack vma */ | 184 | /* We don't try to access the guard page of a stack vma */ |
| 178 | if (stack_guard_page(vma, start)) { | 185 | if (stack_guard_page(vma, start)) { |
| 179 | addr += PAGE_SIZE; | 186 | addr += PAGE_SIZE; |
| 180 | nr_pages--; | 187 | nr_pages--; |
| 181 | } | 188 | } |
| 182 | 189 | ||
| 183 | while (nr_pages > 0) { | 190 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, |
| 184 | int i; | 191 | NULL, NULL, nonblocking); |
| 185 | |||
| 186 | cond_resched(); | ||
| 187 | |||
| 188 | /* | ||
| 189 | * get_user_pages makes pages present if we are | ||
| 190 | * setting mlock. and this extra reference count will | ||
| 191 | * disable migration of this page. However, page may | ||
| 192 | * still be truncated out from under us. | ||
| 193 | */ | ||
| 194 | ret = __get_user_pages(current, mm, addr, | ||
| 195 | min_t(int, nr_pages, ARRAY_SIZE(pages)), | ||
| 196 | gup_flags, pages, NULL); | ||
| 197 | /* | ||
| 198 | * This can happen for, e.g., VM_NONLINEAR regions before | ||
| 199 | * a page has been allocated and mapped at a given offset, | ||
| 200 | * or for addresses that map beyond end of a file. | ||
| 201 | * We'll mlock the pages if/when they get faulted in. | ||
| 202 | */ | ||
| 203 | if (ret < 0) | ||
| 204 | break; | ||
| 205 | |||
| 206 | lru_add_drain(); /* push cached pages to LRU */ | ||
| 207 | |||
| 208 | for (i = 0; i < ret; i++) { | ||
| 209 | struct page *page = pages[i]; | ||
| 210 | |||
| 211 | if (page->mapping) { | ||
| 212 | /* | ||
| 213 | * That preliminary check is mainly to avoid | ||
| 214 | * the pointless overhead of lock_page on the | ||
| 215 | * ZERO_PAGE: which might bounce very badly if | ||
| 216 | * there is contention. However, we're still | ||
| 217 | * dirtying its cacheline with get/put_page: | ||
| 218 | * we'll add another __get_user_pages flag to | ||
| 219 | * avoid it if that case turns out to matter. | ||
| 220 | */ | ||
| 221 | lock_page(page); | ||
| 222 | /* | ||
| 223 | * Because we lock page here and migration is | ||
| 224 | * blocked by the elevated reference, we need | ||
| 225 | * only check for file-cache page truncation. | ||
| 226 | */ | ||
| 227 | if (page->mapping) | ||
| 228 | mlock_vma_page(page); | ||
| 229 | unlock_page(page); | ||
| 230 | } | ||
| 231 | put_page(page); /* ref from get_user_pages() */ | ||
| 232 | } | ||
| 233 | |||
| 234 | addr += ret * PAGE_SIZE; | ||
| 235 | nr_pages -= ret; | ||
| 236 | ret = 0; | ||
| 237 | } | ||
| 238 | |||
| 239 | return ret; /* 0 or negative error code */ | ||
| 240 | } | 192 | } |
| 241 | 193 | ||
| 242 | /* | 194 | /* |
| @@ -280,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
| 280 | is_vm_hugetlb_page(vma) || | 232 | is_vm_hugetlb_page(vma) || |
| 281 | vma == get_gate_vma(current))) { | 233 | vma == get_gate_vma(current))) { |
| 282 | 234 | ||
| 283 | __mlock_vma_pages_range(vma, start, end); | 235 | __mlock_vma_pages_range(vma, start, end, NULL); |
| 284 | 236 | ||
| 285 | /* Hide errors from mmap() and other callers */ | 237 | /* Hide errors from mmap() and other callers */ |
| 286 | return 0; | 238 | return 0; |
| @@ -372,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
| 372 | int ret = 0; | 324 | int ret = 0; |
| 373 | int lock = newflags & VM_LOCKED; | 325 | int lock = newflags & VM_LOCKED; |
| 374 | 326 | ||
| 375 | if (newflags == vma->vm_flags || | 327 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || |
| 376 | (vma->vm_flags & (VM_IO | VM_PFNMAP))) | 328 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) |
| 377 | goto out; /* don't set VM_LOCKED, don't count */ | 329 | goto out; /* don't set VM_LOCKED, don't count */ |
| 378 | 330 | ||
| 379 | if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | ||
| 380 | is_vm_hugetlb_page(vma) || | ||
| 381 | vma == get_gate_vma(current)) { | ||
| 382 | if (lock) | ||
| 383 | make_pages_present(start, end); | ||
| 384 | goto out; /* don't set VM_LOCKED, don't count */ | ||
| 385 | } | ||
| 386 | |||
| 387 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 331 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
| 388 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, | 332 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, |
| 389 | vma->vm_file, pgoff, vma_policy(vma)); | 333 | vma->vm_file, pgoff, vma_policy(vma)); |
| @@ -419,14 +363,10 @@ success: | |||
| 419 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. | 363 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. |
| 420 | */ | 364 | */ |
| 421 | 365 | ||
| 422 | if (lock) { | 366 | if (lock) |
| 423 | vma->vm_flags = newflags; | 367 | vma->vm_flags = newflags; |
| 424 | ret = __mlock_vma_pages_range(vma, start, end); | 368 | else |
| 425 | if (ret < 0) | ||
| 426 | ret = __mlock_posix_error_return(ret); | ||
| 427 | } else { | ||
| 428 | munlock_vma_pages_range(vma, start, end); | 369 | munlock_vma_pages_range(vma, start, end); |
| 429 | } | ||
| 430 | 370 | ||
| 431 | out: | 371 | out: |
| 432 | *prev = vma; | 372 | *prev = vma; |
| @@ -439,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
| 439 | struct vm_area_struct * vma, * prev; | 379 | struct vm_area_struct * vma, * prev; |
| 440 | int error; | 380 | int error; |
| 441 | 381 | ||
| 442 | len = PAGE_ALIGN(len); | 382 | VM_BUG_ON(start & ~PAGE_MASK); |
| 383 | VM_BUG_ON(len != PAGE_ALIGN(len)); | ||
| 443 | end = start + len; | 384 | end = start + len; |
| 444 | if (end < start) | 385 | if (end < start) |
| 445 | return -EINVAL; | 386 | return -EINVAL; |
| @@ -482,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
| 482 | return error; | 423 | return error; |
| 483 | } | 424 | } |
| 484 | 425 | ||
| 426 | static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | ||
| 427 | { | ||
| 428 | struct mm_struct *mm = current->mm; | ||
| 429 | unsigned long end, nstart, nend; | ||
| 430 | struct vm_area_struct *vma = NULL; | ||
| 431 | int locked = 0; | ||
| 432 | int ret = 0; | ||
| 433 | |||
| 434 | VM_BUG_ON(start & ~PAGE_MASK); | ||
| 435 | VM_BUG_ON(len != PAGE_ALIGN(len)); | ||
| 436 | end = start + len; | ||
| 437 | |||
| 438 | for (nstart = start; nstart < end; nstart = nend) { | ||
| 439 | /* | ||
| 440 | * We want to fault in pages for [nstart; end) address range. | ||
| 441 | * Find first corresponding VMA. | ||
| 442 | */ | ||
| 443 | if (!locked) { | ||
| 444 | locked = 1; | ||
| 445 | down_read(&mm->mmap_sem); | ||
| 446 | vma = find_vma(mm, nstart); | ||
| 447 | } else if (nstart >= vma->vm_end) | ||
| 448 | vma = vma->vm_next; | ||
| 449 | if (!vma || vma->vm_start >= end) | ||
| 450 | break; | ||
| 451 | /* | ||
| 452 | * Set [nstart; nend) to intersection of desired address | ||
| 453 | * range with the first VMA. Also, skip undesirable VMA types. | ||
| 454 | */ | ||
| 455 | nend = min(end, vma->vm_end); | ||
| 456 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
| 457 | continue; | ||
| 458 | if (nstart < vma->vm_start) | ||
| 459 | nstart = vma->vm_start; | ||
| 460 | /* | ||
| 461 | * Now fault in a range of pages. __mlock_vma_pages_range() | ||
| 462 | * double checks the vma flags, so that it won't mlock pages | ||
| 463 | * if the vma was already munlocked. | ||
| 464 | */ | ||
| 465 | ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); | ||
| 466 | if (ret < 0) { | ||
| 467 | if (ignore_errors) { | ||
| 468 | ret = 0; | ||
| 469 | continue; /* continue at next VMA */ | ||
| 470 | } | ||
| 471 | ret = __mlock_posix_error_return(ret); | ||
| 472 | break; | ||
| 473 | } | ||
| 474 | nend = nstart + ret * PAGE_SIZE; | ||
| 475 | ret = 0; | ||
| 476 | } | ||
| 477 | if (locked) | ||
| 478 | up_read(&mm->mmap_sem); | ||
| 479 | return ret; /* 0 or negative error code */ | ||
| 480 | } | ||
| 481 | |||
| 485 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | 482 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) |
| 486 | { | 483 | { |
| 487 | unsigned long locked; | 484 | unsigned long locked; |
| @@ -507,6 +504,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
| 507 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) | 504 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
| 508 | error = do_mlock(start, len, 1); | 505 | error = do_mlock(start, len, 1); |
| 509 | up_write(¤t->mm->mmap_sem); | 506 | up_write(¤t->mm->mmap_sem); |
| 507 | if (!error) | ||
| 508 | error = do_mlock_pages(start, len, 0); | ||
| 510 | return error; | 509 | return error; |
| 511 | } | 510 | } |
| 512 | 511 | ||
| @@ -571,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
| 571 | capable(CAP_IPC_LOCK)) | 570 | capable(CAP_IPC_LOCK)) |
| 572 | ret = do_mlockall(flags); | 571 | ret = do_mlockall(flags); |
| 573 | up_write(¤t->mm->mmap_sem); | 572 | up_write(¤t->mm->mmap_sem); |
| 573 | if (!ret && (flags & MCL_CURRENT)) { | ||
| 574 | /* Ignore errors */ | ||
| 575 | do_mlock_pages(0, TASK_SIZE, 1); | ||
| 576 | } | ||
| 574 | out: | 577 | out: |
| 575 | return ret; | 578 | return ret; |
| 576 | } | 579 | } |
| @@ -29,6 +29,7 @@ | |||
| 29 | #include <linux/mmu_notifier.h> | 29 | #include <linux/mmu_notifier.h> |
| 30 | #include <linux/perf_event.h> | 30 | #include <linux/perf_event.h> |
| 31 | #include <linux/audit.h> | 31 | #include <linux/audit.h> |
| 32 | #include <linux/khugepaged.h> | ||
| 32 | 33 | ||
| 33 | #include <asm/uaccess.h> | 34 | #include <asm/uaccess.h> |
| 34 | #include <asm/cacheflush.h> | 35 | #include <asm/cacheflush.h> |
| @@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
| 253 | down_write(&mm->mmap_sem); | 254 | down_write(&mm->mmap_sem); |
| 254 | 255 | ||
| 255 | #ifdef CONFIG_COMPAT_BRK | 256 | #ifdef CONFIG_COMPAT_BRK |
| 256 | min_brk = mm->end_code; | 257 | /* |
| 258 | * CONFIG_COMPAT_BRK can still be overridden by setting | ||
| 259 | * randomize_va_space to 2, which will still cause mm->start_brk | ||
| 260 | * to be arbitrarily shifted | ||
| 261 | */ | ||
| 262 | if (mm->start_brk > PAGE_ALIGN(mm->end_data)) | ||
| 263 | min_brk = mm->start_brk; | ||
| 264 | else | ||
| 265 | min_brk = mm->end_data; | ||
| 257 | #else | 266 | #else |
| 258 | min_brk = mm->start_brk; | 267 | min_brk = mm->start_brk; |
| 259 | #endif | 268 | #endif |
| @@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 588 | } | 597 | } |
| 589 | } | 598 | } |
| 590 | 599 | ||
| 600 | vma_adjust_trans_huge(vma, start, end, adjust_next); | ||
| 601 | |||
| 591 | /* | 602 | /* |
| 592 | * When changing only vma->vm_end, we don't really need anon_vma | 603 | * When changing only vma->vm_end, we don't really need anon_vma |
| 593 | * lock. This is a fairly rare case by itself, but the anon_vma | 604 | * lock. This is a fairly rare case by itself, but the anon_vma |
| @@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
| 815 | end, prev->vm_pgoff, NULL); | 826 | end, prev->vm_pgoff, NULL); |
| 816 | if (err) | 827 | if (err) |
| 817 | return NULL; | 828 | return NULL; |
| 829 | khugepaged_enter_vma_merge(prev); | ||
| 818 | return prev; | 830 | return prev; |
| 819 | } | 831 | } |
| 820 | 832 | ||
| @@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
| 833 | next->vm_pgoff - pglen, NULL); | 845 | next->vm_pgoff - pglen, NULL); |
| 834 | if (err) | 846 | if (err) |
| 835 | return NULL; | 847 | return NULL; |
| 848 | khugepaged_enter_vma_merge(area); | ||
| 836 | return area; | 849 | return area; |
| 837 | } | 850 | } |
| 838 | 851 | ||
| @@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
| 1761 | } | 1774 | } |
| 1762 | } | 1775 | } |
| 1763 | vma_unlock_anon_vma(vma); | 1776 | vma_unlock_anon_vma(vma); |
| 1777 | khugepaged_enter_vma_merge(vma); | ||
| 1764 | return error; | 1778 | return error; |
| 1765 | } | 1779 | } |
| 1766 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | 1780 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ |
| @@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma, | |||
| 1808 | } | 1822 | } |
| 1809 | } | 1823 | } |
| 1810 | vma_unlock_anon_vma(vma); | 1824 | vma_unlock_anon_vma(vma); |
| 1825 | khugepaged_enter_vma_merge(vma); | ||
| 1811 | return error; | 1826 | return error; |
| 1812 | } | 1827 | } |
| 1813 | 1828 | ||
| @@ -2462,6 +2477,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
| 2462 | unsigned long addr, unsigned long len, | 2477 | unsigned long addr, unsigned long len, |
| 2463 | unsigned long vm_flags, struct page **pages) | 2478 | unsigned long vm_flags, struct page **pages) |
| 2464 | { | 2479 | { |
| 2480 | int ret; | ||
| 2465 | struct vm_area_struct *vma; | 2481 | struct vm_area_struct *vma; |
| 2466 | 2482 | ||
| 2467 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 2483 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
| @@ -2479,16 +2495,23 @@ int install_special_mapping(struct mm_struct *mm, | |||
| 2479 | vma->vm_ops = &special_mapping_vmops; | 2495 | vma->vm_ops = &special_mapping_vmops; |
| 2480 | vma->vm_private_data = pages; | 2496 | vma->vm_private_data = pages; |
| 2481 | 2497 | ||
| 2482 | if (unlikely(insert_vm_struct(mm, vma))) { | 2498 | ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); |
| 2483 | kmem_cache_free(vm_area_cachep, vma); | 2499 | if (ret) |
| 2484 | return -ENOMEM; | 2500 | goto out; |
| 2485 | } | 2501 | |
| 2502 | ret = insert_vm_struct(mm, vma); | ||
| 2503 | if (ret) | ||
| 2504 | goto out; | ||
| 2486 | 2505 | ||
| 2487 | mm->total_vm += len >> PAGE_SHIFT; | 2506 | mm->total_vm += len >> PAGE_SHIFT; |
| 2488 | 2507 | ||
| 2489 | perf_event_mmap(vma); | 2508 | perf_event_mmap(vma); |
| 2490 | 2509 | ||
| 2491 | return 0; | 2510 | return 0; |
| 2511 | |||
| 2512 | out: | ||
| 2513 | kmem_cache_free(vm_area_cachep, vma); | ||
| 2514 | return ret; | ||
| 2492 | } | 2515 | } |
| 2493 | 2516 | ||
| 2494 | static DEFINE_MUTEX(mm_all_locks_mutex); | 2517 | static DEFINE_MUTEX(mm_all_locks_mutex); |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 438951d366f2..8d032de4088e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
| @@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
| 100 | return young; | 100 | return young; |
| 101 | } | 101 | } |
| 102 | 102 | ||
| 103 | int __mmu_notifier_test_young(struct mm_struct *mm, | ||
| 104 | unsigned long address) | ||
| 105 | { | ||
| 106 | struct mmu_notifier *mn; | ||
| 107 | struct hlist_node *n; | ||
| 108 | int young = 0; | ||
| 109 | |||
| 110 | rcu_read_lock(); | ||
| 111 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
| 112 | if (mn->ops->test_young) { | ||
| 113 | young = mn->ops->test_young(mn, mm, address); | ||
| 114 | if (young) | ||
| 115 | break; | ||
| 116 | } | ||
| 117 | } | ||
| 118 | rcu_read_unlock(); | ||
| 119 | |||
| 120 | return young; | ||
| 121 | } | ||
| 122 | |||
| 103 | void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | 123 | void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, |
| 104 | pte_t pte) | 124 | pte_t pte) |
| 105 | { | 125 | { |
diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb82c855..f5b7d1760213 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
| @@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, | |||
| 87 | return 1; | 87 | return 1; |
| 88 | } | 88 | } |
| 89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | 89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ |
| 90 | |||
| 91 | #ifdef CONFIG_SMP | ||
| 92 | /* Called when a more accurate view of NR_FREE_PAGES is needed */ | ||
| 93 | unsigned long zone_nr_free_pages(struct zone *zone) | ||
| 94 | { | ||
| 95 | unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); | ||
| 96 | |||
| 97 | /* | ||
| 98 | * While kswapd is awake, it is considered the zone is under some | ||
| 99 | * memory pressure. Under pressure, there is a risk that | ||
| 100 | * per-cpu-counter-drift will allow the min watermark to be breached | ||
| 101 | * potentially causing a live-lock. While kswapd is awake and | ||
| 102 | * free pages are low, get a better estimate for free pages | ||
| 103 | */ | ||
| 104 | if (nr_free_pages < zone->percpu_drift_mark && | ||
| 105 | !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) | ||
| 106 | return zone_page_state_snapshot(zone, NR_FREE_PAGES); | ||
| 107 | |||
| 108 | return nr_free_pages; | ||
| 109 | } | ||
| 110 | #endif /* CONFIG_SMP */ | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c5133873097..5a688a2756be 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 78 | pte_unmap_unlock(pte - 1, ptl); | 78 | pte_unmap_unlock(pte - 1, ptl); |
| 79 | } | 79 | } |
| 80 | 80 | ||
| 81 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | 81 | static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
| 82 | unsigned long addr, unsigned long end, pgprot_t newprot, | 82 | unsigned long addr, unsigned long end, pgprot_t newprot, |
| 83 | int dirty_accountable) | 83 | int dirty_accountable) |
| 84 | { | 84 | { |
| @@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
| 88 | pmd = pmd_offset(pud, addr); | 88 | pmd = pmd_offset(pud, addr); |
| 89 | do { | 89 | do { |
| 90 | next = pmd_addr_end(addr, end); | 90 | next = pmd_addr_end(addr, end); |
| 91 | if (pmd_trans_huge(*pmd)) { | ||
| 92 | if (next - addr != HPAGE_PMD_SIZE) | ||
| 93 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
| 94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) | ||
| 95 | continue; | ||
| 96 | /* fall through */ | ||
| 97 | } | ||
| 91 | if (pmd_none_or_clear_bad(pmd)) | 98 | if (pmd_none_or_clear_bad(pmd)) |
| 92 | continue; | 99 | continue; |
| 93 | change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); | 100 | change_pte_range(vma->vm_mm, pmd, addr, next, newprot, |
| 101 | dirty_accountable); | ||
| 94 | } while (pmd++, addr = next, addr != end); | 102 | } while (pmd++, addr = next, addr != end); |
| 95 | } | 103 | } |
| 96 | 104 | ||
| 97 | static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | 105 | static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
| 98 | unsigned long addr, unsigned long end, pgprot_t newprot, | 106 | unsigned long addr, unsigned long end, pgprot_t newprot, |
| 99 | int dirty_accountable) | 107 | int dirty_accountable) |
| 100 | { | 108 | { |
| @@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
| 106 | next = pud_addr_end(addr, end); | 114 | next = pud_addr_end(addr, end); |
| 107 | if (pud_none_or_clear_bad(pud)) | 115 | if (pud_none_or_clear_bad(pud)) |
| 108 | continue; | 116 | continue; |
| 109 | change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); | 117 | change_pmd_range(vma, pud, addr, next, newprot, |
| 118 | dirty_accountable); | ||
| 110 | } while (pud++, addr = next, addr != end); | 119 | } while (pud++, addr = next, addr != end); |
| 111 | } | 120 | } |
| 112 | 121 | ||
| @@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma, | |||
| 126 | next = pgd_addr_end(addr, end); | 135 | next = pgd_addr_end(addr, end); |
| 127 | if (pgd_none_or_clear_bad(pgd)) | 136 | if (pgd_none_or_clear_bad(pgd)) |
| 128 | continue; | 137 | continue; |
| 129 | change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); | 138 | change_pud_range(vma, pgd, addr, next, newprot, |
| 139 | dirty_accountable); | ||
| 130 | } while (pgd++, addr = next, addr != end); | 140 | } while (pgd++, addr = next, addr != end); |
| 131 | flush_tlb_range(vma, start, end); | 141 | flush_tlb_range(vma, start, end); |
| 132 | } | 142 | } |
diff --git a/mm/mremap.c b/mm/mremap.c index 563fbdd6293a..9925b6391b80 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) | |||
| 41 | return NULL; | 41 | return NULL; |
| 42 | 42 | ||
| 43 | pmd = pmd_offset(pud, addr); | 43 | pmd = pmd_offset(pud, addr); |
| 44 | split_huge_page_pmd(mm, pmd); | ||
| 44 | if (pmd_none_or_clear_bad(pmd)) | 45 | if (pmd_none_or_clear_bad(pmd)) |
| 45 | return NULL; | 46 | return NULL; |
| 46 | 47 | ||
| 47 | return pmd; | 48 | return pmd; |
| 48 | } | 49 | } |
| 49 | 50 | ||
| 50 | static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) | 51 | static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, |
| 52 | unsigned long addr) | ||
| 51 | { | 53 | { |
| 52 | pgd_t *pgd; | 54 | pgd_t *pgd; |
| 53 | pud_t *pud; | 55 | pud_t *pud; |
| @@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) | |||
| 62 | if (!pmd) | 64 | if (!pmd) |
| 63 | return NULL; | 65 | return NULL; |
| 64 | 66 | ||
| 65 | if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) | 67 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
| 68 | if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) | ||
| 66 | return NULL; | 69 | return NULL; |
| 67 | 70 | ||
| 68 | return pmd; | 71 | return pmd; |
| @@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
| 147 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); | 150 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); |
| 148 | if (!old_pmd) | 151 | if (!old_pmd) |
| 149 | continue; | 152 | continue; |
| 150 | new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); | 153 | new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); |
| 151 | if (!new_pmd) | 154 | if (!new_pmd) |
| 152 | break; | 155 | break; |
| 153 | next = (new_addr + PMD_SIZE) & PMD_MASK; | 156 | next = (new_addr + PMD_SIZE) & PMD_MASK; |
diff --git a/mm/nommu.c b/mm/nommu.c index 27a9ac588516..f59e1424d3db 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -10,7 +10,7 @@ | |||
| 10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> | 10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> |
| 11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> | 11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> |
| 12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> | 12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> |
| 13 | * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> | 13 | * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> |
| 14 | */ | 14 | */ |
| 15 | 15 | ||
| 16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
| @@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp) | |||
| 127 | 127 | ||
| 128 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 128 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 129 | unsigned long start, int nr_pages, unsigned int foll_flags, | 129 | unsigned long start, int nr_pages, unsigned int foll_flags, |
| 130 | struct page **pages, struct vm_area_struct **vmas) | 130 | struct page **pages, struct vm_area_struct **vmas, |
| 131 | int *retry) | ||
| 131 | { | 132 | { |
| 132 | struct vm_area_struct *vma; | 133 | struct vm_area_struct *vma; |
| 133 | unsigned long vm_flags; | 134 | unsigned long vm_flags; |
| @@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 185 | if (force) | 186 | if (force) |
| 186 | flags |= FOLL_FORCE; | 187 | flags |= FOLL_FORCE; |
| 187 | 188 | ||
| 188 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 189 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, |
| 190 | NULL); | ||
| 189 | } | 191 | } |
| 190 | EXPORT_SYMBOL(get_user_pages); | 192 | EXPORT_SYMBOL(get_user_pages); |
| 191 | 193 | ||
| @@ -328,6 +330,7 @@ void *vmalloc_node(unsigned long size, int node) | |||
| 328 | { | 330 | { |
| 329 | return vmalloc(size); | 331 | return vmalloc(size); |
| 330 | } | 332 | } |
| 333 | EXPORT_SYMBOL(vmalloc_node); | ||
| 331 | 334 | ||
| 332 | /** | 335 | /** |
| 333 | * vzalloc_node - allocate memory on a specific node with zero fill | 336 | * vzalloc_node - allocate memory on a specific node with zero fill |
| @@ -440,6 +443,31 @@ void __attribute__((weak)) vmalloc_sync_all(void) | |||
| 440 | { | 443 | { |
| 441 | } | 444 | } |
| 442 | 445 | ||
| 446 | /** | ||
| 447 | * alloc_vm_area - allocate a range of kernel address space | ||
| 448 | * @size: size of the area | ||
| 449 | * | ||
| 450 | * Returns: NULL on failure, vm_struct on success | ||
| 451 | * | ||
| 452 | * This function reserves a range of kernel address space, and | ||
| 453 | * allocates pagetables to map that range. No actual mappings | ||
| 454 | * are created. If the kernel address space is not shared | ||
| 455 | * between processes, it syncs the pagetable across all | ||
| 456 | * processes. | ||
| 457 | */ | ||
| 458 | struct vm_struct *alloc_vm_area(size_t size) | ||
| 459 | { | ||
| 460 | BUG(); | ||
| 461 | return NULL; | ||
| 462 | } | ||
| 463 | EXPORT_SYMBOL_GPL(alloc_vm_area); | ||
| 464 | |||
| 465 | void free_vm_area(struct vm_struct *area) | ||
| 466 | { | ||
| 467 | BUG(); | ||
| 468 | } | ||
| 469 | EXPORT_SYMBOL_GPL(free_vm_area); | ||
| 470 | |||
| 443 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | 471 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, |
| 444 | struct page *page) | 472 | struct page *page) |
| 445 | { | 473 | { |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b840afa89761..2cb01f6ec5d0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -404,15 +404,18 @@ unsigned long determine_dirtyable_memory(void) | |||
| 404 | * - vm.dirty_background_ratio or vm.dirty_background_bytes | 404 | * - vm.dirty_background_ratio or vm.dirty_background_bytes |
| 405 | * - vm.dirty_ratio or vm.dirty_bytes | 405 | * - vm.dirty_ratio or vm.dirty_bytes |
| 406 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and | 406 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and |
| 407 | * runtime tasks. | 407 | * real-time tasks. |
| 408 | */ | 408 | */ |
| 409 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | 409 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) |
| 410 | { | 410 | { |
| 411 | unsigned long background; | 411 | unsigned long background; |
| 412 | unsigned long dirty; | 412 | unsigned long dirty; |
| 413 | unsigned long available_memory = determine_dirtyable_memory(); | 413 | unsigned long uninitialized_var(available_memory); |
| 414 | struct task_struct *tsk; | 414 | struct task_struct *tsk; |
| 415 | 415 | ||
| 416 | if (!vm_dirty_bytes || !dirty_background_bytes) | ||
| 417 | available_memory = determine_dirtyable_memory(); | ||
| 418 | |||
| 416 | if (vm_dirty_bytes) | 419 | if (vm_dirty_bytes) |
| 417 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); | 420 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); |
| 418 | else | 421 | else |
| @@ -563,7 +566,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
| 563 | break; /* We've done our duty */ | 566 | break; /* We've done our duty */ |
| 564 | } | 567 | } |
| 565 | trace_wbc_balance_dirty_wait(&wbc, bdi); | 568 | trace_wbc_balance_dirty_wait(&wbc, bdi); |
| 566 | __set_current_state(TASK_INTERRUPTIBLE); | 569 | __set_current_state(TASK_UNINTERRUPTIBLE); |
| 567 | io_schedule_timeout(pause); | 570 | io_schedule_timeout(pause); |
| 568 | 571 | ||
| 569 | /* | 572 | /* |
| @@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page); | |||
| 1103 | int __set_page_dirty_no_writeback(struct page *page) | 1106 | int __set_page_dirty_no_writeback(struct page *page) |
| 1104 | { | 1107 | { |
| 1105 | if (!PageDirty(page)) | 1108 | if (!PageDirty(page)) |
| 1106 | SetPageDirty(page); | 1109 | return !TestSetPageDirty(page); |
| 1107 | return 0; | 1110 | return 0; |
| 1108 | } | 1111 | } |
| 1109 | 1112 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e4092704c1a9..90c1439549fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | |||
| 104 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | 104 | * only be modified with pm_mutex held, unless the suspend/hibernate code is |
| 105 | * guaranteed not to run in parallel with that modification). | 105 | * guaranteed not to run in parallel with that modification). |
| 106 | */ | 106 | */ |
| 107 | void set_gfp_allowed_mask(gfp_t mask) | 107 | |
| 108 | static gfp_t saved_gfp_mask; | ||
| 109 | |||
| 110 | void pm_restore_gfp_mask(void) | ||
| 108 | { | 111 | { |
| 109 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 112 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
| 110 | gfp_allowed_mask = mask; | 113 | if (saved_gfp_mask) { |
| 114 | gfp_allowed_mask = saved_gfp_mask; | ||
| 115 | saved_gfp_mask = 0; | ||
| 116 | } | ||
| 111 | } | 117 | } |
| 112 | 118 | ||
| 113 | gfp_t clear_gfp_allowed_mask(gfp_t mask) | 119 | void pm_restrict_gfp_mask(void) |
| 114 | { | 120 | { |
| 115 | gfp_t ret = gfp_allowed_mask; | ||
| 116 | |||
| 117 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 121 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
| 118 | gfp_allowed_mask &= ~mask; | 122 | WARN_ON(saved_gfp_mask); |
| 119 | return ret; | 123 | saved_gfp_mask = gfp_allowed_mask; |
| 124 | gfp_allowed_mask &= ~GFP_IOFS; | ||
| 120 | } | 125 | } |
| 121 | #endif /* CONFIG_PM_SLEEP */ | 126 | #endif /* CONFIG_PM_SLEEP */ |
| 122 | 127 | ||
| @@ -352,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
| 352 | } | 357 | } |
| 353 | } | 358 | } |
| 354 | 359 | ||
| 360 | /* update __split_huge_page_refcount if you change this function */ | ||
| 355 | static int destroy_compound_page(struct page *page, unsigned long order) | 361 | static int destroy_compound_page(struct page *page, unsigned long order) |
| 356 | { | 362 | { |
| 357 | int i; | 363 | int i; |
| @@ -421,18 +427,10 @@ static inline void rmv_page_order(struct page *page) | |||
| 421 | * | 427 | * |
| 422 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER | 428 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER |
| 423 | */ | 429 | */ |
| 424 | static inline struct page * | ||
| 425 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) | ||
| 426 | { | ||
| 427 | unsigned long buddy_idx = page_idx ^ (1 << order); | ||
| 428 | |||
| 429 | return page + (buddy_idx - page_idx); | ||
| 430 | } | ||
| 431 | |||
| 432 | static inline unsigned long | 430 | static inline unsigned long |
| 433 | __find_combined_index(unsigned long page_idx, unsigned int order) | 431 | __find_buddy_index(unsigned long page_idx, unsigned int order) |
| 434 | { | 432 | { |
| 435 | return (page_idx & ~(1 << order)); | 433 | return page_idx ^ (1 << order); |
| 436 | } | 434 | } |
| 437 | 435 | ||
| 438 | /* | 436 | /* |
| @@ -443,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
| 443 | * (c) a page and its buddy have the same order && | 441 | * (c) a page and its buddy have the same order && |
| 444 | * (d) a page and its buddy are in the same zone. | 442 | * (d) a page and its buddy are in the same zone. |
| 445 | * | 443 | * |
| 446 | * For recording whether a page is in the buddy system, we use PG_buddy. | 444 | * For recording whether a page is in the buddy system, we set ->_mapcount -2. |
| 447 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. | 445 | * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. |
| 448 | * | 446 | * |
| 449 | * For recording page's order, we use page_private(page). | 447 | * For recording page's order, we use page_private(page). |
| 450 | */ | 448 | */ |
| @@ -477,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
| 477 | * as necessary, plus some accounting needed to play nicely with other | 475 | * as necessary, plus some accounting needed to play nicely with other |
| 478 | * parts of the VM system. | 476 | * parts of the VM system. |
| 479 | * At each level, we keep a list of pages, which are heads of continuous | 477 | * At each level, we keep a list of pages, which are heads of continuous |
| 480 | * free pages of length of (1 << order) and marked with PG_buddy. Page's | 478 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's |
| 481 | * order is recorded in page_private(page) field. | 479 | * order is recorded in page_private(page) field. |
| 482 | * So when we are allocating or freeing one, we can derive the state of the | 480 | * So when we are allocating or freeing one, we can derive the state of the |
| 483 | * other. That is, if we allocate a small block, and both were | 481 | * other. That is, if we allocate a small block, and both were |
| @@ -494,6 +492,7 @@ static inline void __free_one_page(struct page *page, | |||
| 494 | { | 492 | { |
| 495 | unsigned long page_idx; | 493 | unsigned long page_idx; |
| 496 | unsigned long combined_idx; | 494 | unsigned long combined_idx; |
| 495 | unsigned long uninitialized_var(buddy_idx); | ||
| 497 | struct page *buddy; | 496 | struct page *buddy; |
| 498 | 497 | ||
| 499 | if (unlikely(PageCompound(page))) | 498 | if (unlikely(PageCompound(page))) |
| @@ -508,7 +507,8 @@ static inline void __free_one_page(struct page *page, | |||
| 508 | VM_BUG_ON(bad_range(zone, page)); | 507 | VM_BUG_ON(bad_range(zone, page)); |
| 509 | 508 | ||
| 510 | while (order < MAX_ORDER-1) { | 509 | while (order < MAX_ORDER-1) { |
| 511 | buddy = __page_find_buddy(page, page_idx, order); | 510 | buddy_idx = __find_buddy_index(page_idx, order); |
| 511 | buddy = page + (buddy_idx - page_idx); | ||
| 512 | if (!page_is_buddy(page, buddy, order)) | 512 | if (!page_is_buddy(page, buddy, order)) |
| 513 | break; | 513 | break; |
| 514 | 514 | ||
| @@ -516,7 +516,7 @@ static inline void __free_one_page(struct page *page, | |||
| 516 | list_del(&buddy->lru); | 516 | list_del(&buddy->lru); |
| 517 | zone->free_area[order].nr_free--; | 517 | zone->free_area[order].nr_free--; |
| 518 | rmv_page_order(buddy); | 518 | rmv_page_order(buddy); |
| 519 | combined_idx = __find_combined_index(page_idx, order); | 519 | combined_idx = buddy_idx & page_idx; |
| 520 | page = page + (combined_idx - page_idx); | 520 | page = page + (combined_idx - page_idx); |
| 521 | page_idx = combined_idx; | 521 | page_idx = combined_idx; |
| 522 | order++; | 522 | order++; |
| @@ -533,9 +533,10 @@ static inline void __free_one_page(struct page *page, | |||
| 533 | */ | 533 | */ |
| 534 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { | 534 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { |
| 535 | struct page *higher_page, *higher_buddy; | 535 | struct page *higher_page, *higher_buddy; |
| 536 | combined_idx = __find_combined_index(page_idx, order); | 536 | combined_idx = buddy_idx & page_idx; |
| 537 | higher_page = page + combined_idx - page_idx; | 537 | higher_page = page + (combined_idx - page_idx); |
| 538 | higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); | 538 | buddy_idx = __find_buddy_index(combined_idx, order + 1); |
| 539 | higher_buddy = page + (buddy_idx - combined_idx); | ||
| 539 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { | 540 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { |
| 540 | list_add_tail(&page->lru, | 541 | list_add_tail(&page->lru, |
| 541 | &zone->free_area[order].free_list[migratetype]); | 542 | &zone->free_area[order].free_list[migratetype]); |
| @@ -646,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
| 646 | trace_mm_page_free_direct(page, order); | 647 | trace_mm_page_free_direct(page, order); |
| 647 | kmemcheck_free_shadow(page, order); | 648 | kmemcheck_free_shadow(page, order); |
| 648 | 649 | ||
| 649 | for (i = 0; i < (1 << order); i++) { | 650 | if (PageAnon(page)) |
| 650 | struct page *pg = page + i; | 651 | page->mapping = NULL; |
| 651 | 652 | for (i = 0; i < (1 << order); i++) | |
| 652 | if (PageAnon(pg)) | 653 | bad += free_pages_check(page + i); |
| 653 | pg->mapping = NULL; | ||
| 654 | bad += free_pages_check(pg); | ||
| 655 | } | ||
| 656 | if (bad) | 654 | if (bad) |
| 657 | return false; | 655 | return false; |
| 658 | 656 | ||
| @@ -1455,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | |||
| 1455 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1453 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
| 1456 | 1454 | ||
| 1457 | /* | 1455 | /* |
| 1458 | * Return 1 if free pages are above 'mark'. This takes into account the order | 1456 | * Return true if free pages are above 'mark'. This takes into account the order |
| 1459 | * of the allocation. | 1457 | * of the allocation. |
| 1460 | */ | 1458 | */ |
| 1461 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1459 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
| 1462 | int classzone_idx, int alloc_flags) | 1460 | int classzone_idx, int alloc_flags, long free_pages) |
| 1463 | { | 1461 | { |
| 1464 | /* free_pages my go negative - that's OK */ | 1462 | /* free_pages my go negative - that's OK */ |
| 1465 | long min = mark; | 1463 | long min = mark; |
| 1466 | long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; | ||
| 1467 | int o; | 1464 | int o; |
| 1468 | 1465 | ||
| 1466 | free_pages -= (1 << order) + 1; | ||
| 1469 | if (alloc_flags & ALLOC_HIGH) | 1467 | if (alloc_flags & ALLOC_HIGH) |
| 1470 | min -= min / 2; | 1468 | min -= min / 2; |
| 1471 | if (alloc_flags & ALLOC_HARDER) | 1469 | if (alloc_flags & ALLOC_HARDER) |
| 1472 | min -= min / 4; | 1470 | min -= min / 4; |
| 1473 | 1471 | ||
| 1474 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1472 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
| 1475 | return 0; | 1473 | return false; |
| 1476 | for (o = 0; o < order; o++) { | 1474 | for (o = 0; o < order; o++) { |
| 1477 | /* At the next order, this order's pages become unavailable */ | 1475 | /* At the next order, this order's pages become unavailable */ |
| 1478 | free_pages -= z->free_area[o].nr_free << o; | 1476 | free_pages -= z->free_area[o].nr_free << o; |
| @@ -1481,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
| 1481 | min >>= 1; | 1479 | min >>= 1; |
| 1482 | 1480 | ||
| 1483 | if (free_pages <= min) | 1481 | if (free_pages <= min) |
| 1484 | return 0; | 1482 | return false; |
| 1485 | } | 1483 | } |
| 1486 | return 1; | 1484 | return true; |
| 1485 | } | ||
| 1486 | |||
| 1487 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | ||
| 1488 | int classzone_idx, int alloc_flags) | ||
| 1489 | { | ||
| 1490 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
| 1491 | zone_page_state(z, NR_FREE_PAGES)); | ||
| 1492 | } | ||
| 1493 | |||
| 1494 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | ||
| 1495 | int classzone_idx, int alloc_flags) | ||
| 1496 | { | ||
| 1497 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | ||
| 1498 | |||
| 1499 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | ||
| 1500 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | ||
| 1501 | |||
| 1502 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
| 1503 | free_pages); | ||
| 1487 | } | 1504 | } |
| 1488 | 1505 | ||
| 1489 | #ifdef CONFIG_NUMA | 1506 | #ifdef CONFIG_NUMA |
| @@ -1788,15 +1805,18 @@ static struct page * | |||
| 1788 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1805 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
| 1789 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1806 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
| 1790 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1807 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
| 1791 | int migratetype, unsigned long *did_some_progress) | 1808 | int migratetype, unsigned long *did_some_progress, |
| 1809 | bool sync_migration) | ||
| 1792 | { | 1810 | { |
| 1793 | struct page *page; | 1811 | struct page *page; |
| 1794 | 1812 | ||
| 1795 | if (!order || compaction_deferred(preferred_zone)) | 1813 | if (!order || compaction_deferred(preferred_zone)) |
| 1796 | return NULL; | 1814 | return NULL; |
| 1797 | 1815 | ||
| 1816 | current->flags |= PF_MEMALLOC; | ||
| 1798 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 1817 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
| 1799 | nodemask); | 1818 | nodemask, sync_migration); |
| 1819 | current->flags &= ~PF_MEMALLOC; | ||
| 1800 | if (*did_some_progress != COMPACT_SKIPPED) { | 1820 | if (*did_some_progress != COMPACT_SKIPPED) { |
| 1801 | 1821 | ||
| 1802 | /* Page migration frees to the PCP lists but we want merging */ | 1822 | /* Page migration frees to the PCP lists but we want merging */ |
| @@ -1832,7 +1852,8 @@ static inline struct page * | |||
| 1832 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1852 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
| 1833 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1853 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
| 1834 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1854 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
| 1835 | int migratetype, unsigned long *did_some_progress) | 1855 | int migratetype, unsigned long *did_some_progress, |
| 1856 | bool sync_migration) | ||
| 1836 | { | 1857 | { |
| 1837 | return NULL; | 1858 | return NULL; |
| 1838 | } | 1859 | } |
| @@ -1847,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
| 1847 | { | 1868 | { |
| 1848 | struct page *page = NULL; | 1869 | struct page *page = NULL; |
| 1849 | struct reclaim_state reclaim_state; | 1870 | struct reclaim_state reclaim_state; |
| 1850 | struct task_struct *p = current; | ||
| 1851 | bool drained = false; | 1871 | bool drained = false; |
| 1852 | 1872 | ||
| 1853 | cond_resched(); | 1873 | cond_resched(); |
| 1854 | 1874 | ||
| 1855 | /* We now go into synchronous reclaim */ | 1875 | /* We now go into synchronous reclaim */ |
| 1856 | cpuset_memory_pressure_bump(); | 1876 | cpuset_memory_pressure_bump(); |
| 1857 | p->flags |= PF_MEMALLOC; | 1877 | current->flags |= PF_MEMALLOC; |
| 1858 | lockdep_set_current_reclaim_state(gfp_mask); | 1878 | lockdep_set_current_reclaim_state(gfp_mask); |
| 1859 | reclaim_state.reclaimed_slab = 0; | 1879 | reclaim_state.reclaimed_slab = 0; |
| 1860 | p->reclaim_state = &reclaim_state; | 1880 | current->reclaim_state = &reclaim_state; |
| 1861 | 1881 | ||
| 1862 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 1882 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); |
| 1863 | 1883 | ||
| 1864 | p->reclaim_state = NULL; | 1884 | current->reclaim_state = NULL; |
| 1865 | lockdep_clear_current_reclaim_state(); | 1885 | lockdep_clear_current_reclaim_state(); |
| 1866 | p->flags &= ~PF_MEMALLOC; | 1886 | current->flags &= ~PF_MEMALLOC; |
| 1867 | 1887 | ||
| 1868 | cond_resched(); | 1888 | cond_resched(); |
| 1869 | 1889 | ||
| @@ -1915,19 +1935,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
| 1915 | 1935 | ||
| 1916 | static inline | 1936 | static inline |
| 1917 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | 1937 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, |
| 1918 | enum zone_type high_zoneidx) | 1938 | enum zone_type high_zoneidx, |
| 1939 | enum zone_type classzone_idx) | ||
| 1919 | { | 1940 | { |
| 1920 | struct zoneref *z; | 1941 | struct zoneref *z; |
| 1921 | struct zone *zone; | 1942 | struct zone *zone; |
| 1922 | 1943 | ||
| 1923 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 1944 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
| 1924 | wakeup_kswapd(zone, order); | 1945 | wakeup_kswapd(zone, order, classzone_idx); |
| 1925 | } | 1946 | } |
| 1926 | 1947 | ||
| 1927 | static inline int | 1948 | static inline int |
| 1928 | gfp_to_alloc_flags(gfp_t gfp_mask) | 1949 | gfp_to_alloc_flags(gfp_t gfp_mask) |
| 1929 | { | 1950 | { |
| 1930 | struct task_struct *p = current; | ||
| 1931 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | 1951 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; |
| 1932 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1952 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
| 1933 | 1953 | ||
| @@ -1943,18 +1963,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
| 1943 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); | 1963 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); |
| 1944 | 1964 | ||
| 1945 | if (!wait) { | 1965 | if (!wait) { |
| 1946 | alloc_flags |= ALLOC_HARDER; | 1966 | /* |
| 1967 | * Not worth trying to allocate harder for | ||
| 1968 | * __GFP_NOMEMALLOC even if it can't schedule. | ||
| 1969 | */ | ||
| 1970 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | ||
| 1971 | alloc_flags |= ALLOC_HARDER; | ||
| 1947 | /* | 1972 | /* |
| 1948 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 1973 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
| 1949 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1974 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
| 1950 | */ | 1975 | */ |
| 1951 | alloc_flags &= ~ALLOC_CPUSET; | 1976 | alloc_flags &= ~ALLOC_CPUSET; |
| 1952 | } else if (unlikely(rt_task(p)) && !in_interrupt()) | 1977 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
| 1953 | alloc_flags |= ALLOC_HARDER; | 1978 | alloc_flags |= ALLOC_HARDER; |
| 1954 | 1979 | ||
| 1955 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 1980 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
| 1956 | if (!in_interrupt() && | 1981 | if (!in_interrupt() && |
| 1957 | ((p->flags & PF_MEMALLOC) || | 1982 | ((current->flags & PF_MEMALLOC) || |
| 1958 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 1983 | unlikely(test_thread_flag(TIF_MEMDIE)))) |
| 1959 | alloc_flags |= ALLOC_NO_WATERMARKS; | 1984 | alloc_flags |= ALLOC_NO_WATERMARKS; |
| 1960 | } | 1985 | } |
| @@ -1973,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
| 1973 | int alloc_flags; | 1998 | int alloc_flags; |
| 1974 | unsigned long pages_reclaimed = 0; | 1999 | unsigned long pages_reclaimed = 0; |
| 1975 | unsigned long did_some_progress; | 2000 | unsigned long did_some_progress; |
| 1976 | struct task_struct *p = current; | 2001 | bool sync_migration = false; |
| 1977 | 2002 | ||
| 1978 | /* | 2003 | /* |
| 1979 | * In the slowpath, we sanity check order to avoid ever trying to | 2004 | * In the slowpath, we sanity check order to avoid ever trying to |
| @@ -1998,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
| 1998 | goto nopage; | 2023 | goto nopage; |
| 1999 | 2024 | ||
| 2000 | restart: | 2025 | restart: |
| 2001 | wake_all_kswapd(order, zonelist, high_zoneidx); | 2026 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
| 2027 | wake_all_kswapd(order, zonelist, high_zoneidx, | ||
| 2028 | zone_idx(preferred_zone)); | ||
| 2002 | 2029 | ||
| 2003 | /* | 2030 | /* |
| 2004 | * OK, we're below the kswapd watermark and have kicked background | 2031 | * OK, we're below the kswapd watermark and have kicked background |
| @@ -2029,21 +2056,26 @@ rebalance: | |||
| 2029 | goto nopage; | 2056 | goto nopage; |
| 2030 | 2057 | ||
| 2031 | /* Avoid recursion of direct reclaim */ | 2058 | /* Avoid recursion of direct reclaim */ |
| 2032 | if (p->flags & PF_MEMALLOC) | 2059 | if (current->flags & PF_MEMALLOC) |
| 2033 | goto nopage; | 2060 | goto nopage; |
| 2034 | 2061 | ||
| 2035 | /* Avoid allocations with no watermarks from looping endlessly */ | 2062 | /* Avoid allocations with no watermarks from looping endlessly */ |
| 2036 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | 2063 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) |
| 2037 | goto nopage; | 2064 | goto nopage; |
| 2038 | 2065 | ||
| 2039 | /* Try direct compaction */ | 2066 | /* |
| 2067 | * Try direct compaction. The first pass is asynchronous. Subsequent | ||
| 2068 | * attempts after direct reclaim are synchronous | ||
| 2069 | */ | ||
| 2040 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2070 | page = __alloc_pages_direct_compact(gfp_mask, order, |
| 2041 | zonelist, high_zoneidx, | 2071 | zonelist, high_zoneidx, |
| 2042 | nodemask, | 2072 | nodemask, |
| 2043 | alloc_flags, preferred_zone, | 2073 | alloc_flags, preferred_zone, |
| 2044 | migratetype, &did_some_progress); | 2074 | migratetype, &did_some_progress, |
| 2075 | sync_migration); | ||
| 2045 | if (page) | 2076 | if (page) |
| 2046 | goto got_pg; | 2077 | goto got_pg; |
| 2078 | sync_migration = true; | ||
| 2047 | 2079 | ||
| 2048 | /* Try direct reclaim and then allocating */ | 2080 | /* Try direct reclaim and then allocating */ |
| 2049 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2081 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
| @@ -2097,13 +2129,27 @@ rebalance: | |||
| 2097 | /* Wait for some write requests to complete then retry */ | 2129 | /* Wait for some write requests to complete then retry */ |
| 2098 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2130 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
| 2099 | goto rebalance; | 2131 | goto rebalance; |
| 2132 | } else { | ||
| 2133 | /* | ||
| 2134 | * High-order allocations do not necessarily loop after | ||
| 2135 | * direct reclaim and reclaim/compaction depends on compaction | ||
| 2136 | * being called after reclaim so call directly if necessary | ||
| 2137 | */ | ||
| 2138 | page = __alloc_pages_direct_compact(gfp_mask, order, | ||
| 2139 | zonelist, high_zoneidx, | ||
| 2140 | nodemask, | ||
| 2141 | alloc_flags, preferred_zone, | ||
| 2142 | migratetype, &did_some_progress, | ||
| 2143 | sync_migration); | ||
| 2144 | if (page) | ||
| 2145 | goto got_pg; | ||
| 2100 | } | 2146 | } |
| 2101 | 2147 | ||
| 2102 | nopage: | 2148 | nopage: |
| 2103 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { | 2149 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { |
| 2104 | printk(KERN_WARNING "%s: page allocation failure." | 2150 | printk(KERN_WARNING "%s: page allocation failure." |
| 2105 | " order:%d, mode:0x%x\n", | 2151 | " order:%d, mode:0x%x\n", |
| 2106 | p->comm, order, gfp_mask); | 2152 | current->comm, order, gfp_mask); |
| 2107 | dump_stack(); | 2153 | dump_stack(); |
| 2108 | show_mem(); | 2154 | show_mem(); |
| 2109 | } | 2155 | } |
| @@ -2437,7 +2483,7 @@ void show_free_areas(void) | |||
| 2437 | " all_unreclaimable? %s" | 2483 | " all_unreclaimable? %s" |
| 2438 | "\n", | 2484 | "\n", |
| 2439 | zone->name, | 2485 | zone->name, |
| 2440 | K(zone_nr_free_pages(zone)), | 2486 | K(zone_page_state(zone, NR_FREE_PAGES)), |
| 2441 | K(min_wmark_pages(zone)), | 2487 | K(min_wmark_pages(zone)), |
| 2442 | K(low_wmark_pages(zone)), | 2488 | K(low_wmark_pages(zone)), |
| 2443 | K(high_wmark_pages(zone)), | 2489 | K(high_wmark_pages(zone)), |
| @@ -2580,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s) | |||
| 2580 | 2626 | ||
| 2581 | static __init int setup_numa_zonelist_order(char *s) | 2627 | static __init int setup_numa_zonelist_order(char *s) |
| 2582 | { | 2628 | { |
| 2583 | if (s) | 2629 | int ret; |
| 2584 | return __parse_numa_zonelist_order(s); | 2630 | |
| 2585 | return 0; | 2631 | if (!s) |
| 2632 | return 0; | ||
| 2633 | |||
| 2634 | ret = __parse_numa_zonelist_order(s); | ||
| 2635 | if (ret == 0) | ||
| 2636 | strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); | ||
| 2637 | |||
| 2638 | return ret; | ||
| 2586 | } | 2639 | } |
| 2587 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | 2640 | early_param("numa_zonelist_order", setup_numa_zonelist_order); |
| 2588 | 2641 | ||
| @@ -4009,7 +4062,7 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
| 4009 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | 4062 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); |
| 4010 | } | 4063 | } |
| 4011 | #else | 4064 | #else |
| 4012 | static void inline setup_usemap(struct pglist_data *pgdat, | 4065 | static inline void setup_usemap(struct pglist_data *pgdat, |
| 4013 | struct zone *zone, unsigned long zonesize) {} | 4066 | struct zone *zone, unsigned long zonesize) {} |
| 4014 | #endif /* CONFIG_SPARSEMEM */ | 4067 | #endif /* CONFIG_SPARSEMEM */ |
| 4015 | 4068 | ||
| @@ -5512,7 +5565,6 @@ static struct trace_print_flags pageflag_names[] = { | |||
| 5512 | {1UL << PG_swapcache, "swapcache" }, | 5565 | {1UL << PG_swapcache, "swapcache" }, |
| 5513 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | 5566 | {1UL << PG_mappedtodisk, "mappedtodisk" }, |
| 5514 | {1UL << PG_reclaim, "reclaim" }, | 5567 | {1UL << PG_reclaim, "reclaim" }, |
| 5515 | {1UL << PG_buddy, "buddy" }, | ||
| 5516 | {1UL << PG_swapbacked, "swapbacked" }, | 5568 | {1UL << PG_swapbacked, "swapbacked" }, |
| 5517 | {1UL << PG_unevictable, "unevictable" }, | 5569 | {1UL << PG_unevictable, "unevictable" }, |
| 5518 | #ifdef CONFIG_MMU | 5570 | #ifdef CONFIG_MMU |
| @@ -5560,7 +5612,7 @@ void dump_page(struct page *page) | |||
| 5560 | { | 5612 | { |
| 5561 | printk(KERN_ALERT | 5613 | printk(KERN_ALERT |
| 5562 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | 5614 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", |
| 5563 | page, page_count(page), page_mapcount(page), | 5615 | page, atomic_read(&page->_count), page_mapcount(page), |
| 5564 | page->mapping, page->index); | 5616 | page->mapping, page->index); |
| 5565 | dump_page_flags(page->flags); | 5617 | dump_page_flags(page->flags); |
| 5566 | } | 5618 | } |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 38cc58b8b2b0..7cfa6ae02303 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
| @@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, | |||
| 34 | pmd = pmd_offset(pud, addr); | 34 | pmd = pmd_offset(pud, addr); |
| 35 | do { | 35 | do { |
| 36 | next = pmd_addr_end(addr, end); | 36 | next = pmd_addr_end(addr, end); |
| 37 | split_huge_page_pmd(walk->mm, pmd); | ||
| 37 | if (pmd_none_or_clear_bad(pmd)) { | 38 | if (pmd_none_or_clear_bad(pmd)) { |
| 38 | if (walk->pte_hole) | 39 | if (walk->pte_hole) |
| 39 | err = walk->pte_hole(addr, next, walk); | 40 | err = walk->pte_hole(addr, next, walk); |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 7d9c1d0ebd3f..ea534960a04b 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
| @@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) | |||
| 421 | return NULL; | 421 | return NULL; |
| 422 | 422 | ||
| 423 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, | 423 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, |
| 424 | pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); | 424 | pcpu_nr_groups, pcpu_atom_size); |
| 425 | if (!vms) { | 425 | if (!vms) { |
| 426 | pcpu_free_chunk(chunk); | 426 | pcpu_free_chunk(chunk); |
| 427 | return NULL; | 427 | return NULL; |
diff --git a/mm/percpu.c b/mm/percpu.c index efe816856a9d..3f930018aa60 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
| @@ -258,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, | |||
| 258 | 258 | ||
| 259 | /* | 259 | /* |
| 260 | * (Un)populated page region iterators. Iterate over (un)populated | 260 | * (Un)populated page region iterators. Iterate over (un)populated |
| 261 | * page regions betwen @start and @end in @chunk. @rs and @re should | 261 | * page regions between @start and @end in @chunk. @rs and @re should |
| 262 | * be integer variables and will be set to start and end page index of | 262 | * be integer variables and will be set to start and end page index of |
| 263 | * the current region. | 263 | * the current region. |
| 264 | */ | 264 | */ |
| @@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size) | |||
| 293 | 293 | ||
| 294 | if (size <= PAGE_SIZE) | 294 | if (size <= PAGE_SIZE) |
| 295 | return kzalloc(size, GFP_KERNEL); | 295 | return kzalloc(size, GFP_KERNEL); |
| 296 | else { | 296 | else |
| 297 | void *ptr = vmalloc(size); | 297 | return vzalloc(size); |
| 298 | if (ptr) | ||
| 299 | memset(ptr, 0, size); | ||
| 300 | return ptr; | ||
| 301 | } | ||
| 302 | } | 298 | } |
| 303 | 299 | ||
| 304 | /** | 300 | /** |
| @@ -1268,7 +1264,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
| 1268 | 1264 | ||
| 1269 | /* we're done parsing the input, undefine BUG macro and dump config */ | 1265 | /* we're done parsing the input, undefine BUG macro and dump config */ |
| 1270 | #undef PCPU_SETUP_BUG_ON | 1266 | #undef PCPU_SETUP_BUG_ON |
| 1271 | pcpu_dump_alloc_info(KERN_INFO, ai); | 1267 | pcpu_dump_alloc_info(KERN_DEBUG, ai); |
| 1272 | 1268 | ||
| 1273 | pcpu_nr_groups = ai->nr_groups; | 1269 | pcpu_nr_groups = ai->nr_groups; |
| 1274 | pcpu_group_offsets = group_offsets; | 1270 | pcpu_group_offsets = group_offsets; |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c new file mode 100644 index 000000000000..d030548047e2 --- /dev/null +++ b/mm/pgtable-generic.c | |||
| @@ -0,0 +1,123 @@ | |||
| 1 | /* | ||
| 2 | * mm/pgtable-generic.c | ||
| 3 | * | ||
| 4 | * Generic pgtable methods declared in asm-generic/pgtable.h | ||
| 5 | * | ||
| 6 | * Copyright (C) 2010 Linus Torvalds | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <asm/tlb.h> | ||
| 10 | #include <asm-generic/pgtable.h> | ||
| 11 | |||
| 12 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | ||
| 13 | /* | ||
| 14 | * Only sets the access flags (dirty, accessed, and | ||
| 15 | * writable). Furthermore, we know it always gets set to a "more | ||
| 16 | * permissive" setting, which allows most architectures to optimize | ||
| 17 | * this. We return whether the PTE actually changed, which in turn | ||
| 18 | * instructs the caller to do things like update__mmu_cache. This | ||
| 19 | * used to be done in the caller, but sparc needs minor faults to | ||
| 20 | * force that call on sun4c so we changed this macro slightly | ||
| 21 | */ | ||
| 22 | int ptep_set_access_flags(struct vm_area_struct *vma, | ||
| 23 | unsigned long address, pte_t *ptep, | ||
| 24 | pte_t entry, int dirty) | ||
| 25 | { | ||
| 26 | int changed = !pte_same(*ptep, entry); | ||
| 27 | if (changed) { | ||
| 28 | set_pte_at(vma->vm_mm, address, ptep, entry); | ||
| 29 | flush_tlb_page(vma, address); | ||
| 30 | } | ||
| 31 | return changed; | ||
| 32 | } | ||
| 33 | #endif | ||
| 34 | |||
| 35 | #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS | ||
| 36 | int pmdp_set_access_flags(struct vm_area_struct *vma, | ||
| 37 | unsigned long address, pmd_t *pmdp, | ||
| 38 | pmd_t entry, int dirty) | ||
| 39 | { | ||
| 40 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 41 | int changed = !pmd_same(*pmdp, entry); | ||
| 42 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
| 43 | if (changed) { | ||
| 44 | set_pmd_at(vma->vm_mm, address, pmdp, entry); | ||
| 45 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
| 46 | } | ||
| 47 | return changed; | ||
| 48 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
| 49 | BUG(); | ||
| 50 | return 0; | ||
| 51 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
| 52 | } | ||
| 53 | #endif | ||
| 54 | |||
| 55 | #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH | ||
| 56 | int ptep_clear_flush_young(struct vm_area_struct *vma, | ||
| 57 | unsigned long address, pte_t *ptep) | ||
| 58 | { | ||
| 59 | int young; | ||
| 60 | young = ptep_test_and_clear_young(vma, address, ptep); | ||
| 61 | if (young) | ||
| 62 | flush_tlb_page(vma, address); | ||
| 63 | return young; | ||
| 64 | } | ||
| 65 | #endif | ||
| 66 | |||
| 67 | #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH | ||
| 68 | int pmdp_clear_flush_young(struct vm_area_struct *vma, | ||
| 69 | unsigned long address, pmd_t *pmdp) | ||
| 70 | { | ||
| 71 | int young; | ||
| 72 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 73 | BUG(); | ||
| 74 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
| 75 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
| 76 | young = pmdp_test_and_clear_young(vma, address, pmdp); | ||
| 77 | if (young) | ||
| 78 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
| 79 | return young; | ||
| 80 | } | ||
| 81 | #endif | ||
| 82 | |||
| 83 | #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH | ||
| 84 | pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, | ||
| 85 | pte_t *ptep) | ||
| 86 | { | ||
| 87 | pte_t pte; | ||
| 88 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); | ||
| 89 | flush_tlb_page(vma, address); | ||
| 90 | return pte; | ||
| 91 | } | ||
| 92 | #endif | ||
| 93 | |||
| 94 | #ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH | ||
| 95 | pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | ||
| 96 | pmd_t *pmdp) | ||
| 97 | { | ||
| 98 | pmd_t pmd; | ||
| 99 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 100 | BUG(); | ||
| 101 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
| 102 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
| 103 | pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); | ||
| 104 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
| 105 | return pmd; | ||
| 106 | } | ||
| 107 | #endif | ||
| 108 | |||
| 109 | #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
| 110 | pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
| 111 | pmd_t *pmdp) | ||
| 112 | { | ||
| 113 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 114 | pmd_t pmd = pmd_mksplitting(*pmdp); | ||
| 115 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
| 116 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); | ||
| 117 | /* tlb flush only to serialize against gup-fast */ | ||
| 118 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
| 119 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
| 120 | BUG(); | ||
| 121 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
| 122 | } | ||
| 123 | #endif | ||
| @@ -94,7 +94,7 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | |||
| 94 | * anonymous pages mapped into it with that anon_vma. | 94 | * anonymous pages mapped into it with that anon_vma. |
| 95 | * | 95 | * |
| 96 | * The common case will be that we already have one, but if | 96 | * The common case will be that we already have one, but if |
| 97 | * if not we either need to find an adjacent mapping that we | 97 | * not we either need to find an adjacent mapping that we |
| 98 | * can re-use the anon_vma from (very common when the only | 98 | * can re-use the anon_vma from (very common when the only |
| 99 | * reason for splitting a vma has been mprotect()), or we | 99 | * reason for splitting a vma has been mprotect()), or we |
| 100 | * allocate a new one. | 100 | * allocate a new one. |
| @@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
| 177 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 177 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
| 178 | 178 | ||
| 179 | anon_vma_lock(anon_vma); | 179 | anon_vma_lock(anon_vma); |
| 180 | /* | ||
| 181 | * It's critical to add new vmas to the tail of the anon_vma, | ||
| 182 | * see comment in huge_memory.c:__split_huge_page(). | ||
| 183 | */ | ||
| 180 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | 184 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); |
| 181 | anon_vma_unlock(anon_vma); | 185 | anon_vma_unlock(anon_vma); |
| 182 | } | 186 | } |
| @@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) | |||
| 360 | * Returns virtual address or -EFAULT if page's index/offset is not | 364 | * Returns virtual address or -EFAULT if page's index/offset is not |
| 361 | * within the range mapped the @vma. | 365 | * within the range mapped the @vma. |
| 362 | */ | 366 | */ |
| 363 | static inline unsigned long | 367 | inline unsigned long |
| 364 | vma_address(struct page *page, struct vm_area_struct *vma) | 368 | vma_address(struct page *page, struct vm_area_struct *vma) |
| 365 | { | 369 | { |
| 366 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 370 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
| @@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | |||
| 435 | pmd = pmd_offset(pud, address); | 439 | pmd = pmd_offset(pud, address); |
| 436 | if (!pmd_present(*pmd)) | 440 | if (!pmd_present(*pmd)) |
| 437 | return NULL; | 441 | return NULL; |
| 442 | if (pmd_trans_huge(*pmd)) | ||
| 443 | return NULL; | ||
| 438 | 444 | ||
| 439 | pte = pte_offset_map(pmd, address); | 445 | pte = pte_offset_map(pmd, address); |
| 440 | /* Make a quick check before getting the lock */ | 446 | /* Make a quick check before getting the lock */ |
| @@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
| 489 | unsigned long *vm_flags) | 495 | unsigned long *vm_flags) |
| 490 | { | 496 | { |
| 491 | struct mm_struct *mm = vma->vm_mm; | 497 | struct mm_struct *mm = vma->vm_mm; |
| 492 | pte_t *pte; | ||
| 493 | spinlock_t *ptl; | ||
| 494 | int referenced = 0; | 498 | int referenced = 0; |
| 495 | 499 | ||
| 496 | pte = page_check_address(page, mm, address, &ptl, 0); | ||
| 497 | if (!pte) | ||
| 498 | goto out; | ||
| 499 | |||
| 500 | /* | 500 | /* |
| 501 | * Don't want to elevate referenced for mlocked page that gets this far, | 501 | * Don't want to elevate referenced for mlocked page that gets this far, |
| 502 | * in order that it progresses to try_to_unmap and is moved to the | 502 | * in order that it progresses to try_to_unmap and is moved to the |
| 503 | * unevictable list. | 503 | * unevictable list. |
| 504 | */ | 504 | */ |
| 505 | if (vma->vm_flags & VM_LOCKED) { | 505 | if (vma->vm_flags & VM_LOCKED) { |
| 506 | *mapcount = 1; /* break early from loop */ | 506 | *mapcount = 0; /* break early from loop */ |
| 507 | *vm_flags |= VM_LOCKED; | 507 | *vm_flags |= VM_LOCKED; |
| 508 | goto out_unmap; | 508 | goto out; |
| 509 | } | ||
| 510 | |||
| 511 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | ||
| 512 | /* | ||
| 513 | * Don't treat a reference through a sequentially read | ||
| 514 | * mapping as such. If the page has been used in | ||
| 515 | * another mapping, we will catch it; if this other | ||
| 516 | * mapping is already gone, the unmap path will have | ||
| 517 | * set PG_referenced or activated the page. | ||
| 518 | */ | ||
| 519 | if (likely(!VM_SequentialReadHint(vma))) | ||
| 520 | referenced++; | ||
| 521 | } | 509 | } |
| 522 | 510 | ||
| 523 | /* Pretend the page is referenced if the task has the | 511 | /* Pretend the page is referenced if the task has the |
| @@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
| 526 | rwsem_is_locked(&mm->mmap_sem)) | 514 | rwsem_is_locked(&mm->mmap_sem)) |
| 527 | referenced++; | 515 | referenced++; |
| 528 | 516 | ||
| 529 | out_unmap: | 517 | if (unlikely(PageTransHuge(page))) { |
| 518 | pmd_t *pmd; | ||
| 519 | |||
| 520 | spin_lock(&mm->page_table_lock); | ||
| 521 | pmd = page_check_address_pmd(page, mm, address, | ||
| 522 | PAGE_CHECK_ADDRESS_PMD_FLAG); | ||
| 523 | if (pmd && !pmd_trans_splitting(*pmd) && | ||
| 524 | pmdp_clear_flush_young_notify(vma, address, pmd)) | ||
| 525 | referenced++; | ||
| 526 | spin_unlock(&mm->page_table_lock); | ||
| 527 | } else { | ||
| 528 | pte_t *pte; | ||
| 529 | spinlock_t *ptl; | ||
| 530 | |||
| 531 | pte = page_check_address(page, mm, address, &ptl, 0); | ||
| 532 | if (!pte) | ||
| 533 | goto out; | ||
| 534 | |||
| 535 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | ||
| 536 | /* | ||
| 537 | * Don't treat a reference through a sequentially read | ||
| 538 | * mapping as such. If the page has been used in | ||
| 539 | * another mapping, we will catch it; if this other | ||
| 540 | * mapping is already gone, the unmap path will have | ||
| 541 | * set PG_referenced or activated the page. | ||
| 542 | */ | ||
| 543 | if (likely(!VM_SequentialReadHint(vma))) | ||
| 544 | referenced++; | ||
| 545 | } | ||
| 546 | pte_unmap_unlock(pte, ptl); | ||
| 547 | } | ||
| 548 | |||
| 530 | (*mapcount)--; | 549 | (*mapcount)--; |
| 531 | pte_unmap_unlock(pte, ptl); | ||
| 532 | 550 | ||
| 533 | if (referenced) | 551 | if (referenced) |
| 534 | *vm_flags |= vma->vm_flags; | 552 | *vm_flags |= vma->vm_flags; |
| @@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page, | |||
| 864 | struct vm_area_struct *vma, unsigned long address, int exclusive) | 882 | struct vm_area_struct *vma, unsigned long address, int exclusive) |
| 865 | { | 883 | { |
| 866 | int first = atomic_inc_and_test(&page->_mapcount); | 884 | int first = atomic_inc_and_test(&page->_mapcount); |
| 867 | if (first) | 885 | if (first) { |
| 868 | __inc_zone_page_state(page, NR_ANON_PAGES); | 886 | if (!PageTransHuge(page)) |
| 887 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
| 888 | else | ||
| 889 | __inc_zone_page_state(page, | ||
| 890 | NR_ANON_TRANSPARENT_HUGEPAGES); | ||
| 891 | } | ||
| 869 | if (unlikely(PageKsm(page))) | 892 | if (unlikely(PageKsm(page))) |
| 870 | return; | 893 | return; |
| 871 | 894 | ||
| @@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page, | |||
| 893 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 916 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
| 894 | SetPageSwapBacked(page); | 917 | SetPageSwapBacked(page); |
| 895 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 918 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
| 896 | __inc_zone_page_state(page, NR_ANON_PAGES); | 919 | if (!PageTransHuge(page)) |
| 920 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
| 921 | else | ||
| 922 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | ||
| 897 | __page_set_anon_rmap(page, vma, address, 1); | 923 | __page_set_anon_rmap(page, vma, address, 1); |
| 898 | if (page_evictable(page, vma)) | 924 | if (page_evictable(page, vma)) |
| 899 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 925 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
| @@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page) | |||
| 911 | { | 937 | { |
| 912 | if (atomic_inc_and_test(&page->_mapcount)) { | 938 | if (atomic_inc_and_test(&page->_mapcount)) { |
| 913 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 939 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
| 914 | mem_cgroup_update_file_mapped(page, 1); | 940 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); |
| 915 | } | 941 | } |
| 916 | } | 942 | } |
| 917 | 943 | ||
| @@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page) | |||
| 946 | return; | 972 | return; |
| 947 | if (PageAnon(page)) { | 973 | if (PageAnon(page)) { |
| 948 | mem_cgroup_uncharge_page(page); | 974 | mem_cgroup_uncharge_page(page); |
| 949 | __dec_zone_page_state(page, NR_ANON_PAGES); | 975 | if (!PageTransHuge(page)) |
| 976 | __dec_zone_page_state(page, NR_ANON_PAGES); | ||
| 977 | else | ||
| 978 | __dec_zone_page_state(page, | ||
| 979 | NR_ANON_TRANSPARENT_HUGEPAGES); | ||
| 950 | } else { | 980 | } else { |
| 951 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 981 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
| 952 | mem_cgroup_update_file_mapped(page, -1); | 982 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); |
| 953 | } | 983 | } |
| 954 | /* | 984 | /* |
| 955 | * It would be tidy to reset the PageAnon mapping here, | 985 | * It would be tidy to reset the PageAnon mapping here, |
| @@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
| 1202 | return ret; | 1232 | return ret; |
| 1203 | } | 1233 | } |
| 1204 | 1234 | ||
| 1205 | static bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1235 | bool is_vma_temporary_stack(struct vm_area_struct *vma) |
| 1206 | { | 1236 | { |
| 1207 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | 1237 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); |
| 1208 | 1238 | ||
| @@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
| 1400 | int ret; | 1430 | int ret; |
| 1401 | 1431 | ||
| 1402 | BUG_ON(!PageLocked(page)); | 1432 | BUG_ON(!PageLocked(page)); |
| 1433 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); | ||
| 1403 | 1434 | ||
| 1404 | if (unlikely(PageKsm(page))) | 1435 | if (unlikely(PageKsm(page))) |
| 1405 | ret = try_to_unmap_ksm(page, flags); | 1436 | ret = try_to_unmap_ksm(page, flags); |
diff --git a/mm/shmem.c b/mm/shmem.c index 47fdeeb9d636..5ee67c990602 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -2415,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) | |||
| 2415 | return &p->vfs_inode; | 2415 | return &p->vfs_inode; |
| 2416 | } | 2416 | } |
| 2417 | 2417 | ||
| 2418 | static void shmem_i_callback(struct rcu_head *head) | ||
| 2419 | { | ||
| 2420 | struct inode *inode = container_of(head, struct inode, i_rcu); | ||
| 2421 | INIT_LIST_HEAD(&inode->i_dentry); | ||
| 2422 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | ||
| 2423 | } | ||
| 2424 | |||
| 2418 | static void shmem_destroy_inode(struct inode *inode) | 2425 | static void shmem_destroy_inode(struct inode *inode) |
| 2419 | { | 2426 | { |
| 2420 | if ((inode->i_mode & S_IFMT) == S_IFREG) { | 2427 | if ((inode->i_mode & S_IFMT) == S_IFREG) { |
| 2421 | /* only struct inode is valid if it's an inline symlink */ | 2428 | /* only struct inode is valid if it's an inline symlink */ |
| 2422 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2429 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
| 2423 | } | 2430 | } |
| 2424 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2431 | call_rcu(&inode->i_rcu, shmem_i_callback); |
| 2425 | } | 2432 | } |
| 2426 | 2433 | ||
| 2427 | static void init_once(void *foo) | 2434 | static void init_once(void *foo) |
| @@ -829,12 +829,12 @@ static void init_reap_node(int cpu) | |||
| 829 | 829 | ||
| 830 | static void next_reap_node(void) | 830 | static void next_reap_node(void) |
| 831 | { | 831 | { |
| 832 | int node = __get_cpu_var(slab_reap_node); | 832 | int node = __this_cpu_read(slab_reap_node); |
| 833 | 833 | ||
| 834 | node = next_node(node, node_online_map); | 834 | node = next_node(node, node_online_map); |
| 835 | if (unlikely(node >= MAX_NUMNODES)) | 835 | if (unlikely(node >= MAX_NUMNODES)) |
| 836 | node = first_node(node_online_map); | 836 | node = first_node(node_online_map); |
| 837 | __get_cpu_var(slab_reap_node) = node; | 837 | __this_cpu_write(slab_reap_node, node); |
| 838 | } | 838 | } |
| 839 | 839 | ||
| 840 | #else | 840 | #else |
| @@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
| 1012 | */ | 1012 | */ |
| 1013 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | 1013 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) |
| 1014 | { | 1014 | { |
| 1015 | int node = __get_cpu_var(slab_reap_node); | 1015 | int node = __this_cpu_read(slab_reap_node); |
| 1016 | 1016 | ||
| 1017 | if (l3->alien) { | 1017 | if (l3->alien) { |
| 1018 | struct array_cache *ac = l3->alien[node]; | 1018 | struct array_cache *ac = l3->alien[node]; |
| @@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
| 1293 | * anything expensive but will only modify reap_work | 1293 | * anything expensive but will only modify reap_work |
| 1294 | * and reschedule the timer. | 1294 | * and reschedule the timer. |
| 1295 | */ | 1295 | */ |
| 1296 | cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); | 1296 | cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); |
| 1297 | /* Now the cache_reaper is guaranteed to be not running. */ | 1297 | /* Now the cache_reaper is guaranteed to be not running. */ |
| 1298 | per_cpu(slab_reap_work, cpu).work.func = NULL; | 1298 | per_cpu(slab_reap_work, cpu).work.func = NULL; |
| 1299 | break; | 1299 | break; |
| @@ -2781,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, | |||
| 2781 | /* | 2781 | /* |
| 2782 | * Map pages beginning at addr to the given cache and slab. This is required | 2782 | * Map pages beginning at addr to the given cache and slab. This is required |
| 2783 | * for the slab allocator to be able to lookup the cache and slab of a | 2783 | * for the slab allocator to be able to lookup the cache and slab of a |
| 2784 | * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. | 2784 | * virtual address for kfree, ksize, and slab debugging. |
| 2785 | */ | 2785 | */ |
| 2786 | static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, | 2786 | static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, |
| 2787 | void *addr) | 2787 | void *addr) |
| @@ -3653,42 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 3653 | EXPORT_SYMBOL(kmem_cache_alloc); | 3653 | EXPORT_SYMBOL(kmem_cache_alloc); |
| 3654 | 3654 | ||
| 3655 | #ifdef CONFIG_TRACING | 3655 | #ifdef CONFIG_TRACING |
| 3656 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) | 3656 | void * |
| 3657 | kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) | ||
| 3657 | { | 3658 | { |
| 3658 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3659 | void *ret; |
| 3659 | } | ||
| 3660 | EXPORT_SYMBOL(kmem_cache_alloc_notrace); | ||
| 3661 | #endif | ||
| 3662 | 3660 | ||
| 3663 | /** | 3661 | ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); |
| 3664 | * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. | ||
| 3665 | * @cachep: the cache we're checking against | ||
| 3666 | * @ptr: pointer to validate | ||
| 3667 | * | ||
| 3668 | * This verifies that the untrusted pointer looks sane; | ||
| 3669 | * it is _not_ a guarantee that the pointer is actually | ||
| 3670 | * part of the slab cache in question, but it at least | ||
| 3671 | * validates that the pointer can be dereferenced and | ||
| 3672 | * looks half-way sane. | ||
| 3673 | * | ||
| 3674 | * Currently only used for dentry validation. | ||
| 3675 | */ | ||
| 3676 | int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) | ||
| 3677 | { | ||
| 3678 | unsigned long size = cachep->buffer_size; | ||
| 3679 | struct page *page; | ||
| 3680 | 3662 | ||
| 3681 | if (unlikely(!kern_ptr_validate(ptr, size))) | 3663 | trace_kmalloc(_RET_IP_, ret, |
| 3682 | goto out; | 3664 | size, slab_buffer_size(cachep), flags); |
| 3683 | page = virt_to_page(ptr); | 3665 | return ret; |
| 3684 | if (unlikely(!PageSlab(page))) | ||
| 3685 | goto out; | ||
| 3686 | if (unlikely(page_get_cache(page) != cachep)) | ||
| 3687 | goto out; | ||
| 3688 | return 1; | ||
| 3689 | out: | ||
| 3690 | return 0; | ||
| 3691 | } | 3666 | } |
| 3667 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | ||
| 3668 | #endif | ||
| 3692 | 3669 | ||
| 3693 | #ifdef CONFIG_NUMA | 3670 | #ifdef CONFIG_NUMA |
| 3694 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3671 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) |
| @@ -3705,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 3705 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3682 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
| 3706 | 3683 | ||
| 3707 | #ifdef CONFIG_TRACING | 3684 | #ifdef CONFIG_TRACING |
| 3708 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, | 3685 | void *kmem_cache_alloc_node_trace(size_t size, |
| 3709 | gfp_t flags, | 3686 | struct kmem_cache *cachep, |
| 3710 | int nodeid) | 3687 | gfp_t flags, |
| 3688 | int nodeid) | ||
| 3711 | { | 3689 | { |
| 3712 | return __cache_alloc_node(cachep, flags, nodeid, | 3690 | void *ret; |
| 3691 | |||
| 3692 | ret = __cache_alloc_node(cachep, flags, nodeid, | ||
| 3713 | __builtin_return_address(0)); | 3693 | __builtin_return_address(0)); |
| 3694 | trace_kmalloc_node(_RET_IP_, ret, | ||
| 3695 | size, slab_buffer_size(cachep), | ||
| 3696 | flags, nodeid); | ||
| 3697 | return ret; | ||
| 3714 | } | 3698 | } |
| 3715 | EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | 3699 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
| 3716 | #endif | 3700 | #endif |
| 3717 | 3701 | ||
| 3718 | static __always_inline void * | 3702 | static __always_inline void * |
| 3719 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | 3703 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) |
| 3720 | { | 3704 | { |
| 3721 | struct kmem_cache *cachep; | 3705 | struct kmem_cache *cachep; |
| 3722 | void *ret; | ||
| 3723 | 3706 | ||
| 3724 | cachep = kmem_find_general_cachep(size, flags); | 3707 | cachep = kmem_find_general_cachep(size, flags); |
| 3725 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3708 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
| 3726 | return cachep; | 3709 | return cachep; |
| 3727 | ret = kmem_cache_alloc_node_notrace(cachep, flags, node); | 3710 | return kmem_cache_alloc_node_trace(size, cachep, flags, node); |
| 3728 | |||
| 3729 | trace_kmalloc_node((unsigned long) caller, ret, | ||
| 3730 | size, cachep->buffer_size, flags, node); | ||
| 3731 | |||
| 3732 | return ret; | ||
| 3733 | } | 3711 | } |
| 3734 | 3712 | ||
| 3735 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | 3713 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
| @@ -678,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d) | |||
| 678 | } | 678 | } |
| 679 | EXPORT_SYMBOL(kmem_cache_shrink); | 679 | EXPORT_SYMBOL(kmem_cache_shrink); |
| 680 | 680 | ||
| 681 | int kmem_ptr_validate(struct kmem_cache *a, const void *b) | ||
| 682 | { | ||
| 683 | return 0; | ||
| 684 | } | ||
| 685 | |||
| 686 | static unsigned int slob_ready __read_mostly; | 681 | static unsigned int slob_ready __read_mostly; |
| 687 | 682 | ||
| 688 | int slab_is_available(void) | 683 | int slab_is_available(void) |
| @@ -28,6 +28,8 @@ | |||
| 28 | #include <linux/math64.h> | 28 | #include <linux/math64.h> |
| 29 | #include <linux/fault-inject.h> | 29 | #include <linux/fault-inject.h> |
| 30 | 30 | ||
| 31 | #include <trace/events/kmem.h> | ||
| 32 | |||
| 31 | /* | 33 | /* |
| 32 | * Lock order: | 34 | * Lock order: |
| 33 | * 1. slab_lock(page) | 35 | * 1. slab_lock(page) |
| @@ -1774,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | |||
| 1774 | EXPORT_SYMBOL(kmem_cache_alloc); | 1776 | EXPORT_SYMBOL(kmem_cache_alloc); |
| 1775 | 1777 | ||
| 1776 | #ifdef CONFIG_TRACING | 1778 | #ifdef CONFIG_TRACING |
| 1777 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) | 1779 | void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) |
| 1780 | { | ||
| 1781 | void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); | ||
| 1782 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); | ||
| 1783 | return ret; | ||
| 1784 | } | ||
| 1785 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | ||
| 1786 | |||
| 1787 | void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) | ||
| 1778 | { | 1788 | { |
| 1779 | return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); | 1789 | void *ret = kmalloc_order(size, flags, order); |
| 1790 | trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); | ||
| 1791 | return ret; | ||
| 1780 | } | 1792 | } |
| 1781 | EXPORT_SYMBOL(kmem_cache_alloc_notrace); | 1793 | EXPORT_SYMBOL(kmalloc_order_trace); |
| 1782 | #endif | 1794 | #endif |
| 1783 | 1795 | ||
| 1784 | #ifdef CONFIG_NUMA | 1796 | #ifdef CONFIG_NUMA |
| @@ -1794,13 +1806,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | |||
| 1794 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 1806 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
| 1795 | 1807 | ||
| 1796 | #ifdef CONFIG_TRACING | 1808 | #ifdef CONFIG_TRACING |
| 1797 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, | 1809 | void *kmem_cache_alloc_node_trace(struct kmem_cache *s, |
| 1798 | gfp_t gfpflags, | 1810 | gfp_t gfpflags, |
| 1799 | int node) | 1811 | int node, size_t size) |
| 1800 | { | 1812 | { |
| 1801 | return slab_alloc(s, gfpflags, node, _RET_IP_); | 1813 | void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); |
| 1814 | |||
| 1815 | trace_kmalloc_node(_RET_IP_, ret, | ||
| 1816 | size, s->size, gfpflags, node); | ||
| 1817 | return ret; | ||
| 1802 | } | 1818 | } |
| 1803 | EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | 1819 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
| 1804 | #endif | 1820 | #endif |
| 1805 | #endif | 1821 | #endif |
| 1806 | 1822 | ||
| @@ -1917,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x) | |||
| 1917 | } | 1933 | } |
| 1918 | EXPORT_SYMBOL(kmem_cache_free); | 1934 | EXPORT_SYMBOL(kmem_cache_free); |
| 1919 | 1935 | ||
| 1920 | /* Figure out on which slab page the object resides */ | ||
| 1921 | static struct page *get_object_page(const void *x) | ||
| 1922 | { | ||
| 1923 | struct page *page = virt_to_head_page(x); | ||
| 1924 | |||
| 1925 | if (!PageSlab(page)) | ||
| 1926 | return NULL; | ||
| 1927 | |||
| 1928 | return page; | ||
| 1929 | } | ||
| 1930 | |||
| 1931 | /* | 1936 | /* |
| 1932 | * Object placement in a slab is made very easy because we always start at | 1937 | * Object placement in a slab is made very easy because we always start at |
| 1933 | * offset 0. If we tune the size of the object to the alignment then we can | 1938 | * offset 0. If we tune the size of the object to the alignment then we can |
| @@ -2386,35 +2391,6 @@ error: | |||
| 2386 | } | 2391 | } |
| 2387 | 2392 | ||
| 2388 | /* | 2393 | /* |
| 2389 | * Check if a given pointer is valid | ||
| 2390 | */ | ||
| 2391 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) | ||
| 2392 | { | ||
| 2393 | struct page *page; | ||
| 2394 | |||
| 2395 | if (!kern_ptr_validate(object, s->size)) | ||
| 2396 | return 0; | ||
| 2397 | |||
| 2398 | page = get_object_page(object); | ||
| 2399 | |||
| 2400 | if (!page || s != page->slab) | ||
| 2401 | /* No slab or wrong slab */ | ||
| 2402 | return 0; | ||
| 2403 | |||
| 2404 | if (!check_valid_pointer(s, page, object)) | ||
| 2405 | return 0; | ||
| 2406 | |||
| 2407 | /* | ||
| 2408 | * We could also check if the object is on the slabs freelist. | ||
| 2409 | * But this would be too expensive and it seems that the main | ||
| 2410 | * purpose of kmem_ptr_valid() is to check if the object belongs | ||
| 2411 | * to a certain slab. | ||
| 2412 | */ | ||
| 2413 | return 1; | ||
| 2414 | } | ||
| 2415 | EXPORT_SYMBOL(kmem_ptr_validate); | ||
| 2416 | |||
| 2417 | /* | ||
| 2418 | * Determine the size of a slab object | 2394 | * Determine the size of a slab object |
| 2419 | */ | 2395 | */ |
| 2420 | unsigned int kmem_cache_size(struct kmem_cache *s) | 2396 | unsigned int kmem_cache_size(struct kmem_cache *s) |
| @@ -3401,13 +3377,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page, | |||
| 3401 | 3377 | ||
| 3402 | for_each_free_object(p, s, page->freelist) { | 3378 | for_each_free_object(p, s, page->freelist) { |
| 3403 | set_bit(slab_index(p, s, addr), map); | 3379 | set_bit(slab_index(p, s, addr), map); |
| 3404 | if (!check_object(s, page, p, 0)) | 3380 | if (!check_object(s, page, p, SLUB_RED_INACTIVE)) |
| 3405 | return 0; | 3381 | return 0; |
| 3406 | } | 3382 | } |
| 3407 | 3383 | ||
| 3408 | for_each_object(p, s, addr, page->objects) | 3384 | for_each_object(p, s, addr, page->objects) |
| 3409 | if (!test_bit(slab_index(p, s, addr), map)) | 3385 | if (!test_bit(slab_index(p, s, addr), map)) |
| 3410 | if (!check_object(s, page, p, 1)) | 3386 | if (!check_object(s, page, p, SLUB_RED_ACTIVE)) |
| 3411 | return 0; | 3387 | return 0; |
| 3412 | return 1; | 3388 | return 1; |
| 3413 | } | 3389 | } |
| @@ -3660,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
| 3660 | len += sprintf(buf + len, "%7ld ", l->count); | 3636 | len += sprintf(buf + len, "%7ld ", l->count); |
| 3661 | 3637 | ||
| 3662 | if (l->addr) | 3638 | if (l->addr) |
| 3663 | len += sprint_symbol(buf + len, (unsigned long)l->addr); | 3639 | len += sprintf(buf + len, "%pS", (void *)l->addr); |
| 3664 | else | 3640 | else |
| 3665 | len += sprintf(buf + len, "<not-available>"); | 3641 | len += sprintf(buf + len, "<not-available>"); |
| 3666 | 3642 | ||
| @@ -3970,12 +3946,9 @@ SLAB_ATTR(min_partial); | |||
| 3970 | 3946 | ||
| 3971 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) | 3947 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) |
| 3972 | { | 3948 | { |
| 3973 | if (s->ctor) { | 3949 | if (!s->ctor) |
| 3974 | int n = sprint_symbol(buf, (unsigned long)s->ctor); | 3950 | return 0; |
| 3975 | 3951 | return sprintf(buf, "%pS\n", s->ctor); | |
| 3976 | return n + sprintf(buf + n, "\n"); | ||
| 3977 | } | ||
| 3978 | return 0; | ||
| 3979 | } | 3952 | } |
| 3980 | SLAB_ATTR_RO(ctor); | 3953 | SLAB_ATTR_RO(ctor); |
| 3981 | 3954 | ||
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 29d6cbffb283..64b984091edb 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
| @@ -9,7 +9,7 @@ | |||
| 9 | * | 9 | * |
| 10 | * However, virtual mappings need a page table and TLBs. Many Linux | 10 | * However, virtual mappings need a page table and TLBs. Many Linux |
| 11 | * architectures already map their physical space using 1-1 mappings | 11 | * architectures already map their physical space using 1-1 mappings |
| 12 | * via TLBs. For those arches the virtual memmory map is essentially | 12 | * via TLBs. For those arches the virtual memory map is essentially |
| 13 | * for free if we use the same page size as the 1-1 mappings. In that | 13 | * for free if we use the same page size as the 1-1 mappings. In that |
| 14 | * case the overhead consists of a few additional pages that are | 14 | * case the overhead consists of a few additional pages that are |
| 15 | * allocated to create a view of memory for vmemmap. | 15 | * allocated to create a view of memory for vmemmap. |
diff --git a/mm/sparse.c b/mm/sparse.c index 95ac219af379..93250207c5cf 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
| 671 | static void free_map_bootmem(struct page *page, unsigned long nr_pages) | 671 | static void free_map_bootmem(struct page *page, unsigned long nr_pages) |
| 672 | { | 672 | { |
| 673 | unsigned long maps_section_nr, removing_section_nr, i; | 673 | unsigned long maps_section_nr, removing_section_nr, i; |
| 674 | int magic; | 674 | unsigned long magic; |
| 675 | 675 | ||
| 676 | for (i = 0; i < nr_pages; i++, page++) { | 676 | for (i = 0; i < nr_pages; i++, page++) { |
| 677 | magic = atomic_read(&page->_mapcount); | 677 | magic = (unsigned long) page->lru.next; |
| 678 | 678 | ||
| 679 | BUG_ON(magic == NODE_INFO); | 679 | BUG_ON(magic == NODE_INFO); |
| 680 | 680 | ||
| @@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page) | |||
| 56 | del_page_from_lru(zone, page); | 56 | del_page_from_lru(zone, page); |
| 57 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 57 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 58 | } | 58 | } |
| 59 | } | ||
| 60 | |||
| 61 | static void __put_single_page(struct page *page) | ||
| 62 | { | ||
| 63 | __page_cache_release(page); | ||
| 59 | free_hot_cold_page(page, 0); | 64 | free_hot_cold_page(page, 0); |
| 60 | } | 65 | } |
| 61 | 66 | ||
| 62 | static void put_compound_page(struct page *page) | 67 | static void __put_compound_page(struct page *page) |
| 63 | { | 68 | { |
| 64 | page = compound_head(page); | 69 | compound_page_dtor *dtor; |
| 65 | if (put_page_testzero(page)) { | ||
| 66 | compound_page_dtor *dtor; | ||
| 67 | 70 | ||
| 68 | dtor = get_compound_page_dtor(page); | 71 | __page_cache_release(page); |
| 69 | (*dtor)(page); | 72 | dtor = get_compound_page_dtor(page); |
| 73 | (*dtor)(page); | ||
| 74 | } | ||
| 75 | |||
| 76 | static void put_compound_page(struct page *page) | ||
| 77 | { | ||
| 78 | if (unlikely(PageTail(page))) { | ||
| 79 | /* __split_huge_page_refcount can run under us */ | ||
| 80 | struct page *page_head = page->first_page; | ||
| 81 | smp_rmb(); | ||
| 82 | /* | ||
| 83 | * If PageTail is still set after smp_rmb() we can be sure | ||
| 84 | * that the page->first_page we read wasn't a dangling pointer. | ||
| 85 | * See __split_huge_page_refcount() smp_wmb(). | ||
| 86 | */ | ||
| 87 | if (likely(PageTail(page) && get_page_unless_zero(page_head))) { | ||
| 88 | unsigned long flags; | ||
| 89 | /* | ||
| 90 | * Verify that our page_head wasn't converted | ||
| 91 | * to a a regular page before we got a | ||
| 92 | * reference on it. | ||
| 93 | */ | ||
| 94 | if (unlikely(!PageHead(page_head))) { | ||
| 95 | /* PageHead is cleared after PageTail */ | ||
| 96 | smp_rmb(); | ||
| 97 | VM_BUG_ON(PageTail(page)); | ||
| 98 | goto out_put_head; | ||
| 99 | } | ||
| 100 | /* | ||
| 101 | * Only run compound_lock on a valid PageHead, | ||
| 102 | * after having it pinned with | ||
| 103 | * get_page_unless_zero() above. | ||
| 104 | */ | ||
| 105 | smp_mb(); | ||
| 106 | /* page_head wasn't a dangling pointer */ | ||
| 107 | flags = compound_lock_irqsave(page_head); | ||
| 108 | if (unlikely(!PageTail(page))) { | ||
| 109 | /* __split_huge_page_refcount run before us */ | ||
| 110 | compound_unlock_irqrestore(page_head, flags); | ||
| 111 | VM_BUG_ON(PageHead(page_head)); | ||
| 112 | out_put_head: | ||
| 113 | if (put_page_testzero(page_head)) | ||
| 114 | __put_single_page(page_head); | ||
| 115 | out_put_single: | ||
| 116 | if (put_page_testzero(page)) | ||
| 117 | __put_single_page(page); | ||
| 118 | return; | ||
| 119 | } | ||
| 120 | VM_BUG_ON(page_head != page->first_page); | ||
| 121 | /* | ||
| 122 | * We can release the refcount taken by | ||
| 123 | * get_page_unless_zero now that | ||
| 124 | * split_huge_page_refcount is blocked on the | ||
| 125 | * compound_lock. | ||
| 126 | */ | ||
| 127 | if (put_page_testzero(page_head)) | ||
| 128 | VM_BUG_ON(1); | ||
| 129 | /* __split_huge_page_refcount will wait now */ | ||
| 130 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | ||
| 131 | atomic_dec(&page->_count); | ||
| 132 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | ||
| 133 | compound_unlock_irqrestore(page_head, flags); | ||
| 134 | if (put_page_testzero(page_head)) { | ||
| 135 | if (PageHead(page_head)) | ||
| 136 | __put_compound_page(page_head); | ||
| 137 | else | ||
| 138 | __put_single_page(page_head); | ||
| 139 | } | ||
| 140 | } else { | ||
| 141 | /* page_head is a dangling pointer */ | ||
| 142 | VM_BUG_ON(PageTail(page)); | ||
| 143 | goto out_put_single; | ||
| 144 | } | ||
| 145 | } else if (put_page_testzero(page)) { | ||
| 146 | if (PageHead(page)) | ||
| 147 | __put_compound_page(page); | ||
| 148 | else | ||
| 149 | __put_single_page(page); | ||
| 70 | } | 150 | } |
| 71 | } | 151 | } |
| 72 | 152 | ||
| @@ -75,7 +155,7 @@ void put_page(struct page *page) | |||
| 75 | if (unlikely(PageCompound(page))) | 155 | if (unlikely(PageCompound(page))) |
| 76 | put_compound_page(page); | 156 | put_compound_page(page); |
| 77 | else if (put_page_testzero(page)) | 157 | else if (put_page_testzero(page)) |
| 78 | __page_cache_release(page); | 158 | __put_single_page(page); |
| 79 | } | 159 | } |
| 80 | EXPORT_SYMBOL(put_page); | 160 | EXPORT_SYMBOL(put_page); |
| 81 | 161 | ||
| @@ -98,15 +178,13 @@ void put_pages_list(struct list_head *pages) | |||
| 98 | } | 178 | } |
| 99 | EXPORT_SYMBOL(put_pages_list); | 179 | EXPORT_SYMBOL(put_pages_list); |
| 100 | 180 | ||
| 101 | /* | 181 | static void pagevec_lru_move_fn(struct pagevec *pvec, |
| 102 | * pagevec_move_tail() must be called with IRQ disabled. | 182 | void (*move_fn)(struct page *page, void *arg), |
| 103 | * Otherwise this may cause nasty races. | 183 | void *arg) |
| 104 | */ | ||
| 105 | static void pagevec_move_tail(struct pagevec *pvec) | ||
| 106 | { | 184 | { |
| 107 | int i; | 185 | int i; |
| 108 | int pgmoved = 0; | ||
| 109 | struct zone *zone = NULL; | 186 | struct zone *zone = NULL; |
| 187 | unsigned long flags = 0; | ||
| 110 | 188 | ||
| 111 | for (i = 0; i < pagevec_count(pvec); i++) { | 189 | for (i = 0; i < pagevec_count(pvec); i++) { |
| 112 | struct page *page = pvec->pages[i]; | 190 | struct page *page = pvec->pages[i]; |
| @@ -114,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec) | |||
| 114 | 192 | ||
| 115 | if (pagezone != zone) { | 193 | if (pagezone != zone) { |
| 116 | if (zone) | 194 | if (zone) |
| 117 | spin_unlock(&zone->lru_lock); | 195 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 118 | zone = pagezone; | 196 | zone = pagezone; |
| 119 | spin_lock(&zone->lru_lock); | 197 | spin_lock_irqsave(&zone->lru_lock, flags); |
| 120 | } | ||
| 121 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | ||
| 122 | int lru = page_lru_base_type(page); | ||
| 123 | list_move_tail(&page->lru, &zone->lru[lru].list); | ||
| 124 | pgmoved++; | ||
| 125 | } | 198 | } |
| 199 | |||
| 200 | (*move_fn)(page, arg); | ||
| 126 | } | 201 | } |
| 127 | if (zone) | 202 | if (zone) |
| 128 | spin_unlock(&zone->lru_lock); | 203 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 129 | __count_vm_events(PGROTATED, pgmoved); | 204 | release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); |
| 130 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
| 131 | pagevec_reinit(pvec); | 205 | pagevec_reinit(pvec); |
| 132 | } | 206 | } |
| 133 | 207 | ||
| 208 | static void pagevec_move_tail_fn(struct page *page, void *arg) | ||
| 209 | { | ||
| 210 | int *pgmoved = arg; | ||
| 211 | struct zone *zone = page_zone(page); | ||
| 212 | |||
| 213 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | ||
| 214 | int lru = page_lru_base_type(page); | ||
| 215 | list_move_tail(&page->lru, &zone->lru[lru].list); | ||
| 216 | (*pgmoved)++; | ||
| 217 | } | ||
| 218 | } | ||
| 219 | |||
| 220 | /* | ||
| 221 | * pagevec_move_tail() must be called with IRQ disabled. | ||
| 222 | * Otherwise this may cause nasty races. | ||
| 223 | */ | ||
| 224 | static void pagevec_move_tail(struct pagevec *pvec) | ||
| 225 | { | ||
| 226 | int pgmoved = 0; | ||
| 227 | |||
| 228 | pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); | ||
| 229 | __count_vm_events(PGROTATED, pgmoved); | ||
| 230 | } | ||
| 231 | |||
| 134 | /* | 232 | /* |
| 135 | * Writeback is about to end against a page which has been marked for immediate | 233 | * Writeback is about to end against a page which has been marked for immediate |
| 136 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 234 | * reclaim. If it still appears to be reclaimable, move it to the tail of the |
| 137 | * inactive list. | 235 | * inactive list. |
| 138 | */ | 236 | */ |
| 139 | void rotate_reclaimable_page(struct page *page) | 237 | void rotate_reclaimable_page(struct page *page) |
| 140 | { | 238 | { |
| 141 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && | 239 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && |
| 142 | !PageUnevictable(page) && PageLRU(page)) { | 240 | !PageUnevictable(page) && PageLRU(page)) { |
| @@ -173,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page, | |||
| 173 | } | 271 | } |
| 174 | 272 | ||
| 175 | /* | 273 | /* |
| 176 | * FIXME: speed this up? | 274 | * A page will go to active list either by activate_page or putback_lru_page. |
| 275 | * In the activate_page case, the page hasn't active bit set. The page might | ||
| 276 | * not in LRU list because it's isolated before it gets a chance to be moved to | ||
| 277 | * active list. The window is small because pagevec just stores several pages. | ||
| 278 | * For such case, we do nothing for such page. | ||
| 279 | * In the putback_lru_page case, the page isn't in lru list but has active | ||
| 280 | * bit set | ||
| 177 | */ | 281 | */ |
| 178 | void activate_page(struct page *page) | 282 | static void __activate_page(struct page *page, void *arg) |
| 179 | { | 283 | { |
| 180 | struct zone *zone = page_zone(page); | 284 | struct zone *zone = page_zone(page); |
| 285 | int file = page_is_file_cache(page); | ||
| 286 | int lru = page_lru_base_type(page); | ||
| 287 | bool putback = !PageLRU(page); | ||
| 181 | 288 | ||
| 182 | spin_lock_irq(&zone->lru_lock); | 289 | /* The page is isolated before it's moved to active list */ |
| 183 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 290 | if (!PageLRU(page) && !PageActive(page)) |
| 184 | int file = page_is_file_cache(page); | 291 | return; |
| 185 | int lru = page_lru_base_type(page); | 292 | if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page)) |
| 293 | return; | ||
| 294 | |||
| 295 | if (!putback) | ||
| 186 | del_page_from_lru_list(zone, page, lru); | 296 | del_page_from_lru_list(zone, page, lru); |
| 297 | else | ||
| 298 | SetPageLRU(page); | ||
| 187 | 299 | ||
| 188 | SetPageActive(page); | 300 | SetPageActive(page); |
| 189 | lru += LRU_ACTIVE; | 301 | lru += LRU_ACTIVE; |
| 190 | add_page_to_lru_list(zone, page, lru); | 302 | add_page_to_lru_list(zone, page, lru); |
| 191 | __count_vm_event(PGACTIVATE); | ||
| 192 | 303 | ||
| 193 | update_page_reclaim_stat(zone, page, file, 1); | 304 | if (putback) |
| 305 | return; | ||
| 306 | __count_vm_event(PGACTIVATE); | ||
| 307 | update_page_reclaim_stat(zone, page, file, 1); | ||
| 308 | } | ||
| 309 | |||
| 310 | #ifdef CONFIG_SMP | ||
| 311 | static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); | ||
| 312 | |||
| 313 | static void activate_page_drain(int cpu) | ||
| 314 | { | ||
| 315 | struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); | ||
| 316 | |||
| 317 | if (pagevec_count(pvec)) | ||
| 318 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | ||
| 319 | } | ||
| 320 | |||
| 321 | void activate_page(struct page *page) | ||
| 322 | { | ||
| 323 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | ||
| 324 | struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); | ||
| 325 | |||
| 326 | page_cache_get(page); | ||
| 327 | if (!pagevec_add(pvec, page)) | ||
| 328 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | ||
| 329 | put_cpu_var(activate_page_pvecs); | ||
| 330 | } | ||
| 331 | } | ||
| 332 | |||
| 333 | /* Caller should hold zone->lru_lock */ | ||
| 334 | int putback_active_lru_page(struct zone *zone, struct page *page) | ||
| 335 | { | ||
| 336 | struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); | ||
| 337 | |||
| 338 | if (!pagevec_add(pvec, page)) { | ||
| 339 | spin_unlock_irq(&zone->lru_lock); | ||
| 340 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | ||
| 341 | spin_lock_irq(&zone->lru_lock); | ||
| 194 | } | 342 | } |
| 343 | put_cpu_var(activate_page_pvecs); | ||
| 344 | return 1; | ||
| 345 | } | ||
| 346 | |||
| 347 | #else | ||
| 348 | static inline void activate_page_drain(int cpu) | ||
| 349 | { | ||
| 350 | } | ||
| 351 | |||
| 352 | void activate_page(struct page *page) | ||
| 353 | { | ||
| 354 | struct zone *zone = page_zone(page); | ||
| 355 | |||
| 356 | spin_lock_irq(&zone->lru_lock); | ||
| 357 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) | ||
| 358 | __activate_page(page, NULL); | ||
| 195 | spin_unlock_irq(&zone->lru_lock); | 359 | spin_unlock_irq(&zone->lru_lock); |
| 196 | } | 360 | } |
| 361 | #endif | ||
| 197 | 362 | ||
| 198 | /* | 363 | /* |
| 199 | * Mark a page as having seen activity. | 364 | * Mark a page as having seen activity. |
| @@ -292,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu) | |||
| 292 | pagevec_move_tail(pvec); | 457 | pagevec_move_tail(pvec); |
| 293 | local_irq_restore(flags); | 458 | local_irq_restore(flags); |
| 294 | } | 459 | } |
| 460 | activate_page_drain(cpu); | ||
| 295 | } | 461 | } |
| 296 | 462 | ||
| 297 | void lru_add_drain(void) | 463 | void lru_add_drain(void) |
| @@ -399,44 +565,70 @@ void __pagevec_release(struct pagevec *pvec) | |||
| 399 | 565 | ||
| 400 | EXPORT_SYMBOL(__pagevec_release); | 566 | EXPORT_SYMBOL(__pagevec_release); |
| 401 | 567 | ||
| 568 | /* used by __split_huge_page_refcount() */ | ||
| 569 | void lru_add_page_tail(struct zone* zone, | ||
| 570 | struct page *page, struct page *page_tail) | ||
| 571 | { | ||
| 572 | int active; | ||
| 573 | enum lru_list lru; | ||
| 574 | const int file = 0; | ||
| 575 | struct list_head *head; | ||
| 576 | |||
| 577 | VM_BUG_ON(!PageHead(page)); | ||
| 578 | VM_BUG_ON(PageCompound(page_tail)); | ||
| 579 | VM_BUG_ON(PageLRU(page_tail)); | ||
| 580 | VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); | ||
| 581 | |||
| 582 | SetPageLRU(page_tail); | ||
| 583 | |||
| 584 | if (page_evictable(page_tail, NULL)) { | ||
| 585 | if (PageActive(page)) { | ||
| 586 | SetPageActive(page_tail); | ||
| 587 | active = 1; | ||
| 588 | lru = LRU_ACTIVE_ANON; | ||
| 589 | } else { | ||
| 590 | active = 0; | ||
| 591 | lru = LRU_INACTIVE_ANON; | ||
| 592 | } | ||
| 593 | update_page_reclaim_stat(zone, page_tail, file, active); | ||
| 594 | if (likely(PageLRU(page))) | ||
| 595 | head = page->lru.prev; | ||
| 596 | else | ||
| 597 | head = &zone->lru[lru].list; | ||
| 598 | __add_page_to_lru_list(zone, page_tail, lru, head); | ||
| 599 | } else { | ||
| 600 | SetPageUnevictable(page_tail); | ||
| 601 | add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); | ||
| 602 | } | ||
| 603 | } | ||
| 604 | |||
| 605 | static void ____pagevec_lru_add_fn(struct page *page, void *arg) | ||
| 606 | { | ||
| 607 | enum lru_list lru = (enum lru_list)arg; | ||
| 608 | struct zone *zone = page_zone(page); | ||
| 609 | int file = is_file_lru(lru); | ||
| 610 | int active = is_active_lru(lru); | ||
| 611 | |||
| 612 | VM_BUG_ON(PageActive(page)); | ||
| 613 | VM_BUG_ON(PageUnevictable(page)); | ||
| 614 | VM_BUG_ON(PageLRU(page)); | ||
| 615 | |||
| 616 | SetPageLRU(page); | ||
| 617 | if (active) | ||
| 618 | SetPageActive(page); | ||
| 619 | update_page_reclaim_stat(zone, page, file, active); | ||
| 620 | add_page_to_lru_list(zone, page, lru); | ||
| 621 | } | ||
| 622 | |||
| 402 | /* | 623 | /* |
| 403 | * Add the passed pages to the LRU, then drop the caller's refcount | 624 | * Add the passed pages to the LRU, then drop the caller's refcount |
| 404 | * on them. Reinitialises the caller's pagevec. | 625 | * on them. Reinitialises the caller's pagevec. |
| 405 | */ | 626 | */ |
| 406 | void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) | 627 | void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) |
| 407 | { | 628 | { |
| 408 | int i; | ||
| 409 | struct zone *zone = NULL; | ||
| 410 | |||
| 411 | VM_BUG_ON(is_unevictable_lru(lru)); | 629 | VM_BUG_ON(is_unevictable_lru(lru)); |
| 412 | 630 | ||
| 413 | for (i = 0; i < pagevec_count(pvec); i++) { | 631 | pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); |
| 414 | struct page *page = pvec->pages[i]; | ||
| 415 | struct zone *pagezone = page_zone(page); | ||
| 416 | int file; | ||
| 417 | int active; | ||
| 418 | |||
| 419 | if (pagezone != zone) { | ||
| 420 | if (zone) | ||
| 421 | spin_unlock_irq(&zone->lru_lock); | ||
| 422 | zone = pagezone; | ||
| 423 | spin_lock_irq(&zone->lru_lock); | ||
| 424 | } | ||
| 425 | VM_BUG_ON(PageActive(page)); | ||
| 426 | VM_BUG_ON(PageUnevictable(page)); | ||
| 427 | VM_BUG_ON(PageLRU(page)); | ||
| 428 | SetPageLRU(page); | ||
| 429 | active = is_active_lru(lru); | ||
| 430 | file = is_file_lru(lru); | ||
| 431 | if (active) | ||
| 432 | SetPageActive(page); | ||
| 433 | update_page_reclaim_stat(zone, page, file, active); | ||
| 434 | add_page_to_lru_list(zone, page, lru); | ||
| 435 | } | ||
| 436 | if (zone) | ||
| 437 | spin_unlock_irq(&zone->lru_lock); | ||
| 438 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
| 439 | pagevec_reinit(pvec); | ||
| 440 | } | 632 | } |
| 441 | 633 | ||
| 442 | EXPORT_SYMBOL(____pagevec_lru_add); | 634 | EXPORT_SYMBOL(____pagevec_lru_add); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index e10f5833167f..5c8cfabbc9bc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -157,6 +157,12 @@ int add_to_swap(struct page *page) | |||
| 157 | if (!entry.val) | 157 | if (!entry.val) |
| 158 | return 0; | 158 | return 0; |
| 159 | 159 | ||
| 160 | if (unlikely(PageTransHuge(page))) | ||
| 161 | if (unlikely(split_huge_page(page))) { | ||
| 162 | swapcache_free(entry, NULL); | ||
| 163 | return 0; | ||
| 164 | } | ||
| 165 | |||
| 160 | /* | 166 | /* |
| 161 | * Radix-tree node allocations from PF_MEMALLOC contexts could | 167 | * Radix-tree node allocations from PF_MEMALLOC contexts could |
| 162 | * completely exhaust the page allocator. __GFP_NOMEMALLOC | 168 | * completely exhaust the page allocator. __GFP_NOMEMALLOC |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 67ddaaf98c74..07a458d72fa8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
| 964 | pmd = pmd_offset(pud, addr); | 964 | pmd = pmd_offset(pud, addr); |
| 965 | do { | 965 | do { |
| 966 | next = pmd_addr_end(addr, end); | 966 | next = pmd_addr_end(addr, end); |
| 967 | if (unlikely(pmd_trans_huge(*pmd))) | ||
| 968 | continue; | ||
| 967 | if (pmd_none_or_clear_bad(pmd)) | 969 | if (pmd_none_or_clear_bad(pmd)) |
| 968 | continue; | 970 | continue; |
| 969 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | 971 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
| @@ -1677,7 +1679,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1677 | if (S_ISBLK(inode->i_mode)) { | 1679 | if (S_ISBLK(inode->i_mode)) { |
| 1678 | struct block_device *bdev = I_BDEV(inode); | 1680 | struct block_device *bdev = I_BDEV(inode); |
| 1679 | set_blocksize(bdev, p->old_block_size); | 1681 | set_blocksize(bdev, p->old_block_size); |
| 1680 | bd_release(bdev); | 1682 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
| 1681 | } else { | 1683 | } else { |
| 1682 | mutex_lock(&inode->i_mutex); | 1684 | mutex_lock(&inode->i_mutex); |
| 1683 | inode->i_flags &= ~S_SWAPFILE; | 1685 | inode->i_flags &= ~S_SWAPFILE; |
| @@ -1939,7 +1941,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1939 | error = -EINVAL; | 1941 | error = -EINVAL; |
| 1940 | if (S_ISBLK(inode->i_mode)) { | 1942 | if (S_ISBLK(inode->i_mode)) { |
| 1941 | bdev = I_BDEV(inode); | 1943 | bdev = I_BDEV(inode); |
| 1942 | error = bd_claim(bdev, sys_swapon); | 1944 | error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, |
| 1945 | sys_swapon); | ||
| 1943 | if (error < 0) { | 1946 | if (error < 0) { |
| 1944 | bdev = NULL; | 1947 | bdev = NULL; |
| 1945 | error = -EINVAL; | 1948 | error = -EINVAL; |
| @@ -2136,7 +2139,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 2136 | bad_swap: | 2139 | bad_swap: |
| 2137 | if (bdev) { | 2140 | if (bdev) { |
| 2138 | set_blocksize(bdev, p->old_block_size); | 2141 | set_blocksize(bdev, p->old_block_size); |
| 2139 | bd_release(bdev); | 2142 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
| 2140 | } | 2143 | } |
| 2141 | destroy_swap_extents(p); | 2144 | destroy_swap_extents(p); |
| 2142 | swap_cgroup_swapoff(type); | 2145 | swap_cgroup_swapoff(type); |
diff --git a/mm/truncate.c b/mm/truncate.c index ba887bff48c5..3c2d5ddfa0d4 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
| 390 | __remove_from_page_cache(page); | 390 | __remove_from_page_cache(page); |
| 391 | spin_unlock_irq(&mapping->tree_lock); | 391 | spin_unlock_irq(&mapping->tree_lock); |
| 392 | mem_cgroup_uncharge_cache_page(page); | 392 | mem_cgroup_uncharge_cache_page(page); |
| 393 | |||
| 394 | if (mapping->a_ops->freepage) | ||
| 395 | mapping->a_ops->freepage(page); | ||
| 396 | |||
| 393 | page_cache_release(page); /* pagecache ref */ | 397 | page_cache_release(page); /* pagecache ref */ |
| 394 | return 1; | 398 | return 1; |
| 395 | failed: | 399 | failed: |
| @@ -186,27 +186,6 @@ void kzfree(const void *p) | |||
| 186 | } | 186 | } |
| 187 | EXPORT_SYMBOL(kzfree); | 187 | EXPORT_SYMBOL(kzfree); |
| 188 | 188 | ||
| 189 | int kern_ptr_validate(const void *ptr, unsigned long size) | ||
| 190 | { | ||
| 191 | unsigned long addr = (unsigned long)ptr; | ||
| 192 | unsigned long min_addr = PAGE_OFFSET; | ||
| 193 | unsigned long align_mask = sizeof(void *) - 1; | ||
| 194 | |||
| 195 | if (unlikely(addr < min_addr)) | ||
| 196 | goto out; | ||
| 197 | if (unlikely(addr > (unsigned long)high_memory - size)) | ||
| 198 | goto out; | ||
| 199 | if (unlikely(addr & align_mask)) | ||
| 200 | goto out; | ||
| 201 | if (unlikely(!kern_addr_valid(addr))) | ||
| 202 | goto out; | ||
| 203 | if (unlikely(!kern_addr_valid(addr + size - 1))) | ||
| 204 | goto out; | ||
| 205 | return 1; | ||
| 206 | out: | ||
| 207 | return 0; | ||
| 208 | } | ||
| 209 | |||
| 210 | /* | 189 | /* |
| 211 | * strndup_user - duplicate an existing string from user space | 190 | * strndup_user - duplicate an existing string from user space |
| 212 | * @s: The string to duplicate | 191 | * @s: The string to duplicate |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3d66b3dc5cb..f9b166732e70 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -31,8 +31,6 @@ | |||
| 31 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
| 32 | #include <asm/shmparam.h> | 32 | #include <asm/shmparam.h> |
| 33 | 33 | ||
| 34 | bool vmap_lazy_unmap __read_mostly = true; | ||
| 35 | |||
| 36 | /*** Page table manipulation functions ***/ | 34 | /*** Page table manipulation functions ***/ |
| 37 | 35 | ||
| 38 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 36 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) |
| @@ -503,9 +501,6 @@ static unsigned long lazy_max_pages(void) | |||
| 503 | { | 501 | { |
| 504 | unsigned int log; | 502 | unsigned int log; |
| 505 | 503 | ||
| 506 | if (!vmap_lazy_unmap) | ||
| 507 | return 0; | ||
| 508 | |||
| 509 | log = fls(num_online_cpus()); | 504 | log = fls(num_online_cpus()); |
| 510 | 505 | ||
| 511 | return log * (32UL * 1024 * 1024 / PAGE_SIZE); | 506 | return log * (32UL * 1024 * 1024 / PAGE_SIZE); |
| @@ -566,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
| 566 | if (va->va_end > *end) | 561 | if (va->va_end > *end) |
| 567 | *end = va->va_end; | 562 | *end = va->va_end; |
| 568 | nr += (va->va_end - va->va_start) >> PAGE_SHIFT; | 563 | nr += (va->va_end - va->va_start) >> PAGE_SHIFT; |
| 569 | unmap_vmap_area(va); | ||
| 570 | list_add_tail(&va->purge_list, &valist); | 564 | list_add_tail(&va->purge_list, &valist); |
| 571 | va->flags |= VM_LAZY_FREEING; | 565 | va->flags |= VM_LAZY_FREEING; |
| 572 | va->flags &= ~VM_LAZY_FREE; | 566 | va->flags &= ~VM_LAZY_FREE; |
| @@ -611,10 +605,11 @@ static void purge_vmap_area_lazy(void) | |||
| 611 | } | 605 | } |
| 612 | 606 | ||
| 613 | /* | 607 | /* |
| 614 | * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been | 608 | * Free a vmap area, caller ensuring that the area has been unmapped |
| 615 | * called for the correct range previously. | 609 | * and flush_cache_vunmap had been called for the correct range |
| 610 | * previously. | ||
| 616 | */ | 611 | */ |
| 617 | static void free_unmap_vmap_area_noflush(struct vmap_area *va) | 612 | static void free_vmap_area_noflush(struct vmap_area *va) |
| 618 | { | 613 | { |
| 619 | va->flags |= VM_LAZY_FREE; | 614 | va->flags |= VM_LAZY_FREE; |
| 620 | atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); | 615 | atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); |
| @@ -623,6 +618,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va) | |||
| 623 | } | 618 | } |
| 624 | 619 | ||
| 625 | /* | 620 | /* |
| 621 | * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been | ||
| 622 | * called for the correct range previously. | ||
| 623 | */ | ||
| 624 | static void free_unmap_vmap_area_noflush(struct vmap_area *va) | ||
| 625 | { | ||
| 626 | unmap_vmap_area(va); | ||
| 627 | free_vmap_area_noflush(va); | ||
| 628 | } | ||
| 629 | |||
| 630 | /* | ||
| 626 | * Free and unmap a vmap area | 631 | * Free and unmap a vmap area |
| 627 | */ | 632 | */ |
| 628 | static void free_unmap_vmap_area(struct vmap_area *va) | 633 | static void free_unmap_vmap_area(struct vmap_area *va) |
| @@ -743,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
| 743 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, | 748 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, |
| 744 | VMALLOC_START, VMALLOC_END, | 749 | VMALLOC_START, VMALLOC_END, |
| 745 | node, gfp_mask); | 750 | node, gfp_mask); |
| 746 | if (unlikely(IS_ERR(va))) { | 751 | if (IS_ERR(va)) { |
| 747 | kfree(vb); | 752 | kfree(vb); |
| 748 | return ERR_CAST(va); | 753 | return ERR_CAST(va); |
| 749 | } | 754 | } |
| @@ -798,7 +803,7 @@ static void free_vmap_block(struct vmap_block *vb) | |||
| 798 | spin_unlock(&vmap_block_tree_lock); | 803 | spin_unlock(&vmap_block_tree_lock); |
| 799 | BUG_ON(tmp != vb); | 804 | BUG_ON(tmp != vb); |
| 800 | 805 | ||
| 801 | free_unmap_vmap_area_noflush(vb->va); | 806 | free_vmap_area_noflush(vb->va); |
| 802 | call_rcu(&vb->rcu_head, rcu_free_vb); | 807 | call_rcu(&vb->rcu_head, rcu_free_vb); |
| 803 | } | 808 | } |
| 804 | 809 | ||
| @@ -936,6 +941,8 @@ static void vb_free(const void *addr, unsigned long size) | |||
| 936 | rcu_read_unlock(); | 941 | rcu_read_unlock(); |
| 937 | BUG_ON(!vb); | 942 | BUG_ON(!vb); |
| 938 | 943 | ||
| 944 | vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); | ||
| 945 | |||
| 939 | spin_lock(&vb->lock); | 946 | spin_lock(&vb->lock); |
| 940 | BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); | 947 | BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); |
| 941 | 948 | ||
| @@ -988,7 +995,6 @@ void vm_unmap_aliases(void) | |||
| 988 | 995 | ||
| 989 | s = vb->va->va_start + (i << PAGE_SHIFT); | 996 | s = vb->va->va_start + (i << PAGE_SHIFT); |
| 990 | e = vb->va->va_start + (j << PAGE_SHIFT); | 997 | e = vb->va->va_start + (j << PAGE_SHIFT); |
| 991 | vunmap_page_range(s, e); | ||
| 992 | flush = 1; | 998 | flush = 1; |
| 993 | 999 | ||
| 994 | if (s < start) | 1000 | if (s < start) |
| @@ -1169,6 +1175,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) | |||
| 1169 | { | 1175 | { |
| 1170 | vunmap_page_range(addr, addr + size); | 1176 | vunmap_page_range(addr, addr + size); |
| 1171 | } | 1177 | } |
| 1178 | EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); | ||
| 1172 | 1179 | ||
| 1173 | /** | 1180 | /** |
| 1174 | * unmap_kernel_range - unmap kernel VM area and flush cache and TLB | 1181 | * unmap_kernel_range - unmap kernel VM area and flush cache and TLB |
| @@ -1309,13 +1316,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | |||
| 1309 | -1, GFP_KERNEL, caller); | 1316 | -1, GFP_KERNEL, caller); |
| 1310 | } | 1317 | } |
| 1311 | 1318 | ||
| 1312 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | ||
| 1313 | int node, gfp_t gfp_mask) | ||
| 1314 | { | ||
| 1315 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | ||
| 1316 | node, gfp_mask, __builtin_return_address(0)); | ||
| 1317 | } | ||
| 1318 | |||
| 1319 | static struct vm_struct *find_vm_area(const void *addr) | 1319 | static struct vm_struct *find_vm_area(const void *addr) |
| 1320 | { | 1320 | { |
| 1321 | struct vmap_area *va; | 1321 | struct vmap_area *va; |
| @@ -1531,25 +1531,12 @@ fail: | |||
| 1531 | return NULL; | 1531 | return NULL; |
| 1532 | } | 1532 | } |
| 1533 | 1533 | ||
| 1534 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | ||
| 1535 | { | ||
| 1536 | void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, | ||
| 1537 | __builtin_return_address(0)); | ||
| 1538 | |||
| 1539 | /* | ||
| 1540 | * A ref_count = 3 is needed because the vm_struct and vmap_area | ||
| 1541 | * structures allocated in the __get_vm_area_node() function contain | ||
| 1542 | * references to the virtual address of the vmalloc'ed block. | ||
| 1543 | */ | ||
| 1544 | kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); | ||
| 1545 | |||
| 1546 | return addr; | ||
| 1547 | } | ||
| 1548 | |||
| 1549 | /** | 1534 | /** |
| 1550 | * __vmalloc_node - allocate virtually contiguous memory | 1535 | * __vmalloc_node_range - allocate virtually contiguous memory |
| 1551 | * @size: allocation size | 1536 | * @size: allocation size |
| 1552 | * @align: desired alignment | 1537 | * @align: desired alignment |
| 1538 | * @start: vm area range start | ||
| 1539 | * @end: vm area range end | ||
| 1553 | * @gfp_mask: flags for the page level allocator | 1540 | * @gfp_mask: flags for the page level allocator |
| 1554 | * @prot: protection mask for the allocated pages | 1541 | * @prot: protection mask for the allocated pages |
| 1555 | * @node: node to use for allocation or -1 | 1542 | * @node: node to use for allocation or -1 |
| @@ -1559,9 +1546,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
| 1559 | * allocator with @gfp_mask flags. Map them into contiguous | 1546 | * allocator with @gfp_mask flags. Map them into contiguous |
| 1560 | * kernel virtual space, using a pagetable protection of @prot. | 1547 | * kernel virtual space, using a pagetable protection of @prot. |
| 1561 | */ | 1548 | */ |
| 1562 | static void *__vmalloc_node(unsigned long size, unsigned long align, | 1549 | void *__vmalloc_node_range(unsigned long size, unsigned long align, |
| 1563 | gfp_t gfp_mask, pgprot_t prot, | 1550 | unsigned long start, unsigned long end, gfp_t gfp_mask, |
| 1564 | int node, void *caller) | 1551 | pgprot_t prot, int node, void *caller) |
| 1565 | { | 1552 | { |
| 1566 | struct vm_struct *area; | 1553 | struct vm_struct *area; |
| 1567 | void *addr; | 1554 | void *addr; |
| @@ -1571,8 +1558,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
| 1571 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1558 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
| 1572 | return NULL; | 1559 | return NULL; |
| 1573 | 1560 | ||
| 1574 | area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, | 1561 | area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, |
| 1575 | VMALLOC_END, node, gfp_mask, caller); | 1562 | gfp_mask, caller); |
| 1576 | 1563 | ||
| 1577 | if (!area) | 1564 | if (!area) |
| 1578 | return NULL; | 1565 | return NULL; |
| @@ -1589,6 +1576,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
| 1589 | return addr; | 1576 | return addr; |
| 1590 | } | 1577 | } |
| 1591 | 1578 | ||
| 1579 | /** | ||
| 1580 | * __vmalloc_node - allocate virtually contiguous memory | ||
| 1581 | * @size: allocation size | ||
| 1582 | * @align: desired alignment | ||
| 1583 | * @gfp_mask: flags for the page level allocator | ||
| 1584 | * @prot: protection mask for the allocated pages | ||
| 1585 | * @node: node to use for allocation or -1 | ||
| 1586 | * @caller: caller's return address | ||
| 1587 | * | ||
| 1588 | * Allocate enough pages to cover @size from the page level | ||
| 1589 | * allocator with @gfp_mask flags. Map them into contiguous | ||
| 1590 | * kernel virtual space, using a pagetable protection of @prot. | ||
| 1591 | */ | ||
| 1592 | static void *__vmalloc_node(unsigned long size, unsigned long align, | ||
| 1593 | gfp_t gfp_mask, pgprot_t prot, | ||
| 1594 | int node, void *caller) | ||
| 1595 | { | ||
| 1596 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, | ||
| 1597 | gfp_mask, prot, node, caller); | ||
| 1598 | } | ||
| 1599 | |||
| 1592 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1600 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
| 1593 | { | 1601 | { |
| 1594 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, | 1602 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, |
| @@ -2197,17 +2205,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, | |||
| 2197 | * @sizes: array containing size of each area | 2205 | * @sizes: array containing size of each area |
| 2198 | * @nr_vms: the number of areas to allocate | 2206 | * @nr_vms: the number of areas to allocate |
| 2199 | * @align: alignment, all entries in @offsets and @sizes must be aligned to this | 2207 | * @align: alignment, all entries in @offsets and @sizes must be aligned to this |
| 2200 | * @gfp_mask: allocation mask | ||
| 2201 | * | 2208 | * |
| 2202 | * Returns: kmalloc'd vm_struct pointer array pointing to allocated | 2209 | * Returns: kmalloc'd vm_struct pointer array pointing to allocated |
| 2203 | * vm_structs on success, %NULL on failure | 2210 | * vm_structs on success, %NULL on failure |
| 2204 | * | 2211 | * |
| 2205 | * Percpu allocator wants to use congruent vm areas so that it can | 2212 | * Percpu allocator wants to use congruent vm areas so that it can |
| 2206 | * maintain the offsets among percpu areas. This function allocates | 2213 | * maintain the offsets among percpu areas. This function allocates |
| 2207 | * congruent vmalloc areas for it. These areas tend to be scattered | 2214 | * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to |
| 2208 | * pretty far, distance between two areas easily going up to | 2215 | * be scattered pretty far, distance between two areas easily going up |
| 2209 | * gigabytes. To avoid interacting with regular vmallocs, these areas | 2216 | * to gigabytes. To avoid interacting with regular vmallocs, these |
| 2210 | * are allocated from top. | 2217 | * areas are allocated from top. |
| 2211 | * | 2218 | * |
| 2212 | * Despite its complicated look, this allocator is rather simple. It | 2219 | * Despite its complicated look, this allocator is rather simple. It |
| 2213 | * does everything top-down and scans areas from the end looking for | 2220 | * does everything top-down and scans areas from the end looking for |
| @@ -2218,7 +2225,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, | |||
| 2218 | */ | 2225 | */ |
| 2219 | struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | 2226 | struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, |
| 2220 | const size_t *sizes, int nr_vms, | 2227 | const size_t *sizes, int nr_vms, |
| 2221 | size_t align, gfp_t gfp_mask) | 2228 | size_t align) |
| 2222 | { | 2229 | { |
| 2223 | const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); | 2230 | const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); |
| 2224 | const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); | 2231 | const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); |
| @@ -2228,8 +2235,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
| 2228 | unsigned long base, start, end, last_end; | 2235 | unsigned long base, start, end, last_end; |
| 2229 | bool purged = false; | 2236 | bool purged = false; |
| 2230 | 2237 | ||
| 2231 | gfp_mask &= GFP_RECLAIM_MASK; | ||
| 2232 | |||
| 2233 | /* verify parameters and allocate data structures */ | 2238 | /* verify parameters and allocate data structures */ |
| 2234 | BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); | 2239 | BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); |
| 2235 | for (last_area = 0, area = 0; area < nr_vms; area++) { | 2240 | for (last_area = 0, area = 0; area < nr_vms; area++) { |
| @@ -2262,14 +2267,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
| 2262 | return NULL; | 2267 | return NULL; |
| 2263 | } | 2268 | } |
| 2264 | 2269 | ||
| 2265 | vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); | 2270 | vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); |
| 2266 | vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); | 2271 | vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); |
| 2267 | if (!vas || !vms) | 2272 | if (!vas || !vms) |
| 2268 | goto err_free; | 2273 | goto err_free; |
| 2269 | 2274 | ||
| 2270 | for (area = 0; area < nr_vms; area++) { | 2275 | for (area = 0; area < nr_vms; area++) { |
| 2271 | vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); | 2276 | vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); |
| 2272 | vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); | 2277 | vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); |
| 2273 | if (!vas[area] || !vms[area]) | 2278 | if (!vas[area] || !vms[area]) |
| 2274 | goto err_free; | 2279 | goto err_free; |
| 2275 | } | 2280 | } |
| @@ -2450,13 +2455,8 @@ static int s_show(struct seq_file *m, void *p) | |||
| 2450 | seq_printf(m, "0x%p-0x%p %7ld", | 2455 | seq_printf(m, "0x%p-0x%p %7ld", |
| 2451 | v->addr, v->addr + v->size, v->size); | 2456 | v->addr, v->addr + v->size, v->size); |
| 2452 | 2457 | ||
| 2453 | if (v->caller) { | 2458 | if (v->caller) |
| 2454 | char buff[KSYM_SYMBOL_LEN]; | 2459 | seq_printf(m, " %pS", v->caller); |
| 2455 | |||
| 2456 | seq_putc(m, ' '); | ||
| 2457 | sprint_symbol(buff, (unsigned long)v->caller); | ||
| 2458 | seq_puts(m, buff); | ||
| 2459 | } | ||
| 2460 | 2460 | ||
| 2461 | if (v->nr_pages) | 2461 | if (v->nr_pages) |
| 2462 | seq_printf(m, " pages=%d", v->nr_pages); | 2462 | seq_printf(m, " pages=%d", v->nr_pages); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index d31d7ce52c0e..99999a9b2b0b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #include <linux/topology.h> | 32 | #include <linux/topology.h> |
| 33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
| 34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
| 35 | #include <linux/compaction.h> | ||
| 35 | #include <linux/notifier.h> | 36 | #include <linux/notifier.h> |
| 36 | #include <linux/rwsem.h> | 37 | #include <linux/rwsem.h> |
| 37 | #include <linux/delay.h> | 38 | #include <linux/delay.h> |
| @@ -40,6 +41,7 @@ | |||
| 40 | #include <linux/memcontrol.h> | 41 | #include <linux/memcontrol.h> |
| 41 | #include <linux/delayacct.h> | 42 | #include <linux/delayacct.h> |
| 42 | #include <linux/sysctl.h> | 43 | #include <linux/sysctl.h> |
| 44 | #include <linux/compaction.h> | ||
| 43 | 45 | ||
| 44 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
| 45 | #include <asm/div64.h> | 47 | #include <asm/div64.h> |
| @@ -51,11 +53,23 @@ | |||
| 51 | #define CREATE_TRACE_POINTS | 53 | #define CREATE_TRACE_POINTS |
| 52 | #include <trace/events/vmscan.h> | 54 | #include <trace/events/vmscan.h> |
| 53 | 55 | ||
| 54 | enum lumpy_mode { | 56 | /* |
| 55 | LUMPY_MODE_NONE, | 57 | * reclaim_mode determines how the inactive list is shrunk |
| 56 | LUMPY_MODE_ASYNC, | 58 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages |
| 57 | LUMPY_MODE_SYNC, | 59 | * RECLAIM_MODE_ASYNC: Do not block |
| 58 | }; | 60 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback |
| 61 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference | ||
| 62 | * page from the LRU and reclaim all pages within a | ||
| 63 | * naturally aligned range | ||
| 64 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of | ||
| 65 | * order-0 pages and then compact the zone | ||
| 66 | */ | ||
| 67 | typedef unsigned __bitwise__ reclaim_mode_t; | ||
| 68 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) | ||
| 69 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) | ||
| 70 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) | ||
| 71 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) | ||
| 72 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) | ||
| 59 | 73 | ||
| 60 | struct scan_control { | 74 | struct scan_control { |
| 61 | /* Incremented by the number of inactive pages that were scanned */ | 75 | /* Incremented by the number of inactive pages that were scanned */ |
| @@ -88,7 +102,7 @@ struct scan_control { | |||
| 88 | * Intend to reclaim enough continuous memory rather than reclaim | 102 | * Intend to reclaim enough continuous memory rather than reclaim |
| 89 | * enough amount of memory. i.e, mode for high order allocation. | 103 | * enough amount of memory. i.e, mode for high order allocation. |
| 90 | */ | 104 | */ |
| 91 | enum lumpy_mode lumpy_reclaim_mode; | 105 | reclaim_mode_t reclaim_mode; |
| 92 | 106 | ||
| 93 | /* Which cgroup do we reclaim from */ | 107 | /* Which cgroup do we reclaim from */ |
| 94 | struct mem_cgroup *mem_cgroup; | 108 | struct mem_cgroup *mem_cgroup; |
| @@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
| 271 | return ret; | 285 | return ret; |
| 272 | } | 286 | } |
| 273 | 287 | ||
| 274 | static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, | 288 | static void set_reclaim_mode(int priority, struct scan_control *sc, |
| 275 | bool sync) | 289 | bool sync) |
| 276 | { | 290 | { |
| 277 | enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; | 291 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; |
| 278 | 292 | ||
| 279 | /* | 293 | /* |
| 280 | * Some reclaim have alredy been failed. No worth to try synchronous | 294 | * Initially assume we are entering either lumpy reclaim or |
| 281 | * lumpy reclaim. | 295 | * reclaim/compaction.Depending on the order, we will either set the |
| 296 | * sync mode or just reclaim order-0 pages later. | ||
| 282 | */ | 297 | */ |
| 283 | if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) | 298 | if (COMPACTION_BUILD) |
| 284 | return; | 299 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; |
| 300 | else | ||
| 301 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; | ||
| 285 | 302 | ||
| 286 | /* | 303 | /* |
| 287 | * If we need a large contiguous chunk of memory, or have | 304 | * Avoid using lumpy reclaim or reclaim/compaction if possible by |
| 288 | * trouble getting a small set of contiguous pages, we | 305 | * restricting when its set to either costly allocations or when |
| 289 | * will reclaim both active and inactive pages. | 306 | * under memory pressure |
| 290 | */ | 307 | */ |
| 291 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | 308 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) |
| 292 | sc->lumpy_reclaim_mode = mode; | 309 | sc->reclaim_mode |= syncmode; |
| 293 | else if (sc->order && priority < DEF_PRIORITY - 2) | 310 | else if (sc->order && priority < DEF_PRIORITY - 2) |
| 294 | sc->lumpy_reclaim_mode = mode; | 311 | sc->reclaim_mode |= syncmode; |
| 295 | else | 312 | else |
| 296 | sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; | 313 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
| 297 | } | 314 | } |
| 298 | 315 | ||
| 299 | static void disable_lumpy_reclaim_mode(struct scan_control *sc) | 316 | static void reset_reclaim_mode(struct scan_control *sc) |
| 300 | { | 317 | { |
| 301 | sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; | 318 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
| 302 | } | 319 | } |
| 303 | 320 | ||
| 304 | static inline int is_page_cache_freeable(struct page *page) | 321 | static inline int is_page_cache_freeable(struct page *page) |
| @@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
| 429 | * first attempt to free a range of pages fails. | 446 | * first attempt to free a range of pages fails. |
| 430 | */ | 447 | */ |
| 431 | if (PageWriteback(page) && | 448 | if (PageWriteback(page) && |
| 432 | sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) | 449 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) |
| 433 | wait_on_page_writeback(page); | 450 | wait_on_page_writeback(page); |
| 434 | 451 | ||
| 435 | if (!PageWriteback(page)) { | 452 | if (!PageWriteback(page)) { |
| @@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
| 437 | ClearPageReclaim(page); | 454 | ClearPageReclaim(page); |
| 438 | } | 455 | } |
| 439 | trace_mm_vmscan_writepage(page, | 456 | trace_mm_vmscan_writepage(page, |
| 440 | trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); | 457 | trace_reclaim_flags(page, sc->reclaim_mode)); |
| 441 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 458 | inc_zone_page_state(page, NR_VMSCAN_WRITE); |
| 442 | return PAGE_SUCCESS; | 459 | return PAGE_SUCCESS; |
| 443 | } | 460 | } |
| @@ -494,9 +511,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) | |||
| 494 | spin_unlock_irq(&mapping->tree_lock); | 511 | spin_unlock_irq(&mapping->tree_lock); |
| 495 | swapcache_free(swap, page); | 512 | swapcache_free(swap, page); |
| 496 | } else { | 513 | } else { |
| 514 | void (*freepage)(struct page *); | ||
| 515 | |||
| 516 | freepage = mapping->a_ops->freepage; | ||
| 517 | |||
| 497 | __remove_from_page_cache(page); | 518 | __remove_from_page_cache(page); |
| 498 | spin_unlock_irq(&mapping->tree_lock); | 519 | spin_unlock_irq(&mapping->tree_lock); |
| 499 | mem_cgroup_uncharge_cache_page(page); | 520 | mem_cgroup_uncharge_cache_page(page); |
| 521 | |||
| 522 | if (freepage != NULL) | ||
| 523 | freepage(page); | ||
| 500 | } | 524 | } |
| 501 | 525 | ||
| 502 | return 1; | 526 | return 1; |
| @@ -615,7 +639,7 @@ static enum page_references page_check_references(struct page *page, | |||
| 615 | referenced_page = TestClearPageReferenced(page); | 639 | referenced_page = TestClearPageReferenced(page); |
| 616 | 640 | ||
| 617 | /* Lumpy reclaim - ignore references */ | 641 | /* Lumpy reclaim - ignore references */ |
| 618 | if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) | 642 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) |
| 619 | return PAGEREF_RECLAIM; | 643 | return PAGEREF_RECLAIM; |
| 620 | 644 | ||
| 621 | /* | 645 | /* |
| @@ -732,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 732 | * for any page for which writeback has already | 756 | * for any page for which writeback has already |
| 733 | * started. | 757 | * started. |
| 734 | */ | 758 | */ |
| 735 | if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && | 759 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
| 736 | may_enter_fs) | 760 | may_enter_fs) |
| 737 | wait_on_page_writeback(page); | 761 | wait_on_page_writeback(page); |
| 738 | else { | 762 | else { |
| @@ -888,7 +912,7 @@ cull_mlocked: | |||
| 888 | try_to_free_swap(page); | 912 | try_to_free_swap(page); |
| 889 | unlock_page(page); | 913 | unlock_page(page); |
| 890 | putback_lru_page(page); | 914 | putback_lru_page(page); |
| 891 | disable_lumpy_reclaim_mode(sc); | 915 | reset_reclaim_mode(sc); |
| 892 | continue; | 916 | continue; |
| 893 | 917 | ||
| 894 | activate_locked: | 918 | activate_locked: |
| @@ -901,7 +925,7 @@ activate_locked: | |||
| 901 | keep_locked: | 925 | keep_locked: |
| 902 | unlock_page(page); | 926 | unlock_page(page); |
| 903 | keep: | 927 | keep: |
| 904 | disable_lumpy_reclaim_mode(sc); | 928 | reset_reclaim_mode(sc); |
| 905 | keep_lumpy: | 929 | keep_lumpy: |
| 906 | list_add(&page->lru, &ret_pages); | 930 | list_add(&page->lru, &ret_pages); |
| 907 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 931 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
| @@ -1021,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 1021 | case 0: | 1045 | case 0: |
| 1022 | list_move(&page->lru, dst); | 1046 | list_move(&page->lru, dst); |
| 1023 | mem_cgroup_del_lru(page); | 1047 | mem_cgroup_del_lru(page); |
| 1024 | nr_taken++; | 1048 | nr_taken += hpage_nr_pages(page); |
| 1025 | break; | 1049 | break; |
| 1026 | 1050 | ||
| 1027 | case -EBUSY: | 1051 | case -EBUSY: |
| @@ -1079,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 1079 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | 1103 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
| 1080 | list_move(&cursor_page->lru, dst); | 1104 | list_move(&cursor_page->lru, dst); |
| 1081 | mem_cgroup_del_lru(cursor_page); | 1105 | mem_cgroup_del_lru(cursor_page); |
| 1082 | nr_taken++; | 1106 | nr_taken += hpage_nr_pages(page); |
| 1083 | nr_lumpy_taken++; | 1107 | nr_lumpy_taken++; |
| 1084 | if (PageDirty(cursor_page)) | 1108 | if (PageDirty(cursor_page)) |
| 1085 | nr_lumpy_dirty++; | 1109 | nr_lumpy_dirty++; |
| @@ -1134,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, | |||
| 1134 | struct page *page; | 1158 | struct page *page; |
| 1135 | 1159 | ||
| 1136 | list_for_each_entry(page, page_list, lru) { | 1160 | list_for_each_entry(page, page_list, lru) { |
| 1161 | int numpages = hpage_nr_pages(page); | ||
| 1137 | lru = page_lru_base_type(page); | 1162 | lru = page_lru_base_type(page); |
| 1138 | if (PageActive(page)) { | 1163 | if (PageActive(page)) { |
| 1139 | lru += LRU_ACTIVE; | 1164 | lru += LRU_ACTIVE; |
| 1140 | ClearPageActive(page); | 1165 | ClearPageActive(page); |
| 1141 | nr_active++; | 1166 | nr_active += numpages; |
| 1142 | } | 1167 | } |
| 1143 | if (count) | 1168 | if (count) |
| 1144 | count[lru]++; | 1169 | count[lru] += numpages; |
| 1145 | } | 1170 | } |
| 1146 | 1171 | ||
| 1147 | return nr_active; | 1172 | return nr_active; |
| @@ -1246,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
| 1246 | spin_lock_irq(&zone->lru_lock); | 1271 | spin_lock_irq(&zone->lru_lock); |
| 1247 | continue; | 1272 | continue; |
| 1248 | } | 1273 | } |
| 1249 | SetPageLRU(page); | ||
| 1250 | lru = page_lru(page); | 1274 | lru = page_lru(page); |
| 1251 | add_page_to_lru_list(zone, page, lru); | ||
| 1252 | if (is_active_lru(lru)) { | 1275 | if (is_active_lru(lru)) { |
| 1253 | int file = is_file_lru(lru); | 1276 | int file = is_file_lru(lru); |
| 1254 | reclaim_stat->recent_rotated[file]++; | 1277 | int numpages = hpage_nr_pages(page); |
| 1278 | reclaim_stat->recent_rotated[file] += numpages; | ||
| 1279 | if (putback_active_lru_page(zone, page)) | ||
| 1280 | continue; | ||
| 1255 | } | 1281 | } |
| 1282 | SetPageLRU(page); | ||
| 1283 | add_page_to_lru_list(zone, page, lru); | ||
| 1256 | if (!pagevec_add(&pvec, page)) { | 1284 | if (!pagevec_add(&pvec, page)) { |
| 1257 | spin_unlock_irq(&zone->lru_lock); | 1285 | spin_unlock_irq(&zone->lru_lock); |
| 1258 | __pagevec_release(&pvec); | 1286 | __pagevec_release(&pvec); |
| @@ -1317,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, | |||
| 1317 | return false; | 1345 | return false; |
| 1318 | 1346 | ||
| 1319 | /* Only stall on lumpy reclaim */ | 1347 | /* Only stall on lumpy reclaim */ |
| 1320 | if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) | 1348 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) |
| 1321 | return false; | 1349 | return false; |
| 1322 | 1350 | ||
| 1323 | /* If we have relaimed everything on the isolated list, no stall */ | 1351 | /* If we have relaimed everything on the isolated list, no stall */ |
| @@ -1361,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
| 1361 | return SWAP_CLUSTER_MAX; | 1389 | return SWAP_CLUSTER_MAX; |
| 1362 | } | 1390 | } |
| 1363 | 1391 | ||
| 1364 | set_lumpy_reclaim_mode(priority, sc, false); | 1392 | set_reclaim_mode(priority, sc, false); |
| 1365 | lru_add_drain(); | 1393 | lru_add_drain(); |
| 1366 | spin_lock_irq(&zone->lru_lock); | 1394 | spin_lock_irq(&zone->lru_lock); |
| 1367 | 1395 | ||
| 1368 | if (scanning_global_lru(sc)) { | 1396 | if (scanning_global_lru(sc)) { |
| 1369 | nr_taken = isolate_pages_global(nr_to_scan, | 1397 | nr_taken = isolate_pages_global(nr_to_scan, |
| 1370 | &page_list, &nr_scanned, sc->order, | 1398 | &page_list, &nr_scanned, sc->order, |
| 1371 | sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? | 1399 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
| 1372 | ISOLATE_INACTIVE : ISOLATE_BOTH, | 1400 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
| 1373 | zone, 0, file); | 1401 | zone, 0, file); |
| 1374 | zone->pages_scanned += nr_scanned; | 1402 | zone->pages_scanned += nr_scanned; |
| 1375 | if (current_is_kswapd()) | 1403 | if (current_is_kswapd()) |
| @@ -1381,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
| 1381 | } else { | 1409 | } else { |
| 1382 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1410 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, |
| 1383 | &page_list, &nr_scanned, sc->order, | 1411 | &page_list, &nr_scanned, sc->order, |
| 1384 | sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? | 1412 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
| 1385 | ISOLATE_INACTIVE : ISOLATE_BOTH, | 1413 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
| 1386 | zone, sc->mem_cgroup, | 1414 | zone, sc->mem_cgroup, |
| 1387 | 0, file); | 1415 | 0, file); |
| 1388 | /* | 1416 | /* |
| @@ -1404,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
| 1404 | 1432 | ||
| 1405 | /* Check if we should syncronously wait for writeback */ | 1433 | /* Check if we should syncronously wait for writeback */ |
| 1406 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1434 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
| 1407 | set_lumpy_reclaim_mode(priority, sc, true); | 1435 | set_reclaim_mode(priority, sc, true); |
| 1408 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1436 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); |
| 1409 | } | 1437 | } |
| 1410 | 1438 | ||
| @@ -1419,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
| 1419 | zone_idx(zone), | 1447 | zone_idx(zone), |
| 1420 | nr_scanned, nr_reclaimed, | 1448 | nr_scanned, nr_reclaimed, |
| 1421 | priority, | 1449 | priority, |
| 1422 | trace_shrink_flags(file, sc->lumpy_reclaim_mode)); | 1450 | trace_shrink_flags(file, sc->reclaim_mode)); |
| 1423 | return nr_reclaimed; | 1451 | return nr_reclaimed; |
| 1424 | } | 1452 | } |
| 1425 | 1453 | ||
| @@ -1459,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
| 1459 | 1487 | ||
| 1460 | list_move(&page->lru, &zone->lru[lru].list); | 1488 | list_move(&page->lru, &zone->lru[lru].list); |
| 1461 | mem_cgroup_add_lru_list(page, lru); | 1489 | mem_cgroup_add_lru_list(page, lru); |
| 1462 | pgmoved++; | 1490 | pgmoved += hpage_nr_pages(page); |
| 1463 | 1491 | ||
| 1464 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | 1492 | if (!pagevec_add(&pvec, page) || list_empty(list)) { |
| 1465 | spin_unlock_irq(&zone->lru_lock); | 1493 | spin_unlock_irq(&zone->lru_lock); |
| @@ -1527,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 1527 | } | 1555 | } |
| 1528 | 1556 | ||
| 1529 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | 1557 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
| 1530 | nr_rotated++; | 1558 | nr_rotated += hpage_nr_pages(page); |
| 1531 | /* | 1559 | /* |
| 1532 | * Identify referenced, file-backed active pages and | 1560 | * Identify referenced, file-backed active pages and |
| 1533 | * give them one more trip around the active list. So | 1561 | * give them one more trip around the active list. So |
| @@ -1798,6 +1826,57 @@ out: | |||
| 1798 | } | 1826 | } |
| 1799 | 1827 | ||
| 1800 | /* | 1828 | /* |
| 1829 | * Reclaim/compaction depends on a number of pages being freed. To avoid | ||
| 1830 | * disruption to the system, a small number of order-0 pages continue to be | ||
| 1831 | * rotated and reclaimed in the normal fashion. However, by the time we get | ||
| 1832 | * back to the allocator and call try_to_compact_zone(), we ensure that | ||
| 1833 | * there are enough free pages for it to be likely successful | ||
| 1834 | */ | ||
| 1835 | static inline bool should_continue_reclaim(struct zone *zone, | ||
| 1836 | unsigned long nr_reclaimed, | ||
| 1837 | unsigned long nr_scanned, | ||
| 1838 | struct scan_control *sc) | ||
| 1839 | { | ||
| 1840 | unsigned long pages_for_compaction; | ||
| 1841 | unsigned long inactive_lru_pages; | ||
| 1842 | |||
| 1843 | /* If not in reclaim/compaction mode, stop */ | ||
| 1844 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | ||
| 1845 | return false; | ||
| 1846 | |||
| 1847 | /* | ||
| 1848 | * If we failed to reclaim and have scanned the full list, stop. | ||
| 1849 | * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far | ||
| 1850 | * faster but obviously would be less likely to succeed | ||
| 1851 | * allocation. If this is desirable, use GFP_REPEAT to decide | ||
| 1852 | * if both reclaimed and scanned should be checked or just | ||
| 1853 | * reclaimed | ||
| 1854 | */ | ||
| 1855 | if (!nr_reclaimed && !nr_scanned) | ||
| 1856 | return false; | ||
| 1857 | |||
| 1858 | /* | ||
| 1859 | * If we have not reclaimed enough pages for compaction and the | ||
| 1860 | * inactive lists are large enough, continue reclaiming | ||
| 1861 | */ | ||
| 1862 | pages_for_compaction = (2UL << sc->order); | ||
| 1863 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + | ||
| 1864 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
| 1865 | if (sc->nr_reclaimed < pages_for_compaction && | ||
| 1866 | inactive_lru_pages > pages_for_compaction) | ||
| 1867 | return true; | ||
| 1868 | |||
| 1869 | /* If compaction would go ahead or the allocation would succeed, stop */ | ||
| 1870 | switch (compaction_suitable(zone, sc->order)) { | ||
| 1871 | case COMPACT_PARTIAL: | ||
| 1872 | case COMPACT_CONTINUE: | ||
| 1873 | return false; | ||
| 1874 | default: | ||
| 1875 | return true; | ||
| 1876 | } | ||
| 1877 | } | ||
| 1878 | |||
| 1879 | /* | ||
| 1801 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1880 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
| 1802 | */ | 1881 | */ |
| 1803 | static void shrink_zone(int priority, struct zone *zone, | 1882 | static void shrink_zone(int priority, struct zone *zone, |
| @@ -1806,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1806 | unsigned long nr[NR_LRU_LISTS]; | 1885 | unsigned long nr[NR_LRU_LISTS]; |
| 1807 | unsigned long nr_to_scan; | 1886 | unsigned long nr_to_scan; |
| 1808 | enum lru_list l; | 1887 | enum lru_list l; |
| 1809 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1888 | unsigned long nr_reclaimed; |
| 1810 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1889 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
| 1890 | unsigned long nr_scanned = sc->nr_scanned; | ||
| 1811 | 1891 | ||
| 1892 | restart: | ||
| 1893 | nr_reclaimed = 0; | ||
| 1812 | get_scan_count(zone, sc, nr, priority); | 1894 | get_scan_count(zone, sc, nr, priority); |
| 1813 | 1895 | ||
| 1814 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1896 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
| @@ -1834,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1834 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 1916 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
| 1835 | break; | 1917 | break; |
| 1836 | } | 1918 | } |
| 1837 | 1919 | sc->nr_reclaimed += nr_reclaimed; | |
| 1838 | sc->nr_reclaimed = nr_reclaimed; | ||
| 1839 | 1920 | ||
| 1840 | /* | 1921 | /* |
| 1841 | * Even if we did not try to evict anon pages at all, we want to | 1922 | * Even if we did not try to evict anon pages at all, we want to |
| @@ -1844,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1844 | if (inactive_anon_is_low(zone, sc)) | 1925 | if (inactive_anon_is_low(zone, sc)) |
| 1845 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1926 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
| 1846 | 1927 | ||
| 1928 | /* reclaim/compaction might need reclaim to continue */ | ||
| 1929 | if (should_continue_reclaim(zone, nr_reclaimed, | ||
| 1930 | sc->nr_scanned - nr_scanned, sc)) | ||
| 1931 | goto restart; | ||
| 1932 | |||
| 1847 | throttle_vm_writeout(sc->gfp_mask); | 1933 | throttle_vm_writeout(sc->gfp_mask); |
| 1848 | } | 1934 | } |
| 1849 | 1935 | ||
| @@ -2117,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
| 2117 | } | 2203 | } |
| 2118 | #endif | 2204 | #endif |
| 2119 | 2205 | ||
| 2206 | /* | ||
| 2207 | * pgdat_balanced is used when checking if a node is balanced for high-order | ||
| 2208 | * allocations. Only zones that meet watermarks and are in a zone allowed | ||
| 2209 | * by the callers classzone_idx are added to balanced_pages. The total of | ||
| 2210 | * balanced pages must be at least 25% of the zones allowed by classzone_idx | ||
| 2211 | * for the node to be considered balanced. Forcing all zones to be balanced | ||
| 2212 | * for high orders can cause excessive reclaim when there are imbalanced zones. | ||
| 2213 | * The choice of 25% is due to | ||
| 2214 | * o a 16M DMA zone that is balanced will not balance a zone on any | ||
| 2215 | * reasonable sized machine | ||
| 2216 | * o On all other machines, the top zone must be at least a reasonable | ||
| 2217 | * precentage of the middle zones. For example, on 32-bit x86, highmem | ||
| 2218 | * would need to be at least 256M for it to be balance a whole node. | ||
| 2219 | * Similarly, on x86-64 the Normal zone would need to be at least 1G | ||
| 2220 | * to balance a node on its own. These seemed like reasonable ratios. | ||
| 2221 | */ | ||
| 2222 | static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | ||
| 2223 | int classzone_idx) | ||
| 2224 | { | ||
| 2225 | unsigned long present_pages = 0; | ||
| 2226 | int i; | ||
| 2227 | |||
| 2228 | for (i = 0; i <= classzone_idx; i++) | ||
| 2229 | present_pages += pgdat->node_zones[i].present_pages; | ||
| 2230 | |||
| 2231 | return balanced_pages > (present_pages >> 2); | ||
| 2232 | } | ||
| 2233 | |||
| 2120 | /* is kswapd sleeping prematurely? */ | 2234 | /* is kswapd sleeping prematurely? */ |
| 2121 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | 2235 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, |
| 2236 | int classzone_idx) | ||
| 2122 | { | 2237 | { |
| 2123 | int i; | 2238 | int i; |
| 2239 | unsigned long balanced = 0; | ||
| 2240 | bool all_zones_ok = true; | ||
| 2124 | 2241 | ||
| 2125 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2242 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
| 2126 | if (remaining) | 2243 | if (remaining) |
| 2127 | return 1; | 2244 | return true; |
| 2128 | 2245 | ||
| 2129 | /* If after HZ/10, a zone is below the high mark, it's premature */ | 2246 | /* Check the watermark levels */ |
| 2130 | for (i = 0; i < pgdat->nr_zones; i++) { | 2247 | for (i = 0; i < pgdat->nr_zones; i++) { |
| 2131 | struct zone *zone = pgdat->node_zones + i; | 2248 | struct zone *zone = pgdat->node_zones + i; |
| 2132 | 2249 | ||
| 2133 | if (!populated_zone(zone)) | 2250 | if (!populated_zone(zone)) |
| 2134 | continue; | 2251 | continue; |
| 2135 | 2252 | ||
| 2136 | if (zone->all_unreclaimable) | 2253 | /* |
| 2254 | * balance_pgdat() skips over all_unreclaimable after | ||
| 2255 | * DEF_PRIORITY. Effectively, it considers them balanced so | ||
| 2256 | * they must be considered balanced here as well if kswapd | ||
| 2257 | * is to sleep | ||
| 2258 | */ | ||
| 2259 | if (zone->all_unreclaimable) { | ||
| 2260 | balanced += zone->present_pages; | ||
| 2137 | continue; | 2261 | continue; |
| 2262 | } | ||
| 2138 | 2263 | ||
| 2139 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 2264 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
| 2140 | 0, 0)) | 2265 | classzone_idx, 0)) |
| 2141 | return 1; | 2266 | all_zones_ok = false; |
| 2267 | else | ||
| 2268 | balanced += zone->present_pages; | ||
| 2142 | } | 2269 | } |
| 2143 | 2270 | ||
| 2144 | return 0; | 2271 | /* |
| 2272 | * For high-order requests, the balanced zones must contain at least | ||
| 2273 | * 25% of the nodes pages for kswapd to sleep. For order-0, all zones | ||
| 2274 | * must be balanced | ||
| 2275 | */ | ||
| 2276 | if (order) | ||
| 2277 | return pgdat_balanced(pgdat, balanced, classzone_idx); | ||
| 2278 | else | ||
| 2279 | return !all_zones_ok; | ||
| 2145 | } | 2280 | } |
| 2146 | 2281 | ||
| 2147 | /* | 2282 | /* |
| 2148 | * For kswapd, balance_pgdat() will work across all this node's zones until | 2283 | * For kswapd, balance_pgdat() will work across all this node's zones until |
| 2149 | * they are all at high_wmark_pages(zone). | 2284 | * they are all at high_wmark_pages(zone). |
| 2150 | * | 2285 | * |
| 2151 | * Returns the number of pages which were actually freed. | 2286 | * Returns the final order kswapd was reclaiming at |
| 2152 | * | 2287 | * |
| 2153 | * There is special handling here for zones which are full of pinned pages. | 2288 | * There is special handling here for zones which are full of pinned pages. |
| 2154 | * This can happen if the pages are all mlocked, or if they are all used by | 2289 | * This can happen if the pages are all mlocked, or if they are all used by |
| @@ -2165,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
| 2165 | * interoperates with the page allocator fallback scheme to ensure that aging | 2300 | * interoperates with the page allocator fallback scheme to ensure that aging |
| 2166 | * of pages is balanced across the zones. | 2301 | * of pages is balanced across the zones. |
| 2167 | */ | 2302 | */ |
| 2168 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 2303 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
| 2304 | int *classzone_idx) | ||
| 2169 | { | 2305 | { |
| 2170 | int all_zones_ok; | 2306 | int all_zones_ok; |
| 2307 | unsigned long balanced; | ||
| 2171 | int priority; | 2308 | int priority; |
| 2172 | int i; | 2309 | int i; |
| 2310 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
| 2173 | unsigned long total_scanned; | 2311 | unsigned long total_scanned; |
| 2174 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2312 | struct reclaim_state *reclaim_state = current->reclaim_state; |
| 2175 | struct scan_control sc = { | 2313 | struct scan_control sc = { |
| @@ -2192,7 +2330,6 @@ loop_again: | |||
| 2192 | count_vm_event(PAGEOUTRUN); | 2330 | count_vm_event(PAGEOUTRUN); |
| 2193 | 2331 | ||
| 2194 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2332 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
| 2195 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
| 2196 | unsigned long lru_pages = 0; | 2333 | unsigned long lru_pages = 0; |
| 2197 | int has_under_min_watermark_zone = 0; | 2334 | int has_under_min_watermark_zone = 0; |
| 2198 | 2335 | ||
| @@ -2201,6 +2338,7 @@ loop_again: | |||
| 2201 | disable_swap_token(); | 2338 | disable_swap_token(); |
| 2202 | 2339 | ||
| 2203 | all_zones_ok = 1; | 2340 | all_zones_ok = 1; |
| 2341 | balanced = 0; | ||
| 2204 | 2342 | ||
| 2205 | /* | 2343 | /* |
| 2206 | * Scan in the highmem->dma direction for the highest | 2344 | * Scan in the highmem->dma direction for the highest |
| @@ -2223,9 +2361,10 @@ loop_again: | |||
| 2223 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 2361 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
| 2224 | &sc, priority, 0); | 2362 | &sc, priority, 0); |
| 2225 | 2363 | ||
| 2226 | if (!zone_watermark_ok(zone, order, | 2364 | if (!zone_watermark_ok_safe(zone, order, |
| 2227 | high_wmark_pages(zone), 0, 0)) { | 2365 | high_wmark_pages(zone), 0, 0)) { |
| 2228 | end_zone = i; | 2366 | end_zone = i; |
| 2367 | *classzone_idx = i; | ||
| 2229 | break; | 2368 | break; |
| 2230 | } | 2369 | } |
| 2231 | } | 2370 | } |
| @@ -2248,6 +2387,7 @@ loop_again: | |||
| 2248 | * cause too much scanning of the lower zones. | 2387 | * cause too much scanning of the lower zones. |
| 2249 | */ | 2388 | */ |
| 2250 | for (i = 0; i <= end_zone; i++) { | 2389 | for (i = 0; i <= end_zone; i++) { |
| 2390 | int compaction; | ||
| 2251 | struct zone *zone = pgdat->node_zones + i; | 2391 | struct zone *zone = pgdat->node_zones + i; |
| 2252 | int nr_slab; | 2392 | int nr_slab; |
| 2253 | 2393 | ||
| @@ -2269,7 +2409,7 @@ loop_again: | |||
| 2269 | * We put equal pressure on every zone, unless one | 2409 | * We put equal pressure on every zone, unless one |
| 2270 | * zone has way too many pages free already. | 2410 | * zone has way too many pages free already. |
| 2271 | */ | 2411 | */ |
| 2272 | if (!zone_watermark_ok(zone, order, | 2412 | if (!zone_watermark_ok_safe(zone, order, |
| 2273 | 8*high_wmark_pages(zone), end_zone, 0)) | 2413 | 8*high_wmark_pages(zone), end_zone, 0)) |
| 2274 | shrink_zone(priority, zone, &sc); | 2414 | shrink_zone(priority, zone, &sc); |
| 2275 | reclaim_state->reclaimed_slab = 0; | 2415 | reclaim_state->reclaimed_slab = 0; |
| @@ -2277,9 +2417,26 @@ loop_again: | |||
| 2277 | lru_pages); | 2417 | lru_pages); |
| 2278 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2418 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
| 2279 | total_scanned += sc.nr_scanned; | 2419 | total_scanned += sc.nr_scanned; |
| 2420 | |||
| 2421 | compaction = 0; | ||
| 2422 | if (order && | ||
| 2423 | zone_watermark_ok(zone, 0, | ||
| 2424 | high_wmark_pages(zone), | ||
| 2425 | end_zone, 0) && | ||
| 2426 | !zone_watermark_ok(zone, order, | ||
| 2427 | high_wmark_pages(zone), | ||
| 2428 | end_zone, 0)) { | ||
| 2429 | compact_zone_order(zone, | ||
| 2430 | order, | ||
| 2431 | sc.gfp_mask, false, | ||
| 2432 | COMPACT_MODE_KSWAPD); | ||
| 2433 | compaction = 1; | ||
| 2434 | } | ||
| 2435 | |||
| 2280 | if (zone->all_unreclaimable) | 2436 | if (zone->all_unreclaimable) |
| 2281 | continue; | 2437 | continue; |
| 2282 | if (nr_slab == 0 && !zone_reclaimable(zone)) | 2438 | if (!compaction && nr_slab == 0 && |
| 2439 | !zone_reclaimable(zone)) | ||
| 2283 | zone->all_unreclaimable = 1; | 2440 | zone->all_unreclaimable = 1; |
| 2284 | /* | 2441 | /* |
| 2285 | * If we've done a decent amount of scanning and | 2442 | * If we've done a decent amount of scanning and |
| @@ -2290,7 +2447,7 @@ loop_again: | |||
| 2290 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2447 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
| 2291 | sc.may_writepage = 1; | 2448 | sc.may_writepage = 1; |
| 2292 | 2449 | ||
| 2293 | if (!zone_watermark_ok(zone, order, | 2450 | if (!zone_watermark_ok_safe(zone, order, |
| 2294 | high_wmark_pages(zone), end_zone, 0)) { | 2451 | high_wmark_pages(zone), end_zone, 0)) { |
| 2295 | all_zones_ok = 0; | 2452 | all_zones_ok = 0; |
| 2296 | /* | 2453 | /* |
| @@ -2298,7 +2455,7 @@ loop_again: | |||
| 2298 | * means that we have a GFP_ATOMIC allocation | 2455 | * means that we have a GFP_ATOMIC allocation |
| 2299 | * failure risk. Hurry up! | 2456 | * failure risk. Hurry up! |
| 2300 | */ | 2457 | */ |
| 2301 | if (!zone_watermark_ok(zone, order, | 2458 | if (!zone_watermark_ok_safe(zone, order, |
| 2302 | min_wmark_pages(zone), end_zone, 0)) | 2459 | min_wmark_pages(zone), end_zone, 0)) |
| 2303 | has_under_min_watermark_zone = 1; | 2460 | has_under_min_watermark_zone = 1; |
| 2304 | } else { | 2461 | } else { |
| @@ -2310,10 +2467,12 @@ loop_again: | |||
| 2310 | * spectulatively avoid congestion waits | 2467 | * spectulatively avoid congestion waits |
| 2311 | */ | 2468 | */ |
| 2312 | zone_clear_flag(zone, ZONE_CONGESTED); | 2469 | zone_clear_flag(zone, ZONE_CONGESTED); |
| 2470 | if (i <= *classzone_idx) | ||
| 2471 | balanced += zone->present_pages; | ||
| 2313 | } | 2472 | } |
| 2314 | 2473 | ||
| 2315 | } | 2474 | } |
| 2316 | if (all_zones_ok) | 2475 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
| 2317 | break; /* kswapd: all done */ | 2476 | break; /* kswapd: all done */ |
| 2318 | /* | 2477 | /* |
| 2319 | * OK, kswapd is getting into trouble. Take a nap, then take | 2478 | * OK, kswapd is getting into trouble. Take a nap, then take |
| @@ -2336,7 +2495,13 @@ loop_again: | |||
| 2336 | break; | 2495 | break; |
| 2337 | } | 2496 | } |
| 2338 | out: | 2497 | out: |
| 2339 | if (!all_zones_ok) { | 2498 | |
| 2499 | /* | ||
| 2500 | * order-0: All zones must meet high watermark for a balanced node | ||
| 2501 | * high-order: Balanced zones must make up at least 25% of the node | ||
| 2502 | * for the node to be balanced | ||
| 2503 | */ | ||
| 2504 | if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { | ||
| 2340 | cond_resched(); | 2505 | cond_resched(); |
| 2341 | 2506 | ||
| 2342 | try_to_freeze(); | 2507 | try_to_freeze(); |
| @@ -2361,7 +2526,88 @@ out: | |||
| 2361 | goto loop_again; | 2526 | goto loop_again; |
| 2362 | } | 2527 | } |
| 2363 | 2528 | ||
| 2364 | return sc.nr_reclaimed; | 2529 | /* |
| 2530 | * If kswapd was reclaiming at a higher order, it has the option of | ||
| 2531 | * sleeping without all zones being balanced. Before it does, it must | ||
| 2532 | * ensure that the watermarks for order-0 on *all* zones are met and | ||
| 2533 | * that the congestion flags are cleared. The congestion flag must | ||
| 2534 | * be cleared as kswapd is the only mechanism that clears the flag | ||
| 2535 | * and it is potentially going to sleep here. | ||
| 2536 | */ | ||
| 2537 | if (order) { | ||
| 2538 | for (i = 0; i <= end_zone; i++) { | ||
| 2539 | struct zone *zone = pgdat->node_zones + i; | ||
| 2540 | |||
| 2541 | if (!populated_zone(zone)) | ||
| 2542 | continue; | ||
| 2543 | |||
| 2544 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | ||
| 2545 | continue; | ||
| 2546 | |||
| 2547 | /* Confirm the zone is balanced for order-0 */ | ||
| 2548 | if (!zone_watermark_ok(zone, 0, | ||
| 2549 | high_wmark_pages(zone), 0, 0)) { | ||
| 2550 | order = sc.order = 0; | ||
| 2551 | goto loop_again; | ||
| 2552 | } | ||
| 2553 | |||
| 2554 | /* If balanced, clear the congested flag */ | ||
| 2555 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
| 2556 | } | ||
| 2557 | } | ||
| 2558 | |||
| 2559 | /* | ||
| 2560 | * Return the order we were reclaiming at so sleeping_prematurely() | ||
| 2561 | * makes a decision on the order we were last reclaiming at. However, | ||
| 2562 | * if another caller entered the allocator slow path while kswapd | ||
| 2563 | * was awake, order will remain at the higher level | ||
| 2564 | */ | ||
| 2565 | *classzone_idx = end_zone; | ||
| 2566 | return order; | ||
| 2567 | } | ||
| 2568 | |||
| 2569 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | ||
| 2570 | { | ||
| 2571 | long remaining = 0; | ||
| 2572 | DEFINE_WAIT(wait); | ||
| 2573 | |||
| 2574 | if (freezing(current) || kthread_should_stop()) | ||
| 2575 | return; | ||
| 2576 | |||
| 2577 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
| 2578 | |||
| 2579 | /* Try to sleep for a short interval */ | ||
| 2580 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
| 2581 | remaining = schedule_timeout(HZ/10); | ||
| 2582 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
| 2583 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
| 2584 | } | ||
| 2585 | |||
| 2586 | /* | ||
| 2587 | * After a short sleep, check if it was a premature sleep. If not, then | ||
| 2588 | * go fully to sleep until explicitly woken up. | ||
| 2589 | */ | ||
| 2590 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
| 2591 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
| 2592 | |||
| 2593 | /* | ||
| 2594 | * vmstat counters are not perfectly accurate and the estimated | ||
| 2595 | * value for counters such as NR_FREE_PAGES can deviate from the | ||
| 2596 | * true value by nr_online_cpus * threshold. To avoid the zone | ||
| 2597 | * watermarks being breached while under pressure, we reduce the | ||
| 2598 | * per-cpu vmstat threshold while kswapd is awake and restore | ||
| 2599 | * them before going back to sleep. | ||
| 2600 | */ | ||
| 2601 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | ||
| 2602 | schedule(); | ||
| 2603 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | ||
| 2604 | } else { | ||
| 2605 | if (remaining) | ||
| 2606 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
| 2607 | else | ||
| 2608 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
| 2609 | } | ||
| 2610 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
| 2365 | } | 2611 | } |
| 2366 | 2612 | ||
| 2367 | /* | 2613 | /* |
| @@ -2380,9 +2626,10 @@ out: | |||
| 2380 | static int kswapd(void *p) | 2626 | static int kswapd(void *p) |
| 2381 | { | 2627 | { |
| 2382 | unsigned long order; | 2628 | unsigned long order; |
| 2629 | int classzone_idx; | ||
| 2383 | pg_data_t *pgdat = (pg_data_t*)p; | 2630 | pg_data_t *pgdat = (pg_data_t*)p; |
| 2384 | struct task_struct *tsk = current; | 2631 | struct task_struct *tsk = current; |
| 2385 | DEFINE_WAIT(wait); | 2632 | |
| 2386 | struct reclaim_state reclaim_state = { | 2633 | struct reclaim_state reclaim_state = { |
| 2387 | .reclaimed_slab = 0, | 2634 | .reclaimed_slab = 0, |
| 2388 | }; | 2635 | }; |
| @@ -2410,49 +2657,30 @@ static int kswapd(void *p) | |||
| 2410 | set_freezable(); | 2657 | set_freezable(); |
| 2411 | 2658 | ||
| 2412 | order = 0; | 2659 | order = 0; |
| 2660 | classzone_idx = MAX_NR_ZONES - 1; | ||
| 2413 | for ( ; ; ) { | 2661 | for ( ; ; ) { |
| 2414 | unsigned long new_order; | 2662 | unsigned long new_order; |
| 2663 | int new_classzone_idx; | ||
| 2415 | int ret; | 2664 | int ret; |
| 2416 | 2665 | ||
| 2417 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
| 2418 | new_order = pgdat->kswapd_max_order; | 2666 | new_order = pgdat->kswapd_max_order; |
| 2667 | new_classzone_idx = pgdat->classzone_idx; | ||
| 2419 | pgdat->kswapd_max_order = 0; | 2668 | pgdat->kswapd_max_order = 0; |
| 2420 | if (order < new_order) { | 2669 | pgdat->classzone_idx = MAX_NR_ZONES - 1; |
| 2670 | if (order < new_order || classzone_idx > new_classzone_idx) { | ||
| 2421 | /* | 2671 | /* |
| 2422 | * Don't sleep if someone wants a larger 'order' | 2672 | * Don't sleep if someone wants a larger 'order' |
| 2423 | * allocation | 2673 | * allocation or has tigher zone constraints |
| 2424 | */ | 2674 | */ |
| 2425 | order = new_order; | 2675 | order = new_order; |
| 2676 | classzone_idx = new_classzone_idx; | ||
| 2426 | } else { | 2677 | } else { |
| 2427 | if (!freezing(current) && !kthread_should_stop()) { | 2678 | kswapd_try_to_sleep(pgdat, order, classzone_idx); |
| 2428 | long remaining = 0; | ||
| 2429 | |||
| 2430 | /* Try to sleep for a short interval */ | ||
| 2431 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
| 2432 | remaining = schedule_timeout(HZ/10); | ||
| 2433 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
| 2434 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
| 2435 | } | ||
| 2436 | |||
| 2437 | /* | ||
| 2438 | * After a short sleep, check if it was a | ||
| 2439 | * premature sleep. If not, then go fully | ||
| 2440 | * to sleep until explicitly woken up | ||
| 2441 | */ | ||
| 2442 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
| 2443 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
| 2444 | schedule(); | ||
| 2445 | } else { | ||
| 2446 | if (remaining) | ||
| 2447 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
| 2448 | else | ||
| 2449 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
| 2450 | } | ||
| 2451 | } | ||
| 2452 | |||
| 2453 | order = pgdat->kswapd_max_order; | 2679 | order = pgdat->kswapd_max_order; |
| 2680 | classzone_idx = pgdat->classzone_idx; | ||
| 2681 | pgdat->kswapd_max_order = 0; | ||
| 2682 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | ||
| 2454 | } | 2683 | } |
| 2455 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
| 2456 | 2684 | ||
| 2457 | ret = try_to_freeze(); | 2685 | ret = try_to_freeze(); |
| 2458 | if (kthread_should_stop()) | 2686 | if (kthread_should_stop()) |
| @@ -2464,7 +2692,7 @@ static int kswapd(void *p) | |||
| 2464 | */ | 2692 | */ |
| 2465 | if (!ret) { | 2693 | if (!ret) { |
| 2466 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2694 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
| 2467 | balance_pgdat(pgdat, order); | 2695 | order = balance_pgdat(pgdat, order, &classzone_idx); |
| 2468 | } | 2696 | } |
| 2469 | } | 2697 | } |
| 2470 | return 0; | 2698 | return 0; |
| @@ -2473,23 +2701,26 @@ static int kswapd(void *p) | |||
| 2473 | /* | 2701 | /* |
| 2474 | * A zone is low on free memory, so wake its kswapd task to service it. | 2702 | * A zone is low on free memory, so wake its kswapd task to service it. |
| 2475 | */ | 2703 | */ |
| 2476 | void wakeup_kswapd(struct zone *zone, int order) | 2704 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) |
| 2477 | { | 2705 | { |
| 2478 | pg_data_t *pgdat; | 2706 | pg_data_t *pgdat; |
| 2479 | 2707 | ||
| 2480 | if (!populated_zone(zone)) | 2708 | if (!populated_zone(zone)) |
| 2481 | return; | 2709 | return; |
| 2482 | 2710 | ||
| 2483 | pgdat = zone->zone_pgdat; | ||
| 2484 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
| 2485 | return; | ||
| 2486 | if (pgdat->kswapd_max_order < order) | ||
| 2487 | pgdat->kswapd_max_order = order; | ||
| 2488 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
| 2489 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2711 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
| 2490 | return; | 2712 | return; |
| 2713 | pgdat = zone->zone_pgdat; | ||
| 2714 | if (pgdat->kswapd_max_order < order) { | ||
| 2715 | pgdat->kswapd_max_order = order; | ||
| 2716 | pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); | ||
| 2717 | } | ||
| 2491 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 2718 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
| 2492 | return; | 2719 | return; |
| 2720 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | ||
| 2721 | return; | ||
| 2722 | |||
| 2723 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
| 2493 | wake_up_interruptible(&pgdat->kswapd_wait); | 2724 | wake_up_interruptible(&pgdat->kswapd_wait); |
| 2494 | } | 2725 | } |
| 2495 | 2726 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 42eac4d33216..0c3b5048773e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat); | |||
| 83 | 83 | ||
| 84 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |
| 85 | 85 | ||
| 86 | static int calculate_threshold(struct zone *zone) | 86 | int calculate_pressure_threshold(struct zone *zone) |
| 87 | { | ||
| 88 | int threshold; | ||
| 89 | int watermark_distance; | ||
| 90 | |||
| 91 | /* | ||
| 92 | * As vmstats are not up to date, there is drift between the estimated | ||
| 93 | * and real values. For high thresholds and a high number of CPUs, it | ||
| 94 | * is possible for the min watermark to be breached while the estimated | ||
| 95 | * value looks fine. The pressure threshold is a reduced value such | ||
| 96 | * that even the maximum amount of drift will not accidentally breach | ||
| 97 | * the min watermark | ||
| 98 | */ | ||
| 99 | watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); | ||
| 100 | threshold = max(1, (int)(watermark_distance / num_online_cpus())); | ||
| 101 | |||
| 102 | /* | ||
| 103 | * Maximum threshold is 125 | ||
| 104 | */ | ||
| 105 | threshold = min(125, threshold); | ||
| 106 | |||
| 107 | return threshold; | ||
| 108 | } | ||
| 109 | |||
| 110 | int calculate_normal_threshold(struct zone *zone) | ||
| 87 | { | 111 | { |
| 88 | int threshold; | 112 | int threshold; |
| 89 | int mem; /* memory in 128 MB units */ | 113 | int mem; /* memory in 128 MB units */ |
| @@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) | |||
| 142 | for_each_populated_zone(zone) { | 166 | for_each_populated_zone(zone) { |
| 143 | unsigned long max_drift, tolerate_drift; | 167 | unsigned long max_drift, tolerate_drift; |
| 144 | 168 | ||
| 145 | threshold = calculate_threshold(zone); | 169 | threshold = calculate_normal_threshold(zone); |
| 146 | 170 | ||
| 147 | for_each_online_cpu(cpu) | 171 | for_each_online_cpu(cpu) |
| 148 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | 172 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
| @@ -161,42 +185,50 @@ static void refresh_zone_stat_thresholds(void) | |||
| 161 | } | 185 | } |
| 162 | } | 186 | } |
| 163 | 187 | ||
| 188 | void set_pgdat_percpu_threshold(pg_data_t *pgdat, | ||
| 189 | int (*calculate_pressure)(struct zone *)) | ||
| 190 | { | ||
| 191 | struct zone *zone; | ||
| 192 | int cpu; | ||
| 193 | int threshold; | ||
| 194 | int i; | ||
| 195 | |||
| 196 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
| 197 | zone = &pgdat->node_zones[i]; | ||
| 198 | if (!zone->percpu_drift_mark) | ||
| 199 | continue; | ||
| 200 | |||
| 201 | threshold = (*calculate_pressure)(zone); | ||
| 202 | for_each_possible_cpu(cpu) | ||
| 203 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | ||
| 204 | = threshold; | ||
| 205 | } | ||
| 206 | } | ||
| 207 | |||
| 164 | /* | 208 | /* |
| 165 | * For use when we know that interrupts are disabled. | 209 | * For use when we know that interrupts are disabled. |
| 166 | */ | 210 | */ |
| 167 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 211 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
| 168 | int delta) | 212 | int delta) |
| 169 | { | 213 | { |
| 170 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 214 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
| 171 | 215 | s8 __percpu *p = pcp->vm_stat_diff + item; | |
| 172 | s8 *p = pcp->vm_stat_diff + item; | ||
| 173 | long x; | 216 | long x; |
| 217 | long t; | ||
| 174 | 218 | ||
| 175 | x = delta + *p; | 219 | x = delta + __this_cpu_read(*p); |
| 176 | 220 | ||
| 177 | if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { | 221 | t = __this_cpu_read(pcp->stat_threshold); |
| 222 | |||
| 223 | if (unlikely(x > t || x < -t)) { | ||
| 178 | zone_page_state_add(x, zone, item); | 224 | zone_page_state_add(x, zone, item); |
| 179 | x = 0; | 225 | x = 0; |
| 180 | } | 226 | } |
| 181 | *p = x; | 227 | __this_cpu_write(*p, x); |
| 182 | } | 228 | } |
| 183 | EXPORT_SYMBOL(__mod_zone_page_state); | 229 | EXPORT_SYMBOL(__mod_zone_page_state); |
| 184 | 230 | ||
| 185 | /* | 231 | /* |
| 186 | * For an unknown interrupt state | ||
| 187 | */ | ||
| 188 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
| 189 | int delta) | ||
| 190 | { | ||
| 191 | unsigned long flags; | ||
| 192 | |||
| 193 | local_irq_save(flags); | ||
| 194 | __mod_zone_page_state(zone, item, delta); | ||
| 195 | local_irq_restore(flags); | ||
| 196 | } | ||
| 197 | EXPORT_SYMBOL(mod_zone_page_state); | ||
| 198 | |||
| 199 | /* | ||
| 200 | * Optimized increment and decrement functions. | 232 | * Optimized increment and decrement functions. |
| 201 | * | 233 | * |
| 202 | * These are only for a single page and therefore can take a struct page * | 234 | * These are only for a single page and therefore can take a struct page * |
| @@ -221,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state); | |||
| 221 | */ | 253 | */ |
| 222 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | 254 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
| 223 | { | 255 | { |
| 224 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 256 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
| 225 | s8 *p = pcp->vm_stat_diff + item; | 257 | s8 __percpu *p = pcp->vm_stat_diff + item; |
| 226 | 258 | s8 v, t; | |
| 227 | (*p)++; | ||
| 228 | 259 | ||
| 229 | if (unlikely(*p > pcp->stat_threshold)) { | 260 | v = __this_cpu_inc_return(*p); |
| 230 | int overstep = pcp->stat_threshold / 2; | 261 | t = __this_cpu_read(pcp->stat_threshold); |
| 262 | if (unlikely(v > t)) { | ||
| 263 | s8 overstep = t >> 1; | ||
| 231 | 264 | ||
| 232 | zone_page_state_add(*p + overstep, zone, item); | 265 | zone_page_state_add(v + overstep, zone, item); |
| 233 | *p = -overstep; | 266 | __this_cpu_write(*p, -overstep); |
| 234 | } | 267 | } |
| 235 | } | 268 | } |
| 236 | 269 | ||
| @@ -242,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state); | |||
| 242 | 275 | ||
| 243 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | 276 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
| 244 | { | 277 | { |
| 245 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 278 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
| 246 | s8 *p = pcp->vm_stat_diff + item; | 279 | s8 __percpu *p = pcp->vm_stat_diff + item; |
| 280 | s8 v, t; | ||
| 247 | 281 | ||
| 248 | (*p)--; | 282 | v = __this_cpu_dec_return(*p); |
| 283 | t = __this_cpu_read(pcp->stat_threshold); | ||
| 284 | if (unlikely(v < - t)) { | ||
| 285 | s8 overstep = t >> 1; | ||
| 249 | 286 | ||
| 250 | if (unlikely(*p < - pcp->stat_threshold)) { | 287 | zone_page_state_add(v - overstep, zone, item); |
| 251 | int overstep = pcp->stat_threshold / 2; | 288 | __this_cpu_write(*p, overstep); |
| 252 | |||
| 253 | zone_page_state_add(*p - overstep, zone, item); | ||
| 254 | *p = overstep; | ||
| 255 | } | 289 | } |
| 256 | } | 290 | } |
| 257 | 291 | ||
| @@ -261,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
| 261 | } | 295 | } |
| 262 | EXPORT_SYMBOL(__dec_zone_page_state); | 296 | EXPORT_SYMBOL(__dec_zone_page_state); |
| 263 | 297 | ||
| 298 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
| 299 | /* | ||
| 300 | * If we have cmpxchg_local support then we do not need to incur the overhead | ||
| 301 | * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. | ||
| 302 | * | ||
| 303 | * mod_state() modifies the zone counter state through atomic per cpu | ||
| 304 | * operations. | ||
| 305 | * | ||
| 306 | * Overstep mode specifies how overstep should handled: | ||
| 307 | * 0 No overstepping | ||
| 308 | * 1 Overstepping half of threshold | ||
| 309 | * -1 Overstepping minus half of threshold | ||
| 310 | */ | ||
| 311 | static inline void mod_state(struct zone *zone, | ||
| 312 | enum zone_stat_item item, int delta, int overstep_mode) | ||
| 313 | { | ||
| 314 | struct per_cpu_pageset __percpu *pcp = zone->pageset; | ||
| 315 | s8 __percpu *p = pcp->vm_stat_diff + item; | ||
| 316 | long o, n, t, z; | ||
| 317 | |||
| 318 | do { | ||
| 319 | z = 0; /* overflow to zone counters */ | ||
| 320 | |||
| 321 | /* | ||
| 322 | * The fetching of the stat_threshold is racy. We may apply | ||
| 323 | * a counter threshold to the wrong the cpu if we get | ||
| 324 | * rescheduled while executing here. However, the following | ||
| 325 | * will apply the threshold again and therefore bring the | ||
| 326 | * counter under the threshold. | ||
| 327 | */ | ||
| 328 | t = this_cpu_read(pcp->stat_threshold); | ||
| 329 | |||
| 330 | o = this_cpu_read(*p); | ||
| 331 | n = delta + o; | ||
| 332 | |||
| 333 | if (n > t || n < -t) { | ||
| 334 | int os = overstep_mode * (t >> 1) ; | ||
| 335 | |||
| 336 | /* Overflow must be added to zone counters */ | ||
| 337 | z = n + os; | ||
| 338 | n = -os; | ||
| 339 | } | ||
| 340 | } while (this_cpu_cmpxchg(*p, o, n) != o); | ||
| 341 | |||
| 342 | if (z) | ||
| 343 | zone_page_state_add(z, zone, item); | ||
| 344 | } | ||
| 345 | |||
| 346 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
| 347 | int delta) | ||
| 348 | { | ||
| 349 | mod_state(zone, item, delta, 0); | ||
| 350 | } | ||
| 351 | EXPORT_SYMBOL(mod_zone_page_state); | ||
| 352 | |||
| 353 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | ||
| 354 | { | ||
| 355 | mod_state(zone, item, 1, 1); | ||
| 356 | } | ||
| 357 | |||
| 358 | void inc_zone_page_state(struct page *page, enum zone_stat_item item) | ||
| 359 | { | ||
| 360 | mod_state(page_zone(page), item, 1, 1); | ||
| 361 | } | ||
| 362 | EXPORT_SYMBOL(inc_zone_page_state); | ||
| 363 | |||
| 364 | void dec_zone_page_state(struct page *page, enum zone_stat_item item) | ||
| 365 | { | ||
| 366 | mod_state(page_zone(page), item, -1, -1); | ||
| 367 | } | ||
| 368 | EXPORT_SYMBOL(dec_zone_page_state); | ||
| 369 | #else | ||
| 370 | /* | ||
| 371 | * Use interrupt disable to serialize counter updates | ||
| 372 | */ | ||
| 373 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
| 374 | int delta) | ||
| 375 | { | ||
| 376 | unsigned long flags; | ||
| 377 | |||
| 378 | local_irq_save(flags); | ||
| 379 | __mod_zone_page_state(zone, item, delta); | ||
| 380 | local_irq_restore(flags); | ||
| 381 | } | ||
| 382 | EXPORT_SYMBOL(mod_zone_page_state); | ||
| 383 | |||
| 264 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | 384 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) |
| 265 | { | 385 | { |
| 266 | unsigned long flags; | 386 | unsigned long flags; |
| @@ -291,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
| 291 | local_irq_restore(flags); | 411 | local_irq_restore(flags); |
| 292 | } | 412 | } |
| 293 | EXPORT_SYMBOL(dec_zone_page_state); | 413 | EXPORT_SYMBOL(dec_zone_page_state); |
| 414 | #endif | ||
| 294 | 415 | ||
| 295 | /* | 416 | /* |
| 296 | * Update the zone counters for one cpu. | 417 | * Update the zone counters for one cpu. |
| @@ -750,8 +871,6 @@ static const char * const vmstat_text[] = { | |||
| 750 | "nr_shmem", | 871 | "nr_shmem", |
| 751 | "nr_dirtied", | 872 | "nr_dirtied", |
| 752 | "nr_written", | 873 | "nr_written", |
| 753 | "nr_dirty_threshold", | ||
| 754 | "nr_dirty_background_threshold", | ||
| 755 | 874 | ||
| 756 | #ifdef CONFIG_NUMA | 875 | #ifdef CONFIG_NUMA |
| 757 | "numa_hit", | 876 | "numa_hit", |
| @@ -761,6 +880,9 @@ static const char * const vmstat_text[] = { | |||
| 761 | "numa_local", | 880 | "numa_local", |
| 762 | "numa_other", | 881 | "numa_other", |
| 763 | #endif | 882 | #endif |
| 883 | "nr_anon_transparent_hugepages", | ||
| 884 | "nr_dirty_threshold", | ||
| 885 | "nr_dirty_background_threshold", | ||
| 764 | 886 | ||
| 765 | #ifdef CONFIG_VM_EVENT_COUNTERS | 887 | #ifdef CONFIG_VM_EVENT_COUNTERS |
| 766 | "pgpgin", | 888 | "pgpgin", |
| @@ -834,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 834 | "\n scanned %lu" | 956 | "\n scanned %lu" |
| 835 | "\n spanned %lu" | 957 | "\n spanned %lu" |
| 836 | "\n present %lu", | 958 | "\n present %lu", |
| 837 | zone_nr_free_pages(zone), | 959 | zone_page_state(zone, NR_FREE_PAGES), |
| 838 | min_wmark_pages(zone), | 960 | min_wmark_pages(zone), |
| 839 | low_wmark_pages(zone), | 961 | low_wmark_pages(zone), |
| 840 | high_wmark_pages(zone), | 962 | high_wmark_pages(zone), |
| @@ -1033,7 +1155,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
| 1033 | break; | 1155 | break; |
| 1034 | case CPU_DOWN_PREPARE: | 1156 | case CPU_DOWN_PREPARE: |
| 1035 | case CPU_DOWN_PREPARE_FROZEN: | 1157 | case CPU_DOWN_PREPARE_FROZEN: |
| 1036 | cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); | 1158 | cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); |
| 1037 | per_cpu(vmstat_work, cpu).work.func = NULL; | 1159 | per_cpu(vmstat_work, cpu).work.func = NULL; |
| 1038 | break; | 1160 | break; |
| 1039 | case CPU_DOWN_FAILED: | 1161 | case CPU_DOWN_FAILED: |
