diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 32 | ||||
| -rw-r--r-- | mm/Makefile | 7 | ||||
| -rw-r--r-- | mm/backing-dev.c | 117 | ||||
| -rw-r--r-- | mm/cleancache.c | 2 | ||||
| -rw-r--r-- | mm/cma.c | 2 | ||||
| -rw-r--r-- | mm/compaction.c | 181 | ||||
| -rw-r--r-- | mm/debug.c | 4 | ||||
| -rw-r--r-- | mm/fadvise.c | 10 | ||||
| -rw-r--r-- | mm/filemap.c | 30 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 478 | ||||
| -rw-r--r-- | mm/fremap.c | 283 | ||||
| -rw-r--r-- | mm/gup.c | 242 | ||||
| -rw-r--r-- | mm/huge_memory.c | 156 | ||||
| -rw-r--r-- | mm/hugetlb.c | 160 | ||||
| -rw-r--r-- | mm/hugetlb_cgroup.c | 2 | ||||
| -rw-r--r-- | mm/internal.h | 28 | ||||
| -rw-r--r-- | mm/interval_tree.c | 34 | ||||
| -rw-r--r-- | mm/iov_iter.c | 17 | ||||
| -rw-r--r-- | mm/kasan/Makefile | 8 | ||||
| -rw-r--r-- | mm/kasan/kasan.c | 516 | ||||
| -rw-r--r-- | mm/kasan/kasan.h | 75 | ||||
| -rw-r--r-- | mm/kasan/report.c | 269 | ||||
| -rw-r--r-- | mm/kmemleak.c | 6 | ||||
| -rw-r--r-- | mm/ksm.c | 2 | ||||
| -rw-r--r-- | mm/list_lru.c | 467 | ||||
| -rw-r--r-- | mm/madvise.c | 32 | ||||
| -rw-r--r-- | mm/memcontrol.c | 1073 | ||||
| -rw-r--r-- | mm/memory-failure.c | 13 | ||||
| -rw-r--r-- | mm/memory.c | 355 | ||||
| -rw-r--r-- | mm/mempolicy.c | 286 | ||||
| -rw-r--r-- | mm/migrate.c | 45 | ||||
| -rw-r--r-- | mm/mincore.c | 175 | ||||
| -rw-r--r-- | mm/mm_init.c | 4 | ||||
| -rw-r--r-- | mm/mmap.c | 100 | ||||
| -rw-r--r-- | mm/mmzone.c | 4 | ||||
| -rw-r--r-- | mm/mprotect.c | 50 | ||||
| -rw-r--r-- | mm/mremap.c | 2 | ||||
| -rw-r--r-- | mm/msync.c | 5 | ||||
| -rw-r--r-- | mm/nommu.c | 118 | ||||
| -rw-r--r-- | mm/oom_kill.c | 169 | ||||
| -rw-r--r-- | mm/page-writeback.c | 46 | ||||
| -rw-r--r-- | mm/page_alloc.c | 471 | ||||
| -rw-r--r-- | mm/page_counter.c | 7 | ||||
| -rw-r--r-- | mm/page_io.c | 9 | ||||
| -rw-r--r-- | mm/page_owner.c | 26 | ||||
| -rw-r--r-- | mm/pagewalk.c | 238 | ||||
| -rw-r--r-- | mm/percpu.c | 6 | ||||
| -rw-r--r-- | mm/pgtable-generic.c | 2 | ||||
| -rw-r--r-- | mm/process_vm_access.c | 7 | ||||
| -rw-r--r-- | mm/readahead.c | 4 | ||||
| -rw-r--r-- | mm/rmap.c | 237 | ||||
| -rw-r--r-- | mm/shmem.c | 34 | ||||
| -rw-r--r-- | mm/slab.c | 17 | ||||
| -rw-r--r-- | mm/slab.h | 67 | ||||
| -rw-r--r-- | mm/slab_common.c | 323 | ||||
| -rw-r--r-- | mm/slob.c | 2 | ||||
| -rw-r--r-- | mm/slub.c | 232 | ||||
| -rw-r--r-- | mm/swap.c | 6 | ||||
| -rw-r--r-- | mm/swap_state.c | 6 | ||||
| -rw-r--r-- | mm/truncate.c | 2 | ||||
| -rw-r--r-- | mm/util.c | 48 | ||||
| -rw-r--r-- | mm/vmalloc.c | 16 | ||||
| -rw-r--r-- | mm/vmscan.c | 121 | ||||
| -rw-r--r-- | mm/vmstat.c | 130 | ||||
| -rw-r--r-- | mm/workingset.c | 9 | ||||
| -rw-r--r-- | mm/zbud.c | 3 | ||||
| -rw-r--r-- | mm/zpool.c | 6 | ||||
| -rw-r--r-- | mm/zsmalloc.c | 239 | ||||
| -rw-r--r-- | mm/zswap.c | 5 |
69 files changed, 4547 insertions, 3331 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 4395b12869c8..a03131b6ba8e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -129,28 +129,28 @@ config SPARSEMEM_VMEMMAP | |||
| 129 | efficient option when sufficient kernel resources are available. | 129 | efficient option when sufficient kernel resources are available. |
| 130 | 130 | ||
| 131 | config HAVE_MEMBLOCK | 131 | config HAVE_MEMBLOCK |
| 132 | boolean | 132 | bool |
| 133 | 133 | ||
| 134 | config HAVE_MEMBLOCK_NODE_MAP | 134 | config HAVE_MEMBLOCK_NODE_MAP |
| 135 | boolean | 135 | bool |
| 136 | 136 | ||
| 137 | config HAVE_MEMBLOCK_PHYS_MAP | 137 | config HAVE_MEMBLOCK_PHYS_MAP |
| 138 | boolean | 138 | bool |
| 139 | 139 | ||
| 140 | config HAVE_GENERIC_RCU_GUP | 140 | config HAVE_GENERIC_RCU_GUP |
| 141 | boolean | 141 | bool |
| 142 | 142 | ||
| 143 | config ARCH_DISCARD_MEMBLOCK | 143 | config ARCH_DISCARD_MEMBLOCK |
| 144 | boolean | 144 | bool |
| 145 | 145 | ||
| 146 | config NO_BOOTMEM | 146 | config NO_BOOTMEM |
| 147 | boolean | 147 | bool |
| 148 | 148 | ||
| 149 | config MEMORY_ISOLATION | 149 | config MEMORY_ISOLATION |
| 150 | boolean | 150 | bool |
| 151 | 151 | ||
| 152 | config MOVABLE_NODE | 152 | config MOVABLE_NODE |
| 153 | boolean "Enable to assign a node which has only movable memory" | 153 | bool "Enable to assign a node which has only movable memory" |
| 154 | depends on HAVE_MEMBLOCK | 154 | depends on HAVE_MEMBLOCK |
| 155 | depends on NO_BOOTMEM | 155 | depends on NO_BOOTMEM |
| 156 | depends on X86_64 | 156 | depends on X86_64 |
| @@ -228,12 +228,12 @@ config SPLIT_PTLOCK_CPUS | |||
| 228 | default "4" | 228 | default "4" |
| 229 | 229 | ||
| 230 | config ARCH_ENABLE_SPLIT_PMD_PTLOCK | 230 | config ARCH_ENABLE_SPLIT_PMD_PTLOCK |
| 231 | boolean | 231 | bool |
| 232 | 232 | ||
| 233 | # | 233 | # |
| 234 | # support for memory balloon | 234 | # support for memory balloon |
| 235 | config MEMORY_BALLOON | 235 | config MEMORY_BALLOON |
| 236 | boolean | 236 | bool |
| 237 | 237 | ||
| 238 | # | 238 | # |
| 239 | # support for memory balloon compaction | 239 | # support for memory balloon compaction |
| @@ -276,7 +276,7 @@ config MIGRATION | |||
| 276 | allocation instead of reclaiming. | 276 | allocation instead of reclaiming. |
| 277 | 277 | ||
| 278 | config ARCH_ENABLE_HUGEPAGE_MIGRATION | 278 | config ARCH_ENABLE_HUGEPAGE_MIGRATION |
| 279 | boolean | 279 | bool |
| 280 | 280 | ||
| 281 | config PHYS_ADDR_T_64BIT | 281 | config PHYS_ADDR_T_64BIT |
| 282 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT | 282 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT |
| @@ -602,6 +602,16 @@ config PGTABLE_MAPPING | |||
| 602 | You can check speed with zsmalloc benchmark: | 602 | You can check speed with zsmalloc benchmark: |
| 603 | https://github.com/spartacus06/zsmapbench | 603 | https://github.com/spartacus06/zsmapbench |
| 604 | 604 | ||
| 605 | config ZSMALLOC_STAT | ||
| 606 | bool "Export zsmalloc statistics" | ||
| 607 | depends on ZSMALLOC | ||
| 608 | select DEBUG_FS | ||
| 609 | help | ||
| 610 | This option enables code in the zsmalloc to collect various | ||
| 611 | statistics about whats happening in zsmalloc and exports that | ||
| 612 | information to userspace via debugfs. | ||
| 613 | If unsure, say N. | ||
| 614 | |||
| 605 | config GENERIC_EARLY_IOREMAP | 615 | config GENERIC_EARLY_IOREMAP |
| 606 | bool | 616 | bool |
| 607 | 617 | ||
diff --git a/mm/Makefile b/mm/Makefile index 4bf586e66378..3c1caa2693bd 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -2,8 +2,11 @@ | |||
| 2 | # Makefile for the linux memory manager. | 2 | # Makefile for the linux memory manager. |
| 3 | # | 3 | # |
| 4 | 4 | ||
| 5 | KASAN_SANITIZE_slab_common.o := n | ||
| 6 | KASAN_SANITIZE_slub.o := n | ||
| 7 | |||
| 5 | mmu-y := nommu.o | 8 | mmu-y := nommu.o |
| 6 | mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ | 9 | mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ |
| 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 10 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
| 8 | vmalloc.o pagewalk.o pgtable-generic.o | 11 | vmalloc.o pagewalk.o pgtable-generic.o |
| 9 | 12 | ||
| @@ -49,9 +52,9 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | |||
| 49 | obj-$(CONFIG_SLAB) += slab.o | 52 | obj-$(CONFIG_SLAB) += slab.o |
| 50 | obj-$(CONFIG_SLUB) += slub.o | 53 | obj-$(CONFIG_SLUB) += slub.o |
| 51 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | 54 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o |
| 55 | obj-$(CONFIG_KASAN) += kasan/ | ||
| 52 | obj-$(CONFIG_FAILSLAB) += failslab.o | 56 | obj-$(CONFIG_FAILSLAB) += failslab.o |
| 53 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 57 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
| 54 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | ||
| 55 | obj-$(CONFIG_MIGRATION) += migrate.o | 58 | obj-$(CONFIG_MIGRATION) += migrate.o |
| 56 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 59 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
| 57 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 60 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0ae0df55000b..6dc4580df2af 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
| @@ -14,19 +14,10 @@ | |||
| 14 | 14 | ||
| 15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); | 15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); |
| 16 | 16 | ||
| 17 | struct backing_dev_info default_backing_dev_info = { | ||
| 18 | .name = "default", | ||
| 19 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, | ||
| 20 | .state = 0, | ||
| 21 | .capabilities = BDI_CAP_MAP_COPY, | ||
| 22 | }; | ||
| 23 | EXPORT_SYMBOL_GPL(default_backing_dev_info); | ||
| 24 | |||
| 25 | struct backing_dev_info noop_backing_dev_info = { | 17 | struct backing_dev_info noop_backing_dev_info = { |
| 26 | .name = "noop", | 18 | .name = "noop", |
| 27 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 19 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
| 28 | }; | 20 | }; |
| 29 | EXPORT_SYMBOL_GPL(noop_backing_dev_info); | ||
| 30 | 21 | ||
| 31 | static struct class *bdi_class; | 22 | static struct class *bdi_class; |
| 32 | 23 | ||
| @@ -40,17 +31,6 @@ LIST_HEAD(bdi_list); | |||
| 40 | /* bdi_wq serves all asynchronous writeback tasks */ | 31 | /* bdi_wq serves all asynchronous writeback tasks */ |
| 41 | struct workqueue_struct *bdi_wq; | 32 | struct workqueue_struct *bdi_wq; |
| 42 | 33 | ||
| 43 | static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | ||
| 44 | { | ||
| 45 | if (wb1 < wb2) { | ||
| 46 | spin_lock(&wb1->list_lock); | ||
| 47 | spin_lock_nested(&wb2->list_lock, 1); | ||
| 48 | } else { | ||
| 49 | spin_lock(&wb2->list_lock); | ||
| 50 | spin_lock_nested(&wb1->list_lock, 1); | ||
| 51 | } | ||
| 52 | } | ||
| 53 | |||
| 54 | #ifdef CONFIG_DEBUG_FS | 34 | #ifdef CONFIG_DEBUG_FS |
| 55 | #include <linux/debugfs.h> | 35 | #include <linux/debugfs.h> |
| 56 | #include <linux/seq_file.h> | 36 | #include <linux/seq_file.h> |
| @@ -69,10 +49,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
| 69 | unsigned long background_thresh; | 49 | unsigned long background_thresh; |
| 70 | unsigned long dirty_thresh; | 50 | unsigned long dirty_thresh; |
| 71 | unsigned long bdi_thresh; | 51 | unsigned long bdi_thresh; |
| 72 | unsigned long nr_dirty, nr_io, nr_more_io; | 52 | unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; |
| 73 | struct inode *inode; | 53 | struct inode *inode; |
| 74 | 54 | ||
| 75 | nr_dirty = nr_io = nr_more_io = 0; | 55 | nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; |
| 76 | spin_lock(&wb->list_lock); | 56 | spin_lock(&wb->list_lock); |
| 77 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 57 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
| 78 | nr_dirty++; | 58 | nr_dirty++; |
| @@ -80,6 +60,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
| 80 | nr_io++; | 60 | nr_io++; |
| 81 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 61 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
| 82 | nr_more_io++; | 62 | nr_more_io++; |
| 63 | list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) | ||
| 64 | if (inode->i_state & I_DIRTY_TIME) | ||
| 65 | nr_dirty_time++; | ||
| 83 | spin_unlock(&wb->list_lock); | 66 | spin_unlock(&wb->list_lock); |
| 84 | 67 | ||
| 85 | global_dirty_limits(&background_thresh, &dirty_thresh); | 68 | global_dirty_limits(&background_thresh, &dirty_thresh); |
| @@ -98,6 +81,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
| 98 | "b_dirty: %10lu\n" | 81 | "b_dirty: %10lu\n" |
| 99 | "b_io: %10lu\n" | 82 | "b_io: %10lu\n" |
| 100 | "b_more_io: %10lu\n" | 83 | "b_more_io: %10lu\n" |
| 84 | "b_dirty_time: %10lu\n" | ||
| 101 | "bdi_list: %10u\n" | 85 | "bdi_list: %10u\n" |
| 102 | "state: %10lx\n", | 86 | "state: %10lx\n", |
| 103 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 87 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
| @@ -111,6 +95,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
| 111 | nr_dirty, | 95 | nr_dirty, |
| 112 | nr_io, | 96 | nr_io, |
| 113 | nr_more_io, | 97 | nr_more_io, |
| 98 | nr_dirty_time, | ||
| 114 | !list_empty(&bdi->bdi_list), bdi->state); | 99 | !list_empty(&bdi->bdi_list), bdi->state); |
| 115 | #undef K | 100 | #undef K |
| 116 | 101 | ||
| @@ -264,9 +249,6 @@ static int __init default_bdi_init(void) | |||
| 264 | if (!bdi_wq) | 249 | if (!bdi_wq) |
| 265 | return -ENOMEM; | 250 | return -ENOMEM; |
| 266 | 251 | ||
| 267 | err = bdi_init(&default_backing_dev_info); | ||
| 268 | if (!err) | ||
| 269 | bdi_register(&default_backing_dev_info, NULL, "default"); | ||
| 270 | err = bdi_init(&noop_backing_dev_info); | 252 | err = bdi_init(&noop_backing_dev_info); |
| 271 | 253 | ||
| 272 | return err; | 254 | return err; |
| @@ -355,19 +337,19 @@ EXPORT_SYMBOL(bdi_register_dev); | |||
| 355 | */ | 337 | */ |
| 356 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) | 338 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) |
| 357 | { | 339 | { |
| 358 | if (!bdi_cap_writeback_dirty(bdi)) | 340 | /* Make sure nobody queues further work */ |
| 341 | spin_lock_bh(&bdi->wb_lock); | ||
| 342 | if (!test_and_clear_bit(BDI_registered, &bdi->state)) { | ||
| 343 | spin_unlock_bh(&bdi->wb_lock); | ||
| 359 | return; | 344 | return; |
| 345 | } | ||
| 346 | spin_unlock_bh(&bdi->wb_lock); | ||
| 360 | 347 | ||
| 361 | /* | 348 | /* |
| 362 | * Make sure nobody finds us on the bdi_list anymore | 349 | * Make sure nobody finds us on the bdi_list anymore |
| 363 | */ | 350 | */ |
| 364 | bdi_remove_from_list(bdi); | 351 | bdi_remove_from_list(bdi); |
| 365 | 352 | ||
| 366 | /* Make sure nobody queues further work */ | ||
| 367 | spin_lock_bh(&bdi->wb_lock); | ||
| 368 | clear_bit(BDI_registered, &bdi->state); | ||
| 369 | spin_unlock_bh(&bdi->wb_lock); | ||
| 370 | |||
| 371 | /* | 353 | /* |
| 372 | * Drain work list and shutdown the delayed_work. At this point, | 354 | * Drain work list and shutdown the delayed_work. At this point, |
| 373 | * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi | 355 | * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi |
| @@ -375,37 +357,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) | |||
| 375 | */ | 357 | */ |
| 376 | mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); | 358 | mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); |
| 377 | flush_delayed_work(&bdi->wb.dwork); | 359 | flush_delayed_work(&bdi->wb.dwork); |
| 378 | WARN_ON(!list_empty(&bdi->work_list)); | ||
| 379 | WARN_ON(delayed_work_pending(&bdi->wb.dwork)); | ||
| 380 | } | 360 | } |
| 381 | 361 | ||
| 382 | /* | 362 | /* |
| 383 | * This bdi is going away now, make sure that no super_blocks point to it | 363 | * Called when the device behind @bdi has been removed or ejected. |
| 364 | * | ||
| 365 | * We can't really do much here except for reducing the dirty ratio at | ||
| 366 | * the moment. In the future we should be able to set a flag so that | ||
| 367 | * the filesystem can handle errors at mark_inode_dirty time instead | ||
| 368 | * of only at writeback time. | ||
| 384 | */ | 369 | */ |
| 385 | static void bdi_prune_sb(struct backing_dev_info *bdi) | ||
| 386 | { | ||
| 387 | struct super_block *sb; | ||
| 388 | |||
| 389 | spin_lock(&sb_lock); | ||
| 390 | list_for_each_entry(sb, &super_blocks, s_list) { | ||
| 391 | if (sb->s_bdi == bdi) | ||
| 392 | sb->s_bdi = &default_backing_dev_info; | ||
| 393 | } | ||
| 394 | spin_unlock(&sb_lock); | ||
| 395 | } | ||
| 396 | |||
| 397 | void bdi_unregister(struct backing_dev_info *bdi) | 370 | void bdi_unregister(struct backing_dev_info *bdi) |
| 398 | { | 371 | { |
| 399 | if (bdi->dev) { | 372 | if (WARN_ON_ONCE(!bdi->dev)) |
| 400 | bdi_set_min_ratio(bdi, 0); | 373 | return; |
| 401 | trace_writeback_bdi_unregister(bdi); | ||
| 402 | bdi_prune_sb(bdi); | ||
| 403 | 374 | ||
| 404 | bdi_wb_shutdown(bdi); | 375 | bdi_set_min_ratio(bdi, 0); |
| 405 | bdi_debug_unregister(bdi); | ||
| 406 | device_unregister(bdi->dev); | ||
| 407 | bdi->dev = NULL; | ||
| 408 | } | ||
| 409 | } | 376 | } |
| 410 | EXPORT_SYMBOL(bdi_unregister); | 377 | EXPORT_SYMBOL(bdi_unregister); |
| 411 | 378 | ||
| @@ -418,6 +385,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | |||
| 418 | INIT_LIST_HEAD(&wb->b_dirty); | 385 | INIT_LIST_HEAD(&wb->b_dirty); |
| 419 | INIT_LIST_HEAD(&wb->b_io); | 386 | INIT_LIST_HEAD(&wb->b_io); |
| 420 | INIT_LIST_HEAD(&wb->b_more_io); | 387 | INIT_LIST_HEAD(&wb->b_more_io); |
| 388 | INIT_LIST_HEAD(&wb->b_dirty_time); | ||
| 421 | spin_lock_init(&wb->list_lock); | 389 | spin_lock_init(&wb->list_lock); |
| 422 | INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); | 390 | INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); |
| 423 | } | 391 | } |
| @@ -474,37 +442,19 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
| 474 | { | 442 | { |
| 475 | int i; | 443 | int i; |
| 476 | 444 | ||
| 477 | /* | 445 | bdi_wb_shutdown(bdi); |
| 478 | * Splice our entries to the default_backing_dev_info. This | ||
| 479 | * condition shouldn't happen. @wb must be empty at this point and | ||
| 480 | * dirty inodes on it might cause other issues. This workaround is | ||
| 481 | * added by ce5f8e779519 ("writeback: splice dirty inode entries to | ||
| 482 | * default bdi on bdi_destroy()") without root-causing the issue. | ||
| 483 | * | ||
| 484 | * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com | ||
| 485 | * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350 | ||
| 486 | * | ||
| 487 | * We should probably add WARN_ON() to find out whether it still | ||
| 488 | * happens and track it down if so. | ||
| 489 | */ | ||
| 490 | if (bdi_has_dirty_io(bdi)) { | ||
| 491 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | ||
| 492 | |||
| 493 | bdi_lock_two(&bdi->wb, dst); | ||
| 494 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | ||
| 495 | list_splice(&bdi->wb.b_io, &dst->b_io); | ||
| 496 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | ||
| 497 | spin_unlock(&bdi->wb.list_lock); | ||
| 498 | spin_unlock(&dst->list_lock); | ||
| 499 | } | ||
| 500 | |||
| 501 | bdi_unregister(bdi); | ||
| 502 | 446 | ||
| 447 | WARN_ON(!list_empty(&bdi->work_list)); | ||
| 503 | WARN_ON(delayed_work_pending(&bdi->wb.dwork)); | 448 | WARN_ON(delayed_work_pending(&bdi->wb.dwork)); |
| 504 | 449 | ||
| 450 | if (bdi->dev) { | ||
| 451 | bdi_debug_unregister(bdi); | ||
| 452 | device_unregister(bdi->dev); | ||
| 453 | bdi->dev = NULL; | ||
| 454 | } | ||
| 455 | |||
| 505 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 456 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
| 506 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 457 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
| 507 | |||
| 508 | fprop_local_destroy_percpu(&bdi->completions); | 458 | fprop_local_destroy_percpu(&bdi->completions); |
| 509 | } | 459 | } |
| 510 | EXPORT_SYMBOL(bdi_destroy); | 460 | EXPORT_SYMBOL(bdi_destroy); |
| @@ -513,13 +463,12 @@ EXPORT_SYMBOL(bdi_destroy); | |||
| 513 | * For use from filesystems to quickly init and register a bdi associated | 463 | * For use from filesystems to quickly init and register a bdi associated |
| 514 | * with dirty writeback | 464 | * with dirty writeback |
| 515 | */ | 465 | */ |
| 516 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, | 466 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name) |
| 517 | unsigned int cap) | ||
| 518 | { | 467 | { |
| 519 | int err; | 468 | int err; |
| 520 | 469 | ||
| 521 | bdi->name = name; | 470 | bdi->name = name; |
| 522 | bdi->capabilities = cap; | 471 | bdi->capabilities = 0; |
| 523 | err = bdi_init(bdi); | 472 | err = bdi_init(bdi); |
| 524 | if (err) | 473 | if (err) |
| 525 | return err; | 474 | return err; |
diff --git a/mm/cleancache.c b/mm/cleancache.c index d0eac4350403..053bcd8f12fb 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | static struct cleancache_ops *cleancache_ops __read_mostly; | 25 | static struct cleancache_ops *cleancache_ops __read_mostly; |
| 26 | 26 | ||
| 27 | /* | 27 | /* |
| 28 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | 28 | * Counters available via /sys/kernel/debug/cleancache (if debugfs is |
| 29 | * properly configured. These are for information only so are not protected | 29 | * properly configured. These are for information only so are not protected |
| 30 | * against increment races. | 30 | * against increment races. |
| 31 | */ | 31 | */ |
| @@ -199,6 +199,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, | |||
| 199 | cma->order_per_bit = order_per_bit; | 199 | cma->order_per_bit = order_per_bit; |
| 200 | *res_cma = cma; | 200 | *res_cma = cma; |
| 201 | cma_area_count++; | 201 | cma_area_count++; |
| 202 | totalcma_pages += (size / PAGE_SIZE); | ||
| 202 | 203 | ||
| 203 | return 0; | 204 | return 0; |
| 204 | } | 205 | } |
| @@ -337,7 +338,6 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
| 337 | if (ret) | 338 | if (ret) |
| 338 | goto err; | 339 | goto err; |
| 339 | 340 | ||
| 340 | totalcma_pages += (size / PAGE_SIZE); | ||
| 341 | pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, | 341 | pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, |
| 342 | &base); | 342 | &base); |
| 343 | return 0; | 343 | return 0; |
diff --git a/mm/compaction.c b/mm/compaction.c index 546e571e9d60..8c0d9459b54a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
| 17 | #include <linux/balloon_compaction.h> | 17 | #include <linux/balloon_compaction.h> |
| 18 | #include <linux/page-isolation.h> | 18 | #include <linux/page-isolation.h> |
| 19 | #include <linux/kasan.h> | ||
| 19 | #include "internal.h" | 20 | #include "internal.h" |
| 20 | 21 | ||
| 21 | #ifdef CONFIG_COMPACTION | 22 | #ifdef CONFIG_COMPACTION |
| @@ -34,6 +35,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta) | |||
| 34 | #endif | 35 | #endif |
| 35 | 36 | ||
| 36 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 37 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
| 38 | #ifdef CONFIG_TRACEPOINTS | ||
| 39 | static const char *const compaction_status_string[] = { | ||
| 40 | "deferred", | ||
| 41 | "skipped", | ||
| 42 | "continue", | ||
| 43 | "partial", | ||
| 44 | "complete", | ||
| 45 | "no_suitable_page", | ||
| 46 | "not_suitable_zone", | ||
| 47 | }; | ||
| 48 | #endif | ||
| 37 | 49 | ||
| 38 | #define CREATE_TRACE_POINTS | 50 | #define CREATE_TRACE_POINTS |
| 39 | #include <trace/events/compaction.h> | 51 | #include <trace/events/compaction.h> |
| @@ -61,6 +73,7 @@ static void map_pages(struct list_head *list) | |||
| 61 | list_for_each_entry(page, list, lru) { | 73 | list_for_each_entry(page, list, lru) { |
| 62 | arch_alloc_page(page, 0); | 74 | arch_alloc_page(page, 0); |
| 63 | kernel_map_pages(page, 1, 1); | 75 | kernel_map_pages(page, 1, 1); |
| 76 | kasan_alloc_pages(page, 0); | ||
| 64 | } | 77 | } |
| 65 | } | 78 | } |
| 66 | 79 | ||
| @@ -113,6 +126,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn, | |||
| 113 | } | 126 | } |
| 114 | 127 | ||
| 115 | #ifdef CONFIG_COMPACTION | 128 | #ifdef CONFIG_COMPACTION |
| 129 | |||
| 130 | /* Do not skip compaction more than 64 times */ | ||
| 131 | #define COMPACT_MAX_DEFER_SHIFT 6 | ||
| 132 | |||
| 133 | /* | ||
| 134 | * Compaction is deferred when compaction fails to result in a page | ||
| 135 | * allocation success. 1 << compact_defer_limit compactions are skipped up | ||
| 136 | * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT | ||
| 137 | */ | ||
| 138 | void defer_compaction(struct zone *zone, int order) | ||
| 139 | { | ||
| 140 | zone->compact_considered = 0; | ||
| 141 | zone->compact_defer_shift++; | ||
| 142 | |||
| 143 | if (order < zone->compact_order_failed) | ||
| 144 | zone->compact_order_failed = order; | ||
| 145 | |||
| 146 | if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) | ||
| 147 | zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; | ||
| 148 | |||
| 149 | trace_mm_compaction_defer_compaction(zone, order); | ||
| 150 | } | ||
| 151 | |||
| 152 | /* Returns true if compaction should be skipped this time */ | ||
| 153 | bool compaction_deferred(struct zone *zone, int order) | ||
| 154 | { | ||
| 155 | unsigned long defer_limit = 1UL << zone->compact_defer_shift; | ||
| 156 | |||
| 157 | if (order < zone->compact_order_failed) | ||
| 158 | return false; | ||
| 159 | |||
| 160 | /* Avoid possible overflow */ | ||
| 161 | if (++zone->compact_considered > defer_limit) | ||
| 162 | zone->compact_considered = defer_limit; | ||
| 163 | |||
| 164 | if (zone->compact_considered >= defer_limit) | ||
| 165 | return false; | ||
| 166 | |||
| 167 | trace_mm_compaction_deferred(zone, order); | ||
| 168 | |||
| 169 | return true; | ||
| 170 | } | ||
| 171 | |||
| 172 | /* | ||
| 173 | * Update defer tracking counters after successful compaction of given order, | ||
| 174 | * which means an allocation either succeeded (alloc_success == true) or is | ||
| 175 | * expected to succeed. | ||
| 176 | */ | ||
| 177 | void compaction_defer_reset(struct zone *zone, int order, | ||
| 178 | bool alloc_success) | ||
| 179 | { | ||
| 180 | if (alloc_success) { | ||
| 181 | zone->compact_considered = 0; | ||
| 182 | zone->compact_defer_shift = 0; | ||
| 183 | } | ||
| 184 | if (order >= zone->compact_order_failed) | ||
| 185 | zone->compact_order_failed = order + 1; | ||
| 186 | |||
| 187 | trace_mm_compaction_defer_reset(zone, order); | ||
| 188 | } | ||
| 189 | |||
| 190 | /* Returns true if restarting compaction after many failures */ | ||
| 191 | bool compaction_restarting(struct zone *zone, int order) | ||
| 192 | { | ||
| 193 | if (order < zone->compact_order_failed) | ||
| 194 | return false; | ||
| 195 | |||
| 196 | return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && | ||
| 197 | zone->compact_considered >= 1UL << zone->compact_defer_shift; | ||
| 198 | } | ||
| 199 | |||
| 116 | /* Returns true if the pageblock should be scanned for pages to isolate. */ | 200 | /* Returns true if the pageblock should be scanned for pages to isolate. */ |
| 117 | static inline bool isolation_suitable(struct compact_control *cc, | 201 | static inline bool isolation_suitable(struct compact_control *cc, |
| 118 | struct page *page) | 202 | struct page *page) |
| @@ -408,6 +492,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
| 408 | 492 | ||
| 409 | /* If a page was split, advance to the end of it */ | 493 | /* If a page was split, advance to the end of it */ |
| 410 | if (isolated) { | 494 | if (isolated) { |
| 495 | cc->nr_freepages += isolated; | ||
| 496 | if (!strict && | ||
| 497 | cc->nr_migratepages <= cc->nr_freepages) { | ||
| 498 | blockpfn += isolated; | ||
| 499 | break; | ||
| 500 | } | ||
| 501 | |||
| 411 | blockpfn += isolated - 1; | 502 | blockpfn += isolated - 1; |
| 412 | cursor += isolated - 1; | 503 | cursor += isolated - 1; |
| 413 | continue; | 504 | continue; |
| @@ -421,11 +512,12 @@ isolate_fail: | |||
| 421 | 512 | ||
| 422 | } | 513 | } |
| 423 | 514 | ||
| 515 | trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, | ||
| 516 | nr_scanned, total_isolated); | ||
| 517 | |||
| 424 | /* Record how far we have got within the block */ | 518 | /* Record how far we have got within the block */ |
| 425 | *start_pfn = blockpfn; | 519 | *start_pfn = blockpfn; |
| 426 | 520 | ||
| 427 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | ||
| 428 | |||
| 429 | /* | 521 | /* |
| 430 | * If strict isolation is requested by CMA then check that all the | 522 | * If strict isolation is requested by CMA then check that all the |
| 431 | * pages requested were isolated. If there were any failures, 0 is | 523 | * pages requested were isolated. If there were any failures, 0 is |
| @@ -581,6 +673,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
| 581 | unsigned long flags = 0; | 673 | unsigned long flags = 0; |
| 582 | bool locked = false; | 674 | bool locked = false; |
| 583 | struct page *page = NULL, *valid_page = NULL; | 675 | struct page *page = NULL, *valid_page = NULL; |
| 676 | unsigned long start_pfn = low_pfn; | ||
| 584 | 677 | ||
| 585 | /* | 678 | /* |
| 586 | * Ensure that there are not too many pages isolated from the LRU | 679 | * Ensure that there are not too many pages isolated from the LRU |
| @@ -741,7 +834,8 @@ isolate_success: | |||
| 741 | if (low_pfn == end_pfn) | 834 | if (low_pfn == end_pfn) |
| 742 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 835 | update_pageblock_skip(cc, valid_page, nr_isolated, true); |
| 743 | 836 | ||
| 744 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 837 | trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, |
| 838 | nr_scanned, nr_isolated); | ||
| 745 | 839 | ||
| 746 | count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); | 840 | count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); |
| 747 | if (nr_isolated) | 841 | if (nr_isolated) |
| @@ -814,7 +908,6 @@ static void isolate_freepages(struct compact_control *cc) | |||
| 814 | unsigned long isolate_start_pfn; /* exact pfn we start at */ | 908 | unsigned long isolate_start_pfn; /* exact pfn we start at */ |
| 815 | unsigned long block_end_pfn; /* end of current pageblock */ | 909 | unsigned long block_end_pfn; /* end of current pageblock */ |
| 816 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ | 910 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ |
| 817 | int nr_freepages = cc->nr_freepages; | ||
| 818 | struct list_head *freelist = &cc->freepages; | 911 | struct list_head *freelist = &cc->freepages; |
| 819 | 912 | ||
| 820 | /* | 913 | /* |
| @@ -839,11 +932,11 @@ static void isolate_freepages(struct compact_control *cc) | |||
| 839 | * pages on cc->migratepages. We stop searching if the migrate | 932 | * pages on cc->migratepages. We stop searching if the migrate |
| 840 | * and free page scanners meet or enough free pages are isolated. | 933 | * and free page scanners meet or enough free pages are isolated. |
| 841 | */ | 934 | */ |
| 842 | for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; | 935 | for (; block_start_pfn >= low_pfn && |
| 936 | cc->nr_migratepages > cc->nr_freepages; | ||
| 843 | block_end_pfn = block_start_pfn, | 937 | block_end_pfn = block_start_pfn, |
| 844 | block_start_pfn -= pageblock_nr_pages, | 938 | block_start_pfn -= pageblock_nr_pages, |
| 845 | isolate_start_pfn = block_start_pfn) { | 939 | isolate_start_pfn = block_start_pfn) { |
| 846 | unsigned long isolated; | ||
| 847 | 940 | ||
| 848 | /* | 941 | /* |
| 849 | * This can iterate a massively long zone without finding any | 942 | * This can iterate a massively long zone without finding any |
| @@ -868,9 +961,8 @@ static void isolate_freepages(struct compact_control *cc) | |||
| 868 | continue; | 961 | continue; |
| 869 | 962 | ||
| 870 | /* Found a block suitable for isolating free pages from. */ | 963 | /* Found a block suitable for isolating free pages from. */ |
| 871 | isolated = isolate_freepages_block(cc, &isolate_start_pfn, | 964 | isolate_freepages_block(cc, &isolate_start_pfn, |
| 872 | block_end_pfn, freelist, false); | 965 | block_end_pfn, freelist, false); |
| 873 | nr_freepages += isolated; | ||
| 874 | 966 | ||
| 875 | /* | 967 | /* |
| 876 | * Remember where the free scanner should restart next time, | 968 | * Remember where the free scanner should restart next time, |
| @@ -902,8 +994,6 @@ static void isolate_freepages(struct compact_control *cc) | |||
| 902 | */ | 994 | */ |
| 903 | if (block_start_pfn < low_pfn) | 995 | if (block_start_pfn < low_pfn) |
| 904 | cc->free_pfn = cc->migrate_pfn; | 996 | cc->free_pfn = cc->migrate_pfn; |
| 905 | |||
| 906 | cc->nr_freepages = nr_freepages; | ||
| 907 | } | 997 | } |
| 908 | 998 | ||
| 909 | /* | 999 | /* |
| @@ -1015,8 +1105,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
| 1015 | low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, | 1105 | low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, |
| 1016 | isolate_mode); | 1106 | isolate_mode); |
| 1017 | 1107 | ||
| 1018 | if (!low_pfn || cc->contended) | 1108 | if (!low_pfn || cc->contended) { |
| 1109 | acct_isolated(zone, cc); | ||
| 1019 | return ISOLATE_ABORT; | 1110 | return ISOLATE_ABORT; |
| 1111 | } | ||
| 1020 | 1112 | ||
| 1021 | /* | 1113 | /* |
| 1022 | * Either we isolated something and proceed with migration. Or | 1114 | * Either we isolated something and proceed with migration. Or |
| @@ -1037,7 +1129,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
| 1037 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; | 1129 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; |
| 1038 | } | 1130 | } |
| 1039 | 1131 | ||
| 1040 | static int compact_finished(struct zone *zone, struct compact_control *cc, | 1132 | static int __compact_finished(struct zone *zone, struct compact_control *cc, |
| 1041 | const int migratetype) | 1133 | const int migratetype) |
| 1042 | { | 1134 | { |
| 1043 | unsigned int order; | 1135 | unsigned int order; |
| @@ -1088,11 +1180,24 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, | |||
| 1088 | return COMPACT_PARTIAL; | 1180 | return COMPACT_PARTIAL; |
| 1089 | 1181 | ||
| 1090 | /* Job done if allocation would set block type */ | 1182 | /* Job done if allocation would set block type */ |
| 1091 | if (cc->order >= pageblock_order && area->nr_free) | 1183 | if (order >= pageblock_order && area->nr_free) |
| 1092 | return COMPACT_PARTIAL; | 1184 | return COMPACT_PARTIAL; |
| 1093 | } | 1185 | } |
| 1094 | 1186 | ||
| 1095 | return COMPACT_CONTINUE; | 1187 | return COMPACT_NO_SUITABLE_PAGE; |
| 1188 | } | ||
| 1189 | |||
| 1190 | static int compact_finished(struct zone *zone, struct compact_control *cc, | ||
| 1191 | const int migratetype) | ||
| 1192 | { | ||
| 1193 | int ret; | ||
| 1194 | |||
| 1195 | ret = __compact_finished(zone, cc, migratetype); | ||
| 1196 | trace_mm_compaction_finished(zone, cc->order, ret); | ||
| 1197 | if (ret == COMPACT_NO_SUITABLE_PAGE) | ||
| 1198 | ret = COMPACT_CONTINUE; | ||
| 1199 | |||
| 1200 | return ret; | ||
| 1096 | } | 1201 | } |
| 1097 | 1202 | ||
| 1098 | /* | 1203 | /* |
| @@ -1102,7 +1207,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, | |||
| 1102 | * COMPACT_PARTIAL - If the allocation would succeed without compaction | 1207 | * COMPACT_PARTIAL - If the allocation would succeed without compaction |
| 1103 | * COMPACT_CONTINUE - If compaction should run now | 1208 | * COMPACT_CONTINUE - If compaction should run now |
| 1104 | */ | 1209 | */ |
| 1105 | unsigned long compaction_suitable(struct zone *zone, int order, | 1210 | static unsigned long __compaction_suitable(struct zone *zone, int order, |
| 1106 | int alloc_flags, int classzone_idx) | 1211 | int alloc_flags, int classzone_idx) |
| 1107 | { | 1212 | { |
| 1108 | int fragindex; | 1213 | int fragindex; |
| @@ -1146,11 +1251,24 @@ unsigned long compaction_suitable(struct zone *zone, int order, | |||
| 1146 | */ | 1251 | */ |
| 1147 | fragindex = fragmentation_index(zone, order); | 1252 | fragindex = fragmentation_index(zone, order); |
| 1148 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | 1253 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) |
| 1149 | return COMPACT_SKIPPED; | 1254 | return COMPACT_NOT_SUITABLE_ZONE; |
| 1150 | 1255 | ||
| 1151 | return COMPACT_CONTINUE; | 1256 | return COMPACT_CONTINUE; |
| 1152 | } | 1257 | } |
| 1153 | 1258 | ||
| 1259 | unsigned long compaction_suitable(struct zone *zone, int order, | ||
| 1260 | int alloc_flags, int classzone_idx) | ||
| 1261 | { | ||
| 1262 | unsigned long ret; | ||
| 1263 | |||
| 1264 | ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); | ||
| 1265 | trace_mm_compaction_suitable(zone, order, ret); | ||
| 1266 | if (ret == COMPACT_NOT_SUITABLE_ZONE) | ||
| 1267 | ret = COMPACT_SKIPPED; | ||
| 1268 | |||
| 1269 | return ret; | ||
| 1270 | } | ||
| 1271 | |||
| 1154 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 1272 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
| 1155 | { | 1273 | { |
| 1156 | int ret; | 1274 | int ret; |
| @@ -1197,7 +1315,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 1197 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | 1315 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; |
| 1198 | } | 1316 | } |
| 1199 | 1317 | ||
| 1200 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); | 1318 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, |
| 1319 | cc->free_pfn, end_pfn, sync); | ||
| 1201 | 1320 | ||
| 1202 | migrate_prep_local(); | 1321 | migrate_prep_local(); |
| 1203 | 1322 | ||
| @@ -1299,7 +1418,8 @@ out: | |||
| 1299 | zone->compact_cached_free_pfn = free_pfn; | 1418 | zone->compact_cached_free_pfn = free_pfn; |
| 1300 | } | 1419 | } |
| 1301 | 1420 | ||
| 1302 | trace_mm_compaction_end(ret); | 1421 | trace_mm_compaction_end(start_pfn, cc->migrate_pfn, |
| 1422 | cc->free_pfn, end_pfn, sync, ret); | ||
| 1303 | 1423 | ||
| 1304 | return ret; | 1424 | return ret; |
| 1305 | } | 1425 | } |
| @@ -1335,22 +1455,20 @@ int sysctl_extfrag_threshold = 500; | |||
| 1335 | 1455 | ||
| 1336 | /** | 1456 | /** |
| 1337 | * try_to_compact_pages - Direct compact to satisfy a high-order allocation | 1457 | * try_to_compact_pages - Direct compact to satisfy a high-order allocation |
| 1338 | * @zonelist: The zonelist used for the current allocation | ||
| 1339 | * @order: The order of the current allocation | ||
| 1340 | * @gfp_mask: The GFP mask of the current allocation | 1458 | * @gfp_mask: The GFP mask of the current allocation |
| 1341 | * @nodemask: The allowed nodes to allocate from | 1459 | * @order: The order of the current allocation |
| 1460 | * @alloc_flags: The allocation flags of the current allocation | ||
| 1461 | * @ac: The context of current allocation | ||
| 1342 | * @mode: The migration mode for async, sync light, or sync migration | 1462 | * @mode: The migration mode for async, sync light, or sync migration |
| 1343 | * @contended: Return value that determines if compaction was aborted due to | 1463 | * @contended: Return value that determines if compaction was aborted due to |
| 1344 | * need_resched() or lock contention | 1464 | * need_resched() or lock contention |
| 1345 | * | 1465 | * |
| 1346 | * This is the main entry point for direct page compaction. | 1466 | * This is the main entry point for direct page compaction. |
| 1347 | */ | 1467 | */ |
| 1348 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1468 | unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, |
| 1349 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1469 | int alloc_flags, const struct alloc_context *ac, |
| 1350 | enum migrate_mode mode, int *contended, | 1470 | enum migrate_mode mode, int *contended) |
| 1351 | int alloc_flags, int classzone_idx) | ||
| 1352 | { | 1471 | { |
| 1353 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
| 1354 | int may_enter_fs = gfp_mask & __GFP_FS; | 1472 | int may_enter_fs = gfp_mask & __GFP_FS; |
| 1355 | int may_perform_io = gfp_mask & __GFP_IO; | 1473 | int may_perform_io = gfp_mask & __GFP_IO; |
| 1356 | struct zoneref *z; | 1474 | struct zoneref *z; |
| @@ -1364,9 +1482,11 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
| 1364 | if (!order || !may_enter_fs || !may_perform_io) | 1482 | if (!order || !may_enter_fs || !may_perform_io) |
| 1365 | return COMPACT_SKIPPED; | 1483 | return COMPACT_SKIPPED; |
| 1366 | 1484 | ||
| 1485 | trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); | ||
| 1486 | |||
| 1367 | /* Compact each zone in the list */ | 1487 | /* Compact each zone in the list */ |
| 1368 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1488 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
| 1369 | nodemask) { | 1489 | ac->nodemask) { |
| 1370 | int status; | 1490 | int status; |
| 1371 | int zone_contended; | 1491 | int zone_contended; |
| 1372 | 1492 | ||
| @@ -1374,7 +1494,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
| 1374 | continue; | 1494 | continue; |
| 1375 | 1495 | ||
| 1376 | status = compact_zone_order(zone, order, gfp_mask, mode, | 1496 | status = compact_zone_order(zone, order, gfp_mask, mode, |
| 1377 | &zone_contended, alloc_flags, classzone_idx); | 1497 | &zone_contended, alloc_flags, |
| 1498 | ac->classzone_idx); | ||
| 1378 | rc = max(status, rc); | 1499 | rc = max(status, rc); |
| 1379 | /* | 1500 | /* |
| 1380 | * It takes at least one zone that wasn't lock contended | 1501 | * It takes at least one zone that wasn't lock contended |
| @@ -1384,7 +1505,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
| 1384 | 1505 | ||
| 1385 | /* If a normal allocation would succeed, stop compacting */ | 1506 | /* If a normal allocation would succeed, stop compacting */ |
| 1386 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), | 1507 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), |
| 1387 | classzone_idx, alloc_flags)) { | 1508 | ac->classzone_idx, alloc_flags)) { |
| 1388 | /* | 1509 | /* |
| 1389 | * We think the allocation will succeed in this zone, | 1510 | * We think the allocation will succeed in this zone, |
| 1390 | * but it is not certain, hence the false. The caller | 1511 | * but it is not certain, hence the false. The caller |
diff --git a/mm/debug.c b/mm/debug.c index 0e58f3211f89..3eb3ac2fcee7 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
| @@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = { | |||
| 130 | {VM_ACCOUNT, "account" }, | 130 | {VM_ACCOUNT, "account" }, |
| 131 | {VM_NORESERVE, "noreserve" }, | 131 | {VM_NORESERVE, "noreserve" }, |
| 132 | {VM_HUGETLB, "hugetlb" }, | 132 | {VM_HUGETLB, "hugetlb" }, |
| 133 | {VM_NONLINEAR, "nonlinear" }, | ||
| 134 | #if defined(CONFIG_X86) | 133 | #if defined(CONFIG_X86) |
| 135 | {VM_PAT, "pat" }, | 134 | {VM_PAT, "pat" }, |
| 136 | #elif defined(CONFIG_PPC) | 135 | #elif defined(CONFIG_PPC) |
| @@ -174,7 +173,7 @@ void dump_mm(const struct mm_struct *mm) | |||
| 174 | "get_unmapped_area %p\n" | 173 | "get_unmapped_area %p\n" |
| 175 | #endif | 174 | #endif |
| 176 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" | 175 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" |
| 177 | "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" | 176 | "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" |
| 178 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" | 177 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" |
| 179 | "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" | 178 | "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" |
| 180 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" | 179 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" |
| @@ -207,6 +206,7 @@ void dump_mm(const struct mm_struct *mm) | |||
| 207 | mm->pgd, atomic_read(&mm->mm_users), | 206 | mm->pgd, atomic_read(&mm->mm_users), |
| 208 | atomic_read(&mm->mm_count), | 207 | atomic_read(&mm->mm_count), |
| 209 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), | 208 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), |
| 209 | mm_nr_pmds((struct mm_struct *)mm), | ||
| 210 | mm->map_count, | 210 | mm->map_count, |
| 211 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, | 211 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, |
| 212 | mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, | 212 | mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 2ad7adf4f0a4..4a3907cf79f8 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) | 28 | SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) |
| 29 | { | 29 | { |
| 30 | struct fd f = fdget(fd); | 30 | struct fd f = fdget(fd); |
| 31 | struct inode *inode; | ||
| 31 | struct address_space *mapping; | 32 | struct address_space *mapping; |
| 32 | struct backing_dev_info *bdi; | 33 | struct backing_dev_info *bdi; |
| 33 | loff_t endbyte; /* inclusive */ | 34 | loff_t endbyte; /* inclusive */ |
| @@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) | |||
| 39 | if (!f.file) | 40 | if (!f.file) |
| 40 | return -EBADF; | 41 | return -EBADF; |
| 41 | 42 | ||
| 42 | if (S_ISFIFO(file_inode(f.file)->i_mode)) { | 43 | inode = file_inode(f.file); |
| 44 | if (S_ISFIFO(inode->i_mode)) { | ||
| 43 | ret = -ESPIPE; | 45 | ret = -ESPIPE; |
| 44 | goto out; | 46 | goto out; |
| 45 | } | 47 | } |
| @@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) | |||
| 50 | goto out; | 52 | goto out; |
| 51 | } | 53 | } |
| 52 | 54 | ||
| 53 | if (mapping->a_ops->get_xip_mem) { | 55 | if (IS_DAX(inode)) { |
| 54 | switch (advice) { | 56 | switch (advice) { |
| 55 | case POSIX_FADV_NORMAL: | 57 | case POSIX_FADV_NORMAL: |
| 56 | case POSIX_FADV_RANDOM: | 58 | case POSIX_FADV_RANDOM: |
| @@ -73,7 +75,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) | |||
| 73 | else | 75 | else |
| 74 | endbyte--; /* inclusive */ | 76 | endbyte--; /* inclusive */ |
| 75 | 77 | ||
| 76 | bdi = mapping->backing_dev_info; | 78 | bdi = inode_to_bdi(mapping->host); |
| 77 | 79 | ||
| 78 | switch (advice) { | 80 | switch (advice) { |
| 79 | case POSIX_FADV_NORMAL: | 81 | case POSIX_FADV_NORMAL: |
| @@ -113,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) | |||
| 113 | case POSIX_FADV_NOREUSE: | 115 | case POSIX_FADV_NOREUSE: |
| 114 | break; | 116 | break; |
| 115 | case POSIX_FADV_DONTNEED: | 117 | case POSIX_FADV_DONTNEED: |
| 116 | if (!bdi_write_congested(mapping->backing_dev_info)) | 118 | if (!bdi_write_congested(bdi)) |
| 117 | __filemap_fdatawrite_range(mapping, offset, endbyte, | 119 | __filemap_fdatawrite_range(mapping, offset, endbyte, |
| 118 | WB_SYNC_NONE); | 120 | WB_SYNC_NONE); |
| 119 | 121 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 673e4581a2e5..ad7242043bdb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) | |||
| 211 | */ | 211 | */ |
| 212 | if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { | 212 | if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { |
| 213 | dec_zone_page_state(page, NR_FILE_DIRTY); | 213 | dec_zone_page_state(page, NR_FILE_DIRTY); |
| 214 | dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); | 214 | dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); |
| 215 | } | 215 | } |
| 216 | } | 216 | } |
| 217 | 217 | ||
| @@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) | |||
| 1695 | loff_t *ppos = &iocb->ki_pos; | 1695 | loff_t *ppos = &iocb->ki_pos; |
| 1696 | loff_t pos = *ppos; | 1696 | loff_t pos = *ppos; |
| 1697 | 1697 | ||
| 1698 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 1698 | if (io_is_direct(file)) { |
| 1699 | if (file->f_flags & O_DIRECT) { | ||
| 1700 | struct address_space *mapping = file->f_mapping; | 1699 | struct address_space *mapping = file->f_mapping; |
| 1701 | struct inode *inode = mapping->host; | 1700 | struct inode *inode = mapping->host; |
| 1702 | size_t count = iov_iter_count(iter); | 1701 | size_t count = iov_iter_count(iter); |
| @@ -1723,9 +1722,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) | |||
| 1723 | * we've already read everything we wanted to, or if | 1722 | * we've already read everything we wanted to, or if |
| 1724 | * there was a short read because we hit EOF, go ahead | 1723 | * there was a short read because we hit EOF, go ahead |
| 1725 | * and return. Otherwise fallthrough to buffered io for | 1724 | * and return. Otherwise fallthrough to buffered io for |
| 1726 | * the rest of the read. | 1725 | * the rest of the read. Buffered reads will not work for |
| 1726 | * DAX files, so don't bother trying. | ||
| 1727 | */ | 1727 | */ |
| 1728 | if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { | 1728 | if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || |
| 1729 | IS_DAX(inode)) { | ||
| 1729 | file_accessed(file); | 1730 | file_accessed(file); |
| 1730 | goto out; | 1731 | goto out; |
| 1731 | } | 1732 | } |
| @@ -2087,7 +2088,6 @@ const struct vm_operations_struct generic_file_vm_ops = { | |||
| 2087 | .fault = filemap_fault, | 2088 | .fault = filemap_fault, |
| 2088 | .map_pages = filemap_map_pages, | 2089 | .map_pages = filemap_map_pages, |
| 2089 | .page_mkwrite = filemap_page_mkwrite, | 2090 | .page_mkwrite = filemap_page_mkwrite, |
| 2090 | .remap_pages = generic_file_remap_pages, | ||
| 2091 | }; | 2091 | }; |
| 2092 | 2092 | ||
| 2093 | /* This is used for a general mmap of a disk file */ | 2093 | /* This is used for a general mmap of a disk file */ |
| @@ -2565,7 +2565,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
| 2565 | size_t count = iov_iter_count(from); | 2565 | size_t count = iov_iter_count(from); |
| 2566 | 2566 | ||
| 2567 | /* We can write back this queue in page reclaim */ | 2567 | /* We can write back this queue in page reclaim */ |
| 2568 | current->backing_dev_info = mapping->backing_dev_info; | 2568 | current->backing_dev_info = inode_to_bdi(inode); |
| 2569 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); | 2569 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); |
| 2570 | if (err) | 2570 | if (err) |
| 2571 | goto out; | 2571 | goto out; |
| @@ -2583,18 +2583,20 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
| 2583 | if (err) | 2583 | if (err) |
| 2584 | goto out; | 2584 | goto out; |
| 2585 | 2585 | ||
| 2586 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 2586 | if (io_is_direct(file)) { |
| 2587 | if (unlikely(file->f_flags & O_DIRECT)) { | ||
| 2588 | loff_t endbyte; | 2587 | loff_t endbyte; |
| 2589 | 2588 | ||
| 2590 | written = generic_file_direct_write(iocb, from, pos); | 2589 | written = generic_file_direct_write(iocb, from, pos); |
| 2591 | if (written < 0 || written == count) | ||
| 2592 | goto out; | ||
| 2593 | |||
| 2594 | /* | 2590 | /* |
| 2595 | * direct-io write to a hole: fall through to buffered I/O | 2591 | * If the write stopped short of completing, fall back to |
| 2596 | * for completing the rest of the request. | 2592 | * buffered writes. Some filesystems do this for writes to |
| 2593 | * holes, for example. For DAX files, a buffered write will | ||
| 2594 | * not succeed (even if it did, DAX does not handle dirty | ||
| 2595 | * page-cache pages correctly). | ||
| 2597 | */ | 2596 | */ |
| 2597 | if (written < 0 || written == count || IS_DAX(inode)) | ||
| 2598 | goto out; | ||
| 2599 | |||
| 2598 | pos += written; | 2600 | pos += written; |
| 2599 | count -= written; | 2601 | count -= written; |
| 2600 | 2602 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c deleted file mode 100644 index 0d105aeff82f..000000000000 --- a/mm/filemap_xip.c +++ /dev/null | |||
| @@ -1,478 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/filemap_xip.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2005 IBM Corporation | ||
| 5 | * Author: Carsten Otte <cotte@de.ibm.com> | ||
| 6 | * | ||
| 7 | * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds | ||
| 8 | * | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/fs.h> | ||
| 12 | #include <linux/pagemap.h> | ||
| 13 | #include <linux/export.h> | ||
| 14 | #include <linux/uio.h> | ||
| 15 | #include <linux/rmap.h> | ||
| 16 | #include <linux/mmu_notifier.h> | ||
| 17 | #include <linux/sched.h> | ||
| 18 | #include <linux/seqlock.h> | ||
| 19 | #include <linux/mutex.h> | ||
| 20 | #include <linux/gfp.h> | ||
| 21 | #include <asm/tlbflush.h> | ||
| 22 | #include <asm/io.h> | ||
| 23 | |||
| 24 | /* | ||
| 25 | * We do use our own empty page to avoid interference with other users | ||
| 26 | * of ZERO_PAGE(), such as /dev/zero | ||
| 27 | */ | ||
| 28 | static DEFINE_MUTEX(xip_sparse_mutex); | ||
| 29 | static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq); | ||
| 30 | static struct page *__xip_sparse_page; | ||
| 31 | |||
| 32 | /* called under xip_sparse_mutex */ | ||
| 33 | static struct page *xip_sparse_page(void) | ||
| 34 | { | ||
| 35 | if (!__xip_sparse_page) { | ||
| 36 | struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); | ||
| 37 | |||
| 38 | if (page) | ||
| 39 | __xip_sparse_page = page; | ||
| 40 | } | ||
| 41 | return __xip_sparse_page; | ||
| 42 | } | ||
| 43 | |||
| 44 | /* | ||
| 45 | * This is a file read routine for execute in place files, and uses | ||
| 46 | * the mapping->a_ops->get_xip_mem() function for the actual low-level | ||
| 47 | * stuff. | ||
| 48 | * | ||
| 49 | * Note the struct file* is not used at all. It may be NULL. | ||
| 50 | */ | ||
| 51 | static ssize_t | ||
| 52 | do_xip_mapping_read(struct address_space *mapping, | ||
| 53 | struct file_ra_state *_ra, | ||
| 54 | struct file *filp, | ||
| 55 | char __user *buf, | ||
| 56 | size_t len, | ||
| 57 | loff_t *ppos) | ||
| 58 | { | ||
| 59 | struct inode *inode = mapping->host; | ||
| 60 | pgoff_t index, end_index; | ||
| 61 | unsigned long offset; | ||
| 62 | loff_t isize, pos; | ||
| 63 | size_t copied = 0, error = 0; | ||
| 64 | |||
| 65 | BUG_ON(!mapping->a_ops->get_xip_mem); | ||
| 66 | |||
| 67 | pos = *ppos; | ||
| 68 | index = pos >> PAGE_CACHE_SHIFT; | ||
| 69 | offset = pos & ~PAGE_CACHE_MASK; | ||
| 70 | |||
| 71 | isize = i_size_read(inode); | ||
| 72 | if (!isize) | ||
| 73 | goto out; | ||
| 74 | |||
| 75 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | ||
| 76 | do { | ||
| 77 | unsigned long nr, left; | ||
| 78 | void *xip_mem; | ||
| 79 | unsigned long xip_pfn; | ||
| 80 | int zero = 0; | ||
| 81 | |||
| 82 | /* nr is the maximum number of bytes to copy from this page */ | ||
| 83 | nr = PAGE_CACHE_SIZE; | ||
| 84 | if (index >= end_index) { | ||
| 85 | if (index > end_index) | ||
| 86 | goto out; | ||
| 87 | nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | ||
| 88 | if (nr <= offset) { | ||
| 89 | goto out; | ||
| 90 | } | ||
| 91 | } | ||
| 92 | nr = nr - offset; | ||
| 93 | if (nr > len - copied) | ||
| 94 | nr = len - copied; | ||
| 95 | |||
| 96 | error = mapping->a_ops->get_xip_mem(mapping, index, 0, | ||
| 97 | &xip_mem, &xip_pfn); | ||
| 98 | if (unlikely(error)) { | ||
| 99 | if (error == -ENODATA) { | ||
| 100 | /* sparse */ | ||
| 101 | zero = 1; | ||
| 102 | } else | ||
| 103 | goto out; | ||
| 104 | } | ||
| 105 | |||
| 106 | /* If users can be writing to this page using arbitrary | ||
| 107 | * virtual addresses, take care about potential aliasing | ||
| 108 | * before reading the page on the kernel side. | ||
| 109 | */ | ||
| 110 | if (mapping_writably_mapped(mapping)) | ||
| 111 | /* address based flush */ ; | ||
| 112 | |||
| 113 | /* | ||
| 114 | * Ok, we have the mem, so now we can copy it to user space... | ||
| 115 | * | ||
| 116 | * The actor routine returns how many bytes were actually used.. | ||
| 117 | * NOTE! This may not be the same as how much of a user buffer | ||
| 118 | * we filled up (we may be padding etc), so we can only update | ||
| 119 | * "pos" here (the actor routine has to update the user buffer | ||
| 120 | * pointers and the remaining count). | ||
| 121 | */ | ||
| 122 | if (!zero) | ||
| 123 | left = __copy_to_user(buf+copied, xip_mem+offset, nr); | ||
| 124 | else | ||
| 125 | left = __clear_user(buf + copied, nr); | ||
| 126 | |||
| 127 | if (left) { | ||
| 128 | error = -EFAULT; | ||
| 129 | goto out; | ||
| 130 | } | ||
| 131 | |||
| 132 | copied += (nr - left); | ||
| 133 | offset += (nr - left); | ||
| 134 | index += offset >> PAGE_CACHE_SHIFT; | ||
| 135 | offset &= ~PAGE_CACHE_MASK; | ||
| 136 | } while (copied < len); | ||
| 137 | |||
| 138 | out: | ||
| 139 | *ppos = pos + copied; | ||
| 140 | if (filp) | ||
| 141 | file_accessed(filp); | ||
| 142 | |||
| 143 | return (copied ? copied : error); | ||
| 144 | } | ||
| 145 | |||
| 146 | ssize_t | ||
| 147 | xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) | ||
| 148 | { | ||
| 149 | if (!access_ok(VERIFY_WRITE, buf, len)) | ||
| 150 | return -EFAULT; | ||
| 151 | |||
| 152 | return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, | ||
| 153 | buf, len, ppos); | ||
| 154 | } | ||
| 155 | EXPORT_SYMBOL_GPL(xip_file_read); | ||
| 156 | |||
| 157 | /* | ||
| 158 | * __xip_unmap is invoked from xip_unmap and xip_write | ||
| 159 | * | ||
| 160 | * This function walks all vmas of the address_space and unmaps the | ||
| 161 | * __xip_sparse_page when found at pgoff. | ||
| 162 | */ | ||
| 163 | static void __xip_unmap(struct address_space * mapping, unsigned long pgoff) | ||
| 164 | { | ||
| 165 | struct vm_area_struct *vma; | ||
| 166 | struct page *page; | ||
| 167 | unsigned count; | ||
| 168 | int locked = 0; | ||
| 169 | |||
| 170 | count = read_seqcount_begin(&xip_sparse_seq); | ||
| 171 | |||
| 172 | page = __xip_sparse_page; | ||
| 173 | if (!page) | ||
| 174 | return; | ||
| 175 | |||
| 176 | retry: | ||
| 177 | i_mmap_lock_read(mapping); | ||
| 178 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
| 179 | pte_t *pte, pteval; | ||
| 180 | spinlock_t *ptl; | ||
| 181 | struct mm_struct *mm = vma->vm_mm; | ||
| 182 | unsigned long address = vma->vm_start + | ||
| 183 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
| 184 | |||
| 185 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | ||
| 186 | pte = page_check_address(page, mm, address, &ptl, 1); | ||
| 187 | if (pte) { | ||
| 188 | /* Nuke the page table entry. */ | ||
| 189 | flush_cache_page(vma, address, pte_pfn(*pte)); | ||
| 190 | pteval = ptep_clear_flush(vma, address, pte); | ||
| 191 | page_remove_rmap(page); | ||
| 192 | dec_mm_counter(mm, MM_FILEPAGES); | ||
| 193 | BUG_ON(pte_dirty(pteval)); | ||
| 194 | pte_unmap_unlock(pte, ptl); | ||
| 195 | /* must invalidate_page _before_ freeing the page */ | ||
| 196 | mmu_notifier_invalidate_page(mm, address); | ||
| 197 | page_cache_release(page); | ||
| 198 | } | ||
| 199 | } | ||
| 200 | i_mmap_unlock_read(mapping); | ||
| 201 | |||
| 202 | if (locked) { | ||
| 203 | mutex_unlock(&xip_sparse_mutex); | ||
| 204 | } else if (read_seqcount_retry(&xip_sparse_seq, count)) { | ||
| 205 | mutex_lock(&xip_sparse_mutex); | ||
| 206 | locked = 1; | ||
| 207 | goto retry; | ||
| 208 | } | ||
| 209 | } | ||
| 210 | |||
| 211 | /* | ||
| 212 | * xip_fault() is invoked via the vma operations vector for a | ||
| 213 | * mapped memory region to read in file data during a page fault. | ||
| 214 | * | ||
| 215 | * This function is derived from filemap_fault, but used for execute in place | ||
| 216 | */ | ||
| 217 | static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
| 218 | { | ||
| 219 | struct file *file = vma->vm_file; | ||
| 220 | struct address_space *mapping = file->f_mapping; | ||
| 221 | struct inode *inode = mapping->host; | ||
| 222 | pgoff_t size; | ||
| 223 | void *xip_mem; | ||
| 224 | unsigned long xip_pfn; | ||
| 225 | struct page *page; | ||
| 226 | int error; | ||
| 227 | |||
| 228 | /* XXX: are VM_FAULT_ codes OK? */ | ||
| 229 | again: | ||
| 230 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 231 | if (vmf->pgoff >= size) | ||
| 232 | return VM_FAULT_SIGBUS; | ||
| 233 | |||
| 234 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, | ||
| 235 | &xip_mem, &xip_pfn); | ||
| 236 | if (likely(!error)) | ||
| 237 | goto found; | ||
| 238 | if (error != -ENODATA) | ||
| 239 | return VM_FAULT_OOM; | ||
| 240 | |||
| 241 | /* sparse block */ | ||
| 242 | if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && | ||
| 243 | (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && | ||
| 244 | (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { | ||
| 245 | int err; | ||
| 246 | |||
| 247 | /* maybe shared writable, allocate new block */ | ||
| 248 | mutex_lock(&xip_sparse_mutex); | ||
| 249 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, | ||
| 250 | &xip_mem, &xip_pfn); | ||
| 251 | mutex_unlock(&xip_sparse_mutex); | ||
| 252 | if (error) | ||
| 253 | return VM_FAULT_SIGBUS; | ||
| 254 | /* unmap sparse mappings at pgoff from all other vmas */ | ||
| 255 | __xip_unmap(mapping, vmf->pgoff); | ||
| 256 | |||
| 257 | found: | ||
| 258 | err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, | ||
| 259 | xip_pfn); | ||
| 260 | if (err == -ENOMEM) | ||
| 261 | return VM_FAULT_OOM; | ||
| 262 | /* | ||
| 263 | * err == -EBUSY is fine, we've raced against another thread | ||
| 264 | * that faulted-in the same page | ||
| 265 | */ | ||
| 266 | if (err != -EBUSY) | ||
| 267 | BUG_ON(err); | ||
| 268 | return VM_FAULT_NOPAGE; | ||
| 269 | } else { | ||
| 270 | int err, ret = VM_FAULT_OOM; | ||
| 271 | |||
| 272 | mutex_lock(&xip_sparse_mutex); | ||
| 273 | write_seqcount_begin(&xip_sparse_seq); | ||
| 274 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, | ||
| 275 | &xip_mem, &xip_pfn); | ||
| 276 | if (unlikely(!error)) { | ||
| 277 | write_seqcount_end(&xip_sparse_seq); | ||
| 278 | mutex_unlock(&xip_sparse_mutex); | ||
| 279 | goto again; | ||
| 280 | } | ||
| 281 | if (error != -ENODATA) | ||
| 282 | goto out; | ||
| 283 | /* not shared and writable, use xip_sparse_page() */ | ||
| 284 | page = xip_sparse_page(); | ||
| 285 | if (!page) | ||
| 286 | goto out; | ||
| 287 | err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, | ||
| 288 | page); | ||
| 289 | if (err == -ENOMEM) | ||
| 290 | goto out; | ||
| 291 | |||
| 292 | ret = VM_FAULT_NOPAGE; | ||
| 293 | out: | ||
| 294 | write_seqcount_end(&xip_sparse_seq); | ||
| 295 | mutex_unlock(&xip_sparse_mutex); | ||
| 296 | |||
| 297 | return ret; | ||
| 298 | } | ||
| 299 | } | ||
| 300 | |||
| 301 | static const struct vm_operations_struct xip_file_vm_ops = { | ||
| 302 | .fault = xip_file_fault, | ||
| 303 | .page_mkwrite = filemap_page_mkwrite, | ||
| 304 | .remap_pages = generic_file_remap_pages, | ||
| 305 | }; | ||
| 306 | |||
| 307 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | ||
| 308 | { | ||
| 309 | BUG_ON(!file->f_mapping->a_ops->get_xip_mem); | ||
| 310 | |||
| 311 | file_accessed(file); | ||
| 312 | vma->vm_ops = &xip_file_vm_ops; | ||
| 313 | vma->vm_flags |= VM_MIXEDMAP; | ||
| 314 | return 0; | ||
| 315 | } | ||
| 316 | EXPORT_SYMBOL_GPL(xip_file_mmap); | ||
| 317 | |||
| 318 | static ssize_t | ||
| 319 | __xip_file_write(struct file *filp, const char __user *buf, | ||
| 320 | size_t count, loff_t pos, loff_t *ppos) | ||
| 321 | { | ||
| 322 | struct address_space * mapping = filp->f_mapping; | ||
| 323 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
| 324 | struct inode *inode = mapping->host; | ||
| 325 | long status = 0; | ||
| 326 | size_t bytes; | ||
| 327 | ssize_t written = 0; | ||
| 328 | |||
| 329 | BUG_ON(!mapping->a_ops->get_xip_mem); | ||
| 330 | |||
| 331 | do { | ||
| 332 | unsigned long index; | ||
| 333 | unsigned long offset; | ||
| 334 | size_t copied; | ||
| 335 | void *xip_mem; | ||
| 336 | unsigned long xip_pfn; | ||
| 337 | |||
| 338 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | ||
| 339 | index = pos >> PAGE_CACHE_SHIFT; | ||
| 340 | bytes = PAGE_CACHE_SIZE - offset; | ||
| 341 | if (bytes > count) | ||
| 342 | bytes = count; | ||
| 343 | |||
| 344 | status = a_ops->get_xip_mem(mapping, index, 0, | ||
| 345 | &xip_mem, &xip_pfn); | ||
| 346 | if (status == -ENODATA) { | ||
| 347 | /* we allocate a new page unmap it */ | ||
| 348 | mutex_lock(&xip_sparse_mutex); | ||
| 349 | status = a_ops->get_xip_mem(mapping, index, 1, | ||
| 350 | &xip_mem, &xip_pfn); | ||
| 351 | mutex_unlock(&xip_sparse_mutex); | ||
| 352 | if (!status) | ||
| 353 | /* unmap page at pgoff from all other vmas */ | ||
| 354 | __xip_unmap(mapping, index); | ||
| 355 | } | ||
| 356 | |||
| 357 | if (status) | ||
| 358 | break; | ||
| 359 | |||
| 360 | copied = bytes - | ||
| 361 | __copy_from_user_nocache(xip_mem + offset, buf, bytes); | ||
| 362 | |||
| 363 | if (likely(copied > 0)) { | ||
| 364 | status = copied; | ||
| 365 | |||
| 366 | if (status >= 0) { | ||
| 367 | written += status; | ||
| 368 | count -= status; | ||
| 369 | pos += status; | ||
| 370 | buf += status; | ||
| 371 | } | ||
| 372 | } | ||
| 373 | if (unlikely(copied != bytes)) | ||
| 374 | if (status >= 0) | ||
| 375 | status = -EFAULT; | ||
| 376 | if (status < 0) | ||
| 377 | break; | ||
| 378 | } while (count); | ||
| 379 | *ppos = pos; | ||
| 380 | /* | ||
| 381 | * No need to use i_size_read() here, the i_size | ||
| 382 | * cannot change under us because we hold i_mutex. | ||
| 383 | */ | ||
| 384 | if (pos > inode->i_size) { | ||
| 385 | i_size_write(inode, pos); | ||
| 386 | mark_inode_dirty(inode); | ||
| 387 | } | ||
| 388 | |||
| 389 | return written ? written : status; | ||
| 390 | } | ||
| 391 | |||
| 392 | ssize_t | ||
| 393 | xip_file_write(struct file *filp, const char __user *buf, size_t len, | ||
| 394 | loff_t *ppos) | ||
| 395 | { | ||
| 396 | struct address_space *mapping = filp->f_mapping; | ||
| 397 | struct inode *inode = mapping->host; | ||
| 398 | size_t count; | ||
| 399 | loff_t pos; | ||
| 400 | ssize_t ret; | ||
| 401 | |||
| 402 | mutex_lock(&inode->i_mutex); | ||
| 403 | |||
| 404 | if (!access_ok(VERIFY_READ, buf, len)) { | ||
| 405 | ret=-EFAULT; | ||
| 406 | goto out_up; | ||
| 407 | } | ||
| 408 | |||
| 409 | pos = *ppos; | ||
| 410 | count = len; | ||
| 411 | |||
| 412 | /* We can write back this queue in page reclaim */ | ||
| 413 | current->backing_dev_info = mapping->backing_dev_info; | ||
| 414 | |||
| 415 | ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); | ||
| 416 | if (ret) | ||
| 417 | goto out_backing; | ||
| 418 | if (count == 0) | ||
| 419 | goto out_backing; | ||
| 420 | |||
| 421 | ret = file_remove_suid(filp); | ||
| 422 | if (ret) | ||
| 423 | goto out_backing; | ||
| 424 | |||
| 425 | ret = file_update_time(filp); | ||
| 426 | if (ret) | ||
| 427 | goto out_backing; | ||
| 428 | |||
| 429 | ret = __xip_file_write (filp, buf, count, pos, ppos); | ||
| 430 | |||
| 431 | out_backing: | ||
| 432 | current->backing_dev_info = NULL; | ||
| 433 | out_up: | ||
| 434 | mutex_unlock(&inode->i_mutex); | ||
| 435 | return ret; | ||
| 436 | } | ||
| 437 | EXPORT_SYMBOL_GPL(xip_file_write); | ||
| 438 | |||
| 439 | /* | ||
| 440 | * truncate a page used for execute in place | ||
| 441 | * functionality is analog to block_truncate_page but does use get_xip_mem | ||
| 442 | * to get the page instead of page cache | ||
| 443 | */ | ||
| 444 | int | ||
| 445 | xip_truncate_page(struct address_space *mapping, loff_t from) | ||
| 446 | { | ||
| 447 | pgoff_t index = from >> PAGE_CACHE_SHIFT; | ||
| 448 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | ||
| 449 | unsigned blocksize; | ||
| 450 | unsigned length; | ||
| 451 | void *xip_mem; | ||
| 452 | unsigned long xip_pfn; | ||
| 453 | int err; | ||
| 454 | |||
| 455 | BUG_ON(!mapping->a_ops->get_xip_mem); | ||
| 456 | |||
| 457 | blocksize = 1 << mapping->host->i_blkbits; | ||
| 458 | length = offset & (blocksize - 1); | ||
| 459 | |||
| 460 | /* Block boundary? Nothing to do */ | ||
| 461 | if (!length) | ||
| 462 | return 0; | ||
| 463 | |||
| 464 | length = blocksize - length; | ||
| 465 | |||
| 466 | err = mapping->a_ops->get_xip_mem(mapping, index, 0, | ||
| 467 | &xip_mem, &xip_pfn); | ||
| 468 | if (unlikely(err)) { | ||
| 469 | if (err == -ENODATA) | ||
| 470 | /* Hole? No need to truncate */ | ||
| 471 | return 0; | ||
| 472 | else | ||
| 473 | return err; | ||
| 474 | } | ||
| 475 | memset(xip_mem + offset, 0, length); | ||
| 476 | return 0; | ||
| 477 | } | ||
| 478 | EXPORT_SYMBOL_GPL(xip_truncate_page); | ||
diff --git a/mm/fremap.c b/mm/fremap.c deleted file mode 100644 index 2805d71cf476..000000000000 --- a/mm/fremap.c +++ /dev/null | |||
| @@ -1,283 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/fremap.c | ||
| 3 | * | ||
| 4 | * Explicit pagetable population and nonlinear (random) mappings support. | ||
| 5 | * | ||
| 6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 | ||
| 7 | */ | ||
| 8 | #include <linux/export.h> | ||
| 9 | #include <linux/backing-dev.h> | ||
| 10 | #include <linux/mm.h> | ||
| 11 | #include <linux/swap.h> | ||
| 12 | #include <linux/file.h> | ||
| 13 | #include <linux/mman.h> | ||
| 14 | #include <linux/pagemap.h> | ||
| 15 | #include <linux/swapops.h> | ||
| 16 | #include <linux/rmap.h> | ||
| 17 | #include <linux/syscalls.h> | ||
| 18 | #include <linux/mmu_notifier.h> | ||
| 19 | |||
| 20 | #include <asm/mmu_context.h> | ||
| 21 | #include <asm/cacheflush.h> | ||
| 22 | #include <asm/tlbflush.h> | ||
| 23 | |||
| 24 | #include "internal.h" | ||
| 25 | |||
| 26 | static int mm_counter(struct page *page) | ||
| 27 | { | ||
| 28 | return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; | ||
| 29 | } | ||
| 30 | |||
| 31 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 32 | unsigned long addr, pte_t *ptep) | ||
| 33 | { | ||
| 34 | pte_t pte = *ptep; | ||
| 35 | struct page *page; | ||
| 36 | swp_entry_t entry; | ||
| 37 | |||
| 38 | if (pte_present(pte)) { | ||
| 39 | flush_cache_page(vma, addr, pte_pfn(pte)); | ||
| 40 | pte = ptep_clear_flush_notify(vma, addr, ptep); | ||
| 41 | page = vm_normal_page(vma, addr, pte); | ||
| 42 | if (page) { | ||
| 43 | if (pte_dirty(pte)) | ||
| 44 | set_page_dirty(page); | ||
| 45 | update_hiwater_rss(mm); | ||
| 46 | dec_mm_counter(mm, mm_counter(page)); | ||
| 47 | page_remove_rmap(page); | ||
| 48 | page_cache_release(page); | ||
| 49 | } | ||
| 50 | } else { /* zap_pte() is not called when pte_none() */ | ||
| 51 | if (!pte_file(pte)) { | ||
| 52 | update_hiwater_rss(mm); | ||
| 53 | entry = pte_to_swp_entry(pte); | ||
| 54 | if (non_swap_entry(entry)) { | ||
| 55 | if (is_migration_entry(entry)) { | ||
| 56 | page = migration_entry_to_page(entry); | ||
| 57 | dec_mm_counter(mm, mm_counter(page)); | ||
| 58 | } | ||
| 59 | } else { | ||
| 60 | free_swap_and_cache(entry); | ||
| 61 | dec_mm_counter(mm, MM_SWAPENTS); | ||
| 62 | } | ||
| 63 | } | ||
| 64 | pte_clear_not_present_full(mm, addr, ptep, 0); | ||
| 65 | } | ||
| 66 | } | ||
| 67 | |||
| 68 | /* | ||
| 69 | * Install a file pte to a given virtual memory address, release any | ||
| 70 | * previously existing mapping. | ||
| 71 | */ | ||
| 72 | static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 73 | unsigned long addr, unsigned long pgoff, pgprot_t prot) | ||
| 74 | { | ||
| 75 | int err = -ENOMEM; | ||
| 76 | pte_t *pte, ptfile; | ||
| 77 | spinlock_t *ptl; | ||
| 78 | |||
| 79 | pte = get_locked_pte(mm, addr, &ptl); | ||
| 80 | if (!pte) | ||
| 81 | goto out; | ||
| 82 | |||
| 83 | ptfile = pgoff_to_pte(pgoff); | ||
| 84 | |||
| 85 | if (!pte_none(*pte)) | ||
| 86 | zap_pte(mm, vma, addr, pte); | ||
| 87 | |||
| 88 | set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); | ||
| 89 | /* | ||
| 90 | * We don't need to run update_mmu_cache() here because the "file pte" | ||
| 91 | * being installed by install_file_pte() is not a real pte - it's a | ||
| 92 | * non-present entry (like a swap entry), noting what file offset should | ||
| 93 | * be mapped there when there's a fault (in a non-linear vma where | ||
| 94 | * that's not obvious). | ||
| 95 | */ | ||
| 96 | pte_unmap_unlock(pte, ptl); | ||
| 97 | err = 0; | ||
| 98 | out: | ||
| 99 | return err; | ||
| 100 | } | ||
| 101 | |||
| 102 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, | ||
| 103 | unsigned long size, pgoff_t pgoff) | ||
| 104 | { | ||
| 105 | struct mm_struct *mm = vma->vm_mm; | ||
| 106 | int err; | ||
| 107 | |||
| 108 | do { | ||
| 109 | err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); | ||
| 110 | if (err) | ||
| 111 | return err; | ||
| 112 | |||
| 113 | size -= PAGE_SIZE; | ||
| 114 | addr += PAGE_SIZE; | ||
| 115 | pgoff++; | ||
| 116 | } while (size); | ||
| 117 | |||
| 118 | return 0; | ||
| 119 | } | ||
| 120 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
| 121 | |||
| 122 | /** | ||
| 123 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma | ||
| 124 | * @start: start of the remapped virtual memory range | ||
| 125 | * @size: size of the remapped virtual memory range | ||
| 126 | * @prot: new protection bits of the range (see NOTE) | ||
| 127 | * @pgoff: to-be-mapped page of the backing store file | ||
| 128 | * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. | ||
| 129 | * | ||
| 130 | * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma | ||
| 131 | * (shared backing store file). | ||
| 132 | * | ||
| 133 | * This syscall works purely via pagetables, so it's the most efficient | ||
| 134 | * way to map the same (large) file into a given virtual window. Unlike | ||
| 135 | * mmap()/mremap() it does not create any new vmas. The new mappings are | ||
| 136 | * also safe across swapout. | ||
| 137 | * | ||
| 138 | * NOTE: the @prot parameter right now is ignored (but must be zero), | ||
| 139 | * and the vma's default protection is used. Arbitrary protections | ||
| 140 | * might be implemented in the future. | ||
| 141 | */ | ||
| 142 | SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | ||
| 143 | unsigned long, prot, unsigned long, pgoff, unsigned long, flags) | ||
| 144 | { | ||
| 145 | struct mm_struct *mm = current->mm; | ||
| 146 | struct address_space *mapping; | ||
| 147 | struct vm_area_struct *vma; | ||
| 148 | int err = -EINVAL; | ||
| 149 | int has_write_lock = 0; | ||
| 150 | vm_flags_t vm_flags = 0; | ||
| 151 | |||
| 152 | pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " | ||
| 153 | "See Documentation/vm/remap_file_pages.txt.\n", | ||
| 154 | current->comm, current->pid); | ||
| 155 | |||
| 156 | if (prot) | ||
| 157 | return err; | ||
| 158 | /* | ||
| 159 | * Sanitize the syscall parameters: | ||
| 160 | */ | ||
| 161 | start = start & PAGE_MASK; | ||
| 162 | size = size & PAGE_MASK; | ||
| 163 | |||
| 164 | /* Does the address range wrap, or is the span zero-sized? */ | ||
| 165 | if (start + size <= start) | ||
| 166 | return err; | ||
| 167 | |||
| 168 | /* Does pgoff wrap? */ | ||
| 169 | if (pgoff + (size >> PAGE_SHIFT) < pgoff) | ||
| 170 | return err; | ||
| 171 | |||
| 172 | /* Can we represent this offset inside this architecture's pte's? */ | ||
| 173 | #if PTE_FILE_MAX_BITS < BITS_PER_LONG | ||
| 174 | if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) | ||
| 175 | return err; | ||
| 176 | #endif | ||
| 177 | |||
| 178 | /* We need down_write() to change vma->vm_flags. */ | ||
| 179 | down_read(&mm->mmap_sem); | ||
| 180 | retry: | ||
| 181 | vma = find_vma(mm, start); | ||
| 182 | |||
| 183 | /* | ||
| 184 | * Make sure the vma is shared, that it supports prefaulting, | ||
| 185 | * and that the remapped range is valid and fully within | ||
| 186 | * the single existing vma. | ||
| 187 | */ | ||
| 188 | if (!vma || !(vma->vm_flags & VM_SHARED)) | ||
| 189 | goto out; | ||
| 190 | |||
| 191 | if (!vma->vm_ops || !vma->vm_ops->remap_pages) | ||
| 192 | goto out; | ||
| 193 | |||
| 194 | if (start < vma->vm_start || start + size > vma->vm_end) | ||
| 195 | goto out; | ||
| 196 | |||
| 197 | /* Must set VM_NONLINEAR before any pages are populated. */ | ||
| 198 | if (!(vma->vm_flags & VM_NONLINEAR)) { | ||
| 199 | /* | ||
| 200 | * vm_private_data is used as a swapout cursor | ||
| 201 | * in a VM_NONLINEAR vma. | ||
| 202 | */ | ||
| 203 | if (vma->vm_private_data) | ||
| 204 | goto out; | ||
| 205 | |||
| 206 | /* Don't need a nonlinear mapping, exit success */ | ||
| 207 | if (pgoff == linear_page_index(vma, start)) { | ||
| 208 | err = 0; | ||
| 209 | goto out; | ||
| 210 | } | ||
| 211 | |||
| 212 | if (!has_write_lock) { | ||
| 213 | get_write_lock: | ||
| 214 | up_read(&mm->mmap_sem); | ||
| 215 | down_write(&mm->mmap_sem); | ||
| 216 | has_write_lock = 1; | ||
| 217 | goto retry; | ||
| 218 | } | ||
| 219 | mapping = vma->vm_file->f_mapping; | ||
| 220 | /* | ||
| 221 | * page_mkclean doesn't work on nonlinear vmas, so if | ||
| 222 | * dirty pages need to be accounted, emulate with linear | ||
| 223 | * vmas. | ||
| 224 | */ | ||
| 225 | if (mapping_cap_account_dirty(mapping)) { | ||
| 226 | unsigned long addr; | ||
| 227 | struct file *file = get_file(vma->vm_file); | ||
| 228 | /* mmap_region may free vma; grab the info now */ | ||
| 229 | vm_flags = vma->vm_flags; | ||
| 230 | |||
| 231 | addr = mmap_region(file, start, size, vm_flags, pgoff); | ||
| 232 | fput(file); | ||
| 233 | if (IS_ERR_VALUE(addr)) { | ||
| 234 | err = addr; | ||
| 235 | } else { | ||
| 236 | BUG_ON(addr != start); | ||
| 237 | err = 0; | ||
| 238 | } | ||
| 239 | goto out_freed; | ||
| 240 | } | ||
| 241 | i_mmap_lock_write(mapping); | ||
| 242 | flush_dcache_mmap_lock(mapping); | ||
| 243 | vma->vm_flags |= VM_NONLINEAR; | ||
| 244 | vma_interval_tree_remove(vma, &mapping->i_mmap); | ||
| 245 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | ||
| 246 | flush_dcache_mmap_unlock(mapping); | ||
| 247 | i_mmap_unlock_write(mapping); | ||
| 248 | } | ||
| 249 | |||
| 250 | if (vma->vm_flags & VM_LOCKED) { | ||
| 251 | /* | ||
| 252 | * drop PG_Mlocked flag for over-mapped range | ||
| 253 | */ | ||
| 254 | if (!has_write_lock) | ||
| 255 | goto get_write_lock; | ||
| 256 | vm_flags = vma->vm_flags; | ||
| 257 | munlock_vma_pages_range(vma, start, start + size); | ||
| 258 | vma->vm_flags = vm_flags; | ||
| 259 | } | ||
| 260 | |||
| 261 | mmu_notifier_invalidate_range_start(mm, start, start + size); | ||
| 262 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); | ||
| 263 | mmu_notifier_invalidate_range_end(mm, start, start + size); | ||
| 264 | |||
| 265 | /* | ||
| 266 | * We can't clear VM_NONLINEAR because we'd have to do | ||
| 267 | * it after ->populate completes, and that would prevent | ||
| 268 | * downgrading the lock. (Locks can't be upgraded). | ||
| 269 | */ | ||
| 270 | |||
| 271 | out: | ||
| 272 | if (vma) | ||
| 273 | vm_flags = vma->vm_flags; | ||
| 274 | out_freed: | ||
| 275 | if (likely(!has_write_lock)) | ||
| 276 | up_read(&mm->mmap_sem); | ||
| 277 | else | ||
| 278 | up_write(&mm->mmap_sem); | ||
| 279 | if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) | ||
| 280 | mm_populate(start, size); | ||
| 281 | |||
| 282 | return err; | ||
| 283 | } | ||
| @@ -55,7 +55,7 @@ retry: | |||
| 55 | */ | 55 | */ |
| 56 | if (likely(!(flags & FOLL_MIGRATION))) | 56 | if (likely(!(flags & FOLL_MIGRATION))) |
| 57 | goto no_page; | 57 | goto no_page; |
| 58 | if (pte_none(pte) || pte_file(pte)) | 58 | if (pte_none(pte)) |
| 59 | goto no_page; | 59 | goto no_page; |
| 60 | entry = pte_to_swp_entry(pte); | 60 | entry = pte_to_swp_entry(pte); |
| 61 | if (!is_migration_entry(entry)) | 61 | if (!is_migration_entry(entry)) |
| @@ -64,7 +64,7 @@ retry: | |||
| 64 | migration_entry_wait(mm, pmd, address); | 64 | migration_entry_wait(mm, pmd, address); |
| 65 | goto retry; | 65 | goto retry; |
| 66 | } | 66 | } |
| 67 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | 67 | if ((flags & FOLL_NUMA) && pte_protnone(pte)) |
| 68 | goto no_page; | 68 | goto no_page; |
| 69 | if ((flags & FOLL_WRITE) && !pte_write(pte)) { | 69 | if ((flags & FOLL_WRITE) && !pte_write(pte)) { |
| 70 | pte_unmap_unlock(ptep, ptl); | 70 | pte_unmap_unlock(ptep, ptl); |
| @@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
| 167 | if (pud_none(*pud)) | 167 | if (pud_none(*pud)) |
| 168 | return no_page_table(vma, flags); | 168 | return no_page_table(vma, flags); |
| 169 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { | 169 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
| 170 | if (flags & FOLL_GET) | 170 | page = follow_huge_pud(mm, address, pud, flags); |
| 171 | return NULL; | 171 | if (page) |
| 172 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 172 | return page; |
| 173 | return page; | 173 | return no_page_table(vma, flags); |
| 174 | } | 174 | } |
| 175 | if (unlikely(pud_bad(*pud))) | 175 | if (unlikely(pud_bad(*pud))) |
| 176 | return no_page_table(vma, flags); | 176 | return no_page_table(vma, flags); |
| @@ -179,21 +179,12 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
| 179 | if (pmd_none(*pmd)) | 179 | if (pmd_none(*pmd)) |
| 180 | return no_page_table(vma, flags); | 180 | return no_page_table(vma, flags); |
| 181 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { | 181 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
| 182 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 182 | page = follow_huge_pmd(mm, address, pmd, flags); |
| 183 | if (flags & FOLL_GET) { | 183 | if (page) |
| 184 | /* | 184 | return page; |
| 185 | * Refcount on tail pages are not well-defined and | 185 | return no_page_table(vma, flags); |
| 186 | * shouldn't be taken. The caller should handle a NULL | ||
| 187 | * return when trying to follow tail pages. | ||
| 188 | */ | ||
| 189 | if (PageHead(page)) | ||
| 190 | get_page(page); | ||
| 191 | else | ||
| 192 | page = NULL; | ||
| 193 | } | ||
| 194 | return page; | ||
| 195 | } | 186 | } |
| 196 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | 187 | if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) |
| 197 | return no_page_table(vma, flags); | 188 | return no_page_table(vma, flags); |
| 198 | if (pmd_trans_huge(*pmd)) { | 189 | if (pmd_trans_huge(*pmd)) { |
| 199 | if (flags & FOLL_SPLIT) { | 190 | if (flags & FOLL_SPLIT) { |
| @@ -584,6 +575,185 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
| 584 | return 0; | 575 | return 0; |
| 585 | } | 576 | } |
| 586 | 577 | ||
| 578 | static __always_inline long __get_user_pages_locked(struct task_struct *tsk, | ||
| 579 | struct mm_struct *mm, | ||
| 580 | unsigned long start, | ||
| 581 | unsigned long nr_pages, | ||
| 582 | int write, int force, | ||
| 583 | struct page **pages, | ||
| 584 | struct vm_area_struct **vmas, | ||
| 585 | int *locked, bool notify_drop, | ||
| 586 | unsigned int flags) | ||
| 587 | { | ||
| 588 | long ret, pages_done; | ||
| 589 | bool lock_dropped; | ||
| 590 | |||
| 591 | if (locked) { | ||
| 592 | /* if VM_FAULT_RETRY can be returned, vmas become invalid */ | ||
| 593 | BUG_ON(vmas); | ||
| 594 | /* check caller initialized locked */ | ||
| 595 | BUG_ON(*locked != 1); | ||
| 596 | } | ||
| 597 | |||
| 598 | if (pages) | ||
| 599 | flags |= FOLL_GET; | ||
| 600 | if (write) | ||
| 601 | flags |= FOLL_WRITE; | ||
| 602 | if (force) | ||
| 603 | flags |= FOLL_FORCE; | ||
| 604 | |||
| 605 | pages_done = 0; | ||
| 606 | lock_dropped = false; | ||
| 607 | for (;;) { | ||
| 608 | ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, | ||
| 609 | vmas, locked); | ||
| 610 | if (!locked) | ||
| 611 | /* VM_FAULT_RETRY couldn't trigger, bypass */ | ||
| 612 | return ret; | ||
| 613 | |||
| 614 | /* VM_FAULT_RETRY cannot return errors */ | ||
| 615 | if (!*locked) { | ||
| 616 | BUG_ON(ret < 0); | ||
| 617 | BUG_ON(ret >= nr_pages); | ||
| 618 | } | ||
| 619 | |||
| 620 | if (!pages) | ||
| 621 | /* If it's a prefault don't insist harder */ | ||
| 622 | return ret; | ||
| 623 | |||
| 624 | if (ret > 0) { | ||
| 625 | nr_pages -= ret; | ||
| 626 | pages_done += ret; | ||
| 627 | if (!nr_pages) | ||
| 628 | break; | ||
| 629 | } | ||
| 630 | if (*locked) { | ||
| 631 | /* VM_FAULT_RETRY didn't trigger */ | ||
| 632 | if (!pages_done) | ||
| 633 | pages_done = ret; | ||
| 634 | break; | ||
| 635 | } | ||
| 636 | /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ | ||
| 637 | pages += ret; | ||
| 638 | start += ret << PAGE_SHIFT; | ||
| 639 | |||
| 640 | /* | ||
| 641 | * Repeat on the address that fired VM_FAULT_RETRY | ||
| 642 | * without FAULT_FLAG_ALLOW_RETRY but with | ||
| 643 | * FAULT_FLAG_TRIED. | ||
| 644 | */ | ||
| 645 | *locked = 1; | ||
| 646 | lock_dropped = true; | ||
| 647 | down_read(&mm->mmap_sem); | ||
| 648 | ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, | ||
| 649 | pages, NULL, NULL); | ||
| 650 | if (ret != 1) { | ||
| 651 | BUG_ON(ret > 1); | ||
| 652 | if (!pages_done) | ||
| 653 | pages_done = ret; | ||
| 654 | break; | ||
| 655 | } | ||
| 656 | nr_pages--; | ||
| 657 | pages_done++; | ||
| 658 | if (!nr_pages) | ||
| 659 | break; | ||
| 660 | pages++; | ||
| 661 | start += PAGE_SIZE; | ||
| 662 | } | ||
| 663 | if (notify_drop && lock_dropped && *locked) { | ||
| 664 | /* | ||
| 665 | * We must let the caller know we temporarily dropped the lock | ||
| 666 | * and so the critical section protected by it was lost. | ||
| 667 | */ | ||
| 668 | up_read(&mm->mmap_sem); | ||
| 669 | *locked = 0; | ||
| 670 | } | ||
| 671 | return pages_done; | ||
| 672 | } | ||
| 673 | |||
| 674 | /* | ||
| 675 | * We can leverage the VM_FAULT_RETRY functionality in the page fault | ||
| 676 | * paths better by using either get_user_pages_locked() or | ||
| 677 | * get_user_pages_unlocked(). | ||
| 678 | * | ||
| 679 | * get_user_pages_locked() is suitable to replace the form: | ||
| 680 | * | ||
| 681 | * down_read(&mm->mmap_sem); | ||
| 682 | * do_something() | ||
| 683 | * get_user_pages(tsk, mm, ..., pages, NULL); | ||
| 684 | * up_read(&mm->mmap_sem); | ||
| 685 | * | ||
| 686 | * to: | ||
| 687 | * | ||
| 688 | * int locked = 1; | ||
| 689 | * down_read(&mm->mmap_sem); | ||
| 690 | * do_something() | ||
| 691 | * get_user_pages_locked(tsk, mm, ..., pages, &locked); | ||
| 692 | * if (locked) | ||
| 693 | * up_read(&mm->mmap_sem); | ||
| 694 | */ | ||
| 695 | long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, | ||
| 696 | unsigned long start, unsigned long nr_pages, | ||
| 697 | int write, int force, struct page **pages, | ||
| 698 | int *locked) | ||
| 699 | { | ||
| 700 | return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, | ||
| 701 | pages, NULL, locked, true, FOLL_TOUCH); | ||
| 702 | } | ||
| 703 | EXPORT_SYMBOL(get_user_pages_locked); | ||
| 704 | |||
| 705 | /* | ||
| 706 | * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to | ||
| 707 | * pass additional gup_flags as last parameter (like FOLL_HWPOISON). | ||
| 708 | * | ||
| 709 | * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the | ||
| 710 | * caller if required (just like with __get_user_pages). "FOLL_GET", | ||
| 711 | * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed | ||
| 712 | * according to the parameters "pages", "write", "force" | ||
| 713 | * respectively. | ||
| 714 | */ | ||
| 715 | __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
| 716 | unsigned long start, unsigned long nr_pages, | ||
| 717 | int write, int force, struct page **pages, | ||
| 718 | unsigned int gup_flags) | ||
| 719 | { | ||
| 720 | long ret; | ||
| 721 | int locked = 1; | ||
| 722 | down_read(&mm->mmap_sem); | ||
| 723 | ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, | ||
| 724 | pages, NULL, &locked, false, gup_flags); | ||
| 725 | if (locked) | ||
| 726 | up_read(&mm->mmap_sem); | ||
| 727 | return ret; | ||
| 728 | } | ||
| 729 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
| 730 | |||
| 731 | /* | ||
| 732 | * get_user_pages_unlocked() is suitable to replace the form: | ||
| 733 | * | ||
| 734 | * down_read(&mm->mmap_sem); | ||
| 735 | * get_user_pages(tsk, mm, ..., pages, NULL); | ||
| 736 | * up_read(&mm->mmap_sem); | ||
| 737 | * | ||
| 738 | * with: | ||
| 739 | * | ||
| 740 | * get_user_pages_unlocked(tsk, mm, ..., pages); | ||
| 741 | * | ||
| 742 | * It is functionally equivalent to get_user_pages_fast so | ||
| 743 | * get_user_pages_fast should be used instead, if the two parameters | ||
| 744 | * "tsk" and "mm" are respectively equal to current and current->mm, | ||
| 745 | * or if "force" shall be set to 1 (get_user_pages_fast misses the | ||
| 746 | * "force" parameter). | ||
| 747 | */ | ||
| 748 | long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
| 749 | unsigned long start, unsigned long nr_pages, | ||
| 750 | int write, int force, struct page **pages) | ||
| 751 | { | ||
| 752 | return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, | ||
| 753 | force, pages, FOLL_TOUCH); | ||
| 754 | } | ||
| 755 | EXPORT_SYMBOL(get_user_pages_unlocked); | ||
| 756 | |||
| 587 | /* | 757 | /* |
| 588 | * get_user_pages() - pin user pages in memory | 758 | * get_user_pages() - pin user pages in memory |
| 589 | * @tsk: the task_struct to use for page fault accounting, or | 759 | * @tsk: the task_struct to use for page fault accounting, or |
| @@ -633,22 +803,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
| 633 | * use the correct cache flushing APIs. | 803 | * use the correct cache flushing APIs. |
| 634 | * | 804 | * |
| 635 | * See also get_user_pages_fast, for performance critical applications. | 805 | * See also get_user_pages_fast, for performance critical applications. |
| 806 | * | ||
| 807 | * get_user_pages should be phased out in favor of | ||
| 808 | * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing | ||
| 809 | * should use get_user_pages because it cannot pass | ||
| 810 | * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. | ||
| 636 | */ | 811 | */ |
| 637 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 812 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 638 | unsigned long start, unsigned long nr_pages, int write, | 813 | unsigned long start, unsigned long nr_pages, int write, |
| 639 | int force, struct page **pages, struct vm_area_struct **vmas) | 814 | int force, struct page **pages, struct vm_area_struct **vmas) |
| 640 | { | 815 | { |
| 641 | int flags = FOLL_TOUCH; | 816 | return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, |
| 642 | 817 | pages, vmas, NULL, false, FOLL_TOUCH); | |
| 643 | if (pages) | ||
| 644 | flags |= FOLL_GET; | ||
| 645 | if (write) | ||
| 646 | flags |= FOLL_WRITE; | ||
| 647 | if (force) | ||
| 648 | flags |= FOLL_FORCE; | ||
| 649 | |||
| 650 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, | ||
| 651 | NULL); | ||
| 652 | } | 818 | } |
| 653 | EXPORT_SYMBOL(get_user_pages); | 819 | EXPORT_SYMBOL(get_user_pages); |
| 654 | 820 | ||
| @@ -740,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
| 740 | 906 | ||
| 741 | /* | 907 | /* |
| 742 | * Similar to the PMD case below, NUMA hinting must take slow | 908 | * Similar to the PMD case below, NUMA hinting must take slow |
| 743 | * path | 909 | * path using the pte_protnone check. |
| 744 | */ | 910 | */ |
| 745 | if (!pte_present(pte) || pte_special(pte) || | 911 | if (!pte_present(pte) || pte_special(pte) || |
| 746 | pte_numa(pte) || (write && !pte_write(pte))) | 912 | pte_protnone(pte) || (write && !pte_write(pte))) |
| 747 | goto pte_unmap; | 913 | goto pte_unmap; |
| 748 | 914 | ||
| 749 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 915 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
| @@ -926,7 +1092,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
| 926 | 1092 | ||
| 927 | pmdp = pmd_offset(&pud, addr); | 1093 | pmdp = pmd_offset(&pud, addr); |
| 928 | do { | 1094 | do { |
| 929 | pmd_t pmd = ACCESS_ONCE(*pmdp); | 1095 | pmd_t pmd = READ_ONCE(*pmdp); |
| 930 | 1096 | ||
| 931 | next = pmd_addr_end(addr, end); | 1097 | next = pmd_addr_end(addr, end); |
| 932 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | 1098 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) |
| @@ -938,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
| 938 | * slowpath for accounting purposes and so that they | 1104 | * slowpath for accounting purposes and so that they |
| 939 | * can be serialised against THP migration. | 1105 | * can be serialised against THP migration. |
| 940 | */ | 1106 | */ |
| 941 | if (pmd_numa(pmd)) | 1107 | if (pmd_protnone(pmd)) |
| 942 | return 0; | 1108 | return 0; |
| 943 | 1109 | ||
| 944 | if (!gup_huge_pmd(pmd, pmdp, addr, next, write, | 1110 | if (!gup_huge_pmd(pmd, pmdp, addr, next, write, |
| @@ -1077,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
| 1077 | start += nr << PAGE_SHIFT; | 1243 | start += nr << PAGE_SHIFT; |
| 1078 | pages += nr; | 1244 | pages += nr; |
| 1079 | 1245 | ||
| 1080 | down_read(&mm->mmap_sem); | 1246 | ret = get_user_pages_unlocked(current, mm, start, |
| 1081 | ret = get_user_pages(current, mm, start, | 1247 | nr_pages - nr, write, 0, pages); |
| 1082 | nr_pages - nr, write, 0, pages, NULL); | ||
| 1083 | up_read(&mm->mmap_sem); | ||
| 1084 | 1248 | ||
| 1085 | /* Have to be a bit careful with return values */ | 1249 | /* Have to be a bit careful with return values */ |
| 1086 | if (nr > 0) { | 1250 | if (nr > 0) { |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 817a875f2b8c..fc00c8cb5a82 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -171,12 +171,7 @@ static int start_khugepaged(void) | |||
| 171 | } | 171 | } |
| 172 | 172 | ||
| 173 | static atomic_t huge_zero_refcount; | 173 | static atomic_t huge_zero_refcount; |
| 174 | static struct page *huge_zero_page __read_mostly; | 174 | struct page *huge_zero_page __read_mostly; |
| 175 | |||
| 176 | static inline bool is_huge_zero_page(struct page *page) | ||
| 177 | { | ||
| 178 | return ACCESS_ONCE(huge_zero_page) == page; | ||
| 179 | } | ||
| 180 | 175 | ||
| 181 | static inline bool is_huge_zero_pmd(pmd_t pmd) | 176 | static inline bool is_huge_zero_pmd(pmd_t pmd) |
| 182 | { | 177 | { |
| @@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | |||
| 766 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; | 761 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; |
| 767 | } | 762 | } |
| 768 | 763 | ||
| 769 | static inline struct page *alloc_hugepage_vma(int defrag, | ||
| 770 | struct vm_area_struct *vma, | ||
| 771 | unsigned long haddr, int nd, | ||
| 772 | gfp_t extra_gfp) | ||
| 773 | { | ||
| 774 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), | ||
| 775 | HPAGE_PMD_ORDER, vma, haddr, nd); | ||
| 776 | } | ||
| 777 | |||
| 778 | /* Caller must hold page table lock. */ | 764 | /* Caller must hold page table lock. */ |
| 779 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 765 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
| 780 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 766 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
| @@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 795 | unsigned long address, pmd_t *pmd, | 781 | unsigned long address, pmd_t *pmd, |
| 796 | unsigned int flags) | 782 | unsigned int flags) |
| 797 | { | 783 | { |
| 784 | gfp_t gfp; | ||
| 798 | struct page *page; | 785 | struct page *page; |
| 799 | unsigned long haddr = address & HPAGE_PMD_MASK; | 786 | unsigned long haddr = address & HPAGE_PMD_MASK; |
| 800 | 787 | ||
| @@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 829 | } | 816 | } |
| 830 | return 0; | 817 | return 0; |
| 831 | } | 818 | } |
| 832 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 819 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
| 833 | vma, haddr, numa_node_id(), 0); | 820 | page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); |
| 834 | if (unlikely(!page)) { | 821 | if (unlikely(!page)) { |
| 835 | count_vm_event(THP_FAULT_FALLBACK); | 822 | count_vm_event(THP_FAULT_FALLBACK); |
| 836 | return VM_FAULT_FALLBACK; | 823 | return VM_FAULT_FALLBACK; |
| @@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1118 | spin_unlock(ptl); | 1105 | spin_unlock(ptl); |
| 1119 | alloc: | 1106 | alloc: |
| 1120 | if (transparent_hugepage_enabled(vma) && | 1107 | if (transparent_hugepage_enabled(vma) && |
| 1121 | !transparent_hugepage_debug_cow()) | 1108 | !transparent_hugepage_debug_cow()) { |
| 1122 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1109 | gfp_t gfp; |
| 1123 | vma, haddr, numa_node_id(), 0); | 1110 | |
| 1124 | else | 1111 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
| 1112 | new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); | ||
| 1113 | } else | ||
| 1125 | new_page = NULL; | 1114 | new_page = NULL; |
| 1126 | 1115 | ||
| 1127 | if (unlikely(!new_page)) { | 1116 | if (unlikely(!new_page)) { |
| @@ -1222,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
| 1222 | return ERR_PTR(-EFAULT); | 1211 | return ERR_PTR(-EFAULT); |
| 1223 | 1212 | ||
| 1224 | /* Full NUMA hinting faults to serialise migration in fault paths */ | 1213 | /* Full NUMA hinting faults to serialise migration in fault paths */ |
| 1225 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | 1214 | if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) |
| 1226 | goto out; | 1215 | goto out; |
| 1227 | 1216 | ||
| 1228 | page = pmd_page(*pmd); | 1217 | page = pmd_page(*pmd); |
| @@ -1273,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1273 | bool migrated = false; | 1262 | bool migrated = false; |
| 1274 | int flags = 0; | 1263 | int flags = 0; |
| 1275 | 1264 | ||
| 1265 | /* A PROT_NONE fault should not end up here */ | ||
| 1266 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); | ||
| 1267 | |||
| 1276 | ptl = pmd_lock(mm, pmdp); | 1268 | ptl = pmd_lock(mm, pmdp); |
| 1277 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1269 | if (unlikely(!pmd_same(pmd, *pmdp))) |
| 1278 | goto out_unlock; | 1270 | goto out_unlock; |
| @@ -1283,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1283 | * check_same as the page may no longer be mapped. | 1275 | * check_same as the page may no longer be mapped. |
| 1284 | */ | 1276 | */ |
| 1285 | if (unlikely(pmd_trans_migrating(*pmdp))) { | 1277 | if (unlikely(pmd_trans_migrating(*pmdp))) { |
| 1278 | page = pmd_page(*pmdp); | ||
| 1286 | spin_unlock(ptl); | 1279 | spin_unlock(ptl); |
| 1287 | wait_migrate_huge_page(vma->anon_vma, pmdp); | 1280 | wait_on_page_locked(page); |
| 1288 | goto out; | 1281 | goto out; |
| 1289 | } | 1282 | } |
| 1290 | 1283 | ||
| @@ -1352,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1352 | 1345 | ||
| 1353 | /* | 1346 | /* |
| 1354 | * Migrate the THP to the requested node, returns with page unlocked | 1347 | * Migrate the THP to the requested node, returns with page unlocked |
| 1355 | * and pmd_numa cleared. | 1348 | * and access rights restored. |
| 1356 | */ | 1349 | */ |
| 1357 | spin_unlock(ptl); | 1350 | spin_unlock(ptl); |
| 1358 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1351 | migrated = migrate_misplaced_transhuge_page(mm, vma, |
| @@ -1365,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1365 | goto out; | 1358 | goto out; |
| 1366 | clear_pmdnuma: | 1359 | clear_pmdnuma: |
| 1367 | BUG_ON(!PageLocked(page)); | 1360 | BUG_ON(!PageLocked(page)); |
| 1368 | pmd = pmd_mknonnuma(pmd); | 1361 | pmd = pmd_modify(pmd, vma->vm_page_prot); |
| 1369 | set_pmd_at(mm, haddr, pmdp, pmd); | 1362 | set_pmd_at(mm, haddr, pmdp, pmd); |
| 1370 | VM_BUG_ON(pmd_numa(*pmdp)); | ||
| 1371 | update_mmu_cache_pmd(vma, addr, pmdp); | 1363 | update_mmu_cache_pmd(vma, addr, pmdp); |
| 1372 | unlock_page(page); | 1364 | unlock_page(page); |
| 1373 | out_unlock: | 1365 | out_unlock: |
| @@ -1423,26 +1415,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 1423 | return ret; | 1415 | return ret; |
| 1424 | } | 1416 | } |
| 1425 | 1417 | ||
| 1426 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | ||
| 1427 | unsigned long addr, unsigned long end, | ||
| 1428 | unsigned char *vec) | ||
| 1429 | { | ||
| 1430 | spinlock_t *ptl; | ||
| 1431 | int ret = 0; | ||
| 1432 | |||
| 1433 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | ||
| 1434 | /* | ||
| 1435 | * All logical pages in the range are present | ||
| 1436 | * if backed by a huge page. | ||
| 1437 | */ | ||
| 1438 | spin_unlock(ptl); | ||
| 1439 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
| 1440 | ret = 1; | ||
| 1441 | } | ||
| 1442 | |||
| 1443 | return ret; | ||
| 1444 | } | ||
| 1445 | |||
| 1446 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | 1418 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, |
| 1447 | unsigned long old_addr, | 1419 | unsigned long old_addr, |
| 1448 | unsigned long new_addr, unsigned long old_end, | 1420 | unsigned long new_addr, unsigned long old_end, |
| @@ -1510,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 1510 | 1482 | ||
| 1511 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1483 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
| 1512 | pmd_t entry; | 1484 | pmd_t entry; |
| 1513 | ret = 1; | 1485 | |
| 1514 | if (!prot_numa) { | 1486 | /* |
| 1487 | * Avoid trapping faults against the zero page. The read-only | ||
| 1488 | * data is likely to be read-cached on the local CPU and | ||
| 1489 | * local/remote hits to the zero page are not interesting. | ||
| 1490 | */ | ||
| 1491 | if (prot_numa && is_huge_zero_pmd(*pmd)) { | ||
| 1492 | spin_unlock(ptl); | ||
| 1493 | return 0; | ||
| 1494 | } | ||
| 1495 | |||
| 1496 | if (!prot_numa || !pmd_protnone(*pmd)) { | ||
| 1497 | ret = 1; | ||
| 1515 | entry = pmdp_get_and_clear_notify(mm, addr, pmd); | 1498 | entry = pmdp_get_and_clear_notify(mm, addr, pmd); |
| 1516 | if (pmd_numa(entry)) | ||
| 1517 | entry = pmd_mknonnuma(entry); | ||
| 1518 | entry = pmd_modify(entry, newprot); | 1499 | entry = pmd_modify(entry, newprot); |
| 1519 | ret = HPAGE_PMD_NR; | 1500 | ret = HPAGE_PMD_NR; |
| 1520 | set_pmd_at(mm, addr, pmd, entry); | 1501 | set_pmd_at(mm, addr, pmd, entry); |
| 1521 | BUG_ON(pmd_write(entry)); | 1502 | BUG_ON(pmd_write(entry)); |
| 1522 | } else { | ||
| 1523 | struct page *page = pmd_page(*pmd); | ||
| 1524 | |||
| 1525 | /* | ||
| 1526 | * Do not trap faults against the zero page. The | ||
| 1527 | * read-only data is likely to be read-cached on the | ||
| 1528 | * local CPU cache and it is less useful to know about | ||
| 1529 | * local vs remote hits on the zero page. | ||
| 1530 | */ | ||
| 1531 | if (!is_huge_zero_page(page) && | ||
| 1532 | !pmd_numa(*pmd)) { | ||
| 1533 | pmdp_set_numa(mm, addr, pmd); | ||
| 1534 | ret = HPAGE_PMD_NR; | ||
| 1535 | } | ||
| 1536 | } | 1503 | } |
| 1537 | spin_unlock(ptl); | 1504 | spin_unlock(ptl); |
| 1538 | } | 1505 | } |
| @@ -1797,9 +1764,9 @@ static int __split_huge_page_map(struct page *page, | |||
| 1797 | pte_t *pte, entry; | 1764 | pte_t *pte, entry; |
| 1798 | BUG_ON(PageCompound(page+i)); | 1765 | BUG_ON(PageCompound(page+i)); |
| 1799 | /* | 1766 | /* |
| 1800 | * Note that pmd_numa is not transferred deliberately | 1767 | * Note that NUMA hinting access restrictions are not |
| 1801 | * to avoid any possibility that pte_numa leaks to | 1768 | * transferred to avoid any possibility of altering |
| 1802 | * a PROT_NONE VMA by accident. | 1769 | * permissions across VMAs. |
| 1803 | */ | 1770 | */ |
| 1804 | entry = mk_pte(page + i, vma->vm_page_prot); | 1771 | entry = mk_pte(page + i, vma->vm_page_prot); |
| 1805 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1772 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| @@ -2148,7 +2115,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
| 2148 | { | 2115 | { |
| 2149 | struct page *page; | 2116 | struct page *page; |
| 2150 | pte_t *_pte; | 2117 | pte_t *_pte; |
| 2151 | int referenced = 0, none = 0; | 2118 | int none = 0; |
| 2119 | bool referenced = false, writable = false; | ||
| 2152 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 2120 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
| 2153 | _pte++, address += PAGE_SIZE) { | 2121 | _pte++, address += PAGE_SIZE) { |
| 2154 | pte_t pteval = *_pte; | 2122 | pte_t pteval = *_pte; |
| @@ -2158,7 +2126,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
| 2158 | else | 2126 | else |
| 2159 | goto out; | 2127 | goto out; |
| 2160 | } | 2128 | } |
| 2161 | if (!pte_present(pteval) || !pte_write(pteval)) | 2129 | if (!pte_present(pteval)) |
| 2162 | goto out; | 2130 | goto out; |
| 2163 | page = vm_normal_page(vma, address, pteval); | 2131 | page = vm_normal_page(vma, address, pteval); |
| 2164 | if (unlikely(!page)) | 2132 | if (unlikely(!page)) |
| @@ -2168,9 +2136,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
| 2168 | VM_BUG_ON_PAGE(!PageAnon(page), page); | 2136 | VM_BUG_ON_PAGE(!PageAnon(page), page); |
| 2169 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 2137 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
| 2170 | 2138 | ||
| 2171 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
| 2172 | if (page_count(page) != 1) | ||
| 2173 | goto out; | ||
| 2174 | /* | 2139 | /* |
| 2175 | * We can do it before isolate_lru_page because the | 2140 | * We can do it before isolate_lru_page because the |
| 2176 | * page can't be freed from under us. NOTE: PG_lock | 2141 | * page can't be freed from under us. NOTE: PG_lock |
| @@ -2179,6 +2144,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
| 2179 | */ | 2144 | */ |
| 2180 | if (!trylock_page(page)) | 2145 | if (!trylock_page(page)) |
| 2181 | goto out; | 2146 | goto out; |
| 2147 | |||
| 2148 | /* | ||
| 2149 | * cannot use mapcount: can't collapse if there's a gup pin. | ||
| 2150 | * The page must only be referenced by the scanned process | ||
| 2151 | * and page swap cache. | ||
| 2152 | */ | ||
| 2153 | if (page_count(page) != 1 + !!PageSwapCache(page)) { | ||
| 2154 | unlock_page(page); | ||
| 2155 | goto out; | ||
| 2156 | } | ||
| 2157 | if (pte_write(pteval)) { | ||
| 2158 | writable = true; | ||
| 2159 | } else { | ||
| 2160 | if (PageSwapCache(page) && !reuse_swap_page(page)) { | ||
| 2161 | unlock_page(page); | ||
| 2162 | goto out; | ||
| 2163 | } | ||
| 2164 | /* | ||
| 2165 | * Page is not in the swap cache. It can be collapsed | ||
| 2166 | * into a THP. | ||
| 2167 | */ | ||
| 2168 | } | ||
| 2169 | |||
| 2182 | /* | 2170 | /* |
| 2183 | * Isolate the page to avoid collapsing an hugepage | 2171 | * Isolate the page to avoid collapsing an hugepage |
| 2184 | * currently in use by the VM. | 2172 | * currently in use by the VM. |
| @@ -2195,9 +2183,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
| 2195 | /* If there is no mapped pte young don't collapse the page */ | 2183 | /* If there is no mapped pte young don't collapse the page */ |
| 2196 | if (pte_young(pteval) || PageReferenced(page) || | 2184 | if (pte_young(pteval) || PageReferenced(page) || |
| 2197 | mmu_notifier_test_young(vma->vm_mm, address)) | 2185 | mmu_notifier_test_young(vma->vm_mm, address)) |
| 2198 | referenced = 1; | 2186 | referenced = true; |
| 2199 | } | 2187 | } |
| 2200 | if (likely(referenced)) | 2188 | if (likely(referenced && writable)) |
| 2201 | return 1; | 2189 | return 1; |
| 2202 | out: | 2190 | out: |
| 2203 | release_pte_pages(pte, _pte); | 2191 | release_pte_pages(pte, _pte); |
| @@ -2550,11 +2538,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
| 2550 | { | 2538 | { |
| 2551 | pmd_t *pmd; | 2539 | pmd_t *pmd; |
| 2552 | pte_t *pte, *_pte; | 2540 | pte_t *pte, *_pte; |
| 2553 | int ret = 0, referenced = 0, none = 0; | 2541 | int ret = 0, none = 0; |
| 2554 | struct page *page; | 2542 | struct page *page; |
| 2555 | unsigned long _address; | 2543 | unsigned long _address; |
| 2556 | spinlock_t *ptl; | 2544 | spinlock_t *ptl; |
| 2557 | int node = NUMA_NO_NODE; | 2545 | int node = NUMA_NO_NODE; |
| 2546 | bool writable = false, referenced = false; | ||
| 2558 | 2547 | ||
| 2559 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2548 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
| 2560 | 2549 | ||
| @@ -2573,8 +2562,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
| 2573 | else | 2562 | else |
| 2574 | goto out_unmap; | 2563 | goto out_unmap; |
| 2575 | } | 2564 | } |
| 2576 | if (!pte_present(pteval) || !pte_write(pteval)) | 2565 | if (!pte_present(pteval)) |
| 2577 | goto out_unmap; | 2566 | goto out_unmap; |
| 2567 | if (pte_write(pteval)) | ||
| 2568 | writable = true; | ||
| 2569 | |||
| 2578 | page = vm_normal_page(vma, _address, pteval); | 2570 | page = vm_normal_page(vma, _address, pteval); |
| 2579 | if (unlikely(!page)) | 2571 | if (unlikely(!page)) |
| 2580 | goto out_unmap; | 2572 | goto out_unmap; |
| @@ -2591,14 +2583,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
| 2591 | VM_BUG_ON_PAGE(PageCompound(page), page); | 2583 | VM_BUG_ON_PAGE(PageCompound(page), page); |
| 2592 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2584 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
| 2593 | goto out_unmap; | 2585 | goto out_unmap; |
| 2594 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2586 | /* |
| 2595 | if (page_count(page) != 1) | 2587 | * cannot use mapcount: can't collapse if there's a gup pin. |
| 2588 | * The page must only be referenced by the scanned process | ||
| 2589 | * and page swap cache. | ||
| 2590 | */ | ||
| 2591 | if (page_count(page) != 1 + !!PageSwapCache(page)) | ||
| 2596 | goto out_unmap; | 2592 | goto out_unmap; |
| 2597 | if (pte_young(pteval) || PageReferenced(page) || | 2593 | if (pte_young(pteval) || PageReferenced(page) || |
| 2598 | mmu_notifier_test_young(vma->vm_mm, address)) | 2594 | mmu_notifier_test_young(vma->vm_mm, address)) |
| 2599 | referenced = 1; | 2595 | referenced = true; |
| 2600 | } | 2596 | } |
| 2601 | if (referenced) | 2597 | if (referenced && writable) |
| 2602 | ret = 1; | 2598 | ret = 1; |
| 2603 | out_unmap: | 2599 | out_unmap: |
| 2604 | pte_unmap_unlock(pte, ptl); | 2600 | pte_unmap_unlock(pte, ptl); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85032de5e20f..0a9ac6c26832 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -35,7 +35,7 @@ | |||
| 35 | #include <linux/node.h> | 35 | #include <linux/node.h> |
| 36 | #include "internal.h" | 36 | #include "internal.h" |
| 37 | 37 | ||
| 38 | unsigned long hugepages_treat_as_movable; | 38 | int hugepages_treat_as_movable; |
| 39 | 39 | ||
| 40 | int hugetlb_max_hstate __read_mostly; | 40 | int hugetlb_max_hstate __read_mostly; |
| 41 | unsigned int default_hstate_idx; | 41 | unsigned int default_hstate_idx; |
| @@ -2657,9 +2657,10 @@ again: | |||
| 2657 | goto unlock; | 2657 | goto unlock; |
| 2658 | 2658 | ||
| 2659 | /* | 2659 | /* |
| 2660 | * HWPoisoned hugepage is already unmapped and dropped reference | 2660 | * Migrating hugepage or HWPoisoned hugepage is already |
| 2661 | * unmapped and its refcount is dropped, so just clear pte here. | ||
| 2661 | */ | 2662 | */ |
| 2662 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { | 2663 | if (unlikely(!pte_present(pte))) { |
| 2663 | huge_pte_clear(mm, address, ptep); | 2664 | huge_pte_clear(mm, address, ptep); |
| 2664 | goto unlock; | 2665 | goto unlock; |
| 2665 | } | 2666 | } |
| @@ -3134,6 +3135,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3134 | struct page *pagecache_page = NULL; | 3135 | struct page *pagecache_page = NULL; |
| 3135 | struct hstate *h = hstate_vma(vma); | 3136 | struct hstate *h = hstate_vma(vma); |
| 3136 | struct address_space *mapping; | 3137 | struct address_space *mapping; |
| 3138 | int need_wait_lock = 0; | ||
| 3137 | 3139 | ||
| 3138 | address &= huge_page_mask(h); | 3140 | address &= huge_page_mask(h); |
| 3139 | 3141 | ||
| @@ -3172,6 +3174,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3172 | ret = 0; | 3174 | ret = 0; |
| 3173 | 3175 | ||
| 3174 | /* | 3176 | /* |
| 3177 | * entry could be a migration/hwpoison entry at this point, so this | ||
| 3178 | * check prevents the kernel from going below assuming that we have | ||
| 3179 | * a active hugepage in pagecache. This goto expects the 2nd page fault, | ||
| 3180 | * and is_hugetlb_entry_(migration|hwpoisoned) check will properly | ||
| 3181 | * handle it. | ||
| 3182 | */ | ||
| 3183 | if (!pte_present(entry)) | ||
| 3184 | goto out_mutex; | ||
| 3185 | |||
| 3186 | /* | ||
| 3175 | * If we are going to COW the mapping later, we examine the pending | 3187 | * If we are going to COW the mapping later, we examine the pending |
| 3176 | * reservations for this page now. This will ensure that any | 3188 | * reservations for this page now. This will ensure that any |
| 3177 | * allocations necessary to record that reservation occur outside the | 3189 | * allocations necessary to record that reservation occur outside the |
| @@ -3190,30 +3202,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3190 | vma, address); | 3202 | vma, address); |
| 3191 | } | 3203 | } |
| 3192 | 3204 | ||
| 3205 | ptl = huge_pte_lock(h, mm, ptep); | ||
| 3206 | |||
| 3207 | /* Check for a racing update before calling hugetlb_cow */ | ||
| 3208 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) | ||
| 3209 | goto out_ptl; | ||
| 3210 | |||
| 3193 | /* | 3211 | /* |
| 3194 | * hugetlb_cow() requires page locks of pte_page(entry) and | 3212 | * hugetlb_cow() requires page locks of pte_page(entry) and |
| 3195 | * pagecache_page, so here we need take the former one | 3213 | * pagecache_page, so here we need take the former one |
| 3196 | * when page != pagecache_page or !pagecache_page. | 3214 | * when page != pagecache_page or !pagecache_page. |
| 3197 | * Note that locking order is always pagecache_page -> page, | ||
| 3198 | * so no worry about deadlock. | ||
| 3199 | */ | 3215 | */ |
| 3200 | page = pte_page(entry); | 3216 | page = pte_page(entry); |
| 3201 | get_page(page); | ||
| 3202 | if (page != pagecache_page) | 3217 | if (page != pagecache_page) |
| 3203 | lock_page(page); | 3218 | if (!trylock_page(page)) { |
| 3204 | 3219 | need_wait_lock = 1; | |
| 3205 | ptl = huge_pte_lockptr(h, mm, ptep); | 3220 | goto out_ptl; |
| 3206 | spin_lock(ptl); | 3221 | } |
| 3207 | /* Check for a racing update before calling hugetlb_cow */ | ||
| 3208 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) | ||
| 3209 | goto out_ptl; | ||
| 3210 | 3222 | ||
| 3223 | get_page(page); | ||
| 3211 | 3224 | ||
| 3212 | if (flags & FAULT_FLAG_WRITE) { | 3225 | if (flags & FAULT_FLAG_WRITE) { |
| 3213 | if (!huge_pte_write(entry)) { | 3226 | if (!huge_pte_write(entry)) { |
| 3214 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | 3227 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
| 3215 | pagecache_page, ptl); | 3228 | pagecache_page, ptl); |
| 3216 | goto out_ptl; | 3229 | goto out_put_page; |
| 3217 | } | 3230 | } |
| 3218 | entry = huge_pte_mkdirty(entry); | 3231 | entry = huge_pte_mkdirty(entry); |
| 3219 | } | 3232 | } |
| @@ -3221,7 +3234,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3221 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, | 3234 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
| 3222 | flags & FAULT_FLAG_WRITE)) | 3235 | flags & FAULT_FLAG_WRITE)) |
| 3223 | update_mmu_cache(vma, address, ptep); | 3236 | update_mmu_cache(vma, address, ptep); |
| 3224 | 3237 | out_put_page: | |
| 3238 | if (page != pagecache_page) | ||
| 3239 | unlock_page(page); | ||
| 3240 | put_page(page); | ||
| 3225 | out_ptl: | 3241 | out_ptl: |
| 3226 | spin_unlock(ptl); | 3242 | spin_unlock(ptl); |
| 3227 | 3243 | ||
| @@ -3229,12 +3245,17 @@ out_ptl: | |||
| 3229 | unlock_page(pagecache_page); | 3245 | unlock_page(pagecache_page); |
| 3230 | put_page(pagecache_page); | 3246 | put_page(pagecache_page); |
| 3231 | } | 3247 | } |
| 3232 | if (page != pagecache_page) | ||
| 3233 | unlock_page(page); | ||
| 3234 | put_page(page); | ||
| 3235 | |||
| 3236 | out_mutex: | 3248 | out_mutex: |
| 3237 | mutex_unlock(&htlb_fault_mutex_table[hash]); | 3249 | mutex_unlock(&htlb_fault_mutex_table[hash]); |
| 3250 | /* | ||
| 3251 | * Generally it's safe to hold refcount during waiting page lock. But | ||
| 3252 | * here we just wait to defer the next page fault to avoid busy loop and | ||
| 3253 | * the page is not used after unlocked before returning from the current | ||
| 3254 | * page fault. So we are safe from accessing freed page, even if we wait | ||
| 3255 | * here without taking refcount. | ||
| 3256 | */ | ||
| 3257 | if (need_wait_lock) | ||
| 3258 | wait_on_page_locked(page); | ||
| 3238 | return ret; | 3259 | return ret; |
| 3239 | } | 3260 | } |
| 3240 | 3261 | ||
| @@ -3364,7 +3385,26 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
| 3364 | spin_unlock(ptl); | 3385 | spin_unlock(ptl); |
| 3365 | continue; | 3386 | continue; |
| 3366 | } | 3387 | } |
| 3367 | if (!huge_pte_none(huge_ptep_get(ptep))) { | 3388 | pte = huge_ptep_get(ptep); |
| 3389 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { | ||
| 3390 | spin_unlock(ptl); | ||
| 3391 | continue; | ||
| 3392 | } | ||
| 3393 | if (unlikely(is_hugetlb_entry_migration(pte))) { | ||
| 3394 | swp_entry_t entry = pte_to_swp_entry(pte); | ||
| 3395 | |||
| 3396 | if (is_write_migration_entry(entry)) { | ||
| 3397 | pte_t newpte; | ||
| 3398 | |||
| 3399 | make_migration_entry_read(&entry); | ||
| 3400 | newpte = swp_entry_to_pte(entry); | ||
| 3401 | set_huge_pte_at(mm, address, ptep, newpte); | ||
| 3402 | pages++; | ||
| 3403 | } | ||
| 3404 | spin_unlock(ptl); | ||
| 3405 | continue; | ||
| 3406 | } | ||
| 3407 | if (!huge_pte_none(pte)) { | ||
| 3368 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 3408 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
| 3369 | pte = pte_mkhuge(huge_pte_modify(pte, newprot)); | 3409 | pte = pte_mkhuge(huge_pte_modify(pte, newprot)); |
| 3370 | pte = arch_make_huge_pte(pte, vma, NULL, 0); | 3410 | pte = arch_make_huge_pte(pte, vma, NULL, 0); |
| @@ -3558,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
| 3558 | if (saddr) { | 3598 | if (saddr) { |
| 3559 | spte = huge_pte_offset(svma->vm_mm, saddr); | 3599 | spte = huge_pte_offset(svma->vm_mm, saddr); |
| 3560 | if (spte) { | 3600 | if (spte) { |
| 3601 | mm_inc_nr_pmds(mm); | ||
| 3561 | get_page(virt_to_page(spte)); | 3602 | get_page(virt_to_page(spte)); |
| 3562 | break; | 3603 | break; |
| 3563 | } | 3604 | } |
| @@ -3569,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
| 3569 | 3610 | ||
| 3570 | ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); | 3611 | ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); |
| 3571 | spin_lock(ptl); | 3612 | spin_lock(ptl); |
| 3572 | if (pud_none(*pud)) | 3613 | if (pud_none(*pud)) { |
| 3573 | pud_populate(mm, pud, | 3614 | pud_populate(mm, pud, |
| 3574 | (pmd_t *)((unsigned long)spte & PAGE_MASK)); | 3615 | (pmd_t *)((unsigned long)spte & PAGE_MASK)); |
| 3575 | else | 3616 | } else { |
| 3576 | put_page(virt_to_page(spte)); | 3617 | put_page(virt_to_page(spte)); |
| 3618 | mm_inc_nr_pmds(mm); | ||
| 3619 | } | ||
| 3577 | spin_unlock(ptl); | 3620 | spin_unlock(ptl); |
| 3578 | out: | 3621 | out: |
| 3579 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | 3622 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
| @@ -3604,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |||
| 3604 | 3647 | ||
| 3605 | pud_clear(pud); | 3648 | pud_clear(pud); |
| 3606 | put_page(virt_to_page(ptep)); | 3649 | put_page(virt_to_page(ptep)); |
| 3650 | mm_dec_nr_pmds(mm); | ||
| 3607 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; | 3651 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; |
| 3608 | return 1; | 3652 | return 1; |
| 3609 | } | 3653 | } |
| @@ -3660,42 +3704,64 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
| 3660 | return (pte_t *) pmd; | 3704 | return (pte_t *) pmd; |
| 3661 | } | 3705 | } |
| 3662 | 3706 | ||
| 3663 | struct page * | 3707 | #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ |
| 3664 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
| 3665 | pmd_t *pmd, int write) | ||
| 3666 | { | ||
| 3667 | struct page *page; | ||
| 3668 | 3708 | ||
| 3669 | page = pte_page(*(pte_t *)pmd); | 3709 | /* |
| 3670 | if (page) | 3710 | * These functions are overwritable if your architecture needs its own |
| 3671 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); | 3711 | * behavior. |
| 3672 | return page; | 3712 | */ |
| 3713 | struct page * __weak | ||
| 3714 | follow_huge_addr(struct mm_struct *mm, unsigned long address, | ||
| 3715 | int write) | ||
| 3716 | { | ||
| 3717 | return ERR_PTR(-EINVAL); | ||
| 3673 | } | 3718 | } |
| 3674 | 3719 | ||
| 3675 | struct page * | 3720 | struct page * __weak |
| 3676 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | 3721 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
| 3677 | pud_t *pud, int write) | 3722 | pmd_t *pmd, int flags) |
| 3678 | { | 3723 | { |
| 3679 | struct page *page; | 3724 | struct page *page = NULL; |
| 3680 | 3725 | spinlock_t *ptl; | |
| 3681 | page = pte_page(*(pte_t *)pud); | 3726 | retry: |
| 3682 | if (page) | 3727 | ptl = pmd_lockptr(mm, pmd); |
| 3683 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | 3728 | spin_lock(ptl); |
| 3729 | /* | ||
| 3730 | * make sure that the address range covered by this pmd is not | ||
| 3731 | * unmapped from other threads. | ||
| 3732 | */ | ||
| 3733 | if (!pmd_huge(*pmd)) | ||
| 3734 | goto out; | ||
| 3735 | if (pmd_present(*pmd)) { | ||
| 3736 | page = pte_page(*(pte_t *)pmd) + | ||
| 3737 | ((address & ~PMD_MASK) >> PAGE_SHIFT); | ||
| 3738 | if (flags & FOLL_GET) | ||
| 3739 | get_page(page); | ||
| 3740 | } else { | ||
| 3741 | if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { | ||
| 3742 | spin_unlock(ptl); | ||
| 3743 | __migration_entry_wait(mm, (pte_t *)pmd, ptl); | ||
| 3744 | goto retry; | ||
| 3745 | } | ||
| 3746 | /* | ||
| 3747 | * hwpoisoned entry is treated as no_page_table in | ||
| 3748 | * follow_page_mask(). | ||
| 3749 | */ | ||
| 3750 | } | ||
| 3751 | out: | ||
| 3752 | spin_unlock(ptl); | ||
| 3684 | return page; | 3753 | return page; |
| 3685 | } | 3754 | } |
| 3686 | 3755 | ||
| 3687 | #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ | ||
| 3688 | |||
| 3689 | /* Can be overriden by architectures */ | ||
| 3690 | struct page * __weak | 3756 | struct page * __weak |
| 3691 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | 3757 | follow_huge_pud(struct mm_struct *mm, unsigned long address, |
| 3692 | pud_t *pud, int write) | 3758 | pud_t *pud, int flags) |
| 3693 | { | 3759 | { |
| 3694 | BUG(); | 3760 | if (flags & FOLL_GET) |
| 3695 | return NULL; | 3761 | return NULL; |
| 3696 | } | ||
| 3697 | 3762 | ||
| 3698 | #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ | 3763 | return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); |
| 3764 | } | ||
| 3699 | 3765 | ||
| 3700 | #ifdef CONFIG_MEMORY_FAILURE | 3766 | #ifdef CONFIG_MEMORY_FAILURE |
| 3701 | 3767 | ||
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 037e1c00a5b7..6e0057439a46 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
| @@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, | |||
| 279 | return -EINVAL; | 279 | return -EINVAL; |
| 280 | 280 | ||
| 281 | buf = strstrip(buf); | 281 | buf = strstrip(buf); |
| 282 | ret = page_counter_memparse(buf, &nr_pages); | 282 | ret = page_counter_memparse(buf, "-1", &nr_pages); |
| 283 | if (ret) | 283 | if (ret) |
| 284 | return ret; | 284 | return ret; |
| 285 | 285 | ||
diff --git a/mm/internal.h b/mm/internal.h index efad241f7014..a96da5b0029d 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -110,6 +110,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); | |||
| 110 | */ | 110 | */ |
| 111 | 111 | ||
| 112 | /* | 112 | /* |
| 113 | * Structure for holding the mostly immutable allocation parameters passed | ||
| 114 | * between functions involved in allocations, including the alloc_pages* | ||
| 115 | * family of functions. | ||
| 116 | * | ||
| 117 | * nodemask, migratetype and high_zoneidx are initialized only once in | ||
| 118 | * __alloc_pages_nodemask() and then never change. | ||
| 119 | * | ||
| 120 | * zonelist, preferred_zone and classzone_idx are set first in | ||
| 121 | * __alloc_pages_nodemask() for the fast path, and might be later changed | ||
| 122 | * in __alloc_pages_slowpath(). All other functions pass the whole strucure | ||
| 123 | * by a const pointer. | ||
| 124 | */ | ||
| 125 | struct alloc_context { | ||
| 126 | struct zonelist *zonelist; | ||
| 127 | nodemask_t *nodemask; | ||
| 128 | struct zone *preferred_zone; | ||
| 129 | int classzone_idx; | ||
| 130 | int migratetype; | ||
| 131 | enum zone_type high_zoneidx; | ||
| 132 | }; | ||
| 133 | |||
| 134 | /* | ||
| 113 | * Locate the struct page for both the matching buddy in our | 135 | * Locate the struct page for both the matching buddy in our |
| 114 | * pair (buddy1) and the combined O(n+1) page they form (page). | 136 | * pair (buddy1) and the combined O(n+1) page they form (page). |
| 115 | * | 137 | * |
| @@ -329,8 +351,10 @@ extern int mminit_loglevel; | |||
| 329 | #define mminit_dprintk(level, prefix, fmt, arg...) \ | 351 | #define mminit_dprintk(level, prefix, fmt, arg...) \ |
| 330 | do { \ | 352 | do { \ |
| 331 | if (level < mminit_loglevel) { \ | 353 | if (level < mminit_loglevel) { \ |
| 332 | printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ | 354 | if (level <= MMINIT_WARNING) \ |
| 333 | printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ | 355 | printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \ |
| 356 | else \ | ||
| 357 | printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ | ||
| 334 | } \ | 358 | } \ |
| 335 | } while (0) | 359 | } while (0) |
| 336 | 360 | ||
diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 8da581fa9060..f2c2492681bf 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c | |||
| @@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) | |||
| 21 | return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; | 21 | return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; |
| 22 | } | 22 | } |
| 23 | 23 | ||
| 24 | INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, | 24 | INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, |
| 25 | unsigned long, shared.linear.rb_subtree_last, | 25 | unsigned long, shared.rb_subtree_last, |
| 26 | vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) | 26 | vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) |
| 27 | 27 | ||
| 28 | /* Insert node immediately after prev in the interval tree */ | 28 | /* Insert node immediately after prev in the interval tree */ |
| @@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, | |||
| 36 | 36 | ||
| 37 | VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); | 37 | VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); |
| 38 | 38 | ||
| 39 | if (!prev->shared.linear.rb.rb_right) { | 39 | if (!prev->shared.rb.rb_right) { |
| 40 | parent = prev; | 40 | parent = prev; |
| 41 | link = &prev->shared.linear.rb.rb_right; | 41 | link = &prev->shared.rb.rb_right; |
| 42 | } else { | 42 | } else { |
| 43 | parent = rb_entry(prev->shared.linear.rb.rb_right, | 43 | parent = rb_entry(prev->shared.rb.rb_right, |
| 44 | struct vm_area_struct, shared.linear.rb); | 44 | struct vm_area_struct, shared.rb); |
| 45 | if (parent->shared.linear.rb_subtree_last < last) | 45 | if (parent->shared.rb_subtree_last < last) |
| 46 | parent->shared.linear.rb_subtree_last = last; | 46 | parent->shared.rb_subtree_last = last; |
| 47 | while (parent->shared.linear.rb.rb_left) { | 47 | while (parent->shared.rb.rb_left) { |
| 48 | parent = rb_entry(parent->shared.linear.rb.rb_left, | 48 | parent = rb_entry(parent->shared.rb.rb_left, |
| 49 | struct vm_area_struct, shared.linear.rb); | 49 | struct vm_area_struct, shared.rb); |
| 50 | if (parent->shared.linear.rb_subtree_last < last) | 50 | if (parent->shared.rb_subtree_last < last) |
| 51 | parent->shared.linear.rb_subtree_last = last; | 51 | parent->shared.rb_subtree_last = last; |
| 52 | } | 52 | } |
| 53 | link = &parent->shared.linear.rb.rb_left; | 53 | link = &parent->shared.rb.rb_left; |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | node->shared.linear.rb_subtree_last = last; | 56 | node->shared.rb_subtree_last = last; |
| 57 | rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); | 57 | rb_link_node(&node->shared.rb, &parent->shared.rb, link); |
| 58 | rb_insert_augmented(&node->shared.linear.rb, root, | 58 | rb_insert_augmented(&node->shared.rb, root, |
| 59 | &vma_interval_tree_augment); | 59 | &vma_interval_tree_augment); |
| 60 | } | 60 | } |
| 61 | 61 | ||
diff --git a/mm/iov_iter.c b/mm/iov_iter.c index a1599ca4ab0e..827732047da1 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c | |||
| @@ -501,18 +501,31 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) | |||
| 501 | EXPORT_SYMBOL(iov_iter_single_seg_count); | 501 | EXPORT_SYMBOL(iov_iter_single_seg_count); |
| 502 | 502 | ||
| 503 | void iov_iter_kvec(struct iov_iter *i, int direction, | 503 | void iov_iter_kvec(struct iov_iter *i, int direction, |
| 504 | const struct kvec *iov, unsigned long nr_segs, | 504 | const struct kvec *kvec, unsigned long nr_segs, |
| 505 | size_t count) | 505 | size_t count) |
| 506 | { | 506 | { |
| 507 | BUG_ON(!(direction & ITER_KVEC)); | 507 | BUG_ON(!(direction & ITER_KVEC)); |
| 508 | i->type = direction; | 508 | i->type = direction; |
| 509 | i->kvec = (struct kvec *)iov; | 509 | i->kvec = kvec; |
| 510 | i->nr_segs = nr_segs; | 510 | i->nr_segs = nr_segs; |
| 511 | i->iov_offset = 0; | 511 | i->iov_offset = 0; |
| 512 | i->count = count; | 512 | i->count = count; |
| 513 | } | 513 | } |
| 514 | EXPORT_SYMBOL(iov_iter_kvec); | 514 | EXPORT_SYMBOL(iov_iter_kvec); |
| 515 | 515 | ||
| 516 | void iov_iter_bvec(struct iov_iter *i, int direction, | ||
| 517 | const struct bio_vec *bvec, unsigned long nr_segs, | ||
| 518 | size_t count) | ||
| 519 | { | ||
| 520 | BUG_ON(!(direction & ITER_BVEC)); | ||
| 521 | i->type = direction; | ||
| 522 | i->bvec = bvec; | ||
| 523 | i->nr_segs = nr_segs; | ||
| 524 | i->iov_offset = 0; | ||
| 525 | i->count = count; | ||
| 526 | } | ||
| 527 | EXPORT_SYMBOL(iov_iter_bvec); | ||
| 528 | |||
| 516 | unsigned long iov_iter_alignment(const struct iov_iter *i) | 529 | unsigned long iov_iter_alignment(const struct iov_iter *i) |
| 517 | { | 530 | { |
| 518 | unsigned long res = 0; | 531 | unsigned long res = 0; |
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile new file mode 100644 index 000000000000..bd837b8c2f41 --- /dev/null +++ b/mm/kasan/Makefile | |||
| @@ -0,0 +1,8 @@ | |||
| 1 | KASAN_SANITIZE := n | ||
| 2 | |||
| 3 | CFLAGS_REMOVE_kasan.o = -pg | ||
| 4 | # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 | ||
| 5 | # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 | ||
| 6 | CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) | ||
| 7 | |||
| 8 | obj-y := kasan.o report.o | ||
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c new file mode 100644 index 000000000000..78fee632a7ee --- /dev/null +++ b/mm/kasan/kasan.c | |||
| @@ -0,0 +1,516 @@ | |||
| 1 | /* | ||
| 2 | * This file contains shadow memory manipulation code. | ||
| 3 | * | ||
| 4 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | ||
| 5 | * Author: Andrey Ryabinin <a.ryabinin@samsung.com> | ||
| 6 | * | ||
| 7 | * Some of code borrowed from https://github.com/xairy/linux by | ||
| 8 | * Andrey Konovalov <adech.fo@gmail.com> | ||
| 9 | * | ||
| 10 | * This program is free software; you can redistribute it and/or modify | ||
| 11 | * it under the terms of the GNU General Public License version 2 as | ||
| 12 | * published by the Free Software Foundation. | ||
| 13 | * | ||
| 14 | */ | ||
| 15 | |||
| 16 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 17 | #define DISABLE_BRANCH_PROFILING | ||
| 18 | |||
| 19 | #include <linux/export.h> | ||
| 20 | #include <linux/init.h> | ||
| 21 | #include <linux/kernel.h> | ||
| 22 | #include <linux/memblock.h> | ||
| 23 | #include <linux/memory.h> | ||
| 24 | #include <linux/mm.h> | ||
| 25 | #include <linux/module.h> | ||
| 26 | #include <linux/printk.h> | ||
| 27 | #include <linux/sched.h> | ||
| 28 | #include <linux/slab.h> | ||
| 29 | #include <linux/stacktrace.h> | ||
| 30 | #include <linux/string.h> | ||
| 31 | #include <linux/types.h> | ||
| 32 | #include <linux/kasan.h> | ||
| 33 | |||
| 34 | #include "kasan.h" | ||
| 35 | #include "../slab.h" | ||
| 36 | |||
| 37 | /* | ||
| 38 | * Poisons the shadow memory for 'size' bytes starting from 'addr'. | ||
| 39 | * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. | ||
| 40 | */ | ||
| 41 | static void kasan_poison_shadow(const void *address, size_t size, u8 value) | ||
| 42 | { | ||
| 43 | void *shadow_start, *shadow_end; | ||
| 44 | |||
| 45 | shadow_start = kasan_mem_to_shadow(address); | ||
| 46 | shadow_end = kasan_mem_to_shadow(address + size); | ||
| 47 | |||
| 48 | memset(shadow_start, value, shadow_end - shadow_start); | ||
| 49 | } | ||
| 50 | |||
| 51 | void kasan_unpoison_shadow(const void *address, size_t size) | ||
| 52 | { | ||
| 53 | kasan_poison_shadow(address, size, 0); | ||
| 54 | |||
| 55 | if (size & KASAN_SHADOW_MASK) { | ||
| 56 | u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); | ||
| 57 | *shadow = size & KASAN_SHADOW_MASK; | ||
| 58 | } | ||
| 59 | } | ||
| 60 | |||
| 61 | |||
| 62 | /* | ||
| 63 | * All functions below always inlined so compiler could | ||
| 64 | * perform better optimizations in each of __asan_loadX/__assn_storeX | ||
| 65 | * depending on memory access size X. | ||
| 66 | */ | ||
| 67 | |||
| 68 | static __always_inline bool memory_is_poisoned_1(unsigned long addr) | ||
| 69 | { | ||
| 70 | s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); | ||
| 71 | |||
| 72 | if (unlikely(shadow_value)) { | ||
| 73 | s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; | ||
| 74 | return unlikely(last_accessible_byte >= shadow_value); | ||
| 75 | } | ||
| 76 | |||
| 77 | return false; | ||
| 78 | } | ||
| 79 | |||
| 80 | static __always_inline bool memory_is_poisoned_2(unsigned long addr) | ||
| 81 | { | ||
| 82 | u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); | ||
| 83 | |||
| 84 | if (unlikely(*shadow_addr)) { | ||
| 85 | if (memory_is_poisoned_1(addr + 1)) | ||
| 86 | return true; | ||
| 87 | |||
| 88 | if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) | ||
| 89 | return false; | ||
| 90 | |||
| 91 | return unlikely(*(u8 *)shadow_addr); | ||
| 92 | } | ||
| 93 | |||
| 94 | return false; | ||
| 95 | } | ||
| 96 | |||
| 97 | static __always_inline bool memory_is_poisoned_4(unsigned long addr) | ||
| 98 | { | ||
| 99 | u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); | ||
| 100 | |||
| 101 | if (unlikely(*shadow_addr)) { | ||
| 102 | if (memory_is_poisoned_1(addr + 3)) | ||
| 103 | return true; | ||
| 104 | |||
| 105 | if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) | ||
| 106 | return false; | ||
| 107 | |||
| 108 | return unlikely(*(u8 *)shadow_addr); | ||
| 109 | } | ||
| 110 | |||
| 111 | return false; | ||
| 112 | } | ||
| 113 | |||
| 114 | static __always_inline bool memory_is_poisoned_8(unsigned long addr) | ||
| 115 | { | ||
| 116 | u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); | ||
| 117 | |||
| 118 | if (unlikely(*shadow_addr)) { | ||
| 119 | if (memory_is_poisoned_1(addr + 7)) | ||
| 120 | return true; | ||
| 121 | |||
| 122 | if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7)) | ||
| 123 | return false; | ||
| 124 | |||
| 125 | return unlikely(*(u8 *)shadow_addr); | ||
| 126 | } | ||
| 127 | |||
| 128 | return false; | ||
| 129 | } | ||
| 130 | |||
| 131 | static __always_inline bool memory_is_poisoned_16(unsigned long addr) | ||
| 132 | { | ||
| 133 | u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr); | ||
| 134 | |||
| 135 | if (unlikely(*shadow_addr)) { | ||
| 136 | u16 shadow_first_bytes = *(u16 *)shadow_addr; | ||
| 137 | s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK; | ||
| 138 | |||
| 139 | if (unlikely(shadow_first_bytes)) | ||
| 140 | return true; | ||
| 141 | |||
| 142 | if (likely(!last_byte)) | ||
| 143 | return false; | ||
| 144 | |||
| 145 | return memory_is_poisoned_1(addr + 15); | ||
| 146 | } | ||
| 147 | |||
| 148 | return false; | ||
| 149 | } | ||
| 150 | |||
| 151 | static __always_inline unsigned long bytes_is_zero(const u8 *start, | ||
| 152 | size_t size) | ||
| 153 | { | ||
| 154 | while (size) { | ||
| 155 | if (unlikely(*start)) | ||
| 156 | return (unsigned long)start; | ||
| 157 | start++; | ||
| 158 | size--; | ||
| 159 | } | ||
| 160 | |||
| 161 | return 0; | ||
| 162 | } | ||
| 163 | |||
| 164 | static __always_inline unsigned long memory_is_zero(const void *start, | ||
| 165 | const void *end) | ||
| 166 | { | ||
| 167 | unsigned int words; | ||
| 168 | unsigned long ret; | ||
| 169 | unsigned int prefix = (unsigned long)start % 8; | ||
| 170 | |||
| 171 | if (end - start <= 16) | ||
| 172 | return bytes_is_zero(start, end - start); | ||
| 173 | |||
| 174 | if (prefix) { | ||
| 175 | prefix = 8 - prefix; | ||
| 176 | ret = bytes_is_zero(start, prefix); | ||
| 177 | if (unlikely(ret)) | ||
| 178 | return ret; | ||
| 179 | start += prefix; | ||
| 180 | } | ||
| 181 | |||
| 182 | words = (end - start) / 8; | ||
| 183 | while (words) { | ||
| 184 | if (unlikely(*(u64 *)start)) | ||
| 185 | return bytes_is_zero(start, 8); | ||
| 186 | start += 8; | ||
| 187 | words--; | ||
| 188 | } | ||
| 189 | |||
| 190 | return bytes_is_zero(start, (end - start) % 8); | ||
| 191 | } | ||
| 192 | |||
| 193 | static __always_inline bool memory_is_poisoned_n(unsigned long addr, | ||
| 194 | size_t size) | ||
| 195 | { | ||
| 196 | unsigned long ret; | ||
| 197 | |||
| 198 | ret = memory_is_zero(kasan_mem_to_shadow((void *)addr), | ||
| 199 | kasan_mem_to_shadow((void *)addr + size - 1) + 1); | ||
| 200 | |||
| 201 | if (unlikely(ret)) { | ||
| 202 | unsigned long last_byte = addr + size - 1; | ||
| 203 | s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); | ||
| 204 | |||
| 205 | if (unlikely(ret != (unsigned long)last_shadow || | ||
| 206 | ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) | ||
| 207 | return true; | ||
| 208 | } | ||
| 209 | return false; | ||
| 210 | } | ||
| 211 | |||
| 212 | static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) | ||
| 213 | { | ||
| 214 | if (__builtin_constant_p(size)) { | ||
| 215 | switch (size) { | ||
| 216 | case 1: | ||
| 217 | return memory_is_poisoned_1(addr); | ||
| 218 | case 2: | ||
| 219 | return memory_is_poisoned_2(addr); | ||
| 220 | case 4: | ||
| 221 | return memory_is_poisoned_4(addr); | ||
| 222 | case 8: | ||
| 223 | return memory_is_poisoned_8(addr); | ||
| 224 | case 16: | ||
| 225 | return memory_is_poisoned_16(addr); | ||
| 226 | default: | ||
| 227 | BUILD_BUG(); | ||
| 228 | } | ||
| 229 | } | ||
| 230 | |||
| 231 | return memory_is_poisoned_n(addr, size); | ||
| 232 | } | ||
| 233 | |||
| 234 | |||
| 235 | static __always_inline void check_memory_region(unsigned long addr, | ||
| 236 | size_t size, bool write) | ||
| 237 | { | ||
| 238 | struct kasan_access_info info; | ||
| 239 | |||
| 240 | if (unlikely(size == 0)) | ||
| 241 | return; | ||
| 242 | |||
| 243 | if (unlikely((void *)addr < | ||
| 244 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { | ||
| 245 | info.access_addr = (void *)addr; | ||
| 246 | info.access_size = size; | ||
| 247 | info.is_write = write; | ||
| 248 | info.ip = _RET_IP_; | ||
| 249 | kasan_report_user_access(&info); | ||
| 250 | return; | ||
| 251 | } | ||
| 252 | |||
| 253 | if (likely(!memory_is_poisoned(addr, size))) | ||
| 254 | return; | ||
| 255 | |||
| 256 | kasan_report(addr, size, write, _RET_IP_); | ||
| 257 | } | ||
| 258 | |||
| 259 | void __asan_loadN(unsigned long addr, size_t size); | ||
| 260 | void __asan_storeN(unsigned long addr, size_t size); | ||
| 261 | |||
| 262 | #undef memset | ||
| 263 | void *memset(void *addr, int c, size_t len) | ||
| 264 | { | ||
| 265 | __asan_storeN((unsigned long)addr, len); | ||
| 266 | |||
| 267 | return __memset(addr, c, len); | ||
| 268 | } | ||
| 269 | |||
| 270 | #undef memmove | ||
| 271 | void *memmove(void *dest, const void *src, size_t len) | ||
| 272 | { | ||
| 273 | __asan_loadN((unsigned long)src, len); | ||
| 274 | __asan_storeN((unsigned long)dest, len); | ||
| 275 | |||
| 276 | return __memmove(dest, src, len); | ||
| 277 | } | ||
| 278 | |||
| 279 | #undef memcpy | ||
| 280 | void *memcpy(void *dest, const void *src, size_t len) | ||
| 281 | { | ||
| 282 | __asan_loadN((unsigned long)src, len); | ||
| 283 | __asan_storeN((unsigned long)dest, len); | ||
| 284 | |||
| 285 | return __memcpy(dest, src, len); | ||
| 286 | } | ||
| 287 | |||
| 288 | void kasan_alloc_pages(struct page *page, unsigned int order) | ||
| 289 | { | ||
| 290 | if (likely(!PageHighMem(page))) | ||
| 291 | kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); | ||
| 292 | } | ||
| 293 | |||
| 294 | void kasan_free_pages(struct page *page, unsigned int order) | ||
| 295 | { | ||
| 296 | if (likely(!PageHighMem(page))) | ||
| 297 | kasan_poison_shadow(page_address(page), | ||
| 298 | PAGE_SIZE << order, | ||
| 299 | KASAN_FREE_PAGE); | ||
| 300 | } | ||
| 301 | |||
| 302 | void kasan_poison_slab(struct page *page) | ||
| 303 | { | ||
| 304 | kasan_poison_shadow(page_address(page), | ||
| 305 | PAGE_SIZE << compound_order(page), | ||
| 306 | KASAN_KMALLOC_REDZONE); | ||
| 307 | } | ||
| 308 | |||
| 309 | void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) | ||
| 310 | { | ||
| 311 | kasan_unpoison_shadow(object, cache->object_size); | ||
| 312 | } | ||
| 313 | |||
| 314 | void kasan_poison_object_data(struct kmem_cache *cache, void *object) | ||
| 315 | { | ||
| 316 | kasan_poison_shadow(object, | ||
| 317 | round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), | ||
| 318 | KASAN_KMALLOC_REDZONE); | ||
| 319 | } | ||
| 320 | |||
| 321 | void kasan_slab_alloc(struct kmem_cache *cache, void *object) | ||
| 322 | { | ||
| 323 | kasan_kmalloc(cache, object, cache->object_size); | ||
| 324 | } | ||
| 325 | |||
| 326 | void kasan_slab_free(struct kmem_cache *cache, void *object) | ||
| 327 | { | ||
| 328 | unsigned long size = cache->object_size; | ||
| 329 | unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); | ||
| 330 | |||
| 331 | /* RCU slabs could be legally used after free within the RCU period */ | ||
| 332 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) | ||
| 333 | return; | ||
| 334 | |||
| 335 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); | ||
| 336 | } | ||
| 337 | |||
| 338 | void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) | ||
| 339 | { | ||
| 340 | unsigned long redzone_start; | ||
| 341 | unsigned long redzone_end; | ||
| 342 | |||
| 343 | if (unlikely(object == NULL)) | ||
| 344 | return; | ||
| 345 | |||
| 346 | redzone_start = round_up((unsigned long)(object + size), | ||
| 347 | KASAN_SHADOW_SCALE_SIZE); | ||
| 348 | redzone_end = round_up((unsigned long)object + cache->object_size, | ||
| 349 | KASAN_SHADOW_SCALE_SIZE); | ||
| 350 | |||
| 351 | kasan_unpoison_shadow(object, size); | ||
| 352 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, | ||
| 353 | KASAN_KMALLOC_REDZONE); | ||
| 354 | } | ||
| 355 | EXPORT_SYMBOL(kasan_kmalloc); | ||
| 356 | |||
| 357 | void kasan_kmalloc_large(const void *ptr, size_t size) | ||
| 358 | { | ||
| 359 | struct page *page; | ||
| 360 | unsigned long redzone_start; | ||
| 361 | unsigned long redzone_end; | ||
| 362 | |||
| 363 | if (unlikely(ptr == NULL)) | ||
| 364 | return; | ||
| 365 | |||
| 366 | page = virt_to_page(ptr); | ||
| 367 | redzone_start = round_up((unsigned long)(ptr + size), | ||
| 368 | KASAN_SHADOW_SCALE_SIZE); | ||
| 369 | redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page)); | ||
| 370 | |||
| 371 | kasan_unpoison_shadow(ptr, size); | ||
| 372 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, | ||
| 373 | KASAN_PAGE_REDZONE); | ||
| 374 | } | ||
| 375 | |||
| 376 | void kasan_krealloc(const void *object, size_t size) | ||
| 377 | { | ||
| 378 | struct page *page; | ||
| 379 | |||
| 380 | if (unlikely(object == ZERO_SIZE_PTR)) | ||
| 381 | return; | ||
| 382 | |||
| 383 | page = virt_to_head_page(object); | ||
| 384 | |||
| 385 | if (unlikely(!PageSlab(page))) | ||
| 386 | kasan_kmalloc_large(object, size); | ||
| 387 | else | ||
| 388 | kasan_kmalloc(page->slab_cache, object, size); | ||
| 389 | } | ||
| 390 | |||
| 391 | void kasan_kfree_large(const void *ptr) | ||
| 392 | { | ||
| 393 | struct page *page = virt_to_page(ptr); | ||
| 394 | |||
| 395 | kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), | ||
| 396 | KASAN_FREE_PAGE); | ||
| 397 | } | ||
| 398 | |||
| 399 | int kasan_module_alloc(void *addr, size_t size) | ||
| 400 | { | ||
| 401 | void *ret; | ||
| 402 | size_t shadow_size; | ||
| 403 | unsigned long shadow_start; | ||
| 404 | |||
| 405 | shadow_start = (unsigned long)kasan_mem_to_shadow(addr); | ||
| 406 | shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT, | ||
| 407 | PAGE_SIZE); | ||
| 408 | |||
| 409 | if (WARN_ON(!PAGE_ALIGNED(shadow_start))) | ||
| 410 | return -EINVAL; | ||
| 411 | |||
| 412 | ret = __vmalloc_node_range(shadow_size, 1, shadow_start, | ||
| 413 | shadow_start + shadow_size, | ||
| 414 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | ||
| 415 | PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, | ||
| 416 | __builtin_return_address(0)); | ||
| 417 | return ret ? 0 : -ENOMEM; | ||
| 418 | } | ||
| 419 | |||
| 420 | void kasan_module_free(void *addr) | ||
| 421 | { | ||
| 422 | vfree(kasan_mem_to_shadow(addr)); | ||
| 423 | } | ||
| 424 | |||
| 425 | static void register_global(struct kasan_global *global) | ||
| 426 | { | ||
| 427 | size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); | ||
| 428 | |||
| 429 | kasan_unpoison_shadow(global->beg, global->size); | ||
| 430 | |||
| 431 | kasan_poison_shadow(global->beg + aligned_size, | ||
| 432 | global->size_with_redzone - aligned_size, | ||
| 433 | KASAN_GLOBAL_REDZONE); | ||
| 434 | } | ||
| 435 | |||
| 436 | void __asan_register_globals(struct kasan_global *globals, size_t size) | ||
| 437 | { | ||
| 438 | int i; | ||
| 439 | |||
| 440 | for (i = 0; i < size; i++) | ||
| 441 | register_global(&globals[i]); | ||
| 442 | } | ||
| 443 | EXPORT_SYMBOL(__asan_register_globals); | ||
| 444 | |||
| 445 | void __asan_unregister_globals(struct kasan_global *globals, size_t size) | ||
| 446 | { | ||
| 447 | } | ||
| 448 | EXPORT_SYMBOL(__asan_unregister_globals); | ||
| 449 | |||
| 450 | #define DEFINE_ASAN_LOAD_STORE(size) \ | ||
| 451 | void __asan_load##size(unsigned long addr) \ | ||
| 452 | { \ | ||
| 453 | check_memory_region(addr, size, false); \ | ||
| 454 | } \ | ||
| 455 | EXPORT_SYMBOL(__asan_load##size); \ | ||
| 456 | __alias(__asan_load##size) \ | ||
| 457 | void __asan_load##size##_noabort(unsigned long); \ | ||
| 458 | EXPORT_SYMBOL(__asan_load##size##_noabort); \ | ||
| 459 | void __asan_store##size(unsigned long addr) \ | ||
| 460 | { \ | ||
| 461 | check_memory_region(addr, size, true); \ | ||
| 462 | } \ | ||
| 463 | EXPORT_SYMBOL(__asan_store##size); \ | ||
| 464 | __alias(__asan_store##size) \ | ||
| 465 | void __asan_store##size##_noabort(unsigned long); \ | ||
| 466 | EXPORT_SYMBOL(__asan_store##size##_noabort) | ||
| 467 | |||
| 468 | DEFINE_ASAN_LOAD_STORE(1); | ||
| 469 | DEFINE_ASAN_LOAD_STORE(2); | ||
| 470 | DEFINE_ASAN_LOAD_STORE(4); | ||
| 471 | DEFINE_ASAN_LOAD_STORE(8); | ||
| 472 | DEFINE_ASAN_LOAD_STORE(16); | ||
| 473 | |||
| 474 | void __asan_loadN(unsigned long addr, size_t size) | ||
| 475 | { | ||
| 476 | check_memory_region(addr, size, false); | ||
| 477 | } | ||
| 478 | EXPORT_SYMBOL(__asan_loadN); | ||
| 479 | |||
| 480 | __alias(__asan_loadN) | ||
| 481 | void __asan_loadN_noabort(unsigned long, size_t); | ||
| 482 | EXPORT_SYMBOL(__asan_loadN_noabort); | ||
| 483 | |||
| 484 | void __asan_storeN(unsigned long addr, size_t size) | ||
| 485 | { | ||
| 486 | check_memory_region(addr, size, true); | ||
| 487 | } | ||
| 488 | EXPORT_SYMBOL(__asan_storeN); | ||
| 489 | |||
| 490 | __alias(__asan_storeN) | ||
| 491 | void __asan_storeN_noabort(unsigned long, size_t); | ||
| 492 | EXPORT_SYMBOL(__asan_storeN_noabort); | ||
| 493 | |||
| 494 | /* to shut up compiler complaints */ | ||
| 495 | void __asan_handle_no_return(void) {} | ||
| 496 | EXPORT_SYMBOL(__asan_handle_no_return); | ||
| 497 | |||
| 498 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 499 | static int kasan_mem_notifier(struct notifier_block *nb, | ||
| 500 | unsigned long action, void *data) | ||
| 501 | { | ||
| 502 | return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK; | ||
| 503 | } | ||
| 504 | |||
| 505 | static int __init kasan_memhotplug_init(void) | ||
| 506 | { | ||
| 507 | pr_err("WARNING: KASan doesn't support memory hot-add\n"); | ||
| 508 | pr_err("Memory hot-add will be disabled\n"); | ||
| 509 | |||
| 510 | hotplug_memory_notifier(kasan_mem_notifier, 0); | ||
| 511 | |||
| 512 | return 0; | ||
| 513 | } | ||
| 514 | |||
| 515 | module_init(kasan_memhotplug_init); | ||
| 516 | #endif | ||
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h new file mode 100644 index 000000000000..4986b0acab21 --- /dev/null +++ b/mm/kasan/kasan.h | |||
| @@ -0,0 +1,75 @@ | |||
| 1 | #ifndef __MM_KASAN_KASAN_H | ||
| 2 | #define __MM_KASAN_KASAN_H | ||
| 3 | |||
| 4 | #include <linux/kasan.h> | ||
| 5 | |||
| 6 | #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) | ||
| 7 | #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) | ||
| 8 | |||
| 9 | #define KASAN_FREE_PAGE 0xFF /* page was freed */ | ||
| 10 | #define KASAN_FREE_PAGE 0xFF /* page was freed */ | ||
| 11 | #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ | ||
| 12 | #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ | ||
| 13 | #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ | ||
| 14 | #define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ | ||
| 15 | |||
| 16 | /* | ||
| 17 | * Stack redzone shadow values | ||
| 18 | * (Those are compiler's ABI, don't change them) | ||
| 19 | */ | ||
| 20 | #define KASAN_STACK_LEFT 0xF1 | ||
| 21 | #define KASAN_STACK_MID 0xF2 | ||
| 22 | #define KASAN_STACK_RIGHT 0xF3 | ||
| 23 | #define KASAN_STACK_PARTIAL 0xF4 | ||
| 24 | |||
| 25 | /* Don't break randconfig/all*config builds */ | ||
| 26 | #ifndef KASAN_ABI_VERSION | ||
| 27 | #define KASAN_ABI_VERSION 1 | ||
| 28 | #endif | ||
| 29 | |||
| 30 | struct kasan_access_info { | ||
| 31 | const void *access_addr; | ||
| 32 | const void *first_bad_addr; | ||
| 33 | size_t access_size; | ||
| 34 | bool is_write; | ||
| 35 | unsigned long ip; | ||
| 36 | }; | ||
| 37 | |||
| 38 | /* The layout of struct dictated by compiler */ | ||
| 39 | struct kasan_source_location { | ||
| 40 | const char *filename; | ||
| 41 | int line_no; | ||
| 42 | int column_no; | ||
| 43 | }; | ||
| 44 | |||
| 45 | /* The layout of struct dictated by compiler */ | ||
| 46 | struct kasan_global { | ||
| 47 | const void *beg; /* Address of the beginning of the global variable. */ | ||
| 48 | size_t size; /* Size of the global variable. */ | ||
| 49 | size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */ | ||
| 50 | const void *name; | ||
| 51 | const void *module_name; /* Name of the module where the global variable is declared. */ | ||
| 52 | unsigned long has_dynamic_init; /* This needed for C++ */ | ||
| 53 | #if KASAN_ABI_VERSION >= 4 | ||
| 54 | struct kasan_source_location *location; | ||
| 55 | #endif | ||
| 56 | }; | ||
| 57 | |||
| 58 | void kasan_report_error(struct kasan_access_info *info); | ||
| 59 | void kasan_report_user_access(struct kasan_access_info *info); | ||
| 60 | |||
| 61 | static inline const void *kasan_shadow_to_mem(const void *shadow_addr) | ||
| 62 | { | ||
| 63 | return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) | ||
| 64 | << KASAN_SHADOW_SCALE_SHIFT); | ||
| 65 | } | ||
| 66 | |||
| 67 | static inline bool kasan_enabled(void) | ||
| 68 | { | ||
| 69 | return !current->kasan_depth; | ||
| 70 | } | ||
| 71 | |||
| 72 | void kasan_report(unsigned long addr, size_t size, | ||
| 73 | bool is_write, unsigned long ip); | ||
| 74 | |||
| 75 | #endif | ||
diff --git a/mm/kasan/report.c b/mm/kasan/report.c new file mode 100644 index 000000000000..680ceedf810a --- /dev/null +++ b/mm/kasan/report.c | |||
| @@ -0,0 +1,269 @@ | |||
| 1 | /* | ||
| 2 | * This file contains error reporting code. | ||
| 3 | * | ||
| 4 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | ||
| 5 | * Author: Andrey Ryabinin <a.ryabinin@samsung.com> | ||
| 6 | * | ||
| 7 | * Some of code borrowed from https://github.com/xairy/linux by | ||
| 8 | * Andrey Konovalov <adech.fo@gmail.com> | ||
| 9 | * | ||
| 10 | * This program is free software; you can redistribute it and/or modify | ||
| 11 | * it under the terms of the GNU General Public License version 2 as | ||
| 12 | * published by the Free Software Foundation. | ||
| 13 | * | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/kernel.h> | ||
| 17 | #include <linux/mm.h> | ||
| 18 | #include <linux/printk.h> | ||
| 19 | #include <linux/sched.h> | ||
| 20 | #include <linux/slab.h> | ||
| 21 | #include <linux/stacktrace.h> | ||
| 22 | #include <linux/string.h> | ||
| 23 | #include <linux/types.h> | ||
| 24 | #include <linux/kasan.h> | ||
| 25 | |||
| 26 | #include <asm/sections.h> | ||
| 27 | |||
| 28 | #include "kasan.h" | ||
| 29 | #include "../slab.h" | ||
| 30 | |||
| 31 | /* Shadow layout customization. */ | ||
| 32 | #define SHADOW_BYTES_PER_BLOCK 1 | ||
| 33 | #define SHADOW_BLOCKS_PER_ROW 16 | ||
| 34 | #define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK) | ||
| 35 | #define SHADOW_ROWS_AROUND_ADDR 2 | ||
| 36 | |||
| 37 | static const void *find_first_bad_addr(const void *addr, size_t size) | ||
| 38 | { | ||
| 39 | u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr); | ||
| 40 | const void *first_bad_addr = addr; | ||
| 41 | |||
| 42 | while (!shadow_val && first_bad_addr < addr + size) { | ||
| 43 | first_bad_addr += KASAN_SHADOW_SCALE_SIZE; | ||
| 44 | shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr); | ||
| 45 | } | ||
| 46 | return first_bad_addr; | ||
| 47 | } | ||
| 48 | |||
| 49 | static void print_error_description(struct kasan_access_info *info) | ||
| 50 | { | ||
| 51 | const char *bug_type = "unknown crash"; | ||
| 52 | u8 shadow_val; | ||
| 53 | |||
| 54 | info->first_bad_addr = find_first_bad_addr(info->access_addr, | ||
| 55 | info->access_size); | ||
| 56 | |||
| 57 | shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); | ||
| 58 | |||
| 59 | switch (shadow_val) { | ||
| 60 | case KASAN_FREE_PAGE: | ||
| 61 | case KASAN_KMALLOC_FREE: | ||
| 62 | bug_type = "use after free"; | ||
| 63 | break; | ||
| 64 | case KASAN_PAGE_REDZONE: | ||
| 65 | case KASAN_KMALLOC_REDZONE: | ||
| 66 | case KASAN_GLOBAL_REDZONE: | ||
| 67 | case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: | ||
| 68 | bug_type = "out of bounds access"; | ||
| 69 | break; | ||
| 70 | case KASAN_STACK_LEFT: | ||
| 71 | case KASAN_STACK_MID: | ||
| 72 | case KASAN_STACK_RIGHT: | ||
| 73 | case KASAN_STACK_PARTIAL: | ||
| 74 | bug_type = "out of bounds on stack"; | ||
| 75 | break; | ||
| 76 | } | ||
| 77 | |||
| 78 | pr_err("BUG: KASan: %s in %pS at addr %p\n", | ||
| 79 | bug_type, (void *)info->ip, | ||
| 80 | info->access_addr); | ||
| 81 | pr_err("%s of size %zu by task %s/%d\n", | ||
| 82 | info->is_write ? "Write" : "Read", | ||
| 83 | info->access_size, current->comm, task_pid_nr(current)); | ||
| 84 | } | ||
| 85 | |||
| 86 | static inline bool kernel_or_module_addr(const void *addr) | ||
| 87 | { | ||
| 88 | return (addr >= (void *)_stext && addr < (void *)_end) | ||
| 89 | || (addr >= (void *)MODULES_VADDR | ||
| 90 | && addr < (void *)MODULES_END); | ||
| 91 | } | ||
| 92 | |||
| 93 | static inline bool init_task_stack_addr(const void *addr) | ||
| 94 | { | ||
| 95 | return addr >= (void *)&init_thread_union.stack && | ||
| 96 | (addr <= (void *)&init_thread_union.stack + | ||
| 97 | sizeof(init_thread_union.stack)); | ||
| 98 | } | ||
| 99 | |||
| 100 | static void print_address_description(struct kasan_access_info *info) | ||
| 101 | { | ||
| 102 | const void *addr = info->access_addr; | ||
| 103 | |||
| 104 | if ((addr >= (void *)PAGE_OFFSET) && | ||
| 105 | (addr < high_memory)) { | ||
| 106 | struct page *page = virt_to_head_page(addr); | ||
| 107 | |||
| 108 | if (PageSlab(page)) { | ||
| 109 | void *object; | ||
| 110 | struct kmem_cache *cache = page->slab_cache; | ||
| 111 | void *last_object; | ||
| 112 | |||
| 113 | object = virt_to_obj(cache, page_address(page), addr); | ||
| 114 | last_object = page_address(page) + | ||
| 115 | page->objects * cache->size; | ||
| 116 | |||
| 117 | if (unlikely(object > last_object)) | ||
| 118 | object = last_object; /* we hit into padding */ | ||
| 119 | |||
| 120 | object_err(cache, page, object, | ||
| 121 | "kasan: bad access detected"); | ||
| 122 | return; | ||
| 123 | } | ||
| 124 | dump_page(page, "kasan: bad access detected"); | ||
| 125 | } | ||
| 126 | |||
| 127 | if (kernel_or_module_addr(addr)) { | ||
| 128 | if (!init_task_stack_addr(addr)) | ||
| 129 | pr_err("Address belongs to variable %pS\n", addr); | ||
| 130 | } | ||
| 131 | |||
| 132 | dump_stack(); | ||
| 133 | } | ||
| 134 | |||
| 135 | static bool row_is_guilty(const void *row, const void *guilty) | ||
| 136 | { | ||
| 137 | return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW); | ||
| 138 | } | ||
| 139 | |||
| 140 | static int shadow_pointer_offset(const void *row, const void *shadow) | ||
| 141 | { | ||
| 142 | /* The length of ">ff00ff00ff00ff00: " is | ||
| 143 | * 3 + (BITS_PER_LONG/8)*2 chars. | ||
| 144 | */ | ||
| 145 | return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 + | ||
| 146 | (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1; | ||
| 147 | } | ||
| 148 | |||
| 149 | static void print_shadow_for_address(const void *addr) | ||
| 150 | { | ||
| 151 | int i; | ||
| 152 | const void *shadow = kasan_mem_to_shadow(addr); | ||
| 153 | const void *shadow_row; | ||
| 154 | |||
| 155 | shadow_row = (void *)round_down((unsigned long)shadow, | ||
| 156 | SHADOW_BYTES_PER_ROW) | ||
| 157 | - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW; | ||
| 158 | |||
| 159 | pr_err("Memory state around the buggy address:\n"); | ||
| 160 | |||
| 161 | for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { | ||
| 162 | const void *kaddr = kasan_shadow_to_mem(shadow_row); | ||
| 163 | char buffer[4 + (BITS_PER_LONG/8)*2]; | ||
| 164 | |||
| 165 | snprintf(buffer, sizeof(buffer), | ||
| 166 | (i == 0) ? ">%p: " : " %p: ", kaddr); | ||
| 167 | |||
| 168 | kasan_disable_current(); | ||
| 169 | print_hex_dump(KERN_ERR, buffer, | ||
| 170 | DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, | ||
| 171 | shadow_row, SHADOW_BYTES_PER_ROW, 0); | ||
| 172 | kasan_enable_current(); | ||
| 173 | |||
| 174 | if (row_is_guilty(shadow_row, shadow)) | ||
| 175 | pr_err("%*c\n", | ||
| 176 | shadow_pointer_offset(shadow_row, shadow), | ||
| 177 | '^'); | ||
| 178 | |||
| 179 | shadow_row += SHADOW_BYTES_PER_ROW; | ||
| 180 | } | ||
| 181 | } | ||
| 182 | |||
| 183 | static DEFINE_SPINLOCK(report_lock); | ||
| 184 | |||
| 185 | void kasan_report_error(struct kasan_access_info *info) | ||
| 186 | { | ||
| 187 | unsigned long flags; | ||
| 188 | |||
| 189 | spin_lock_irqsave(&report_lock, flags); | ||
| 190 | pr_err("=================================" | ||
| 191 | "=================================\n"); | ||
| 192 | print_error_description(info); | ||
| 193 | print_address_description(info); | ||
| 194 | print_shadow_for_address(info->first_bad_addr); | ||
| 195 | pr_err("=================================" | ||
| 196 | "=================================\n"); | ||
| 197 | spin_unlock_irqrestore(&report_lock, flags); | ||
| 198 | } | ||
| 199 | |||
| 200 | void kasan_report_user_access(struct kasan_access_info *info) | ||
| 201 | { | ||
| 202 | unsigned long flags; | ||
| 203 | |||
| 204 | spin_lock_irqsave(&report_lock, flags); | ||
| 205 | pr_err("=================================" | ||
| 206 | "=================================\n"); | ||
| 207 | pr_err("BUG: KASan: user-memory-access on address %p\n", | ||
| 208 | info->access_addr); | ||
| 209 | pr_err("%s of size %zu by task %s/%d\n", | ||
| 210 | info->is_write ? "Write" : "Read", | ||
| 211 | info->access_size, current->comm, task_pid_nr(current)); | ||
| 212 | dump_stack(); | ||
| 213 | pr_err("=================================" | ||
| 214 | "=================================\n"); | ||
| 215 | spin_unlock_irqrestore(&report_lock, flags); | ||
| 216 | } | ||
| 217 | |||
| 218 | void kasan_report(unsigned long addr, size_t size, | ||
| 219 | bool is_write, unsigned long ip) | ||
| 220 | { | ||
| 221 | struct kasan_access_info info; | ||
| 222 | |||
| 223 | if (likely(!kasan_enabled())) | ||
| 224 | return; | ||
| 225 | |||
| 226 | info.access_addr = (void *)addr; | ||
| 227 | info.access_size = size; | ||
| 228 | info.is_write = is_write; | ||
| 229 | info.ip = ip; | ||
| 230 | kasan_report_error(&info); | ||
| 231 | } | ||
| 232 | |||
| 233 | |||
| 234 | #define DEFINE_ASAN_REPORT_LOAD(size) \ | ||
| 235 | void __asan_report_load##size##_noabort(unsigned long addr) \ | ||
| 236 | { \ | ||
| 237 | kasan_report(addr, size, false, _RET_IP_); \ | ||
| 238 | } \ | ||
| 239 | EXPORT_SYMBOL(__asan_report_load##size##_noabort) | ||
| 240 | |||
| 241 | #define DEFINE_ASAN_REPORT_STORE(size) \ | ||
| 242 | void __asan_report_store##size##_noabort(unsigned long addr) \ | ||
| 243 | { \ | ||
| 244 | kasan_report(addr, size, true, _RET_IP_); \ | ||
| 245 | } \ | ||
| 246 | EXPORT_SYMBOL(__asan_report_store##size##_noabort) | ||
| 247 | |||
| 248 | DEFINE_ASAN_REPORT_LOAD(1); | ||
| 249 | DEFINE_ASAN_REPORT_LOAD(2); | ||
| 250 | DEFINE_ASAN_REPORT_LOAD(4); | ||
| 251 | DEFINE_ASAN_REPORT_LOAD(8); | ||
| 252 | DEFINE_ASAN_REPORT_LOAD(16); | ||
| 253 | DEFINE_ASAN_REPORT_STORE(1); | ||
| 254 | DEFINE_ASAN_REPORT_STORE(2); | ||
| 255 | DEFINE_ASAN_REPORT_STORE(4); | ||
| 256 | DEFINE_ASAN_REPORT_STORE(8); | ||
| 257 | DEFINE_ASAN_REPORT_STORE(16); | ||
| 258 | |||
| 259 | void __asan_report_load_n_noabort(unsigned long addr, size_t size) | ||
| 260 | { | ||
| 261 | kasan_report(addr, size, false, _RET_IP_); | ||
| 262 | } | ||
| 263 | EXPORT_SYMBOL(__asan_report_load_n_noabort); | ||
| 264 | |||
| 265 | void __asan_report_store_n_noabort(unsigned long addr, size_t size) | ||
| 266 | { | ||
| 267 | kasan_report(addr, size, true, _RET_IP_); | ||
| 268 | } | ||
| 269 | EXPORT_SYMBOL(__asan_report_store_n_noabort); | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3cda50c1e394..5405aff5a590 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
| @@ -98,6 +98,7 @@ | |||
| 98 | #include <asm/processor.h> | 98 | #include <asm/processor.h> |
| 99 | #include <linux/atomic.h> | 99 | #include <linux/atomic.h> |
| 100 | 100 | ||
| 101 | #include <linux/kasan.h> | ||
| 101 | #include <linux/kmemcheck.h> | 102 | #include <linux/kmemcheck.h> |
| 102 | #include <linux/kmemleak.h> | 103 | #include <linux/kmemleak.h> |
| 103 | #include <linux/memory_hotplug.h> | 104 | #include <linux/memory_hotplug.h> |
| @@ -1113,7 +1114,10 @@ static bool update_checksum(struct kmemleak_object *object) | |||
| 1113 | if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) | 1114 | if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) |
| 1114 | return false; | 1115 | return false; |
| 1115 | 1116 | ||
| 1117 | kasan_disable_current(); | ||
| 1116 | object->checksum = crc32(0, (void *)object->pointer, object->size); | 1118 | object->checksum = crc32(0, (void *)object->pointer, object->size); |
| 1119 | kasan_enable_current(); | ||
| 1120 | |||
| 1117 | return object->checksum != old_csum; | 1121 | return object->checksum != old_csum; |
| 1118 | } | 1122 | } |
| 1119 | 1123 | ||
| @@ -1164,7 +1168,9 @@ static void scan_block(void *_start, void *_end, | |||
| 1164 | BYTES_PER_POINTER)) | 1168 | BYTES_PER_POINTER)) |
| 1165 | continue; | 1169 | continue; |
| 1166 | 1170 | ||
| 1171 | kasan_disable_current(); | ||
| 1167 | pointer = *ptr; | 1172 | pointer = *ptr; |
| 1173 | kasan_enable_current(); | ||
| 1168 | 1174 | ||
| 1169 | object = find_and_get_object(pointer, 1); | 1175 | object = find_and_get_object(pointer, 1); |
| 1170 | if (!object) | 1176 | if (!object) |
| @@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
| 1748 | */ | 1748 | */ |
| 1749 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1749 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
| 1750 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1750 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
| 1751 | VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) | 1751 | VM_HUGETLB | VM_MIXEDMAP)) |
| 1752 | return 0; /* just ignore the advice */ | 1752 | return 0; /* just ignore the advice */ |
| 1753 | 1753 | ||
| 1754 | #ifdef VM_SAO | 1754 | #ifdef VM_SAO |
diff --git a/mm/list_lru.c b/mm/list_lru.c index f1a0db194173..909eca2c820e 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c | |||
| @@ -9,18 +9,100 @@ | |||
| 9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
| 10 | #include <linux/list_lru.h> | 10 | #include <linux/list_lru.h> |
| 11 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
| 12 | #include <linux/mutex.h> | ||
| 13 | #include <linux/memcontrol.h> | ||
| 14 | |||
| 15 | #ifdef CONFIG_MEMCG_KMEM | ||
| 16 | static LIST_HEAD(list_lrus); | ||
| 17 | static DEFINE_MUTEX(list_lrus_mutex); | ||
| 18 | |||
| 19 | static void list_lru_register(struct list_lru *lru) | ||
| 20 | { | ||
| 21 | mutex_lock(&list_lrus_mutex); | ||
| 22 | list_add(&lru->list, &list_lrus); | ||
| 23 | mutex_unlock(&list_lrus_mutex); | ||
| 24 | } | ||
| 25 | |||
| 26 | static void list_lru_unregister(struct list_lru *lru) | ||
| 27 | { | ||
| 28 | mutex_lock(&list_lrus_mutex); | ||
| 29 | list_del(&lru->list); | ||
| 30 | mutex_unlock(&list_lrus_mutex); | ||
| 31 | } | ||
| 32 | #else | ||
| 33 | static void list_lru_register(struct list_lru *lru) | ||
| 34 | { | ||
| 35 | } | ||
| 36 | |||
| 37 | static void list_lru_unregister(struct list_lru *lru) | ||
| 38 | { | ||
| 39 | } | ||
| 40 | #endif /* CONFIG_MEMCG_KMEM */ | ||
| 41 | |||
| 42 | #ifdef CONFIG_MEMCG_KMEM | ||
| 43 | static inline bool list_lru_memcg_aware(struct list_lru *lru) | ||
| 44 | { | ||
| 45 | return !!lru->node[0].memcg_lrus; | ||
| 46 | } | ||
| 47 | |||
| 48 | static inline struct list_lru_one * | ||
| 49 | list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) | ||
| 50 | { | ||
| 51 | /* | ||
| 52 | * The lock protects the array of per cgroup lists from relocation | ||
| 53 | * (see memcg_update_list_lru_node). | ||
| 54 | */ | ||
| 55 | lockdep_assert_held(&nlru->lock); | ||
| 56 | if (nlru->memcg_lrus && idx >= 0) | ||
| 57 | return nlru->memcg_lrus->lru[idx]; | ||
| 58 | |||
| 59 | return &nlru->lru; | ||
| 60 | } | ||
| 61 | |||
| 62 | static inline struct list_lru_one * | ||
| 63 | list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) | ||
| 64 | { | ||
| 65 | struct mem_cgroup *memcg; | ||
| 66 | |||
| 67 | if (!nlru->memcg_lrus) | ||
| 68 | return &nlru->lru; | ||
| 69 | |||
| 70 | memcg = mem_cgroup_from_kmem(ptr); | ||
| 71 | if (!memcg) | ||
| 72 | return &nlru->lru; | ||
| 73 | |||
| 74 | return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); | ||
| 75 | } | ||
| 76 | #else | ||
| 77 | static inline bool list_lru_memcg_aware(struct list_lru *lru) | ||
| 78 | { | ||
| 79 | return false; | ||
| 80 | } | ||
| 81 | |||
| 82 | static inline struct list_lru_one * | ||
| 83 | list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) | ||
| 84 | { | ||
| 85 | return &nlru->lru; | ||
| 86 | } | ||
| 87 | |||
| 88 | static inline struct list_lru_one * | ||
| 89 | list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) | ||
| 90 | { | ||
| 91 | return &nlru->lru; | ||
| 92 | } | ||
| 93 | #endif /* CONFIG_MEMCG_KMEM */ | ||
| 12 | 94 | ||
| 13 | bool list_lru_add(struct list_lru *lru, struct list_head *item) | 95 | bool list_lru_add(struct list_lru *lru, struct list_head *item) |
| 14 | { | 96 | { |
| 15 | int nid = page_to_nid(virt_to_page(item)); | 97 | int nid = page_to_nid(virt_to_page(item)); |
| 16 | struct list_lru_node *nlru = &lru->node[nid]; | 98 | struct list_lru_node *nlru = &lru->node[nid]; |
| 99 | struct list_lru_one *l; | ||
| 17 | 100 | ||
| 18 | spin_lock(&nlru->lock); | 101 | spin_lock(&nlru->lock); |
| 19 | WARN_ON_ONCE(nlru->nr_items < 0); | 102 | l = list_lru_from_kmem(nlru, item); |
| 20 | if (list_empty(item)) { | 103 | if (list_empty(item)) { |
| 21 | list_add_tail(item, &nlru->list); | 104 | list_add_tail(item, &l->list); |
| 22 | if (nlru->nr_items++ == 0) | 105 | l->nr_items++; |
| 23 | node_set(nid, lru->active_nodes); | ||
| 24 | spin_unlock(&nlru->lock); | 106 | spin_unlock(&nlru->lock); |
| 25 | return true; | 107 | return true; |
| 26 | } | 108 | } |
| @@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) | |||
| 33 | { | 115 | { |
| 34 | int nid = page_to_nid(virt_to_page(item)); | 116 | int nid = page_to_nid(virt_to_page(item)); |
| 35 | struct list_lru_node *nlru = &lru->node[nid]; | 117 | struct list_lru_node *nlru = &lru->node[nid]; |
| 118 | struct list_lru_one *l; | ||
| 36 | 119 | ||
| 37 | spin_lock(&nlru->lock); | 120 | spin_lock(&nlru->lock); |
| 121 | l = list_lru_from_kmem(nlru, item); | ||
| 38 | if (!list_empty(item)) { | 122 | if (!list_empty(item)) { |
| 39 | list_del_init(item); | 123 | list_del_init(item); |
| 40 | if (--nlru->nr_items == 0) | 124 | l->nr_items--; |
| 41 | node_clear(nid, lru->active_nodes); | ||
| 42 | WARN_ON_ONCE(nlru->nr_items < 0); | ||
| 43 | spin_unlock(&nlru->lock); | 125 | spin_unlock(&nlru->lock); |
| 44 | return true; | 126 | return true; |
| 45 | } | 127 | } |
| @@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) | |||
| 48 | } | 130 | } |
| 49 | EXPORT_SYMBOL_GPL(list_lru_del); | 131 | EXPORT_SYMBOL_GPL(list_lru_del); |
| 50 | 132 | ||
| 51 | unsigned long | 133 | void list_lru_isolate(struct list_lru_one *list, struct list_head *item) |
| 52 | list_lru_count_node(struct list_lru *lru, int nid) | 134 | { |
| 135 | list_del_init(item); | ||
| 136 | list->nr_items--; | ||
| 137 | } | ||
| 138 | EXPORT_SYMBOL_GPL(list_lru_isolate); | ||
| 139 | |||
| 140 | void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, | ||
| 141 | struct list_head *head) | ||
| 142 | { | ||
| 143 | list_move(item, head); | ||
| 144 | list->nr_items--; | ||
| 145 | } | ||
| 146 | EXPORT_SYMBOL_GPL(list_lru_isolate_move); | ||
| 147 | |||
| 148 | static unsigned long __list_lru_count_one(struct list_lru *lru, | ||
| 149 | int nid, int memcg_idx) | ||
| 53 | { | 150 | { |
| 54 | unsigned long count = 0; | ||
| 55 | struct list_lru_node *nlru = &lru->node[nid]; | 151 | struct list_lru_node *nlru = &lru->node[nid]; |
| 152 | struct list_lru_one *l; | ||
| 153 | unsigned long count; | ||
| 56 | 154 | ||
| 57 | spin_lock(&nlru->lock); | 155 | spin_lock(&nlru->lock); |
| 58 | WARN_ON_ONCE(nlru->nr_items < 0); | 156 | l = list_lru_from_memcg_idx(nlru, memcg_idx); |
| 59 | count += nlru->nr_items; | 157 | count = l->nr_items; |
| 60 | spin_unlock(&nlru->lock); | 158 | spin_unlock(&nlru->lock); |
| 61 | 159 | ||
| 62 | return count; | 160 | return count; |
| 63 | } | 161 | } |
| 162 | |||
| 163 | unsigned long list_lru_count_one(struct list_lru *lru, | ||
| 164 | int nid, struct mem_cgroup *memcg) | ||
| 165 | { | ||
| 166 | return __list_lru_count_one(lru, nid, memcg_cache_id(memcg)); | ||
| 167 | } | ||
| 168 | EXPORT_SYMBOL_GPL(list_lru_count_one); | ||
| 169 | |||
| 170 | unsigned long list_lru_count_node(struct list_lru *lru, int nid) | ||
| 171 | { | ||
| 172 | long count = 0; | ||
| 173 | int memcg_idx; | ||
| 174 | |||
| 175 | count += __list_lru_count_one(lru, nid, -1); | ||
| 176 | if (list_lru_memcg_aware(lru)) { | ||
| 177 | for_each_memcg_cache_index(memcg_idx) | ||
| 178 | count += __list_lru_count_one(lru, nid, memcg_idx); | ||
| 179 | } | ||
| 180 | return count; | ||
| 181 | } | ||
| 64 | EXPORT_SYMBOL_GPL(list_lru_count_node); | 182 | EXPORT_SYMBOL_GPL(list_lru_count_node); |
| 65 | 183 | ||
| 66 | unsigned long | 184 | static unsigned long |
| 67 | list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, | 185 | __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, |
| 68 | void *cb_arg, unsigned long *nr_to_walk) | 186 | list_lru_walk_cb isolate, void *cb_arg, |
| 187 | unsigned long *nr_to_walk) | ||
| 69 | { | 188 | { |
| 70 | 189 | ||
| 71 | struct list_lru_node *nlru = &lru->node[nid]; | 190 | struct list_lru_node *nlru = &lru->node[nid]; |
| 191 | struct list_lru_one *l; | ||
| 72 | struct list_head *item, *n; | 192 | struct list_head *item, *n; |
| 73 | unsigned long isolated = 0; | 193 | unsigned long isolated = 0; |
| 74 | 194 | ||
| 75 | spin_lock(&nlru->lock); | 195 | spin_lock(&nlru->lock); |
| 196 | l = list_lru_from_memcg_idx(nlru, memcg_idx); | ||
| 76 | restart: | 197 | restart: |
| 77 | list_for_each_safe(item, n, &nlru->list) { | 198 | list_for_each_safe(item, n, &l->list) { |
| 78 | enum lru_status ret; | 199 | enum lru_status ret; |
| 79 | 200 | ||
| 80 | /* | 201 | /* |
| @@ -85,14 +206,11 @@ restart: | |||
| 85 | break; | 206 | break; |
| 86 | --*nr_to_walk; | 207 | --*nr_to_walk; |
| 87 | 208 | ||
| 88 | ret = isolate(item, &nlru->lock, cb_arg); | 209 | ret = isolate(item, l, &nlru->lock, cb_arg); |
| 89 | switch (ret) { | 210 | switch (ret) { |
| 90 | case LRU_REMOVED_RETRY: | 211 | case LRU_REMOVED_RETRY: |
| 91 | assert_spin_locked(&nlru->lock); | 212 | assert_spin_locked(&nlru->lock); |
| 92 | case LRU_REMOVED: | 213 | case LRU_REMOVED: |
| 93 | if (--nlru->nr_items == 0) | ||
| 94 | node_clear(nid, lru->active_nodes); | ||
| 95 | WARN_ON_ONCE(nlru->nr_items < 0); | ||
| 96 | isolated++; | 214 | isolated++; |
| 97 | /* | 215 | /* |
| 98 | * If the lru lock has been dropped, our list | 216 | * If the lru lock has been dropped, our list |
| @@ -103,7 +221,7 @@ restart: | |||
| 103 | goto restart; | 221 | goto restart; |
| 104 | break; | 222 | break; |
| 105 | case LRU_ROTATE: | 223 | case LRU_ROTATE: |
| 106 | list_move_tail(item, &nlru->list); | 224 | list_move_tail(item, &l->list); |
| 107 | break; | 225 | break; |
| 108 | case LRU_SKIP: | 226 | case LRU_SKIP: |
| 109 | break; | 227 | break; |
| @@ -122,31 +240,322 @@ restart: | |||
| 122 | spin_unlock(&nlru->lock); | 240 | spin_unlock(&nlru->lock); |
| 123 | return isolated; | 241 | return isolated; |
| 124 | } | 242 | } |
| 243 | |||
| 244 | unsigned long | ||
| 245 | list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, | ||
| 246 | list_lru_walk_cb isolate, void *cb_arg, | ||
| 247 | unsigned long *nr_to_walk) | ||
| 248 | { | ||
| 249 | return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), | ||
| 250 | isolate, cb_arg, nr_to_walk); | ||
| 251 | } | ||
| 252 | EXPORT_SYMBOL_GPL(list_lru_walk_one); | ||
| 253 | |||
| 254 | unsigned long list_lru_walk_node(struct list_lru *lru, int nid, | ||
| 255 | list_lru_walk_cb isolate, void *cb_arg, | ||
| 256 | unsigned long *nr_to_walk) | ||
| 257 | { | ||
| 258 | long isolated = 0; | ||
| 259 | int memcg_idx; | ||
| 260 | |||
| 261 | isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg, | ||
| 262 | nr_to_walk); | ||
| 263 | if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { | ||
| 264 | for_each_memcg_cache_index(memcg_idx) { | ||
| 265 | isolated += __list_lru_walk_one(lru, nid, memcg_idx, | ||
| 266 | isolate, cb_arg, nr_to_walk); | ||
| 267 | if (*nr_to_walk <= 0) | ||
| 268 | break; | ||
| 269 | } | ||
| 270 | } | ||
| 271 | return isolated; | ||
| 272 | } | ||
| 125 | EXPORT_SYMBOL_GPL(list_lru_walk_node); | 273 | EXPORT_SYMBOL_GPL(list_lru_walk_node); |
| 126 | 274 | ||
| 127 | int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) | 275 | static void init_one_lru(struct list_lru_one *l) |
| 276 | { | ||
| 277 | INIT_LIST_HEAD(&l->list); | ||
| 278 | l->nr_items = 0; | ||
| 279 | } | ||
| 280 | |||
| 281 | #ifdef CONFIG_MEMCG_KMEM | ||
| 282 | static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, | ||
| 283 | int begin, int end) | ||
| 284 | { | ||
| 285 | int i; | ||
| 286 | |||
| 287 | for (i = begin; i < end; i++) | ||
| 288 | kfree(memcg_lrus->lru[i]); | ||
| 289 | } | ||
| 290 | |||
| 291 | static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus, | ||
| 292 | int begin, int end) | ||
| 293 | { | ||
| 294 | int i; | ||
| 295 | |||
| 296 | for (i = begin; i < end; i++) { | ||
| 297 | struct list_lru_one *l; | ||
| 298 | |||
| 299 | l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL); | ||
| 300 | if (!l) | ||
| 301 | goto fail; | ||
| 302 | |||
| 303 | init_one_lru(l); | ||
| 304 | memcg_lrus->lru[i] = l; | ||
| 305 | } | ||
| 306 | return 0; | ||
| 307 | fail: | ||
| 308 | __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1); | ||
| 309 | return -ENOMEM; | ||
| 310 | } | ||
| 311 | |||
| 312 | static int memcg_init_list_lru_node(struct list_lru_node *nlru) | ||
| 313 | { | ||
| 314 | int size = memcg_nr_cache_ids; | ||
| 315 | |||
| 316 | nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL); | ||
| 317 | if (!nlru->memcg_lrus) | ||
| 318 | return -ENOMEM; | ||
| 319 | |||
| 320 | if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { | ||
| 321 | kfree(nlru->memcg_lrus); | ||
| 322 | return -ENOMEM; | ||
| 323 | } | ||
| 324 | |||
| 325 | return 0; | ||
| 326 | } | ||
| 327 | |||
| 328 | static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) | ||
| 329 | { | ||
| 330 | __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); | ||
| 331 | kfree(nlru->memcg_lrus); | ||
| 332 | } | ||
| 333 | |||
| 334 | static int memcg_update_list_lru_node(struct list_lru_node *nlru, | ||
| 335 | int old_size, int new_size) | ||
| 336 | { | ||
| 337 | struct list_lru_memcg *old, *new; | ||
| 338 | |||
| 339 | BUG_ON(old_size > new_size); | ||
| 340 | |||
| 341 | old = nlru->memcg_lrus; | ||
| 342 | new = kmalloc(new_size * sizeof(void *), GFP_KERNEL); | ||
| 343 | if (!new) | ||
| 344 | return -ENOMEM; | ||
| 345 | |||
| 346 | if (__memcg_init_list_lru_node(new, old_size, new_size)) { | ||
| 347 | kfree(new); | ||
| 348 | return -ENOMEM; | ||
| 349 | } | ||
| 350 | |||
| 351 | memcpy(new, old, old_size * sizeof(void *)); | ||
| 352 | |||
| 353 | /* | ||
| 354 | * The lock guarantees that we won't race with a reader | ||
| 355 | * (see list_lru_from_memcg_idx). | ||
| 356 | * | ||
| 357 | * Since list_lru_{add,del} may be called under an IRQ-safe lock, | ||
| 358 | * we have to use IRQ-safe primitives here to avoid deadlock. | ||
| 359 | */ | ||
| 360 | spin_lock_irq(&nlru->lock); | ||
| 361 | nlru->memcg_lrus = new; | ||
| 362 | spin_unlock_irq(&nlru->lock); | ||
| 363 | |||
| 364 | kfree(old); | ||
| 365 | return 0; | ||
| 366 | } | ||
| 367 | |||
| 368 | static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, | ||
| 369 | int old_size, int new_size) | ||
| 370 | { | ||
| 371 | /* do not bother shrinking the array back to the old size, because we | ||
| 372 | * cannot handle allocation failures here */ | ||
| 373 | __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); | ||
| 374 | } | ||
| 375 | |||
| 376 | static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) | ||
| 377 | { | ||
| 378 | int i; | ||
| 379 | |||
| 380 | for (i = 0; i < nr_node_ids; i++) { | ||
| 381 | if (!memcg_aware) | ||
| 382 | lru->node[i].memcg_lrus = NULL; | ||
| 383 | else if (memcg_init_list_lru_node(&lru->node[i])) | ||
| 384 | goto fail; | ||
| 385 | } | ||
| 386 | return 0; | ||
| 387 | fail: | ||
| 388 | for (i = i - 1; i >= 0; i--) | ||
| 389 | memcg_destroy_list_lru_node(&lru->node[i]); | ||
| 390 | return -ENOMEM; | ||
| 391 | } | ||
| 392 | |||
| 393 | static void memcg_destroy_list_lru(struct list_lru *lru) | ||
| 394 | { | ||
| 395 | int i; | ||
| 396 | |||
| 397 | if (!list_lru_memcg_aware(lru)) | ||
| 398 | return; | ||
| 399 | |||
| 400 | for (i = 0; i < nr_node_ids; i++) | ||
| 401 | memcg_destroy_list_lru_node(&lru->node[i]); | ||
| 402 | } | ||
| 403 | |||
| 404 | static int memcg_update_list_lru(struct list_lru *lru, | ||
| 405 | int old_size, int new_size) | ||
| 406 | { | ||
| 407 | int i; | ||
| 408 | |||
| 409 | if (!list_lru_memcg_aware(lru)) | ||
| 410 | return 0; | ||
| 411 | |||
| 412 | for (i = 0; i < nr_node_ids; i++) { | ||
| 413 | if (memcg_update_list_lru_node(&lru->node[i], | ||
| 414 | old_size, new_size)) | ||
| 415 | goto fail; | ||
| 416 | } | ||
| 417 | return 0; | ||
| 418 | fail: | ||
| 419 | for (i = i - 1; i >= 0; i--) | ||
| 420 | memcg_cancel_update_list_lru_node(&lru->node[i], | ||
| 421 | old_size, new_size); | ||
| 422 | return -ENOMEM; | ||
| 423 | } | ||
| 424 | |||
| 425 | static void memcg_cancel_update_list_lru(struct list_lru *lru, | ||
| 426 | int old_size, int new_size) | ||
| 427 | { | ||
| 428 | int i; | ||
| 429 | |||
| 430 | if (!list_lru_memcg_aware(lru)) | ||
| 431 | return; | ||
| 432 | |||
| 433 | for (i = 0; i < nr_node_ids; i++) | ||
| 434 | memcg_cancel_update_list_lru_node(&lru->node[i], | ||
| 435 | old_size, new_size); | ||
| 436 | } | ||
| 437 | |||
| 438 | int memcg_update_all_list_lrus(int new_size) | ||
| 439 | { | ||
| 440 | int ret = 0; | ||
| 441 | struct list_lru *lru; | ||
| 442 | int old_size = memcg_nr_cache_ids; | ||
| 443 | |||
| 444 | mutex_lock(&list_lrus_mutex); | ||
| 445 | list_for_each_entry(lru, &list_lrus, list) { | ||
| 446 | ret = memcg_update_list_lru(lru, old_size, new_size); | ||
| 447 | if (ret) | ||
| 448 | goto fail; | ||
| 449 | } | ||
| 450 | out: | ||
| 451 | mutex_unlock(&list_lrus_mutex); | ||
| 452 | return ret; | ||
| 453 | fail: | ||
| 454 | list_for_each_entry_continue_reverse(lru, &list_lrus, list) | ||
| 455 | memcg_cancel_update_list_lru(lru, old_size, new_size); | ||
| 456 | goto out; | ||
| 457 | } | ||
| 458 | |||
| 459 | static void memcg_drain_list_lru_node(struct list_lru_node *nlru, | ||
| 460 | int src_idx, int dst_idx) | ||
| 461 | { | ||
| 462 | struct list_lru_one *src, *dst; | ||
| 463 | |||
| 464 | /* | ||
| 465 | * Since list_lru_{add,del} may be called under an IRQ-safe lock, | ||
| 466 | * we have to use IRQ-safe primitives here to avoid deadlock. | ||
| 467 | */ | ||
| 468 | spin_lock_irq(&nlru->lock); | ||
| 469 | |||
| 470 | src = list_lru_from_memcg_idx(nlru, src_idx); | ||
| 471 | dst = list_lru_from_memcg_idx(nlru, dst_idx); | ||
| 472 | |||
| 473 | list_splice_init(&src->list, &dst->list); | ||
| 474 | dst->nr_items += src->nr_items; | ||
| 475 | src->nr_items = 0; | ||
| 476 | |||
| 477 | spin_unlock_irq(&nlru->lock); | ||
| 478 | } | ||
| 479 | |||
| 480 | static void memcg_drain_list_lru(struct list_lru *lru, | ||
| 481 | int src_idx, int dst_idx) | ||
| 482 | { | ||
| 483 | int i; | ||
| 484 | |||
| 485 | if (!list_lru_memcg_aware(lru)) | ||
| 486 | return; | ||
| 487 | |||
| 488 | for (i = 0; i < nr_node_ids; i++) | ||
| 489 | memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); | ||
| 490 | } | ||
| 491 | |||
| 492 | void memcg_drain_all_list_lrus(int src_idx, int dst_idx) | ||
| 493 | { | ||
| 494 | struct list_lru *lru; | ||
| 495 | |||
| 496 | mutex_lock(&list_lrus_mutex); | ||
| 497 | list_for_each_entry(lru, &list_lrus, list) | ||
| 498 | memcg_drain_list_lru(lru, src_idx, dst_idx); | ||
| 499 | mutex_unlock(&list_lrus_mutex); | ||
| 500 | } | ||
| 501 | #else | ||
| 502 | static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) | ||
| 503 | { | ||
| 504 | return 0; | ||
| 505 | } | ||
| 506 | |||
| 507 | static void memcg_destroy_list_lru(struct list_lru *lru) | ||
| 508 | { | ||
| 509 | } | ||
| 510 | #endif /* CONFIG_MEMCG_KMEM */ | ||
| 511 | |||
| 512 | int __list_lru_init(struct list_lru *lru, bool memcg_aware, | ||
| 513 | struct lock_class_key *key) | ||
| 128 | { | 514 | { |
| 129 | int i; | 515 | int i; |
| 130 | size_t size = sizeof(*lru->node) * nr_node_ids; | 516 | size_t size = sizeof(*lru->node) * nr_node_ids; |
| 517 | int err = -ENOMEM; | ||
| 518 | |||
| 519 | memcg_get_cache_ids(); | ||
| 131 | 520 | ||
| 132 | lru->node = kzalloc(size, GFP_KERNEL); | 521 | lru->node = kzalloc(size, GFP_KERNEL); |
| 133 | if (!lru->node) | 522 | if (!lru->node) |
| 134 | return -ENOMEM; | 523 | goto out; |
| 135 | 524 | ||
| 136 | nodes_clear(lru->active_nodes); | ||
| 137 | for (i = 0; i < nr_node_ids; i++) { | 525 | for (i = 0; i < nr_node_ids; i++) { |
| 138 | spin_lock_init(&lru->node[i].lock); | 526 | spin_lock_init(&lru->node[i].lock); |
| 139 | if (key) | 527 | if (key) |
| 140 | lockdep_set_class(&lru->node[i].lock, key); | 528 | lockdep_set_class(&lru->node[i].lock, key); |
| 141 | INIT_LIST_HEAD(&lru->node[i].list); | 529 | init_one_lru(&lru->node[i].lru); |
| 142 | lru->node[i].nr_items = 0; | ||
| 143 | } | 530 | } |
| 144 | return 0; | 531 | |
| 532 | err = memcg_init_list_lru(lru, memcg_aware); | ||
| 533 | if (err) { | ||
| 534 | kfree(lru->node); | ||
| 535 | goto out; | ||
| 536 | } | ||
| 537 | |||
| 538 | list_lru_register(lru); | ||
| 539 | out: | ||
| 540 | memcg_put_cache_ids(); | ||
| 541 | return err; | ||
| 145 | } | 542 | } |
| 146 | EXPORT_SYMBOL_GPL(list_lru_init_key); | 543 | EXPORT_SYMBOL_GPL(__list_lru_init); |
| 147 | 544 | ||
| 148 | void list_lru_destroy(struct list_lru *lru) | 545 | void list_lru_destroy(struct list_lru *lru) |
| 149 | { | 546 | { |
| 547 | /* Already destroyed or not yet initialized? */ | ||
| 548 | if (!lru->node) | ||
| 549 | return; | ||
| 550 | |||
| 551 | memcg_get_cache_ids(); | ||
| 552 | |||
| 553 | list_lru_unregister(lru); | ||
| 554 | |||
| 555 | memcg_destroy_list_lru(lru); | ||
| 150 | kfree(lru->node); | 556 | kfree(lru->node); |
| 557 | lru->node = NULL; | ||
| 558 | |||
| 559 | memcg_put_cache_ids(); | ||
| 151 | } | 560 | } |
| 152 | EXPORT_SYMBOL_GPL(list_lru_destroy); | 561 | EXPORT_SYMBOL_GPL(list_lru_destroy); |
diff --git a/mm/madvise.c b/mm/madvise.c index a271adc93289..d551475517bf 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, | |||
| 155 | pte = *(orig_pte + ((index - start) / PAGE_SIZE)); | 155 | pte = *(orig_pte + ((index - start) / PAGE_SIZE)); |
| 156 | pte_unmap_unlock(orig_pte, ptl); | 156 | pte_unmap_unlock(orig_pte, ptl); |
| 157 | 157 | ||
| 158 | if (pte_present(pte) || pte_none(pte) || pte_file(pte)) | 158 | if (pte_present(pte) || pte_none(pte)) |
| 159 | continue; | 159 | continue; |
| 160 | entry = pte_to_swp_entry(pte); | 160 | entry = pte_to_swp_entry(pte); |
| 161 | if (unlikely(non_swap_entry(entry))) | 161 | if (unlikely(non_swap_entry(entry))) |
| @@ -222,21 +222,24 @@ static long madvise_willneed(struct vm_area_struct *vma, | |||
| 222 | struct file *file = vma->vm_file; | 222 | struct file *file = vma->vm_file; |
| 223 | 223 | ||
| 224 | #ifdef CONFIG_SWAP | 224 | #ifdef CONFIG_SWAP |
| 225 | if (!file || mapping_cap_swap_backed(file->f_mapping)) { | 225 | if (!file) { |
| 226 | *prev = vma; | 226 | *prev = vma; |
| 227 | if (!file) | 227 | force_swapin_readahead(vma, start, end); |
| 228 | force_swapin_readahead(vma, start, end); | ||
| 229 | else | ||
| 230 | force_shm_swapin_readahead(vma, start, end, | ||
| 231 | file->f_mapping); | ||
| 232 | return 0; | 228 | return 0; |
| 233 | } | 229 | } |
| 234 | #endif | ||
| 235 | 230 | ||
| 231 | if (shmem_mapping(file->f_mapping)) { | ||
| 232 | *prev = vma; | ||
| 233 | force_shm_swapin_readahead(vma, start, end, | ||
| 234 | file->f_mapping); | ||
| 235 | return 0; | ||
| 236 | } | ||
| 237 | #else | ||
| 236 | if (!file) | 238 | if (!file) |
| 237 | return -EBADF; | 239 | return -EBADF; |
| 240 | #endif | ||
| 238 | 241 | ||
| 239 | if (file->f_mapping->a_ops->get_xip_mem) { | 242 | if (IS_DAX(file_inode(file))) { |
| 240 | /* no bad return value, but ignore advice */ | 243 | /* no bad return value, but ignore advice */ |
| 241 | return 0; | 244 | return 0; |
| 242 | } | 245 | } |
| @@ -278,14 +281,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, | |||
| 278 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) | 281 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) |
| 279 | return -EINVAL; | 282 | return -EINVAL; |
| 280 | 283 | ||
| 281 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { | 284 | zap_page_range(vma, start, end - start, NULL); |
| 282 | struct zap_details details = { | ||
| 283 | .nonlinear_vma = vma, | ||
| 284 | .last_index = ULONG_MAX, | ||
| 285 | }; | ||
| 286 | zap_page_range(vma, start, end - start, &details); | ||
| 287 | } else | ||
| 288 | zap_page_range(vma, start, end - start, NULL); | ||
| 289 | return 0; | 285 | return 0; |
| 290 | } | 286 | } |
| 291 | 287 | ||
| @@ -303,7 +299,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
| 303 | 299 | ||
| 304 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 300 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
| 305 | 301 | ||
| 306 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | 302 | if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) |
| 307 | return -EINVAL; | 303 | return -EINVAL; |
| 308 | 304 | ||
| 309 | f = vma->vm_file; | 305 | f = vma->vm_file; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2f6893c2f01b..9fe07692eaad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys); | |||
| 72 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 72 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
| 73 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 73 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
| 74 | 74 | ||
| 75 | /* Whether the swap controller is active */ | ||
| 75 | #ifdef CONFIG_MEMCG_SWAP | 76 | #ifdef CONFIG_MEMCG_SWAP |
| 76 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | ||
| 77 | int do_swap_account __read_mostly; | 77 | int do_swap_account __read_mostly; |
| 78 | |||
| 79 | /* for remember boot option*/ | ||
| 80 | #ifdef CONFIG_MEMCG_SWAP_ENABLED | ||
| 81 | static int really_do_swap_account __initdata = 1; | ||
| 82 | #else | ||
| 83 | static int really_do_swap_account __initdata; | ||
| 84 | #endif | ||
| 85 | |||
| 86 | #else | 78 | #else |
| 87 | #define do_swap_account 0 | 79 | #define do_swap_account 0 |
| 88 | #endif | 80 | #endif |
| 89 | 81 | ||
| 90 | |||
| 91 | static const char * const mem_cgroup_stat_names[] = { | 82 | static const char * const mem_cgroup_stat_names[] = { |
| 92 | "cache", | 83 | "cache", |
| 93 | "rss", | 84 | "rss", |
| @@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = { | |||
| 97 | "swap", | 88 | "swap", |
| 98 | }; | 89 | }; |
| 99 | 90 | ||
| 100 | enum mem_cgroup_events_index { | ||
| 101 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | ||
| 102 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | ||
| 103 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ | ||
| 104 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ | ||
| 105 | MEM_CGROUP_EVENTS_NSTATS, | ||
| 106 | }; | ||
| 107 | |||
| 108 | static const char * const mem_cgroup_events_names[] = { | 91 | static const char * const mem_cgroup_events_names[] = { |
| 109 | "pgpgin", | 92 | "pgpgin", |
| 110 | "pgpgout", | 93 | "pgpgout", |
| @@ -138,7 +121,7 @@ enum mem_cgroup_events_target { | |||
| 138 | 121 | ||
| 139 | struct mem_cgroup_stat_cpu { | 122 | struct mem_cgroup_stat_cpu { |
| 140 | long count[MEM_CGROUP_STAT_NSTATS]; | 123 | long count[MEM_CGROUP_STAT_NSTATS]; |
| 141 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; | 124 | unsigned long events[MEMCG_NR_EVENTS]; |
| 142 | unsigned long nr_page_events; | 125 | unsigned long nr_page_events; |
| 143 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 126 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
| 144 | }; | 127 | }; |
| @@ -284,6 +267,10 @@ struct mem_cgroup { | |||
| 284 | struct page_counter memsw; | 267 | struct page_counter memsw; |
| 285 | struct page_counter kmem; | 268 | struct page_counter kmem; |
| 286 | 269 | ||
| 270 | /* Normal memory consumption range */ | ||
| 271 | unsigned long low; | ||
| 272 | unsigned long high; | ||
| 273 | |||
| 287 | unsigned long soft_limit; | 274 | unsigned long soft_limit; |
| 288 | 275 | ||
| 289 | /* vmpressure notifications */ | 276 | /* vmpressure notifications */ |
| @@ -325,9 +312,11 @@ struct mem_cgroup { | |||
| 325 | /* | 312 | /* |
| 326 | * set > 0 if pages under this cgroup are moving to other cgroup. | 313 | * set > 0 if pages under this cgroup are moving to other cgroup. |
| 327 | */ | 314 | */ |
| 328 | atomic_t moving_account; | 315 | atomic_t moving_account; |
| 329 | /* taken only while moving_account > 0 */ | 316 | /* taken only while moving_account > 0 */ |
| 330 | spinlock_t move_lock; | 317 | spinlock_t move_lock; |
| 318 | struct task_struct *move_lock_task; | ||
| 319 | unsigned long move_lock_flags; | ||
| 331 | /* | 320 | /* |
| 332 | * percpu counter. | 321 | * percpu counter. |
| 333 | */ | 322 | */ |
| @@ -343,11 +332,10 @@ struct mem_cgroup { | |||
| 343 | struct cg_proto tcp_mem; | 332 | struct cg_proto tcp_mem; |
| 344 | #endif | 333 | #endif |
| 345 | #if defined(CONFIG_MEMCG_KMEM) | 334 | #if defined(CONFIG_MEMCG_KMEM) |
| 346 | /* analogous to slab_common's slab_caches list, but per-memcg; | 335 | /* Index in the kmem_cache->memcg_params.memcg_caches array */ |
| 347 | * protected by memcg_slab_mutex */ | ||
| 348 | struct list_head memcg_slab_caches; | ||
| 349 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | ||
| 350 | int kmemcg_id; | 336 | int kmemcg_id; |
| 337 | bool kmem_acct_activated; | ||
| 338 | bool kmem_acct_active; | ||
| 351 | #endif | 339 | #endif |
| 352 | 340 | ||
| 353 | int last_scanned_node; | 341 | int last_scanned_node; |
| @@ -366,29 +354,26 @@ struct mem_cgroup { | |||
| 366 | }; | 354 | }; |
| 367 | 355 | ||
| 368 | #ifdef CONFIG_MEMCG_KMEM | 356 | #ifdef CONFIG_MEMCG_KMEM |
| 369 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | 357 | bool memcg_kmem_is_active(struct mem_cgroup *memcg) |
| 370 | { | 358 | { |
| 371 | return memcg->kmemcg_id >= 0; | 359 | return memcg->kmem_acct_active; |
| 372 | } | 360 | } |
| 373 | #endif | 361 | #endif |
| 374 | 362 | ||
| 375 | /* Stuffs for move charges at task migration. */ | 363 | /* Stuffs for move charges at task migration. */ |
| 376 | /* | 364 | /* |
| 377 | * Types of charges to be moved. "move_charge_at_immitgrate" and | 365 | * Types of charges to be moved. |
| 378 | * "immigrate_flags" are treated as a left-shifted bitmap of these types. | ||
| 379 | */ | 366 | */ |
| 380 | enum move_type { | 367 | #define MOVE_ANON 0x1U |
| 381 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 368 | #define MOVE_FILE 0x2U |
| 382 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ | 369 | #define MOVE_MASK (MOVE_ANON | MOVE_FILE) |
| 383 | NR_MOVE_TYPE, | ||
| 384 | }; | ||
| 385 | 370 | ||
| 386 | /* "mc" and its members are protected by cgroup_mutex */ | 371 | /* "mc" and its members are protected by cgroup_mutex */ |
| 387 | static struct move_charge_struct { | 372 | static struct move_charge_struct { |
| 388 | spinlock_t lock; /* for from, to */ | 373 | spinlock_t lock; /* for from, to */ |
| 389 | struct mem_cgroup *from; | 374 | struct mem_cgroup *from; |
| 390 | struct mem_cgroup *to; | 375 | struct mem_cgroup *to; |
| 391 | unsigned long immigrate_flags; | 376 | unsigned long flags; |
| 392 | unsigned long precharge; | 377 | unsigned long precharge; |
| 393 | unsigned long moved_charge; | 378 | unsigned long moved_charge; |
| 394 | unsigned long moved_swap; | 379 | unsigned long moved_swap; |
| @@ -399,16 +384,6 @@ static struct move_charge_struct { | |||
| 399 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 384 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
| 400 | }; | 385 | }; |
| 401 | 386 | ||
| 402 | static bool move_anon(void) | ||
| 403 | { | ||
| 404 | return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); | ||
| 405 | } | ||
| 406 | |||
| 407 | static bool move_file(void) | ||
| 408 | { | ||
| 409 | return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); | ||
| 410 | } | ||
| 411 | |||
| 412 | /* | 387 | /* |
| 413 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 388 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
| 414 | * limit reclaim to prevent infinite loops, if they ever occur. | 389 | * limit reclaim to prevent infinite loops, if they ever occur. |
| @@ -544,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | |||
| 544 | } | 519 | } |
| 545 | EXPORT_SYMBOL(tcp_proto_cgroup); | 520 | EXPORT_SYMBOL(tcp_proto_cgroup); |
| 546 | 521 | ||
| 547 | static void disarm_sock_keys(struct mem_cgroup *memcg) | ||
| 548 | { | ||
| 549 | if (!memcg_proto_activated(&memcg->tcp_mem)) | ||
| 550 | return; | ||
| 551 | static_key_slow_dec(&memcg_socket_limit_enabled); | ||
| 552 | } | ||
| 553 | #else | ||
| 554 | static void disarm_sock_keys(struct mem_cgroup *memcg) | ||
| 555 | { | ||
| 556 | } | ||
| 557 | #endif | 522 | #endif |
| 558 | 523 | ||
| 559 | #ifdef CONFIG_MEMCG_KMEM | 524 | #ifdef CONFIG_MEMCG_KMEM |
| 560 | /* | 525 | /* |
| 561 | * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. | 526 | * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. |
| 562 | * The main reason for not using cgroup id for this: | 527 | * The main reason for not using cgroup id for this: |
| 563 | * this works better in sparse environments, where we have a lot of memcgs, | 528 | * this works better in sparse environments, where we have a lot of memcgs, |
| 564 | * but only a few kmem-limited. Or also, if we have, for instance, 200 | 529 | * but only a few kmem-limited. Or also, if we have, for instance, 200 |
| 565 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a | 530 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a |
| 566 | * 200 entry array for that. | 531 | * 200 entry array for that. |
| 567 | * | 532 | * |
| 568 | * The current size of the caches array is stored in | 533 | * The current size of the caches array is stored in memcg_nr_cache_ids. It |
| 569 | * memcg_limited_groups_array_size. It will double each time we have to | 534 | * will double each time we have to increase it. |
| 570 | * increase it. | ||
| 571 | */ | 535 | */ |
| 572 | static DEFINE_IDA(kmem_limited_groups); | 536 | static DEFINE_IDA(memcg_cache_ida); |
| 573 | int memcg_limited_groups_array_size; | 537 | int memcg_nr_cache_ids; |
| 538 | |||
| 539 | /* Protects memcg_nr_cache_ids */ | ||
| 540 | static DECLARE_RWSEM(memcg_cache_ids_sem); | ||
| 541 | |||
| 542 | void memcg_get_cache_ids(void) | ||
| 543 | { | ||
| 544 | down_read(&memcg_cache_ids_sem); | ||
| 545 | } | ||
| 546 | |||
| 547 | void memcg_put_cache_ids(void) | ||
| 548 | { | ||
| 549 | up_read(&memcg_cache_ids_sem); | ||
| 550 | } | ||
| 574 | 551 | ||
| 575 | /* | 552 | /* |
| 576 | * MIN_SIZE is different than 1, because we would like to avoid going through | 553 | * MIN_SIZE is different than 1, because we would like to avoid going through |
| @@ -596,32 +573,8 @@ int memcg_limited_groups_array_size; | |||
| 596 | struct static_key memcg_kmem_enabled_key; | 573 | struct static_key memcg_kmem_enabled_key; |
| 597 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | 574 | EXPORT_SYMBOL(memcg_kmem_enabled_key); |
| 598 | 575 | ||
| 599 | static void memcg_free_cache_id(int id); | ||
| 600 | |||
| 601 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | ||
| 602 | { | ||
| 603 | if (memcg_kmem_is_active(memcg)) { | ||
| 604 | static_key_slow_dec(&memcg_kmem_enabled_key); | ||
| 605 | memcg_free_cache_id(memcg->kmemcg_id); | ||
| 606 | } | ||
| 607 | /* | ||
| 608 | * This check can't live in kmem destruction function, | ||
| 609 | * since the charges will outlive the cgroup | ||
| 610 | */ | ||
| 611 | WARN_ON(page_counter_read(&memcg->kmem)); | ||
| 612 | } | ||
| 613 | #else | ||
| 614 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | ||
| 615 | { | ||
| 616 | } | ||
| 617 | #endif /* CONFIG_MEMCG_KMEM */ | 576 | #endif /* CONFIG_MEMCG_KMEM */ |
| 618 | 577 | ||
| 619 | static void disarm_static_keys(struct mem_cgroup *memcg) | ||
| 620 | { | ||
| 621 | disarm_sock_keys(memcg); | ||
| 622 | disarm_kmem_keys(memcg); | ||
| 623 | } | ||
| 624 | |||
| 625 | static struct mem_cgroup_per_zone * | 578 | static struct mem_cgroup_per_zone * |
| 626 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) | 579 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) |
| 627 | { | 580 | { |
| @@ -1368,6 +1321,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
| 1368 | return inactive * inactive_ratio < active; | 1321 | return inactive * inactive_ratio < active; |
| 1369 | } | 1322 | } |
| 1370 | 1323 | ||
| 1324 | bool mem_cgroup_lruvec_online(struct lruvec *lruvec) | ||
| 1325 | { | ||
| 1326 | struct mem_cgroup_per_zone *mz; | ||
| 1327 | struct mem_cgroup *memcg; | ||
| 1328 | |||
| 1329 | if (mem_cgroup_disabled()) | ||
| 1330 | return true; | ||
| 1331 | |||
| 1332 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | ||
| 1333 | memcg = mz->memcg; | ||
| 1334 | |||
| 1335 | return !!(memcg->css.flags & CSS_ONLINE); | ||
| 1336 | } | ||
| 1337 | |||
| 1371 | #define mem_cgroup_from_counter(counter, member) \ | 1338 | #define mem_cgroup_from_counter(counter, member) \ |
| 1372 | container_of(counter, struct mem_cgroup, member) | 1339 | container_of(counter, struct mem_cgroup, member) |
| 1373 | 1340 | ||
| @@ -1560,7 +1527,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
| 1560 | * quickly exit and free its memory. | 1527 | * quickly exit and free its memory. |
| 1561 | */ | 1528 | */ |
| 1562 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { | 1529 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { |
| 1563 | set_thread_flag(TIF_MEMDIE); | 1530 | mark_tsk_oom_victim(current); |
| 1564 | return; | 1531 | return; |
| 1565 | } | 1532 | } |
| 1566 | 1533 | ||
| @@ -1934,7 +1901,7 @@ bool mem_cgroup_oom_synchronize(bool handle) | |||
| 1934 | if (!memcg) | 1901 | if (!memcg) |
| 1935 | return false; | 1902 | return false; |
| 1936 | 1903 | ||
| 1937 | if (!handle) | 1904 | if (!handle || oom_killer_disabled) |
| 1938 | goto cleanup; | 1905 | goto cleanup; |
| 1939 | 1906 | ||
| 1940 | owait.memcg = memcg; | 1907 | owait.memcg = memcg; |
| @@ -1980,34 +1947,33 @@ cleanup: | |||
| 1980 | /** | 1947 | /** |
| 1981 | * mem_cgroup_begin_page_stat - begin a page state statistics transaction | 1948 | * mem_cgroup_begin_page_stat - begin a page state statistics transaction |
| 1982 | * @page: page that is going to change accounted state | 1949 | * @page: page that is going to change accounted state |
| 1983 | * @locked: &memcg->move_lock slowpath was taken | ||
| 1984 | * @flags: IRQ-state flags for &memcg->move_lock | ||
| 1985 | * | 1950 | * |
| 1986 | * This function must mark the beginning of an accounted page state | 1951 | * This function must mark the beginning of an accounted page state |
| 1987 | * change to prevent double accounting when the page is concurrently | 1952 | * change to prevent double accounting when the page is concurrently |
| 1988 | * being moved to another memcg: | 1953 | * being moved to another memcg: |
| 1989 | * | 1954 | * |
| 1990 | * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); | 1955 | * memcg = mem_cgroup_begin_page_stat(page); |
| 1991 | * if (TestClearPageState(page)) | 1956 | * if (TestClearPageState(page)) |
| 1992 | * mem_cgroup_update_page_stat(memcg, state, -1); | 1957 | * mem_cgroup_update_page_stat(memcg, state, -1); |
| 1993 | * mem_cgroup_end_page_stat(memcg, locked, flags); | 1958 | * mem_cgroup_end_page_stat(memcg); |
| 1994 | * | ||
| 1995 | * The RCU lock is held throughout the transaction. The fast path can | ||
| 1996 | * get away without acquiring the memcg->move_lock (@locked is false) | ||
| 1997 | * because page moving starts with an RCU grace period. | ||
| 1998 | * | ||
| 1999 | * The RCU lock also protects the memcg from being freed when the page | ||
| 2000 | * state that is going to change is the only thing preventing the page | ||
| 2001 | * from being uncharged. E.g. end-writeback clearing PageWriteback(), | ||
| 2002 | * which allows migration to go ahead and uncharge the page before the | ||
| 2003 | * account transaction might be complete. | ||
| 2004 | */ | 1959 | */ |
| 2005 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, | 1960 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) |
| 2006 | bool *locked, | ||
| 2007 | unsigned long *flags) | ||
| 2008 | { | 1961 | { |
| 2009 | struct mem_cgroup *memcg; | 1962 | struct mem_cgroup *memcg; |
| 1963 | unsigned long flags; | ||
| 2010 | 1964 | ||
| 1965 | /* | ||
| 1966 | * The RCU lock is held throughout the transaction. The fast | ||
| 1967 | * path can get away without acquiring the memcg->move_lock | ||
| 1968 | * because page moving starts with an RCU grace period. | ||
| 1969 | * | ||
| 1970 | * The RCU lock also protects the memcg from being freed when | ||
| 1971 | * the page state that is going to change is the only thing | ||
| 1972 | * preventing the page from being uncharged. | ||
| 1973 | * E.g. end-writeback clearing PageWriteback(), which allows | ||
| 1974 | * migration to go ahead and uncharge the page before the | ||
| 1975 | * account transaction might be complete. | ||
| 1976 | */ | ||
| 2011 | rcu_read_lock(); | 1977 | rcu_read_lock(); |
| 2012 | 1978 | ||
| 2013 | if (mem_cgroup_disabled()) | 1979 | if (mem_cgroup_disabled()) |
| @@ -2017,16 +1983,22 @@ again: | |||
| 2017 | if (unlikely(!memcg)) | 1983 | if (unlikely(!memcg)) |
| 2018 | return NULL; | 1984 | return NULL; |
| 2019 | 1985 | ||
| 2020 | *locked = false; | ||
| 2021 | if (atomic_read(&memcg->moving_account) <= 0) | 1986 | if (atomic_read(&memcg->moving_account) <= 0) |
| 2022 | return memcg; | 1987 | return memcg; |
| 2023 | 1988 | ||
| 2024 | spin_lock_irqsave(&memcg->move_lock, *flags); | 1989 | spin_lock_irqsave(&memcg->move_lock, flags); |
| 2025 | if (memcg != page->mem_cgroup) { | 1990 | if (memcg != page->mem_cgroup) { |
| 2026 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | 1991 | spin_unlock_irqrestore(&memcg->move_lock, flags); |
| 2027 | goto again; | 1992 | goto again; |
| 2028 | } | 1993 | } |
| 2029 | *locked = true; | 1994 | |
| 1995 | /* | ||
| 1996 | * When charge migration first begins, we can have locked and | ||
| 1997 | * unlocked page stat updates happening concurrently. Track | ||
| 1998 | * the task who has the lock for mem_cgroup_end_page_stat(). | ||
| 1999 | */ | ||
| 2000 | memcg->move_lock_task = current; | ||
| 2001 | memcg->move_lock_flags = flags; | ||
| 2030 | 2002 | ||
| 2031 | return memcg; | 2003 | return memcg; |
| 2032 | } | 2004 | } |
| @@ -2034,14 +2006,17 @@ again: | |||
| 2034 | /** | 2006 | /** |
| 2035 | * mem_cgroup_end_page_stat - finish a page state statistics transaction | 2007 | * mem_cgroup_end_page_stat - finish a page state statistics transaction |
| 2036 | * @memcg: the memcg that was accounted against | 2008 | * @memcg: the memcg that was accounted against |
| 2037 | * @locked: value received from mem_cgroup_begin_page_stat() | ||
| 2038 | * @flags: value received from mem_cgroup_begin_page_stat() | ||
| 2039 | */ | 2009 | */ |
| 2040 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, | 2010 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) |
| 2041 | unsigned long *flags) | ||
| 2042 | { | 2011 | { |
| 2043 | if (memcg && *locked) | 2012 | if (memcg && memcg->move_lock_task == current) { |
| 2044 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | 2013 | unsigned long flags = memcg->move_lock_flags; |
| 2014 | |||
| 2015 | memcg->move_lock_task = NULL; | ||
| 2016 | memcg->move_lock_flags = 0; | ||
| 2017 | |||
| 2018 | spin_unlock_irqrestore(&memcg->move_lock, flags); | ||
| 2019 | } | ||
| 2045 | 2020 | ||
| 2046 | rcu_read_unlock(); | 2021 | rcu_read_unlock(); |
| 2047 | } | 2022 | } |
| @@ -2134,17 +2109,6 @@ static void drain_local_stock(struct work_struct *dummy) | |||
| 2134 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | 2109 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); |
| 2135 | } | 2110 | } |
| 2136 | 2111 | ||
| 2137 | static void __init memcg_stock_init(void) | ||
| 2138 | { | ||
| 2139 | int cpu; | ||
| 2140 | |||
| 2141 | for_each_possible_cpu(cpu) { | ||
| 2142 | struct memcg_stock_pcp *stock = | ||
| 2143 | &per_cpu(memcg_stock, cpu); | ||
| 2144 | INIT_WORK(&stock->work, drain_local_stock); | ||
| 2145 | } | ||
| 2146 | } | ||
| 2147 | |||
| 2148 | /* | 2112 | /* |
| 2149 | * Cache charges(val) to local per_cpu area. | 2113 | * Cache charges(val) to local per_cpu area. |
| 2150 | * This will be consumed by consume_stock() function, later. | 2114 | * This will be consumed by consume_stock() function, later. |
| @@ -2294,6 +2258,8 @@ retry: | |||
| 2294 | if (!(gfp_mask & __GFP_WAIT)) | 2258 | if (!(gfp_mask & __GFP_WAIT)) |
| 2295 | goto nomem; | 2259 | goto nomem; |
| 2296 | 2260 | ||
| 2261 | mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); | ||
| 2262 | |||
| 2297 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, | 2263 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, |
| 2298 | gfp_mask, may_swap); | 2264 | gfp_mask, may_swap); |
| 2299 | 2265 | ||
| @@ -2335,6 +2301,8 @@ retry: | |||
| 2335 | if (fatal_signal_pending(current)) | 2301 | if (fatal_signal_pending(current)) |
| 2336 | goto bypass; | 2302 | goto bypass; |
| 2337 | 2303 | ||
| 2304 | mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); | ||
| 2305 | |||
| 2338 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); | 2306 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); |
| 2339 | nomem: | 2307 | nomem: |
| 2340 | if (!(gfp_mask & __GFP_NOFAIL)) | 2308 | if (!(gfp_mask & __GFP_NOFAIL)) |
| @@ -2346,6 +2314,16 @@ done_restock: | |||
| 2346 | css_get_many(&memcg->css, batch); | 2314 | css_get_many(&memcg->css, batch); |
| 2347 | if (batch > nr_pages) | 2315 | if (batch > nr_pages) |
| 2348 | refill_stock(memcg, batch - nr_pages); | 2316 | refill_stock(memcg, batch - nr_pages); |
| 2317 | /* | ||
| 2318 | * If the hierarchy is above the normal consumption range, | ||
| 2319 | * make the charging task trim their excess contribution. | ||
| 2320 | */ | ||
| 2321 | do { | ||
| 2322 | if (page_counter_read(&memcg->memory) <= memcg->high) | ||
| 2323 | continue; | ||
| 2324 | mem_cgroup_events(memcg, MEMCG_HIGH, 1); | ||
| 2325 | try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); | ||
| 2326 | } while ((memcg = parent_mem_cgroup(memcg))); | ||
| 2349 | done: | 2327 | done: |
| 2350 | return ret; | 2328 | return ret; |
| 2351 | } | 2329 | } |
| @@ -2476,27 +2454,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
| 2476 | } | 2454 | } |
| 2477 | 2455 | ||
| 2478 | #ifdef CONFIG_MEMCG_KMEM | 2456 | #ifdef CONFIG_MEMCG_KMEM |
| 2479 | /* | 2457 | int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, |
| 2480 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or | 2458 | unsigned long nr_pages) |
| 2481 | * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. | ||
| 2482 | */ | ||
| 2483 | static DEFINE_MUTEX(memcg_slab_mutex); | ||
| 2484 | |||
| 2485 | /* | ||
| 2486 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer | ||
| 2487 | * in the memcg_cache_params struct. | ||
| 2488 | */ | ||
| 2489 | static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | ||
| 2490 | { | ||
| 2491 | struct kmem_cache *cachep; | ||
| 2492 | |||
| 2493 | VM_BUG_ON(p->is_root_cache); | ||
| 2494 | cachep = p->root_cache; | ||
| 2495 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); | ||
| 2496 | } | ||
| 2497 | |||
| 2498 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, | ||
| 2499 | unsigned long nr_pages) | ||
| 2500 | { | 2459 | { |
| 2501 | struct page_counter *counter; | 2460 | struct page_counter *counter; |
| 2502 | int ret = 0; | 2461 | int ret = 0; |
| @@ -2533,8 +2492,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, | |||
| 2533 | return ret; | 2492 | return ret; |
| 2534 | } | 2493 | } |
| 2535 | 2494 | ||
| 2536 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, | 2495 | void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) |
| 2537 | unsigned long nr_pages) | ||
| 2538 | { | 2496 | { |
| 2539 | page_counter_uncharge(&memcg->memory, nr_pages); | 2497 | page_counter_uncharge(&memcg->memory, nr_pages); |
| 2540 | if (do_swap_account) | 2498 | if (do_swap_account) |
| @@ -2560,18 +2518,19 @@ static int memcg_alloc_cache_id(void) | |||
| 2560 | int id, size; | 2518 | int id, size; |
| 2561 | int err; | 2519 | int err; |
| 2562 | 2520 | ||
| 2563 | id = ida_simple_get(&kmem_limited_groups, | 2521 | id = ida_simple_get(&memcg_cache_ida, |
| 2564 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | 2522 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); |
| 2565 | if (id < 0) | 2523 | if (id < 0) |
| 2566 | return id; | 2524 | return id; |
| 2567 | 2525 | ||
| 2568 | if (id < memcg_limited_groups_array_size) | 2526 | if (id < memcg_nr_cache_ids) |
| 2569 | return id; | 2527 | return id; |
| 2570 | 2528 | ||
| 2571 | /* | 2529 | /* |
| 2572 | * There's no space for the new id in memcg_caches arrays, | 2530 | * There's no space for the new id in memcg_caches arrays, |
| 2573 | * so we have to grow them. | 2531 | * so we have to grow them. |
| 2574 | */ | 2532 | */ |
| 2533 | down_write(&memcg_cache_ids_sem); | ||
| 2575 | 2534 | ||
| 2576 | size = 2 * (id + 1); | 2535 | size = 2 * (id + 1); |
| 2577 | if (size < MEMCG_CACHES_MIN_SIZE) | 2536 | if (size < MEMCG_CACHES_MIN_SIZE) |
| @@ -2579,12 +2538,16 @@ static int memcg_alloc_cache_id(void) | |||
| 2579 | else if (size > MEMCG_CACHES_MAX_SIZE) | 2538 | else if (size > MEMCG_CACHES_MAX_SIZE) |
| 2580 | size = MEMCG_CACHES_MAX_SIZE; | 2539 | size = MEMCG_CACHES_MAX_SIZE; |
| 2581 | 2540 | ||
| 2582 | mutex_lock(&memcg_slab_mutex); | ||
| 2583 | err = memcg_update_all_caches(size); | 2541 | err = memcg_update_all_caches(size); |
| 2584 | mutex_unlock(&memcg_slab_mutex); | 2542 | if (!err) |
| 2543 | err = memcg_update_all_list_lrus(size); | ||
| 2544 | if (!err) | ||
| 2545 | memcg_nr_cache_ids = size; | ||
| 2546 | |||
| 2547 | up_write(&memcg_cache_ids_sem); | ||
| 2585 | 2548 | ||
| 2586 | if (err) { | 2549 | if (err) { |
| 2587 | ida_simple_remove(&kmem_limited_groups, id); | 2550 | ida_simple_remove(&memcg_cache_ida, id); |
| 2588 | return err; | 2551 | return err; |
| 2589 | } | 2552 | } |
| 2590 | return id; | 2553 | return id; |
| @@ -2592,136 +2555,23 @@ static int memcg_alloc_cache_id(void) | |||
| 2592 | 2555 | ||
| 2593 | static void memcg_free_cache_id(int id) | 2556 | static void memcg_free_cache_id(int id) |
| 2594 | { | 2557 | { |
| 2595 | ida_simple_remove(&kmem_limited_groups, id); | 2558 | ida_simple_remove(&memcg_cache_ida, id); |
| 2596 | } | 2559 | } |
| 2597 | 2560 | ||
| 2598 | /* | 2561 | struct memcg_kmem_cache_create_work { |
| 2599 | * We should update the current array size iff all caches updates succeed. This | ||
| 2600 | * can only be done from the slab side. The slab mutex needs to be held when | ||
| 2601 | * calling this. | ||
| 2602 | */ | ||
| 2603 | void memcg_update_array_size(int num) | ||
| 2604 | { | ||
| 2605 | memcg_limited_groups_array_size = num; | ||
| 2606 | } | ||
| 2607 | |||
| 2608 | static void memcg_register_cache(struct mem_cgroup *memcg, | ||
| 2609 | struct kmem_cache *root_cache) | ||
| 2610 | { | ||
| 2611 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by | ||
| 2612 | memcg_slab_mutex */ | ||
| 2613 | struct kmem_cache *cachep; | ||
| 2614 | int id; | ||
| 2615 | |||
| 2616 | lockdep_assert_held(&memcg_slab_mutex); | ||
| 2617 | |||
| 2618 | id = memcg_cache_id(memcg); | ||
| 2619 | |||
| 2620 | /* | ||
| 2621 | * Since per-memcg caches are created asynchronously on first | ||
| 2622 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
| 2623 | * create the same cache, but only one of them may succeed. | ||
| 2624 | */ | ||
| 2625 | if (cache_from_memcg_idx(root_cache, id)) | ||
| 2626 | return; | ||
| 2627 | |||
| 2628 | cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); | ||
| 2629 | cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); | ||
| 2630 | /* | ||
| 2631 | * If we could not create a memcg cache, do not complain, because | ||
| 2632 | * that's not critical at all as we can always proceed with the root | ||
| 2633 | * cache. | ||
| 2634 | */ | ||
| 2635 | if (!cachep) | ||
| 2636 | return; | ||
| 2637 | |||
| 2638 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | ||
| 2639 | |||
| 2640 | /* | ||
| 2641 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | ||
| 2642 | * barrier here to ensure nobody will see the kmem_cache partially | ||
| 2643 | * initialized. | ||
| 2644 | */ | ||
| 2645 | smp_wmb(); | ||
| 2646 | |||
| 2647 | BUG_ON(root_cache->memcg_params->memcg_caches[id]); | ||
| 2648 | root_cache->memcg_params->memcg_caches[id] = cachep; | ||
| 2649 | } | ||
| 2650 | |||
| 2651 | static void memcg_unregister_cache(struct kmem_cache *cachep) | ||
| 2652 | { | ||
| 2653 | struct kmem_cache *root_cache; | ||
| 2654 | struct mem_cgroup *memcg; | ||
| 2655 | int id; | ||
| 2656 | |||
| 2657 | lockdep_assert_held(&memcg_slab_mutex); | ||
| 2658 | |||
| 2659 | BUG_ON(is_root_cache(cachep)); | ||
| 2660 | |||
| 2661 | root_cache = cachep->memcg_params->root_cache; | ||
| 2662 | memcg = cachep->memcg_params->memcg; | ||
| 2663 | id = memcg_cache_id(memcg); | ||
| 2664 | |||
| 2665 | BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); | ||
| 2666 | root_cache->memcg_params->memcg_caches[id] = NULL; | ||
| 2667 | |||
| 2668 | list_del(&cachep->memcg_params->list); | ||
| 2669 | |||
| 2670 | kmem_cache_destroy(cachep); | ||
| 2671 | } | ||
| 2672 | |||
| 2673 | int __memcg_cleanup_cache_params(struct kmem_cache *s) | ||
| 2674 | { | ||
| 2675 | struct kmem_cache *c; | ||
| 2676 | int i, failed = 0; | ||
| 2677 | |||
| 2678 | mutex_lock(&memcg_slab_mutex); | ||
| 2679 | for_each_memcg_cache_index(i) { | ||
| 2680 | c = cache_from_memcg_idx(s, i); | ||
| 2681 | if (!c) | ||
| 2682 | continue; | ||
| 2683 | |||
| 2684 | memcg_unregister_cache(c); | ||
| 2685 | |||
| 2686 | if (cache_from_memcg_idx(s, i)) | ||
| 2687 | failed++; | ||
| 2688 | } | ||
| 2689 | mutex_unlock(&memcg_slab_mutex); | ||
| 2690 | return failed; | ||
| 2691 | } | ||
| 2692 | |||
| 2693 | static void memcg_unregister_all_caches(struct mem_cgroup *memcg) | ||
| 2694 | { | ||
| 2695 | struct kmem_cache *cachep; | ||
| 2696 | struct memcg_cache_params *params, *tmp; | ||
| 2697 | |||
| 2698 | if (!memcg_kmem_is_active(memcg)) | ||
| 2699 | return; | ||
| 2700 | |||
| 2701 | mutex_lock(&memcg_slab_mutex); | ||
| 2702 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { | ||
| 2703 | cachep = memcg_params_to_cache(params); | ||
| 2704 | memcg_unregister_cache(cachep); | ||
| 2705 | } | ||
| 2706 | mutex_unlock(&memcg_slab_mutex); | ||
| 2707 | } | ||
| 2708 | |||
| 2709 | struct memcg_register_cache_work { | ||
| 2710 | struct mem_cgroup *memcg; | 2562 | struct mem_cgroup *memcg; |
| 2711 | struct kmem_cache *cachep; | 2563 | struct kmem_cache *cachep; |
| 2712 | struct work_struct work; | 2564 | struct work_struct work; |
| 2713 | }; | 2565 | }; |
| 2714 | 2566 | ||
| 2715 | static void memcg_register_cache_func(struct work_struct *w) | 2567 | static void memcg_kmem_cache_create_func(struct work_struct *w) |
| 2716 | { | 2568 | { |
| 2717 | struct memcg_register_cache_work *cw = | 2569 | struct memcg_kmem_cache_create_work *cw = |
| 2718 | container_of(w, struct memcg_register_cache_work, work); | 2570 | container_of(w, struct memcg_kmem_cache_create_work, work); |
| 2719 | struct mem_cgroup *memcg = cw->memcg; | 2571 | struct mem_cgroup *memcg = cw->memcg; |
| 2720 | struct kmem_cache *cachep = cw->cachep; | 2572 | struct kmem_cache *cachep = cw->cachep; |
| 2721 | 2573 | ||
| 2722 | mutex_lock(&memcg_slab_mutex); | 2574 | memcg_create_kmem_cache(memcg, cachep); |
| 2723 | memcg_register_cache(memcg, cachep); | ||
| 2724 | mutex_unlock(&memcg_slab_mutex); | ||
| 2725 | 2575 | ||
| 2726 | css_put(&memcg->css); | 2576 | css_put(&memcg->css); |
| 2727 | kfree(cw); | 2577 | kfree(cw); |
| @@ -2730,10 +2580,10 @@ static void memcg_register_cache_func(struct work_struct *w) | |||
| 2730 | /* | 2580 | /* |
| 2731 | * Enqueue the creation of a per-memcg kmem_cache. | 2581 | * Enqueue the creation of a per-memcg kmem_cache. |
| 2732 | */ | 2582 | */ |
| 2733 | static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | 2583 | static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, |
| 2734 | struct kmem_cache *cachep) | 2584 | struct kmem_cache *cachep) |
| 2735 | { | 2585 | { |
| 2736 | struct memcg_register_cache_work *cw; | 2586 | struct memcg_kmem_cache_create_work *cw; |
| 2737 | 2587 | ||
| 2738 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); | 2588 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); |
| 2739 | if (!cw) | 2589 | if (!cw) |
| @@ -2743,18 +2593,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
| 2743 | 2593 | ||
| 2744 | cw->memcg = memcg; | 2594 | cw->memcg = memcg; |
| 2745 | cw->cachep = cachep; | 2595 | cw->cachep = cachep; |
| 2596 | INIT_WORK(&cw->work, memcg_kmem_cache_create_func); | ||
| 2746 | 2597 | ||
| 2747 | INIT_WORK(&cw->work, memcg_register_cache_func); | ||
| 2748 | schedule_work(&cw->work); | 2598 | schedule_work(&cw->work); |
| 2749 | } | 2599 | } |
| 2750 | 2600 | ||
| 2751 | static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | 2601 | static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, |
| 2752 | struct kmem_cache *cachep) | 2602 | struct kmem_cache *cachep) |
| 2753 | { | 2603 | { |
| 2754 | /* | 2604 | /* |
| 2755 | * We need to stop accounting when we kmalloc, because if the | 2605 | * We need to stop accounting when we kmalloc, because if the |
| 2756 | * corresponding kmalloc cache is not yet created, the first allocation | 2606 | * corresponding kmalloc cache is not yet created, the first allocation |
| 2757 | * in __memcg_schedule_register_cache will recurse. | 2607 | * in __memcg_schedule_kmem_cache_create will recurse. |
| 2758 | * | 2608 | * |
| 2759 | * However, it is better to enclose the whole function. Depending on | 2609 | * However, it is better to enclose the whole function. Depending on |
| 2760 | * the debugging options enabled, INIT_WORK(), for instance, can | 2610 | * the debugging options enabled, INIT_WORK(), for instance, can |
| @@ -2763,24 +2613,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
| 2763 | * the safest choice is to do it like this, wrapping the whole function. | 2613 | * the safest choice is to do it like this, wrapping the whole function. |
| 2764 | */ | 2614 | */ |
| 2765 | current->memcg_kmem_skip_account = 1; | 2615 | current->memcg_kmem_skip_account = 1; |
| 2766 | __memcg_schedule_register_cache(memcg, cachep); | 2616 | __memcg_schedule_kmem_cache_create(memcg, cachep); |
| 2767 | current->memcg_kmem_skip_account = 0; | 2617 | current->memcg_kmem_skip_account = 0; |
| 2768 | } | 2618 | } |
| 2769 | 2619 | ||
| 2770 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | ||
| 2771 | { | ||
| 2772 | unsigned int nr_pages = 1 << order; | ||
| 2773 | |||
| 2774 | return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); | ||
| 2775 | } | ||
| 2776 | |||
| 2777 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | ||
| 2778 | { | ||
| 2779 | unsigned int nr_pages = 1 << order; | ||
| 2780 | |||
| 2781 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); | ||
| 2782 | } | ||
| 2783 | |||
| 2784 | /* | 2620 | /* |
| 2785 | * Return the kmem_cache we're supposed to use for a slab allocation. | 2621 | * Return the kmem_cache we're supposed to use for a slab allocation. |
| 2786 | * We try to use the current memcg's version of the cache. | 2622 | * We try to use the current memcg's version of the cache. |
| @@ -2798,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
| 2798 | { | 2634 | { |
| 2799 | struct mem_cgroup *memcg; | 2635 | struct mem_cgroup *memcg; |
| 2800 | struct kmem_cache *memcg_cachep; | 2636 | struct kmem_cache *memcg_cachep; |
| 2637 | int kmemcg_id; | ||
| 2801 | 2638 | ||
| 2802 | VM_BUG_ON(!cachep->memcg_params); | 2639 | VM_BUG_ON(!is_root_cache(cachep)); |
| 2803 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | ||
| 2804 | 2640 | ||
| 2805 | if (current->memcg_kmem_skip_account) | 2641 | if (current->memcg_kmem_skip_account) |
| 2806 | return cachep; | 2642 | return cachep; |
| 2807 | 2643 | ||
| 2808 | memcg = get_mem_cgroup_from_mm(current->mm); | 2644 | memcg = get_mem_cgroup_from_mm(current->mm); |
| 2809 | if (!memcg_kmem_is_active(memcg)) | 2645 | kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); |
| 2646 | if (kmemcg_id < 0) | ||
| 2810 | goto out; | 2647 | goto out; |
| 2811 | 2648 | ||
| 2812 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); | 2649 | memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); |
| 2813 | if (likely(memcg_cachep)) | 2650 | if (likely(memcg_cachep)) |
| 2814 | return memcg_cachep; | 2651 | return memcg_cachep; |
| 2815 | 2652 | ||
| @@ -2825,7 +2662,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
| 2825 | * could happen with the slab_mutex held. So it's better to | 2662 | * could happen with the slab_mutex held. So it's better to |
| 2826 | * defer everything. | 2663 | * defer everything. |
| 2827 | */ | 2664 | */ |
| 2828 | memcg_schedule_register_cache(memcg, cachep); | 2665 | memcg_schedule_kmem_cache_create(memcg, cachep); |
| 2829 | out: | 2666 | out: |
| 2830 | css_put(&memcg->css); | 2667 | css_put(&memcg->css); |
| 2831 | return cachep; | 2668 | return cachep; |
| @@ -2834,7 +2671,7 @@ out: | |||
| 2834 | void __memcg_kmem_put_cache(struct kmem_cache *cachep) | 2671 | void __memcg_kmem_put_cache(struct kmem_cache *cachep) |
| 2835 | { | 2672 | { |
| 2836 | if (!is_root_cache(cachep)) | 2673 | if (!is_root_cache(cachep)) |
| 2837 | css_put(&cachep->memcg_params->memcg->css); | 2674 | css_put(&cachep->memcg_params.memcg->css); |
| 2838 | } | 2675 | } |
| 2839 | 2676 | ||
| 2840 | /* | 2677 | /* |
| @@ -2899,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
| 2899 | memcg_uncharge_kmem(memcg, 1 << order); | 2736 | memcg_uncharge_kmem(memcg, 1 << order); |
| 2900 | page->mem_cgroup = NULL; | 2737 | page->mem_cgroup = NULL; |
| 2901 | } | 2738 | } |
| 2739 | |||
| 2740 | struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) | ||
| 2741 | { | ||
| 2742 | struct mem_cgroup *memcg = NULL; | ||
| 2743 | struct kmem_cache *cachep; | ||
| 2744 | struct page *page; | ||
| 2745 | |||
| 2746 | page = virt_to_head_page(ptr); | ||
| 2747 | if (PageSlab(page)) { | ||
| 2748 | cachep = page->slab_cache; | ||
| 2749 | if (!is_root_cache(cachep)) | ||
| 2750 | memcg = cachep->memcg_params.memcg; | ||
| 2751 | } else | ||
| 2752 | /* page allocated by alloc_kmem_pages */ | ||
| 2753 | memcg = page->mem_cgroup; | ||
| 2754 | |||
| 2755 | return memcg; | ||
| 2756 | } | ||
| 2902 | #endif /* CONFIG_MEMCG_KMEM */ | 2757 | #endif /* CONFIG_MEMCG_KMEM */ |
| 2903 | 2758 | ||
| 2904 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2759 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| @@ -3433,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, | |||
| 3433 | int err = 0; | 3288 | int err = 0; |
| 3434 | int memcg_id; | 3289 | int memcg_id; |
| 3435 | 3290 | ||
| 3436 | if (memcg_kmem_is_active(memcg)) | 3291 | BUG_ON(memcg->kmemcg_id >= 0); |
| 3437 | return 0; | 3292 | BUG_ON(memcg->kmem_acct_activated); |
| 3293 | BUG_ON(memcg->kmem_acct_active); | ||
| 3438 | 3294 | ||
| 3439 | /* | 3295 | /* |
| 3440 | * For simplicity, we won't allow this to be disabled. It also can't | 3296 | * For simplicity, we won't allow this to be disabled. It also can't |
| @@ -3477,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, | |||
| 3477 | * patched. | 3333 | * patched. |
| 3478 | */ | 3334 | */ |
| 3479 | memcg->kmemcg_id = memcg_id; | 3335 | memcg->kmemcg_id = memcg_id; |
| 3336 | memcg->kmem_acct_activated = true; | ||
| 3337 | memcg->kmem_acct_active = true; | ||
| 3480 | out: | 3338 | out: |
| 3481 | return err; | 3339 | return err; |
| 3482 | } | 3340 | } |
| @@ -3533,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, | |||
| 3533 | int ret; | 3391 | int ret; |
| 3534 | 3392 | ||
| 3535 | buf = strstrip(buf); | 3393 | buf = strstrip(buf); |
| 3536 | ret = page_counter_memparse(buf, &nr_pages); | 3394 | ret = page_counter_memparse(buf, "-1", &nr_pages); |
| 3537 | if (ret) | 3395 | if (ret) |
| 3538 | return ret; | 3396 | return ret; |
| 3539 | 3397 | ||
| @@ -3609,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | |||
| 3609 | { | 3467 | { |
| 3610 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 3468 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| 3611 | 3469 | ||
| 3612 | if (val >= (1 << NR_MOVE_TYPE)) | 3470 | if (val & ~MOVE_MASK) |
| 3613 | return -EINVAL; | 3471 | return -EINVAL; |
| 3614 | 3472 | ||
| 3615 | /* | 3473 | /* |
| @@ -3687,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
| 3687 | struct mem_cgroup *mi; | 3545 | struct mem_cgroup *mi; |
| 3688 | unsigned int i; | 3546 | unsigned int i; |
| 3689 | 3547 | ||
| 3548 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != | ||
| 3549 | MEM_CGROUP_STAT_NSTATS); | ||
| 3550 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != | ||
| 3551 | MEM_CGROUP_EVENTS_NSTATS); | ||
| 3690 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 3552 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
| 3691 | 3553 | ||
| 3692 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 3554 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
| @@ -3901,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |||
| 3901 | unsigned long usage; | 3763 | unsigned long usage; |
| 3902 | int i, size, ret; | 3764 | int i, size, ret; |
| 3903 | 3765 | ||
| 3904 | ret = page_counter_memparse(args, &threshold); | 3766 | ret = page_counter_memparse(args, "-1", &threshold); |
| 3905 | if (ret) | 3767 | if (ret) |
| 3906 | return ret; | 3768 | return ret; |
| 3907 | 3769 | ||
| @@ -4152,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
| 4152 | return mem_cgroup_sockets_init(memcg, ss); | 4014 | return mem_cgroup_sockets_init(memcg, ss); |
| 4153 | } | 4015 | } |
| 4154 | 4016 | ||
| 4017 | static void memcg_deactivate_kmem(struct mem_cgroup *memcg) | ||
| 4018 | { | ||
| 4019 | struct cgroup_subsys_state *css; | ||
| 4020 | struct mem_cgroup *parent, *child; | ||
| 4021 | int kmemcg_id; | ||
| 4022 | |||
| 4023 | if (!memcg->kmem_acct_active) | ||
| 4024 | return; | ||
| 4025 | |||
| 4026 | /* | ||
| 4027 | * Clear the 'active' flag before clearing memcg_caches arrays entries. | ||
| 4028 | * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it | ||
| 4029 | * guarantees no cache will be created for this cgroup after we are | ||
| 4030 | * done (see memcg_create_kmem_cache()). | ||
| 4031 | */ | ||
| 4032 | memcg->kmem_acct_active = false; | ||
| 4033 | |||
| 4034 | memcg_deactivate_kmem_caches(memcg); | ||
| 4035 | |||
| 4036 | kmemcg_id = memcg->kmemcg_id; | ||
| 4037 | BUG_ON(kmemcg_id < 0); | ||
| 4038 | |||
| 4039 | parent = parent_mem_cgroup(memcg); | ||
| 4040 | if (!parent) | ||
| 4041 | parent = root_mem_cgroup; | ||
| 4042 | |||
| 4043 | /* | ||
| 4044 | * Change kmemcg_id of this cgroup and all its descendants to the | ||
| 4045 | * parent's id, and then move all entries from this cgroup's list_lrus | ||
| 4046 | * to ones of the parent. After we have finished, all list_lrus | ||
| 4047 | * corresponding to this cgroup are guaranteed to remain empty. The | ||
| 4048 | * ordering is imposed by list_lru_node->lock taken by | ||
| 4049 | * memcg_drain_all_list_lrus(). | ||
| 4050 | */ | ||
| 4051 | css_for_each_descendant_pre(css, &memcg->css) { | ||
| 4052 | child = mem_cgroup_from_css(css); | ||
| 4053 | BUG_ON(child->kmemcg_id != kmemcg_id); | ||
| 4054 | child->kmemcg_id = parent->kmemcg_id; | ||
| 4055 | if (!memcg->use_hierarchy) | ||
| 4056 | break; | ||
| 4057 | } | ||
| 4058 | memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); | ||
| 4059 | |||
| 4060 | memcg_free_cache_id(kmemcg_id); | ||
| 4061 | } | ||
| 4062 | |||
| 4155 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4063 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
| 4156 | { | 4064 | { |
| 4157 | memcg_unregister_all_caches(memcg); | 4065 | if (memcg->kmem_acct_activated) { |
| 4066 | memcg_destroy_kmem_caches(memcg); | ||
| 4067 | static_key_slow_dec(&memcg_kmem_enabled_key); | ||
| 4068 | WARN_ON(page_counter_read(&memcg->kmem)); | ||
| 4069 | } | ||
| 4158 | mem_cgroup_sockets_destroy(memcg); | 4070 | mem_cgroup_sockets_destroy(memcg); |
| 4159 | } | 4071 | } |
| 4160 | #else | 4072 | #else |
| @@ -4163,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
| 4163 | return 0; | 4075 | return 0; |
| 4164 | } | 4076 | } |
| 4165 | 4077 | ||
| 4078 | static void memcg_deactivate_kmem(struct mem_cgroup *memcg) | ||
| 4079 | { | ||
| 4080 | } | ||
| 4081 | |||
| 4166 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4082 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
| 4167 | { | 4083 | { |
| 4168 | } | 4084 | } |
| @@ -4391,7 +4307,7 @@ out_kfree: | |||
| 4391 | return ret; | 4307 | return ret; |
| 4392 | } | 4308 | } |
| 4393 | 4309 | ||
| 4394 | static struct cftype mem_cgroup_files[] = { | 4310 | static struct cftype mem_cgroup_legacy_files[] = { |
| 4395 | { | 4311 | { |
| 4396 | .name = "usage_in_bytes", | 4312 | .name = "usage_in_bytes", |
| 4397 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 4313 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
| @@ -4502,34 +4418,6 @@ static struct cftype mem_cgroup_files[] = { | |||
| 4502 | { }, /* terminate */ | 4418 | { }, /* terminate */ |
| 4503 | }; | 4419 | }; |
| 4504 | 4420 | ||
| 4505 | #ifdef CONFIG_MEMCG_SWAP | ||
| 4506 | static struct cftype memsw_cgroup_files[] = { | ||
| 4507 | { | ||
| 4508 | .name = "memsw.usage_in_bytes", | ||
| 4509 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
| 4510 | .read_u64 = mem_cgroup_read_u64, | ||
| 4511 | }, | ||
| 4512 | { | ||
| 4513 | .name = "memsw.max_usage_in_bytes", | ||
| 4514 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
| 4515 | .write = mem_cgroup_reset, | ||
| 4516 | .read_u64 = mem_cgroup_read_u64, | ||
| 4517 | }, | ||
| 4518 | { | ||
| 4519 | .name = "memsw.limit_in_bytes", | ||
| 4520 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
| 4521 | .write = mem_cgroup_write, | ||
| 4522 | .read_u64 = mem_cgroup_read_u64, | ||
| 4523 | }, | ||
| 4524 | { | ||
| 4525 | .name = "memsw.failcnt", | ||
| 4526 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
| 4527 | .write = mem_cgroup_reset, | ||
| 4528 | .read_u64 = mem_cgroup_read_u64, | ||
| 4529 | }, | ||
| 4530 | { }, /* terminate */ | ||
| 4531 | }; | ||
| 4532 | #endif | ||
| 4533 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 4421 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
| 4534 | { | 4422 | { |
| 4535 | struct mem_cgroup_per_node *pn; | 4423 | struct mem_cgroup_per_node *pn; |
| @@ -4609,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
| 4609 | free_mem_cgroup_per_zone_info(memcg, node); | 4497 | free_mem_cgroup_per_zone_info(memcg, node); |
| 4610 | 4498 | ||
| 4611 | free_percpu(memcg->stat); | 4499 | free_percpu(memcg->stat); |
| 4612 | |||
| 4613 | disarm_static_keys(memcg); | ||
| 4614 | kfree(memcg); | 4500 | kfree(memcg); |
| 4615 | } | 4501 | } |
| 4616 | 4502 | ||
| @@ -4625,29 +4511,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
| 4625 | } | 4511 | } |
| 4626 | EXPORT_SYMBOL(parent_mem_cgroup); | 4512 | EXPORT_SYMBOL(parent_mem_cgroup); |
| 4627 | 4513 | ||
| 4628 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
| 4629 | { | ||
| 4630 | struct mem_cgroup_tree_per_node *rtpn; | ||
| 4631 | struct mem_cgroup_tree_per_zone *rtpz; | ||
| 4632 | int tmp, node, zone; | ||
| 4633 | |||
| 4634 | for_each_node(node) { | ||
| 4635 | tmp = node; | ||
| 4636 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
| 4637 | tmp = -1; | ||
| 4638 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
| 4639 | BUG_ON(!rtpn); | ||
| 4640 | |||
| 4641 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
| 4642 | |||
| 4643 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 4644 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
| 4645 | rtpz->rb_root = RB_ROOT; | ||
| 4646 | spin_lock_init(&rtpz->lock); | ||
| 4647 | } | ||
| 4648 | } | ||
| 4649 | } | ||
| 4650 | |||
| 4651 | static struct cgroup_subsys_state * __ref | 4514 | static struct cgroup_subsys_state * __ref |
| 4652 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 4515 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
| 4653 | { | 4516 | { |
| @@ -4667,6 +4530,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 4667 | if (parent_css == NULL) { | 4530 | if (parent_css == NULL) { |
| 4668 | root_mem_cgroup = memcg; | 4531 | root_mem_cgroup = memcg; |
| 4669 | page_counter_init(&memcg->memory, NULL); | 4532 | page_counter_init(&memcg->memory, NULL); |
| 4533 | memcg->high = PAGE_COUNTER_MAX; | ||
| 4670 | memcg->soft_limit = PAGE_COUNTER_MAX; | 4534 | memcg->soft_limit = PAGE_COUNTER_MAX; |
| 4671 | page_counter_init(&memcg->memsw, NULL); | 4535 | page_counter_init(&memcg->memsw, NULL); |
| 4672 | page_counter_init(&memcg->kmem, NULL); | 4536 | page_counter_init(&memcg->kmem, NULL); |
| @@ -4682,7 +4546,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 4682 | spin_lock_init(&memcg->event_list_lock); | 4546 | spin_lock_init(&memcg->event_list_lock); |
| 4683 | #ifdef CONFIG_MEMCG_KMEM | 4547 | #ifdef CONFIG_MEMCG_KMEM |
| 4684 | memcg->kmemcg_id = -1; | 4548 | memcg->kmemcg_id = -1; |
| 4685 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
| 4686 | #endif | 4549 | #endif |
| 4687 | 4550 | ||
| 4688 | return &memcg->css; | 4551 | return &memcg->css; |
| @@ -4713,6 +4576,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
| 4713 | 4576 | ||
| 4714 | if (parent->use_hierarchy) { | 4577 | if (parent->use_hierarchy) { |
| 4715 | page_counter_init(&memcg->memory, &parent->memory); | 4578 | page_counter_init(&memcg->memory, &parent->memory); |
| 4579 | memcg->high = PAGE_COUNTER_MAX; | ||
| 4716 | memcg->soft_limit = PAGE_COUNTER_MAX; | 4580 | memcg->soft_limit = PAGE_COUNTER_MAX; |
| 4717 | page_counter_init(&memcg->memsw, &parent->memsw); | 4581 | page_counter_init(&memcg->memsw, &parent->memsw); |
| 4718 | page_counter_init(&memcg->kmem, &parent->kmem); | 4582 | page_counter_init(&memcg->kmem, &parent->kmem); |
| @@ -4723,6 +4587,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
| 4723 | */ | 4587 | */ |
| 4724 | } else { | 4588 | } else { |
| 4725 | page_counter_init(&memcg->memory, NULL); | 4589 | page_counter_init(&memcg->memory, NULL); |
| 4590 | memcg->high = PAGE_COUNTER_MAX; | ||
| 4726 | memcg->soft_limit = PAGE_COUNTER_MAX; | 4591 | memcg->soft_limit = PAGE_COUNTER_MAX; |
| 4727 | page_counter_init(&memcg->memsw, NULL); | 4592 | page_counter_init(&memcg->memsw, NULL); |
| 4728 | page_counter_init(&memcg->kmem, NULL); | 4593 | page_counter_init(&memcg->kmem, NULL); |
| @@ -4768,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
| 4768 | spin_unlock(&memcg->event_list_lock); | 4633 | spin_unlock(&memcg->event_list_lock); |
| 4769 | 4634 | ||
| 4770 | vmpressure_cleanup(&memcg->vmpressure); | 4635 | vmpressure_cleanup(&memcg->vmpressure); |
| 4636 | |||
| 4637 | memcg_deactivate_kmem(memcg); | ||
| 4771 | } | 4638 | } |
| 4772 | 4639 | ||
| 4773 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) | 4640 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) |
| @@ -4798,6 +4665,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) | |||
| 4798 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); | 4665 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); |
| 4799 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); | 4666 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); |
| 4800 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); | 4667 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); |
| 4668 | memcg->low = 0; | ||
| 4669 | memcg->high = PAGE_COUNTER_MAX; | ||
| 4801 | memcg->soft_limit = PAGE_COUNTER_MAX; | 4670 | memcg->soft_limit = PAGE_COUNTER_MAX; |
| 4802 | } | 4671 | } |
| 4803 | 4672 | ||
| @@ -4874,12 +4743,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |||
| 4874 | if (!page || !page_mapped(page)) | 4743 | if (!page || !page_mapped(page)) |
| 4875 | return NULL; | 4744 | return NULL; |
| 4876 | if (PageAnon(page)) { | 4745 | if (PageAnon(page)) { |
| 4877 | /* we don't move shared anon */ | 4746 | if (!(mc.flags & MOVE_ANON)) |
| 4878 | if (!move_anon()) | ||
| 4879 | return NULL; | 4747 | return NULL; |
| 4880 | } else if (!move_file()) | 4748 | } else { |
| 4881 | /* we ignore mapcount for file pages */ | 4749 | if (!(mc.flags & MOVE_FILE)) |
| 4882 | return NULL; | 4750 | return NULL; |
| 4751 | } | ||
| 4883 | if (!get_page_unless_zero(page)) | 4752 | if (!get_page_unless_zero(page)) |
| 4884 | return NULL; | 4753 | return NULL; |
| 4885 | 4754 | ||
| @@ -4893,7 +4762,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | |||
| 4893 | struct page *page = NULL; | 4762 | struct page *page = NULL; |
| 4894 | swp_entry_t ent = pte_to_swp_entry(ptent); | 4763 | swp_entry_t ent = pte_to_swp_entry(ptent); |
| 4895 | 4764 | ||
| 4896 | if (!move_anon() || non_swap_entry(ent)) | 4765 | if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) |
| 4897 | return NULL; | 4766 | return NULL; |
| 4898 | /* | 4767 | /* |
| 4899 | * Because lookup_swap_cache() updates some statistics counter, | 4768 | * Because lookup_swap_cache() updates some statistics counter, |
| @@ -4922,14 +4791,11 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
| 4922 | 4791 | ||
| 4923 | if (!vma->vm_file) /* anonymous vma */ | 4792 | if (!vma->vm_file) /* anonymous vma */ |
| 4924 | return NULL; | 4793 | return NULL; |
| 4925 | if (!move_file()) | 4794 | if (!(mc.flags & MOVE_FILE)) |
| 4926 | return NULL; | 4795 | return NULL; |
| 4927 | 4796 | ||
| 4928 | mapping = vma->vm_file->f_mapping; | 4797 | mapping = vma->vm_file->f_mapping; |
| 4929 | if (pte_none(ptent)) | 4798 | pgoff = linear_page_index(vma, addr); |
| 4930 | pgoff = linear_page_index(vma, addr); | ||
| 4931 | else /* pte_file(ptent) is true */ | ||
| 4932 | pgoff = pte_to_pgoff(ptent); | ||
| 4933 | 4799 | ||
| 4934 | /* page is moved even if it's not RSS of this task(page-faulted). */ | 4800 | /* page is moved even if it's not RSS of this task(page-faulted). */ |
| 4935 | #ifdef CONFIG_SWAP | 4801 | #ifdef CONFIG_SWAP |
| @@ -4961,7 +4827,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
| 4961 | page = mc_handle_present_pte(vma, addr, ptent); | 4827 | page = mc_handle_present_pte(vma, addr, ptent); |
| 4962 | else if (is_swap_pte(ptent)) | 4828 | else if (is_swap_pte(ptent)) |
| 4963 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); | 4829 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); |
| 4964 | else if (pte_none(ptent) || pte_file(ptent)) | 4830 | else if (pte_none(ptent)) |
| 4965 | page = mc_handle_file_pte(vma, addr, ptent, &ent); | 4831 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
| 4966 | 4832 | ||
| 4967 | if (!page && !ent.val) | 4833 | if (!page && !ent.val) |
| @@ -5004,7 +4870,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | |||
| 5004 | 4870 | ||
| 5005 | page = pmd_page(pmd); | 4871 | page = pmd_page(pmd); |
| 5006 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); | 4872 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); |
| 5007 | if (!move_anon()) | 4873 | if (!(mc.flags & MOVE_ANON)) |
| 5008 | return ret; | 4874 | return ret; |
| 5009 | if (page->mem_cgroup == mc.from) { | 4875 | if (page->mem_cgroup == mc.from) { |
| 5010 | ret = MC_TARGET_PAGE; | 4876 | ret = MC_TARGET_PAGE; |
| @@ -5027,7 +4893,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
| 5027 | unsigned long addr, unsigned long end, | 4893 | unsigned long addr, unsigned long end, |
| 5028 | struct mm_walk *walk) | 4894 | struct mm_walk *walk) |
| 5029 | { | 4895 | { |
| 5030 | struct vm_area_struct *vma = walk->private; | 4896 | struct vm_area_struct *vma = walk->vma; |
| 5031 | pte_t *pte; | 4897 | pte_t *pte; |
| 5032 | spinlock_t *ptl; | 4898 | spinlock_t *ptl; |
| 5033 | 4899 | ||
| @@ -5053,20 +4919,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
| 5053 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | 4919 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) |
| 5054 | { | 4920 | { |
| 5055 | unsigned long precharge; | 4921 | unsigned long precharge; |
| 5056 | struct vm_area_struct *vma; | ||
| 5057 | 4922 | ||
| 4923 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
| 4924 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
| 4925 | .mm = mm, | ||
| 4926 | }; | ||
| 5058 | down_read(&mm->mmap_sem); | 4927 | down_read(&mm->mmap_sem); |
| 5059 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4928 | walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); |
| 5060 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
| 5061 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
| 5062 | .mm = mm, | ||
| 5063 | .private = vma, | ||
| 5064 | }; | ||
| 5065 | if (is_vm_hugetlb_page(vma)) | ||
| 5066 | continue; | ||
| 5067 | walk_page_range(vma->vm_start, vma->vm_end, | ||
| 5068 | &mem_cgroup_count_precharge_walk); | ||
| 5069 | } | ||
| 5070 | up_read(&mm->mmap_sem); | 4929 | up_read(&mm->mmap_sem); |
| 5071 | 4930 | ||
| 5072 | precharge = mc.precharge; | 4931 | precharge = mc.precharge; |
| @@ -5146,15 +5005,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
| 5146 | struct task_struct *p = cgroup_taskset_first(tset); | 5005 | struct task_struct *p = cgroup_taskset_first(tset); |
| 5147 | int ret = 0; | 5006 | int ret = 0; |
| 5148 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5007 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| 5149 | unsigned long move_charge_at_immigrate; | 5008 | unsigned long move_flags; |
| 5150 | 5009 | ||
| 5151 | /* | 5010 | /* |
| 5152 | * We are now commited to this value whatever it is. Changes in this | 5011 | * We are now commited to this value whatever it is. Changes in this |
| 5153 | * tunable will only affect upcoming migrations, not the current one. | 5012 | * tunable will only affect upcoming migrations, not the current one. |
| 5154 | * So we need to save it, and keep it going. | 5013 | * So we need to save it, and keep it going. |
| 5155 | */ | 5014 | */ |
| 5156 | move_charge_at_immigrate = memcg->move_charge_at_immigrate; | 5015 | move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); |
| 5157 | if (move_charge_at_immigrate) { | 5016 | if (move_flags) { |
| 5158 | struct mm_struct *mm; | 5017 | struct mm_struct *mm; |
| 5159 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 5018 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
| 5160 | 5019 | ||
| @@ -5174,7 +5033,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
| 5174 | spin_lock(&mc.lock); | 5033 | spin_lock(&mc.lock); |
| 5175 | mc.from = from; | 5034 | mc.from = from; |
| 5176 | mc.to = memcg; | 5035 | mc.to = memcg; |
| 5177 | mc.immigrate_flags = move_charge_at_immigrate; | 5036 | mc.flags = move_flags; |
| 5178 | spin_unlock(&mc.lock); | 5037 | spin_unlock(&mc.lock); |
| 5179 | /* We set mc.moving_task later */ | 5038 | /* We set mc.moving_task later */ |
| 5180 | 5039 | ||
| @@ -5199,7 +5058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
| 5199 | struct mm_walk *walk) | 5058 | struct mm_walk *walk) |
| 5200 | { | 5059 | { |
| 5201 | int ret = 0; | 5060 | int ret = 0; |
| 5202 | struct vm_area_struct *vma = walk->private; | 5061 | struct vm_area_struct *vma = walk->vma; |
| 5203 | pte_t *pte; | 5062 | pte_t *pte; |
| 5204 | spinlock_t *ptl; | 5063 | spinlock_t *ptl; |
| 5205 | enum mc_target_type target_type; | 5064 | enum mc_target_type target_type; |
| @@ -5295,7 +5154,10 @@ put: /* get_mctgt_type() gets the page */ | |||
| 5295 | 5154 | ||
| 5296 | static void mem_cgroup_move_charge(struct mm_struct *mm) | 5155 | static void mem_cgroup_move_charge(struct mm_struct *mm) |
| 5297 | { | 5156 | { |
| 5298 | struct vm_area_struct *vma; | 5157 | struct mm_walk mem_cgroup_move_charge_walk = { |
| 5158 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
| 5159 | .mm = mm, | ||
| 5160 | }; | ||
| 5299 | 5161 | ||
| 5300 | lru_add_drain_all(); | 5162 | lru_add_drain_all(); |
| 5301 | /* | 5163 | /* |
| @@ -5318,24 +5180,11 @@ retry: | |||
| 5318 | cond_resched(); | 5180 | cond_resched(); |
| 5319 | goto retry; | 5181 | goto retry; |
| 5320 | } | 5182 | } |
| 5321 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 5183 | /* |
| 5322 | int ret; | 5184 | * When we have consumed all precharges and failed in doing |
| 5323 | struct mm_walk mem_cgroup_move_charge_walk = { | 5185 | * additional charge, the page walk just aborts. |
| 5324 | .pmd_entry = mem_cgroup_move_charge_pte_range, | 5186 | */ |
| 5325 | .mm = mm, | 5187 | walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); |
| 5326 | .private = vma, | ||
| 5327 | }; | ||
| 5328 | if (is_vm_hugetlb_page(vma)) | ||
| 5329 | continue; | ||
| 5330 | ret = walk_page_range(vma->vm_start, vma->vm_end, | ||
| 5331 | &mem_cgroup_move_charge_walk); | ||
| 5332 | if (ret) | ||
| 5333 | /* | ||
| 5334 | * means we have consumed all precharges and failed in | ||
| 5335 | * doing additional charge. Just abandon here. | ||
| 5336 | */ | ||
| 5337 | break; | ||
| 5338 | } | ||
| 5339 | up_read(&mm->mmap_sem); | 5188 | up_read(&mm->mmap_sem); |
| 5340 | atomic_dec(&mc.from->moving_account); | 5189 | atomic_dec(&mc.from->moving_account); |
| 5341 | } | 5190 | } |
| @@ -5386,118 +5235,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) | |||
| 5386 | mem_cgroup_from_css(root_css)->use_hierarchy = true; | 5235 | mem_cgroup_from_css(root_css)->use_hierarchy = true; |
| 5387 | } | 5236 | } |
| 5388 | 5237 | ||
| 5389 | struct cgroup_subsys memory_cgrp_subsys = { | 5238 | static u64 memory_current_read(struct cgroup_subsys_state *css, |
| 5390 | .css_alloc = mem_cgroup_css_alloc, | 5239 | struct cftype *cft) |
| 5391 | .css_online = mem_cgroup_css_online, | 5240 | { |
| 5392 | .css_offline = mem_cgroup_css_offline, | 5241 | return mem_cgroup_usage(mem_cgroup_from_css(css), false); |
| 5393 | .css_free = mem_cgroup_css_free, | 5242 | } |
| 5394 | .css_reset = mem_cgroup_css_reset, | ||
| 5395 | .can_attach = mem_cgroup_can_attach, | ||
| 5396 | .cancel_attach = mem_cgroup_cancel_attach, | ||
| 5397 | .attach = mem_cgroup_move_task, | ||
| 5398 | .bind = mem_cgroup_bind, | ||
| 5399 | .legacy_cftypes = mem_cgroup_files, | ||
| 5400 | .early_init = 0, | ||
| 5401 | }; | ||
| 5402 | 5243 | ||
| 5403 | #ifdef CONFIG_MEMCG_SWAP | 5244 | static int memory_low_show(struct seq_file *m, void *v) |
| 5404 | static int __init enable_swap_account(char *s) | ||
| 5405 | { | 5245 | { |
| 5406 | if (!strcmp(s, "1")) | 5246 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5407 | really_do_swap_account = 1; | 5247 | unsigned long low = ACCESS_ONCE(memcg->low); |
| 5408 | else if (!strcmp(s, "0")) | 5248 | |
| 5409 | really_do_swap_account = 0; | 5249 | if (low == PAGE_COUNTER_MAX) |
| 5410 | return 1; | 5250 | seq_puts(m, "max\n"); |
| 5251 | else | ||
| 5252 | seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); | ||
| 5253 | |||
| 5254 | return 0; | ||
| 5411 | } | 5255 | } |
| 5412 | __setup("swapaccount=", enable_swap_account); | ||
| 5413 | 5256 | ||
| 5414 | static void __init memsw_file_init(void) | 5257 | static ssize_t memory_low_write(struct kernfs_open_file *of, |
| 5258 | char *buf, size_t nbytes, loff_t off) | ||
| 5415 | { | 5259 | { |
| 5416 | WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, | 5260 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
| 5417 | memsw_cgroup_files)); | 5261 | unsigned long low; |
| 5262 | int err; | ||
| 5263 | |||
| 5264 | buf = strstrip(buf); | ||
| 5265 | err = page_counter_memparse(buf, "max", &low); | ||
| 5266 | if (err) | ||
| 5267 | return err; | ||
| 5268 | |||
| 5269 | memcg->low = low; | ||
| 5270 | |||
| 5271 | return nbytes; | ||
| 5418 | } | 5272 | } |
| 5419 | 5273 | ||
| 5420 | static void __init enable_swap_cgroup(void) | 5274 | static int memory_high_show(struct seq_file *m, void *v) |
| 5421 | { | 5275 | { |
| 5422 | if (!mem_cgroup_disabled() && really_do_swap_account) { | 5276 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5423 | do_swap_account = 1; | 5277 | unsigned long high = ACCESS_ONCE(memcg->high); |
| 5424 | memsw_file_init(); | 5278 | |
| 5425 | } | 5279 | if (high == PAGE_COUNTER_MAX) |
| 5280 | seq_puts(m, "max\n"); | ||
| 5281 | else | ||
| 5282 | seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); | ||
| 5283 | |||
| 5284 | return 0; | ||
| 5426 | } | 5285 | } |
| 5427 | 5286 | ||
| 5428 | #else | 5287 | static ssize_t memory_high_write(struct kernfs_open_file *of, |
| 5429 | static void __init enable_swap_cgroup(void) | 5288 | char *buf, size_t nbytes, loff_t off) |
| 5430 | { | 5289 | { |
| 5290 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | ||
| 5291 | unsigned long high; | ||
| 5292 | int err; | ||
| 5293 | |||
| 5294 | buf = strstrip(buf); | ||
| 5295 | err = page_counter_memparse(buf, "max", &high); | ||
| 5296 | if (err) | ||
| 5297 | return err; | ||
| 5298 | |||
| 5299 | memcg->high = high; | ||
| 5300 | |||
| 5301 | return nbytes; | ||
| 5431 | } | 5302 | } |
| 5432 | #endif | ||
| 5433 | 5303 | ||
| 5434 | #ifdef CONFIG_MEMCG_SWAP | 5304 | static int memory_max_show(struct seq_file *m, void *v) |
| 5435 | /** | ||
| 5436 | * mem_cgroup_swapout - transfer a memsw charge to swap | ||
| 5437 | * @page: page whose memsw charge to transfer | ||
| 5438 | * @entry: swap entry to move the charge to | ||
| 5439 | * | ||
| 5440 | * Transfer the memsw charge of @page to @entry. | ||
| 5441 | */ | ||
| 5442 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | ||
| 5443 | { | 5305 | { |
| 5444 | struct mem_cgroup *memcg; | 5306 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5445 | unsigned short oldid; | 5307 | unsigned long max = ACCESS_ONCE(memcg->memory.limit); |
| 5446 | 5308 | ||
| 5447 | VM_BUG_ON_PAGE(PageLRU(page), page); | 5309 | if (max == PAGE_COUNTER_MAX) |
| 5448 | VM_BUG_ON_PAGE(page_count(page), page); | 5310 | seq_puts(m, "max\n"); |
| 5311 | else | ||
| 5312 | seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); | ||
| 5449 | 5313 | ||
| 5450 | if (!do_swap_account) | 5314 | return 0; |
| 5451 | return; | 5315 | } |
| 5452 | 5316 | ||
| 5453 | memcg = page->mem_cgroup; | 5317 | static ssize_t memory_max_write(struct kernfs_open_file *of, |
| 5318 | char *buf, size_t nbytes, loff_t off) | ||
| 5319 | { | ||
| 5320 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | ||
| 5321 | unsigned long max; | ||
| 5322 | int err; | ||
| 5454 | 5323 | ||
| 5455 | /* Readahead page, never charged */ | 5324 | buf = strstrip(buf); |
| 5456 | if (!memcg) | 5325 | err = page_counter_memparse(buf, "max", &max); |
| 5457 | return; | 5326 | if (err) |
| 5327 | return err; | ||
| 5458 | 5328 | ||
| 5459 | oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); | 5329 | err = mem_cgroup_resize_limit(memcg, max); |
| 5460 | VM_BUG_ON_PAGE(oldid, page); | 5330 | if (err) |
| 5461 | mem_cgroup_swap_statistics(memcg, true); | 5331 | return err; |
| 5462 | 5332 | ||
| 5463 | page->mem_cgroup = NULL; | 5333 | return nbytes; |
| 5334 | } | ||
| 5464 | 5335 | ||
| 5465 | if (!mem_cgroup_is_root(memcg)) | 5336 | static int memory_events_show(struct seq_file *m, void *v) |
| 5466 | page_counter_uncharge(&memcg->memory, 1); | 5337 | { |
| 5338 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | ||
| 5467 | 5339 | ||
| 5468 | /* XXX: caller holds IRQ-safe mapping->tree_lock */ | 5340 | seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); |
| 5469 | VM_BUG_ON(!irqs_disabled()); | 5341 | seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); |
| 5342 | seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); | ||
| 5343 | seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); | ||
| 5470 | 5344 | ||
| 5471 | mem_cgroup_charge_statistics(memcg, page, -1); | 5345 | return 0; |
| 5472 | memcg_check_events(memcg, page); | ||
| 5473 | } | 5346 | } |
| 5474 | 5347 | ||
| 5348 | static struct cftype memory_files[] = { | ||
| 5349 | { | ||
| 5350 | .name = "current", | ||
| 5351 | .read_u64 = memory_current_read, | ||
| 5352 | }, | ||
| 5353 | { | ||
| 5354 | .name = "low", | ||
| 5355 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 5356 | .seq_show = memory_low_show, | ||
| 5357 | .write = memory_low_write, | ||
| 5358 | }, | ||
| 5359 | { | ||
| 5360 | .name = "high", | ||
| 5361 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 5362 | .seq_show = memory_high_show, | ||
| 5363 | .write = memory_high_write, | ||
| 5364 | }, | ||
| 5365 | { | ||
| 5366 | .name = "max", | ||
| 5367 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 5368 | .seq_show = memory_max_show, | ||
| 5369 | .write = memory_max_write, | ||
| 5370 | }, | ||
| 5371 | { | ||
| 5372 | .name = "events", | ||
| 5373 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 5374 | .seq_show = memory_events_show, | ||
| 5375 | }, | ||
| 5376 | { } /* terminate */ | ||
| 5377 | }; | ||
| 5378 | |||
| 5379 | struct cgroup_subsys memory_cgrp_subsys = { | ||
| 5380 | .css_alloc = mem_cgroup_css_alloc, | ||
| 5381 | .css_online = mem_cgroup_css_online, | ||
| 5382 | .css_offline = mem_cgroup_css_offline, | ||
| 5383 | .css_free = mem_cgroup_css_free, | ||
| 5384 | .css_reset = mem_cgroup_css_reset, | ||
| 5385 | .can_attach = mem_cgroup_can_attach, | ||
| 5386 | .cancel_attach = mem_cgroup_cancel_attach, | ||
| 5387 | .attach = mem_cgroup_move_task, | ||
| 5388 | .bind = mem_cgroup_bind, | ||
| 5389 | .dfl_cftypes = memory_files, | ||
| 5390 | .legacy_cftypes = mem_cgroup_legacy_files, | ||
| 5391 | .early_init = 0, | ||
| 5392 | }; | ||
| 5393 | |||
| 5475 | /** | 5394 | /** |
| 5476 | * mem_cgroup_uncharge_swap - uncharge a swap entry | 5395 | * mem_cgroup_events - count memory events against a cgroup |
| 5477 | * @entry: swap entry to uncharge | 5396 | * @memcg: the memory cgroup |
| 5397 | * @idx: the event index | ||
| 5398 | * @nr: the number of events to account for | ||
| 5399 | */ | ||
| 5400 | void mem_cgroup_events(struct mem_cgroup *memcg, | ||
| 5401 | enum mem_cgroup_events_index idx, | ||
| 5402 | unsigned int nr) | ||
| 5403 | { | ||
| 5404 | this_cpu_add(memcg->stat->events[idx], nr); | ||
| 5405 | } | ||
| 5406 | |||
| 5407 | /** | ||
| 5408 | * mem_cgroup_low - check if memory consumption is below the normal range | ||
| 5409 | * @root: the highest ancestor to consider | ||
| 5410 | * @memcg: the memory cgroup to check | ||
| 5478 | * | 5411 | * |
| 5479 | * Drop the memsw charge associated with @entry. | 5412 | * Returns %true if memory consumption of @memcg, and that of all |
| 5413 | * configurable ancestors up to @root, is below the normal range. | ||
| 5480 | */ | 5414 | */ |
| 5481 | void mem_cgroup_uncharge_swap(swp_entry_t entry) | 5415 | bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) |
| 5482 | { | 5416 | { |
| 5483 | struct mem_cgroup *memcg; | 5417 | if (mem_cgroup_disabled()) |
| 5484 | unsigned short id; | 5418 | return false; |
| 5485 | 5419 | ||
| 5486 | if (!do_swap_account) | 5420 | /* |
| 5487 | return; | 5421 | * The toplevel group doesn't have a configurable range, so |
| 5422 | * it's never low when looked at directly, and it is not | ||
| 5423 | * considered an ancestor when assessing the hierarchy. | ||
| 5424 | */ | ||
| 5488 | 5425 | ||
| 5489 | id = swap_cgroup_record(entry, 0); | 5426 | if (memcg == root_mem_cgroup) |
| 5490 | rcu_read_lock(); | 5427 | return false; |
| 5491 | memcg = mem_cgroup_lookup(id); | 5428 | |
| 5492 | if (memcg) { | 5429 | if (page_counter_read(&memcg->memory) >= memcg->low) |
| 5493 | if (!mem_cgroup_is_root(memcg)) | 5430 | return false; |
| 5494 | page_counter_uncharge(&memcg->memsw, 1); | 5431 | |
| 5495 | mem_cgroup_swap_statistics(memcg, false); | 5432 | while (memcg != root) { |
| 5496 | css_put(&memcg->css); | 5433 | memcg = parent_mem_cgroup(memcg); |
| 5434 | |||
| 5435 | if (memcg == root_mem_cgroup) | ||
| 5436 | break; | ||
| 5437 | |||
| 5438 | if (page_counter_read(&memcg->memory) >= memcg->low) | ||
| 5439 | return false; | ||
| 5497 | } | 5440 | } |
| 5498 | rcu_read_unlock(); | 5441 | return true; |
| 5499 | } | 5442 | } |
| 5500 | #endif | ||
| 5501 | 5443 | ||
| 5502 | /** | 5444 | /** |
| 5503 | * mem_cgroup_try_charge - try charging a page | 5445 | * mem_cgroup_try_charge - try charging a page |
| @@ -5831,10 +5773,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | |||
| 5831 | */ | 5773 | */ |
| 5832 | static int __init mem_cgroup_init(void) | 5774 | static int __init mem_cgroup_init(void) |
| 5833 | { | 5775 | { |
| 5776 | int cpu, node; | ||
| 5777 | |||
| 5834 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 5778 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
| 5835 | enable_swap_cgroup(); | 5779 | |
| 5836 | mem_cgroup_soft_limit_tree_init(); | 5780 | for_each_possible_cpu(cpu) |
| 5837 | memcg_stock_init(); | 5781 | INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, |
| 5782 | drain_local_stock); | ||
| 5783 | |||
| 5784 | for_each_node(node) { | ||
| 5785 | struct mem_cgroup_tree_per_node *rtpn; | ||
| 5786 | int zone; | ||
| 5787 | |||
| 5788 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, | ||
| 5789 | node_online(node) ? node : NUMA_NO_NODE); | ||
| 5790 | |||
| 5791 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 5792 | struct mem_cgroup_tree_per_zone *rtpz; | ||
| 5793 | |||
| 5794 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
| 5795 | rtpz->rb_root = RB_ROOT; | ||
| 5796 | spin_lock_init(&rtpz->lock); | ||
| 5797 | } | ||
| 5798 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
| 5799 | } | ||
| 5800 | |||
| 5838 | return 0; | 5801 | return 0; |
| 5839 | } | 5802 | } |
| 5840 | subsys_initcall(mem_cgroup_init); | 5803 | subsys_initcall(mem_cgroup_init); |
| 5804 | |||
| 5805 | #ifdef CONFIG_MEMCG_SWAP | ||
| 5806 | /** | ||
| 5807 | * mem_cgroup_swapout - transfer a memsw charge to swap | ||
| 5808 | * @page: page whose memsw charge to transfer | ||
| 5809 | * @entry: swap entry to move the charge to | ||
| 5810 | * | ||
| 5811 | * Transfer the memsw charge of @page to @entry. | ||
| 5812 | */ | ||
| 5813 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | ||
| 5814 | { | ||
| 5815 | struct mem_cgroup *memcg; | ||
| 5816 | unsigned short oldid; | ||
| 5817 | |||
| 5818 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
| 5819 | VM_BUG_ON_PAGE(page_count(page), page); | ||
| 5820 | |||
| 5821 | if (!do_swap_account) | ||
| 5822 | return; | ||
| 5823 | |||
| 5824 | memcg = page->mem_cgroup; | ||
| 5825 | |||
| 5826 | /* Readahead page, never charged */ | ||
| 5827 | if (!memcg) | ||
| 5828 | return; | ||
| 5829 | |||
| 5830 | oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); | ||
| 5831 | VM_BUG_ON_PAGE(oldid, page); | ||
| 5832 | mem_cgroup_swap_statistics(memcg, true); | ||
| 5833 | |||
| 5834 | page->mem_cgroup = NULL; | ||
| 5835 | |||
| 5836 | if (!mem_cgroup_is_root(memcg)) | ||
| 5837 | page_counter_uncharge(&memcg->memory, 1); | ||
| 5838 | |||
| 5839 | /* XXX: caller holds IRQ-safe mapping->tree_lock */ | ||
| 5840 | VM_BUG_ON(!irqs_disabled()); | ||
| 5841 | |||
| 5842 | mem_cgroup_charge_statistics(memcg, page, -1); | ||
| 5843 | memcg_check_events(memcg, page); | ||
| 5844 | } | ||
| 5845 | |||
| 5846 | /** | ||
| 5847 | * mem_cgroup_uncharge_swap - uncharge a swap entry | ||
| 5848 | * @entry: swap entry to uncharge | ||
| 5849 | * | ||
| 5850 | * Drop the memsw charge associated with @entry. | ||
| 5851 | */ | ||
| 5852 | void mem_cgroup_uncharge_swap(swp_entry_t entry) | ||
| 5853 | { | ||
| 5854 | struct mem_cgroup *memcg; | ||
| 5855 | unsigned short id; | ||
| 5856 | |||
| 5857 | if (!do_swap_account) | ||
| 5858 | return; | ||
| 5859 | |||
| 5860 | id = swap_cgroup_record(entry, 0); | ||
| 5861 | rcu_read_lock(); | ||
| 5862 | memcg = mem_cgroup_lookup(id); | ||
| 5863 | if (memcg) { | ||
| 5864 | if (!mem_cgroup_is_root(memcg)) | ||
| 5865 | page_counter_uncharge(&memcg->memsw, 1); | ||
| 5866 | mem_cgroup_swap_statistics(memcg, false); | ||
| 5867 | css_put(&memcg->css); | ||
| 5868 | } | ||
| 5869 | rcu_read_unlock(); | ||
| 5870 | } | ||
| 5871 | |||
| 5872 | /* for remember boot option*/ | ||
| 5873 | #ifdef CONFIG_MEMCG_SWAP_ENABLED | ||
| 5874 | static int really_do_swap_account __initdata = 1; | ||
| 5875 | #else | ||
| 5876 | static int really_do_swap_account __initdata; | ||
| 5877 | #endif | ||
| 5878 | |||
| 5879 | static int __init enable_swap_account(char *s) | ||
| 5880 | { | ||
| 5881 | if (!strcmp(s, "1")) | ||
| 5882 | really_do_swap_account = 1; | ||
| 5883 | else if (!strcmp(s, "0")) | ||
| 5884 | really_do_swap_account = 0; | ||
| 5885 | return 1; | ||
| 5886 | } | ||
| 5887 | __setup("swapaccount=", enable_swap_account); | ||
| 5888 | |||
| 5889 | static struct cftype memsw_cgroup_files[] = { | ||
| 5890 | { | ||
| 5891 | .name = "memsw.usage_in_bytes", | ||
| 5892 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
| 5893 | .read_u64 = mem_cgroup_read_u64, | ||
| 5894 | }, | ||
| 5895 | { | ||
| 5896 | .name = "memsw.max_usage_in_bytes", | ||
| 5897 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
| 5898 | .write = mem_cgroup_reset, | ||
| 5899 | .read_u64 = mem_cgroup_read_u64, | ||
| 5900 | }, | ||
| 5901 | { | ||
| 5902 | .name = "memsw.limit_in_bytes", | ||
| 5903 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
| 5904 | .write = mem_cgroup_write, | ||
| 5905 | .read_u64 = mem_cgroup_read_u64, | ||
| 5906 | }, | ||
| 5907 | { | ||
| 5908 | .name = "memsw.failcnt", | ||
| 5909 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
| 5910 | .write = mem_cgroup_reset, | ||
| 5911 | .read_u64 = mem_cgroup_read_u64, | ||
| 5912 | }, | ||
| 5913 | { }, /* terminate */ | ||
| 5914 | }; | ||
| 5915 | |||
| 5916 | static int __init mem_cgroup_swap_init(void) | ||
| 5917 | { | ||
| 5918 | if (!mem_cgroup_disabled() && really_do_swap_account) { | ||
| 5919 | do_swap_account = 1; | ||
| 5920 | WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, | ||
| 5921 | memsw_cgroup_files)); | ||
| 5922 | } | ||
| 5923 | return 0; | ||
| 5924 | } | ||
| 5925 | subsys_initcall(mem_cgroup_swap_init); | ||
| 5926 | |||
| 5927 | #endif /* CONFIG_MEMCG_SWAP */ | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index feb803bf3443..d487f8dc6d39 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -242,15 +242,8 @@ void shake_page(struct page *p, int access) | |||
| 242 | * Only call shrink_node_slabs here (which would also shrink | 242 | * Only call shrink_node_slabs here (which would also shrink |
| 243 | * other caches) if access is not potentially fatal. | 243 | * other caches) if access is not potentially fatal. |
| 244 | */ | 244 | */ |
| 245 | if (access) { | 245 | if (access) |
| 246 | int nr; | 246 | drop_slab_node(page_to_nid(p)); |
| 247 | int nid = page_to_nid(p); | ||
| 248 | do { | ||
| 249 | nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); | ||
| 250 | if (page_count(p) == 1) | ||
| 251 | break; | ||
| 252 | } while (nr > 10); | ||
| 253 | } | ||
| 254 | } | 247 | } |
| 255 | EXPORT_SYMBOL_GPL(shake_page); | 248 | EXPORT_SYMBOL_GPL(shake_page); |
| 256 | 249 | ||
| @@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags) | |||
| 1654 | * setting PG_hwpoison. | 1647 | * setting PG_hwpoison. |
| 1655 | */ | 1648 | */ |
| 1656 | if (!is_free_buddy_page(page)) | 1649 | if (!is_free_buddy_page(page)) |
| 1657 | lru_add_drain_all(); | ||
| 1658 | if (!is_free_buddy_page(page)) | ||
| 1659 | drain_all_pages(page_zone(page)); | 1650 | drain_all_pages(page_zone(page)); |
| 1660 | SetPageHWPoison(page); | 1651 | SetPageHWPoison(page); |
| 1661 | if (!is_free_buddy_page(page)) | 1652 | if (!is_free_buddy_page(page)) |
diff --git a/mm/memory.c b/mm/memory.c index 2c3536cc6c63..8068893697bb 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
| 428 | pmd = pmd_offset(pud, start); | 428 | pmd = pmd_offset(pud, start); |
| 429 | pud_clear(pud); | 429 | pud_clear(pud); |
| 430 | pmd_free_tlb(tlb, pmd, start); | 430 | pmd_free_tlb(tlb, pmd, start); |
| 431 | mm_dec_nr_pmds(tlb->mm); | ||
| 431 | } | 432 | } |
| 432 | 433 | ||
| 433 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 434 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, |
| @@ -754,6 +755,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 754 | if (HAVE_PTE_SPECIAL) { | 755 | if (HAVE_PTE_SPECIAL) { |
| 755 | if (likely(!pte_special(pte))) | 756 | if (likely(!pte_special(pte))) |
| 756 | goto check_pfn; | 757 | goto check_pfn; |
| 758 | if (vma->vm_ops && vma->vm_ops->find_special_page) | ||
| 759 | return vma->vm_ops->find_special_page(vma, addr); | ||
| 757 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) | 760 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) |
| 758 | return NULL; | 761 | return NULL; |
| 759 | if (!is_zero_pfn(pfn)) | 762 | if (!is_zero_pfn(pfn)) |
| @@ -811,42 +814,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 811 | 814 | ||
| 812 | /* pte contains position in swap or file, so copy. */ | 815 | /* pte contains position in swap or file, so copy. */ |
| 813 | if (unlikely(!pte_present(pte))) { | 816 | if (unlikely(!pte_present(pte))) { |
| 814 | if (!pte_file(pte)) { | 817 | swp_entry_t entry = pte_to_swp_entry(pte); |
| 815 | swp_entry_t entry = pte_to_swp_entry(pte); | 818 | |
| 816 | 819 | if (likely(!non_swap_entry(entry))) { | |
| 817 | if (likely(!non_swap_entry(entry))) { | 820 | if (swap_duplicate(entry) < 0) |
| 818 | if (swap_duplicate(entry) < 0) | 821 | return entry.val; |
| 819 | return entry.val; | 822 | |
| 820 | 823 | /* make sure dst_mm is on swapoff's mmlist. */ | |
| 821 | /* make sure dst_mm is on swapoff's mmlist. */ | 824 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
| 822 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 825 | spin_lock(&mmlist_lock); |
| 823 | spin_lock(&mmlist_lock); | 826 | if (list_empty(&dst_mm->mmlist)) |
| 824 | if (list_empty(&dst_mm->mmlist)) | 827 | list_add(&dst_mm->mmlist, |
| 825 | list_add(&dst_mm->mmlist, | 828 | &src_mm->mmlist); |
| 826 | &src_mm->mmlist); | 829 | spin_unlock(&mmlist_lock); |
| 827 | spin_unlock(&mmlist_lock); | 830 | } |
| 828 | } | 831 | rss[MM_SWAPENTS]++; |
| 829 | rss[MM_SWAPENTS]++; | 832 | } else if (is_migration_entry(entry)) { |
| 830 | } else if (is_migration_entry(entry)) { | 833 | page = migration_entry_to_page(entry); |
| 831 | page = migration_entry_to_page(entry); | 834 | |
| 832 | 835 | if (PageAnon(page)) | |
| 833 | if (PageAnon(page)) | 836 | rss[MM_ANONPAGES]++; |
| 834 | rss[MM_ANONPAGES]++; | 837 | else |
| 835 | else | 838 | rss[MM_FILEPAGES]++; |
| 836 | rss[MM_FILEPAGES]++; | 839 | |
| 837 | 840 | if (is_write_migration_entry(entry) && | |
| 838 | if (is_write_migration_entry(entry) && | 841 | is_cow_mapping(vm_flags)) { |
| 839 | is_cow_mapping(vm_flags)) { | 842 | /* |
| 840 | /* | 843 | * COW mappings require pages in both |
| 841 | * COW mappings require pages in both | 844 | * parent and child to be set to read. |
| 842 | * parent and child to be set to read. | 845 | */ |
| 843 | */ | 846 | make_migration_entry_read(&entry); |
| 844 | make_migration_entry_read(&entry); | 847 | pte = swp_entry_to_pte(entry); |
| 845 | pte = swp_entry_to_pte(entry); | 848 | if (pte_swp_soft_dirty(*src_pte)) |
| 846 | if (pte_swp_soft_dirty(*src_pte)) | 849 | pte = pte_swp_mksoft_dirty(pte); |
| 847 | pte = pte_swp_mksoft_dirty(pte); | 850 | set_pte_at(src_mm, addr, src_pte, pte); |
| 848 | set_pte_at(src_mm, addr, src_pte, pte); | ||
| 849 | } | ||
| 850 | } | 851 | } |
| 851 | } | 852 | } |
| 852 | goto out_set_pte; | 853 | goto out_set_pte; |
| @@ -1020,11 +1021,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 1020 | * readonly mappings. The tradeoff is that copy_page_range is more | 1021 | * readonly mappings. The tradeoff is that copy_page_range is more |
| 1021 | * efficient than faulting. | 1022 | * efficient than faulting. |
| 1022 | */ | 1023 | */ |
| 1023 | if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | | 1024 | if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && |
| 1024 | VM_PFNMAP | VM_MIXEDMAP))) { | 1025 | !vma->anon_vma) |
| 1025 | if (!vma->anon_vma) | 1026 | return 0; |
| 1026 | return 0; | ||
| 1027 | } | ||
| 1028 | 1027 | ||
| 1029 | if (is_vm_hugetlb_page(vma)) | 1028 | if (is_vm_hugetlb_page(vma)) |
| 1030 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 1029 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
| @@ -1082,6 +1081,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
| 1082 | spinlock_t *ptl; | 1081 | spinlock_t *ptl; |
| 1083 | pte_t *start_pte; | 1082 | pte_t *start_pte; |
| 1084 | pte_t *pte; | 1083 | pte_t *pte; |
| 1084 | swp_entry_t entry; | ||
| 1085 | 1085 | ||
| 1086 | again: | 1086 | again: |
| 1087 | init_rss_vec(rss); | 1087 | init_rss_vec(rss); |
| @@ -1107,28 +1107,12 @@ again: | |||
| 1107 | if (details->check_mapping && | 1107 | if (details->check_mapping && |
| 1108 | details->check_mapping != page->mapping) | 1108 | details->check_mapping != page->mapping) |
| 1109 | continue; | 1109 | continue; |
| 1110 | /* | ||
| 1111 | * Each page->index must be checked when | ||
| 1112 | * invalidating or truncating nonlinear. | ||
| 1113 | */ | ||
| 1114 | if (details->nonlinear_vma && | ||
| 1115 | (page->index < details->first_index || | ||
| 1116 | page->index > details->last_index)) | ||
| 1117 | continue; | ||
| 1118 | } | 1110 | } |
| 1119 | ptent = ptep_get_and_clear_full(mm, addr, pte, | 1111 | ptent = ptep_get_and_clear_full(mm, addr, pte, |
| 1120 | tlb->fullmm); | 1112 | tlb->fullmm); |
| 1121 | tlb_remove_tlb_entry(tlb, pte, addr); | 1113 | tlb_remove_tlb_entry(tlb, pte, addr); |
| 1122 | if (unlikely(!page)) | 1114 | if (unlikely(!page)) |
| 1123 | continue; | 1115 | continue; |
| 1124 | if (unlikely(details) && details->nonlinear_vma | ||
| 1125 | && linear_page_index(details->nonlinear_vma, | ||
| 1126 | addr) != page->index) { | ||
| 1127 | pte_t ptfile = pgoff_to_pte(page->index); | ||
| 1128 | if (pte_soft_dirty(ptent)) | ||
| 1129 | ptfile = pte_file_mksoft_dirty(ptfile); | ||
| 1130 | set_pte_at(mm, addr, pte, ptfile); | ||
| 1131 | } | ||
| 1132 | if (PageAnon(page)) | 1116 | if (PageAnon(page)) |
| 1133 | rss[MM_ANONPAGES]--; | 1117 | rss[MM_ANONPAGES]--; |
| 1134 | else { | 1118 | else { |
| @@ -1151,33 +1135,25 @@ again: | |||
| 1151 | } | 1135 | } |
| 1152 | continue; | 1136 | continue; |
| 1153 | } | 1137 | } |
| 1154 | /* | 1138 | /* If details->check_mapping, we leave swap entries. */ |
| 1155 | * If details->check_mapping, we leave swap entries; | ||
| 1156 | * if details->nonlinear_vma, we leave file entries. | ||
| 1157 | */ | ||
| 1158 | if (unlikely(details)) | 1139 | if (unlikely(details)) |
| 1159 | continue; | 1140 | continue; |
| 1160 | if (pte_file(ptent)) { | ||
| 1161 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) | ||
| 1162 | print_bad_pte(vma, addr, ptent, NULL); | ||
| 1163 | } else { | ||
| 1164 | swp_entry_t entry = pte_to_swp_entry(ptent); | ||
| 1165 | 1141 | ||
| 1166 | if (!non_swap_entry(entry)) | 1142 | entry = pte_to_swp_entry(ptent); |
| 1167 | rss[MM_SWAPENTS]--; | 1143 | if (!non_swap_entry(entry)) |
| 1168 | else if (is_migration_entry(entry)) { | 1144 | rss[MM_SWAPENTS]--; |
| 1169 | struct page *page; | 1145 | else if (is_migration_entry(entry)) { |
| 1146 | struct page *page; | ||
| 1170 | 1147 | ||
| 1171 | page = migration_entry_to_page(entry); | 1148 | page = migration_entry_to_page(entry); |
| 1172 | 1149 | ||
| 1173 | if (PageAnon(page)) | 1150 | if (PageAnon(page)) |
| 1174 | rss[MM_ANONPAGES]--; | 1151 | rss[MM_ANONPAGES]--; |
| 1175 | else | 1152 | else |
| 1176 | rss[MM_FILEPAGES]--; | 1153 | rss[MM_FILEPAGES]--; |
| 1177 | } | ||
| 1178 | if (unlikely(!free_swap_and_cache(entry))) | ||
| 1179 | print_bad_pte(vma, addr, ptent, NULL); | ||
| 1180 | } | 1154 | } |
| 1155 | if (unlikely(!free_swap_and_cache(entry))) | ||
| 1156 | print_bad_pte(vma, addr, ptent, NULL); | ||
| 1181 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 1157 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
| 1182 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1158 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 1183 | 1159 | ||
| @@ -1277,7 +1253,7 @@ static void unmap_page_range(struct mmu_gather *tlb, | |||
| 1277 | pgd_t *pgd; | 1253 | pgd_t *pgd; |
| 1278 | unsigned long next; | 1254 | unsigned long next; |
| 1279 | 1255 | ||
| 1280 | if (details && !details->check_mapping && !details->nonlinear_vma) | 1256 | if (details && !details->check_mapping) |
| 1281 | details = NULL; | 1257 | details = NULL; |
| 1282 | 1258 | ||
| 1283 | BUG_ON(addr >= end); | 1259 | BUG_ON(addr >= end); |
| @@ -1371,7 +1347,7 @@ void unmap_vmas(struct mmu_gather *tlb, | |||
| 1371 | * @vma: vm_area_struct holding the applicable pages | 1347 | * @vma: vm_area_struct holding the applicable pages |
| 1372 | * @start: starting address of pages to zap | 1348 | * @start: starting address of pages to zap |
| 1373 | * @size: number of bytes to zap | 1349 | * @size: number of bytes to zap |
| 1374 | * @details: details of nonlinear truncation or shared cache invalidation | 1350 | * @details: details of shared cache invalidation |
| 1375 | * | 1351 | * |
| 1376 | * Caller must protect the VMA list | 1352 | * Caller must protect the VMA list |
| 1377 | */ | 1353 | */ |
| @@ -1397,7 +1373,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, | |||
| 1397 | * @vma: vm_area_struct holding the applicable pages | 1373 | * @vma: vm_area_struct holding the applicable pages |
| 1398 | * @address: starting address of pages to zap | 1374 | * @address: starting address of pages to zap |
| 1399 | * @size: number of bytes to zap | 1375 | * @size: number of bytes to zap |
| 1400 | * @details: details of nonlinear truncation or shared cache invalidation | 1376 | * @details: details of shared cache invalidation |
| 1401 | * | 1377 | * |
| 1402 | * The range must fit into one VMA. | 1378 | * The range must fit into one VMA. |
| 1403 | */ | 1379 | */ |
| @@ -1922,12 +1898,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
| 1922 | EXPORT_SYMBOL_GPL(apply_to_page_range); | 1898 | EXPORT_SYMBOL_GPL(apply_to_page_range); |
| 1923 | 1899 | ||
| 1924 | /* | 1900 | /* |
| 1925 | * handle_pte_fault chooses page fault handler according to an entry | 1901 | * handle_pte_fault chooses page fault handler according to an entry which was |
| 1926 | * which was read non-atomically. Before making any commitment, on | 1902 | * read non-atomically. Before making any commitment, on those architectures |
| 1927 | * those architectures or configurations (e.g. i386 with PAE) which | 1903 | * or configurations (e.g. i386 with PAE) which might give a mix of unmatched |
| 1928 | * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault | 1904 | * parts, do_swap_page must check under lock before unmapping the pte and |
| 1929 | * must check under lock before unmapping the pte and proceeding | 1905 | * proceeding (but do_wp_page is only called after already making such a check; |
| 1930 | * (but do_wp_page is only called after already making such a check; | ||
| 1931 | * and do_anonymous_page can safely check later on). | 1906 | * and do_anonymous_page can safely check later on). |
| 1932 | */ | 1907 | */ |
| 1933 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | 1908 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, |
| @@ -1990,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
| 1990 | vmf.pgoff = page->index; | 1965 | vmf.pgoff = page->index; |
| 1991 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; | 1966 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; |
| 1992 | vmf.page = page; | 1967 | vmf.page = page; |
| 1968 | vmf.cow_page = NULL; | ||
| 1993 | 1969 | ||
| 1994 | ret = vma->vm_ops->page_mkwrite(vma, &vmf); | 1970 | ret = vma->vm_ops->page_mkwrite(vma, &vmf); |
| 1995 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 1971 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
| @@ -2033,7 +2009,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2033 | pte_t entry; | 2009 | pte_t entry; |
| 2034 | int ret = 0; | 2010 | int ret = 0; |
| 2035 | int page_mkwrite = 0; | 2011 | int page_mkwrite = 0; |
| 2036 | struct page *dirty_page = NULL; | 2012 | bool dirty_shared = false; |
| 2037 | unsigned long mmun_start = 0; /* For mmu_notifiers */ | 2013 | unsigned long mmun_start = 0; /* For mmu_notifiers */ |
| 2038 | unsigned long mmun_end = 0; /* For mmu_notifiers */ | 2014 | unsigned long mmun_end = 0; /* For mmu_notifiers */ |
| 2039 | struct mem_cgroup *memcg; | 2015 | struct mem_cgroup *memcg; |
| @@ -2084,6 +2060,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2084 | unlock_page(old_page); | 2060 | unlock_page(old_page); |
| 2085 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2061 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
| 2086 | (VM_WRITE|VM_SHARED))) { | 2062 | (VM_WRITE|VM_SHARED))) { |
| 2063 | page_cache_get(old_page); | ||
| 2087 | /* | 2064 | /* |
| 2088 | * Only catch write-faults on shared writable pages, | 2065 | * Only catch write-faults on shared writable pages, |
| 2089 | * read-only shared pages can get COWed by | 2066 | * read-only shared pages can get COWed by |
| @@ -2091,7 +2068,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2091 | */ | 2068 | */ |
| 2092 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 2069 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
| 2093 | int tmp; | 2070 | int tmp; |
| 2094 | page_cache_get(old_page); | 2071 | |
| 2095 | pte_unmap_unlock(page_table, ptl); | 2072 | pte_unmap_unlock(page_table, ptl); |
| 2096 | tmp = do_page_mkwrite(vma, old_page, address); | 2073 | tmp = do_page_mkwrite(vma, old_page, address); |
| 2097 | if (unlikely(!tmp || (tmp & | 2074 | if (unlikely(!tmp || (tmp & |
| @@ -2111,11 +2088,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2111 | unlock_page(old_page); | 2088 | unlock_page(old_page); |
| 2112 | goto unlock; | 2089 | goto unlock; |
| 2113 | } | 2090 | } |
| 2114 | |||
| 2115 | page_mkwrite = 1; | 2091 | page_mkwrite = 1; |
| 2116 | } | 2092 | } |
| 2117 | dirty_page = old_page; | 2093 | |
| 2118 | get_page(dirty_page); | 2094 | dirty_shared = true; |
| 2119 | 2095 | ||
| 2120 | reuse: | 2096 | reuse: |
| 2121 | /* | 2097 | /* |
| @@ -2134,20 +2110,20 @@ reuse: | |||
| 2134 | pte_unmap_unlock(page_table, ptl); | 2110 | pte_unmap_unlock(page_table, ptl); |
| 2135 | ret |= VM_FAULT_WRITE; | 2111 | ret |= VM_FAULT_WRITE; |
| 2136 | 2112 | ||
| 2137 | if (!dirty_page) | 2113 | if (dirty_shared) { |
| 2138 | return ret; | ||
| 2139 | |||
| 2140 | if (!page_mkwrite) { | ||
| 2141 | struct address_space *mapping; | 2114 | struct address_space *mapping; |
| 2142 | int dirtied; | 2115 | int dirtied; |
| 2143 | 2116 | ||
| 2144 | lock_page(dirty_page); | 2117 | if (!page_mkwrite) |
| 2145 | dirtied = set_page_dirty(dirty_page); | 2118 | lock_page(old_page); |
| 2146 | VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page); | 2119 | |
| 2147 | mapping = dirty_page->mapping; | 2120 | dirtied = set_page_dirty(old_page); |
| 2148 | unlock_page(dirty_page); | 2121 | VM_BUG_ON_PAGE(PageAnon(old_page), old_page); |
| 2122 | mapping = old_page->mapping; | ||
| 2123 | unlock_page(old_page); | ||
| 2124 | page_cache_release(old_page); | ||
| 2149 | 2125 | ||
| 2150 | if (dirtied && mapping) { | 2126 | if ((dirtied || page_mkwrite) && mapping) { |
| 2151 | /* | 2127 | /* |
| 2152 | * Some device drivers do not set page.mapping | 2128 | * Some device drivers do not set page.mapping |
| 2153 | * but still dirty their pages | 2129 | * but still dirty their pages |
| @@ -2155,25 +2131,9 @@ reuse: | |||
| 2155 | balance_dirty_pages_ratelimited(mapping); | 2131 | balance_dirty_pages_ratelimited(mapping); |
| 2156 | } | 2132 | } |
| 2157 | 2133 | ||
| 2158 | /* file_update_time outside page_lock */ | 2134 | if (!page_mkwrite) |
| 2159 | if (vma->vm_file) | ||
| 2160 | file_update_time(vma->vm_file); | 2135 | file_update_time(vma->vm_file); |
| 2161 | } | 2136 | } |
| 2162 | put_page(dirty_page); | ||
| 2163 | if (page_mkwrite) { | ||
| 2164 | struct address_space *mapping = dirty_page->mapping; | ||
| 2165 | |||
| 2166 | set_page_dirty(dirty_page); | ||
| 2167 | unlock_page(dirty_page); | ||
| 2168 | page_cache_release(dirty_page); | ||
| 2169 | if (mapping) { | ||
| 2170 | /* | ||
| 2171 | * Some device drivers do not set page.mapping | ||
| 2172 | * but still dirty their pages | ||
| 2173 | */ | ||
| 2174 | balance_dirty_pages_ratelimited(mapping); | ||
| 2175 | } | ||
| 2176 | } | ||
| 2177 | 2137 | ||
| 2178 | return ret; | 2138 | return ret; |
| 2179 | } | 2139 | } |
| @@ -2331,25 +2291,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, | |||
| 2331 | } | 2291 | } |
| 2332 | } | 2292 | } |
| 2333 | 2293 | ||
| 2334 | static inline void unmap_mapping_range_list(struct list_head *head, | ||
| 2335 | struct zap_details *details) | ||
| 2336 | { | ||
| 2337 | struct vm_area_struct *vma; | ||
| 2338 | |||
| 2339 | /* | ||
| 2340 | * In nonlinear VMAs there is no correspondence between virtual address | ||
| 2341 | * offset and file offset. So we must perform an exhaustive search | ||
| 2342 | * across *all* the pages in each nonlinear VMA, not just the pages | ||
| 2343 | * whose virtual address lies outside the file truncation point. | ||
| 2344 | */ | ||
| 2345 | list_for_each_entry(vma, head, shared.nonlinear) { | ||
| 2346 | details->nonlinear_vma = vma; | ||
| 2347 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); | ||
| 2348 | } | ||
| 2349 | } | ||
| 2350 | |||
| 2351 | /** | 2294 | /** |
| 2352 | * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. | 2295 | * unmap_mapping_range - unmap the portion of all mmaps in the specified |
| 2296 | * address_space corresponding to the specified page range in the underlying | ||
| 2297 | * file. | ||
| 2298 | * | ||
| 2353 | * @mapping: the address space containing mmaps to be unmapped. | 2299 | * @mapping: the address space containing mmaps to be unmapped. |
| 2354 | * @holebegin: byte in first page to unmap, relative to the start of | 2300 | * @holebegin: byte in first page to unmap, relative to the start of |
| 2355 | * the underlying file. This will be rounded down to a PAGE_SIZE | 2301 | * the underlying file. This will be rounded down to a PAGE_SIZE |
| @@ -2378,18 +2324,16 @@ void unmap_mapping_range(struct address_space *mapping, | |||
| 2378 | } | 2324 | } |
| 2379 | 2325 | ||
| 2380 | details.check_mapping = even_cows? NULL: mapping; | 2326 | details.check_mapping = even_cows? NULL: mapping; |
| 2381 | details.nonlinear_vma = NULL; | ||
| 2382 | details.first_index = hba; | 2327 | details.first_index = hba; |
| 2383 | details.last_index = hba + hlen - 1; | 2328 | details.last_index = hba + hlen - 1; |
| 2384 | if (details.last_index < details.first_index) | 2329 | if (details.last_index < details.first_index) |
| 2385 | details.last_index = ULONG_MAX; | 2330 | details.last_index = ULONG_MAX; |
| 2386 | 2331 | ||
| 2387 | 2332 | ||
| 2333 | /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ | ||
| 2388 | i_mmap_lock_write(mapping); | 2334 | i_mmap_lock_write(mapping); |
| 2389 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) | 2335 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
| 2390 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2336 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
| 2391 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | ||
| 2392 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | ||
| 2393 | i_mmap_unlock_write(mapping); | 2337 | i_mmap_unlock_write(mapping); |
| 2394 | } | 2338 | } |
| 2395 | EXPORT_SYMBOL(unmap_mapping_range); | 2339 | EXPORT_SYMBOL(unmap_mapping_range); |
| @@ -2696,7 +2640,8 @@ oom: | |||
| 2696 | * See filemap_fault() and __lock_page_retry(). | 2640 | * See filemap_fault() and __lock_page_retry(). |
| 2697 | */ | 2641 | */ |
| 2698 | static int __do_fault(struct vm_area_struct *vma, unsigned long address, | 2642 | static int __do_fault(struct vm_area_struct *vma, unsigned long address, |
| 2699 | pgoff_t pgoff, unsigned int flags, struct page **page) | 2643 | pgoff_t pgoff, unsigned int flags, |
| 2644 | struct page *cow_page, struct page **page) | ||
| 2700 | { | 2645 | { |
| 2701 | struct vm_fault vmf; | 2646 | struct vm_fault vmf; |
| 2702 | int ret; | 2647 | int ret; |
| @@ -2705,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 2705 | vmf.pgoff = pgoff; | 2650 | vmf.pgoff = pgoff; |
| 2706 | vmf.flags = flags; | 2651 | vmf.flags = flags; |
| 2707 | vmf.page = NULL; | 2652 | vmf.page = NULL; |
| 2653 | vmf.cow_page = cow_page; | ||
| 2708 | 2654 | ||
| 2709 | ret = vma->vm_ops->fault(vma, &vmf); | 2655 | ret = vma->vm_ops->fault(vma, &vmf); |
| 2710 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 2656 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
| 2711 | return ret; | 2657 | return ret; |
| 2658 | if (!vmf.page) | ||
| 2659 | goto out; | ||
| 2712 | 2660 | ||
| 2713 | if (unlikely(PageHWPoison(vmf.page))) { | 2661 | if (unlikely(PageHWPoison(vmf.page))) { |
| 2714 | if (ret & VM_FAULT_LOCKED) | 2662 | if (ret & VM_FAULT_LOCKED) |
| @@ -2722,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 2722 | else | 2670 | else |
| 2723 | VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); | 2671 | VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); |
| 2724 | 2672 | ||
| 2673 | out: | ||
| 2725 | *page = vmf.page; | 2674 | *page = vmf.page; |
| 2726 | return ret; | 2675 | return ret; |
| 2727 | } | 2676 | } |
| @@ -2750,8 +2699,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, | |||
| 2750 | entry = mk_pte(page, vma->vm_page_prot); | 2699 | entry = mk_pte(page, vma->vm_page_prot); |
| 2751 | if (write) | 2700 | if (write) |
| 2752 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2701 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2753 | else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) | ||
| 2754 | entry = pte_mksoft_dirty(entry); | ||
| 2755 | if (anon) { | 2702 | if (anon) { |
| 2756 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2703 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
| 2757 | page_add_new_anon_rmap(page, vma, address); | 2704 | page_add_new_anon_rmap(page, vma, address); |
| @@ -2886,8 +2833,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2886 | * if page by the offset is not ready to be mapped (cold cache or | 2833 | * if page by the offset is not ready to be mapped (cold cache or |
| 2887 | * something). | 2834 | * something). |
| 2888 | */ | 2835 | */ |
| 2889 | if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && | 2836 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { |
| 2890 | fault_around_bytes >> PAGE_SHIFT > 1) { | ||
| 2891 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2837 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 2892 | do_fault_around(vma, address, pte, pgoff, flags); | 2838 | do_fault_around(vma, address, pte, pgoff, flags); |
| 2893 | if (!pte_same(*pte, orig_pte)) | 2839 | if (!pte_same(*pte, orig_pte)) |
| @@ -2895,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2895 | pte_unmap_unlock(pte, ptl); | 2841 | pte_unmap_unlock(pte, ptl); |
| 2896 | } | 2842 | } |
| 2897 | 2843 | ||
| 2898 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); | 2844 | ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); |
| 2899 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 2845 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
| 2900 | return ret; | 2846 | return ret; |
| 2901 | 2847 | ||
| @@ -2935,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2935 | return VM_FAULT_OOM; | 2881 | return VM_FAULT_OOM; |
| 2936 | } | 2882 | } |
| 2937 | 2883 | ||
| 2938 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); | 2884 | ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); |
| 2939 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 2885 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
| 2940 | goto uncharge_out; | 2886 | goto uncharge_out; |
| 2941 | 2887 | ||
| 2942 | copy_user_highpage(new_page, fault_page, address, vma); | 2888 | if (fault_page) |
| 2889 | copy_user_highpage(new_page, fault_page, address, vma); | ||
| 2943 | __SetPageUptodate(new_page); | 2890 | __SetPageUptodate(new_page); |
| 2944 | 2891 | ||
| 2945 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2892 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 2946 | if (unlikely(!pte_same(*pte, orig_pte))) { | 2893 | if (unlikely(!pte_same(*pte, orig_pte))) { |
| 2947 | pte_unmap_unlock(pte, ptl); | 2894 | pte_unmap_unlock(pte, ptl); |
| 2948 | unlock_page(fault_page); | 2895 | if (fault_page) { |
| 2949 | page_cache_release(fault_page); | 2896 | unlock_page(fault_page); |
| 2897 | page_cache_release(fault_page); | ||
| 2898 | } else { | ||
| 2899 | /* | ||
| 2900 | * The fault handler has no page to lock, so it holds | ||
| 2901 | * i_mmap_lock for read to protect against truncate. | ||
| 2902 | */ | ||
| 2903 | i_mmap_unlock_read(vma->vm_file->f_mapping); | ||
| 2904 | } | ||
| 2950 | goto uncharge_out; | 2905 | goto uncharge_out; |
| 2951 | } | 2906 | } |
| 2952 | do_set_pte(vma, address, new_page, pte, true, true); | 2907 | do_set_pte(vma, address, new_page, pte, true, true); |
| 2953 | mem_cgroup_commit_charge(new_page, memcg, false); | 2908 | mem_cgroup_commit_charge(new_page, memcg, false); |
| 2954 | lru_cache_add_active_or_unevictable(new_page, vma); | 2909 | lru_cache_add_active_or_unevictable(new_page, vma); |
| 2955 | pte_unmap_unlock(pte, ptl); | 2910 | pte_unmap_unlock(pte, ptl); |
| 2956 | unlock_page(fault_page); | 2911 | if (fault_page) { |
| 2957 | page_cache_release(fault_page); | 2912 | unlock_page(fault_page); |
| 2913 | page_cache_release(fault_page); | ||
| 2914 | } else { | ||
| 2915 | /* | ||
| 2916 | * The fault handler has no page to lock, so it holds | ||
| 2917 | * i_mmap_lock for read to protect against truncate. | ||
| 2918 | */ | ||
| 2919 | i_mmap_unlock_read(vma->vm_file->f_mapping); | ||
| 2920 | } | ||
| 2958 | return ret; | 2921 | return ret; |
| 2959 | uncharge_out: | 2922 | uncharge_out: |
| 2960 | mem_cgroup_cancel_charge(new_page, memcg); | 2923 | mem_cgroup_cancel_charge(new_page, memcg); |
| @@ -2973,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2973 | int dirtied = 0; | 2936 | int dirtied = 0; |
| 2974 | int ret, tmp; | 2937 | int ret, tmp; |
| 2975 | 2938 | ||
| 2976 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); | 2939 | ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); |
| 2977 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 2940 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
| 2978 | return ret; | 2941 | return ret; |
| 2979 | 2942 | ||
| @@ -3019,8 +2982,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3019 | balance_dirty_pages_ratelimited(mapping); | 2982 | balance_dirty_pages_ratelimited(mapping); |
| 3020 | } | 2983 | } |
| 3021 | 2984 | ||
| 3022 | /* file_update_time outside page_lock */ | 2985 | if (!vma->vm_ops->page_mkwrite) |
| 3023 | if (vma->vm_file && !vma->vm_ops->page_mkwrite) | ||
| 3024 | file_update_time(vma->vm_file); | 2986 | file_update_time(vma->vm_file); |
| 3025 | 2987 | ||
| 3026 | return ret; | 2988 | return ret; |
| @@ -3032,7 +2994,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3032 | * The mmap_sem may have been released depending on flags and our | 2994 | * The mmap_sem may have been released depending on flags and our |
| 3033 | * return value. See filemap_fault() and __lock_page_or_retry(). | 2995 | * return value. See filemap_fault() and __lock_page_or_retry(). |
| 3034 | */ | 2996 | */ |
| 3035 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2997 | static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
| 3036 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2998 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 3037 | unsigned int flags, pte_t orig_pte) | 2999 | unsigned int flags, pte_t orig_pte) |
| 3038 | { | 3000 | { |
| @@ -3049,46 +3011,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3049 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3011 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
| 3050 | } | 3012 | } |
| 3051 | 3013 | ||
| 3052 | /* | ||
| 3053 | * Fault of a previously existing named mapping. Repopulate the pte | ||
| 3054 | * from the encoded file_pte if possible. This enables swappable | ||
| 3055 | * nonlinear vmas. | ||
| 3056 | * | ||
| 3057 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
| 3058 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
| 3059 | * We return with pte unmapped and unlocked. | ||
| 3060 | * The mmap_sem may have been released depending on flags and our | ||
| 3061 | * return value. See filemap_fault() and __lock_page_or_retry(). | ||
| 3062 | */ | ||
| 3063 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 3064 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
| 3065 | unsigned int flags, pte_t orig_pte) | ||
| 3066 | { | ||
| 3067 | pgoff_t pgoff; | ||
| 3068 | |||
| 3069 | flags |= FAULT_FLAG_NONLINEAR; | ||
| 3070 | |||
| 3071 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | ||
| 3072 | return 0; | ||
| 3073 | |||
| 3074 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { | ||
| 3075 | /* | ||
| 3076 | * Page table corrupted: show pte and kill process. | ||
| 3077 | */ | ||
| 3078 | print_bad_pte(vma, address, orig_pte, NULL); | ||
| 3079 | return VM_FAULT_SIGBUS; | ||
| 3080 | } | ||
| 3081 | |||
| 3082 | pgoff = pte_to_pgoff(orig_pte); | ||
| 3083 | if (!(flags & FAULT_FLAG_WRITE)) | ||
| 3084 | return do_read_fault(mm, vma, address, pmd, pgoff, flags, | ||
| 3085 | orig_pte); | ||
| 3086 | if (!(vma->vm_flags & VM_SHARED)) | ||
| 3087 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, | ||
| 3088 | orig_pte); | ||
| 3089 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | ||
| 3090 | } | ||
| 3091 | |||
| 3092 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3014 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
| 3093 | unsigned long addr, int page_nid, | 3015 | unsigned long addr, int page_nid, |
| 3094 | int *flags) | 3016 | int *flags) |
| @@ -3115,14 +3037,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3115 | bool migrated = false; | 3037 | bool migrated = false; |
| 3116 | int flags = 0; | 3038 | int flags = 0; |
| 3117 | 3039 | ||
| 3040 | /* A PROT_NONE fault should not end up here */ | ||
| 3041 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); | ||
| 3042 | |||
| 3118 | /* | 3043 | /* |
| 3119 | * The "pte" at this point cannot be used safely without | 3044 | * The "pte" at this point cannot be used safely without |
| 3120 | * validation through pte_unmap_same(). It's of NUMA type but | 3045 | * validation through pte_unmap_same(). It's of NUMA type but |
| 3121 | * the pfn may be screwed if the read is non atomic. | 3046 | * the pfn may be screwed if the read is non atomic. |
| 3122 | * | 3047 | * |
| 3123 | * ptep_modify_prot_start is not called as this is clearing | 3048 | * We can safely just do a "set_pte_at()", because the old |
| 3124 | * the _PAGE_NUMA bit and it is not really expected that there | 3049 | * page table entry is not accessible, so there would be no |
| 3125 | * would be concurrent hardware modifications to the PTE. | 3050 | * concurrent hardware modifications to the PTE. |
| 3126 | */ | 3051 | */ |
| 3127 | ptl = pte_lockptr(mm, pmd); | 3052 | ptl = pte_lockptr(mm, pmd); |
| 3128 | spin_lock(ptl); | 3053 | spin_lock(ptl); |
| @@ -3131,7 +3056,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3131 | goto out; | 3056 | goto out; |
| 3132 | } | 3057 | } |
| 3133 | 3058 | ||
| 3134 | pte = pte_mknonnuma(pte); | 3059 | /* Make it present again */ |
| 3060 | pte = pte_modify(pte, vma->vm_page_prot); | ||
| 3061 | pte = pte_mkyoung(pte); | ||
| 3135 | set_pte_at(mm, addr, ptep, pte); | 3062 | set_pte_at(mm, addr, ptep, pte); |
| 3136 | update_mmu_cache(vma, addr, ptep); | 3063 | update_mmu_cache(vma, addr, ptep); |
| 3137 | 3064 | ||
| @@ -3140,7 +3067,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3140 | pte_unmap_unlock(ptep, ptl); | 3067 | pte_unmap_unlock(ptep, ptl); |
| 3141 | return 0; | 3068 | return 0; |
| 3142 | } | 3069 | } |
| 3143 | BUG_ON(is_zero_pfn(page_to_pfn(page))); | ||
| 3144 | 3070 | ||
| 3145 | /* | 3071 | /* |
| 3146 | * Avoid grouping on DSO/COW pages in specific and RO pages | 3072 | * Avoid grouping on DSO/COW pages in specific and RO pages |
| @@ -3216,20 +3142,17 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
| 3216 | if (pte_none(entry)) { | 3142 | if (pte_none(entry)) { |
| 3217 | if (vma->vm_ops) { | 3143 | if (vma->vm_ops) { |
| 3218 | if (likely(vma->vm_ops->fault)) | 3144 | if (likely(vma->vm_ops->fault)) |
| 3219 | return do_linear_fault(mm, vma, address, | 3145 | return do_fault(mm, vma, address, pte, |
| 3220 | pte, pmd, flags, entry); | 3146 | pmd, flags, entry); |
| 3221 | } | 3147 | } |
| 3222 | return do_anonymous_page(mm, vma, address, | 3148 | return do_anonymous_page(mm, vma, address, |
| 3223 | pte, pmd, flags); | 3149 | pte, pmd, flags); |
| 3224 | } | 3150 | } |
| 3225 | if (pte_file(entry)) | ||
| 3226 | return do_nonlinear_fault(mm, vma, address, | ||
| 3227 | pte, pmd, flags, entry); | ||
| 3228 | return do_swap_page(mm, vma, address, | 3151 | return do_swap_page(mm, vma, address, |
| 3229 | pte, pmd, flags, entry); | 3152 | pte, pmd, flags, entry); |
| 3230 | } | 3153 | } |
| 3231 | 3154 | ||
| 3232 | if (pte_numa(entry)) | 3155 | if (pte_protnone(entry)) |
| 3233 | return do_numa_page(mm, vma, address, entry, pte, pmd); | 3156 | return do_numa_page(mm, vma, address, entry, pte, pmd); |
| 3234 | 3157 | ||
| 3235 | ptl = pte_lockptr(mm, pmd); | 3158 | ptl = pte_lockptr(mm, pmd); |
| @@ -3307,7 +3230,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3307 | if (pmd_trans_splitting(orig_pmd)) | 3230 | if (pmd_trans_splitting(orig_pmd)) |
| 3308 | return 0; | 3231 | return 0; |
| 3309 | 3232 | ||
| 3310 | if (pmd_numa(orig_pmd)) | 3233 | if (pmd_protnone(orig_pmd)) |
| 3311 | return do_huge_pmd_numa_page(mm, vma, address, | 3234 | return do_huge_pmd_numa_page(mm, vma, address, |
| 3312 | orig_pmd, pmd); | 3235 | orig_pmd, pmd); |
| 3313 | 3236 | ||
| @@ -3428,15 +3351,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | |||
| 3428 | 3351 | ||
| 3429 | spin_lock(&mm->page_table_lock); | 3352 | spin_lock(&mm->page_table_lock); |
| 3430 | #ifndef __ARCH_HAS_4LEVEL_HACK | 3353 | #ifndef __ARCH_HAS_4LEVEL_HACK |
| 3431 | if (pud_present(*pud)) /* Another has populated it */ | 3354 | if (!pud_present(*pud)) { |
| 3432 | pmd_free(mm, new); | 3355 | mm_inc_nr_pmds(mm); |
| 3433 | else | ||
| 3434 | pud_populate(mm, pud, new); | 3356 | pud_populate(mm, pud, new); |
| 3435 | #else | 3357 | } else /* Another has populated it */ |
| 3436 | if (pgd_present(*pud)) /* Another has populated it */ | ||
| 3437 | pmd_free(mm, new); | 3358 | pmd_free(mm, new); |
| 3438 | else | 3359 | #else |
| 3360 | if (!pgd_present(*pud)) { | ||
| 3361 | mm_inc_nr_pmds(mm); | ||
| 3439 | pgd_populate(mm, pud, new); | 3362 | pgd_populate(mm, pud, new); |
| 3363 | } else /* Another has populated it */ | ||
| 3364 | pmd_free(mm, new); | ||
| 3440 | #endif /* __ARCH_HAS_4LEVEL_HACK */ | 3365 | #endif /* __ARCH_HAS_4LEVEL_HACK */ |
| 3441 | spin_unlock(&mm->page_table_lock); | 3366 | spin_unlock(&mm->page_table_lock); |
| 3442 | return 0; | 3367 | return 0; |
| @@ -3561,7 +3486,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | |||
| 3561 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) | 3486 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) |
| 3562 | return -EINVAL; | 3487 | return -EINVAL; |
| 3563 | 3488 | ||
| 3564 | maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); | 3489 | maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); |
| 3565 | if (write) | 3490 | if (write) |
| 3566 | memcpy_toio(maddr + offset, buf, len); | 3491 | memcpy_toio(maddr + offset, buf, len); |
| 3567 | else | 3492 | else |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e0961b8c39c..4721046a134a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { | |||
| 471 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 471 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
| 472 | unsigned long flags); | 472 | unsigned long flags); |
| 473 | 473 | ||
| 474 | struct queue_pages { | ||
| 475 | struct list_head *pagelist; | ||
| 476 | unsigned long flags; | ||
| 477 | nodemask_t *nmask; | ||
| 478 | struct vm_area_struct *prev; | ||
| 479 | }; | ||
| 480 | |||
| 474 | /* | 481 | /* |
| 475 | * Scan through pages checking if pages follow certain conditions, | 482 | * Scan through pages checking if pages follow certain conditions, |
| 476 | * and move them to the pagelist if they do. | 483 | * and move them to the pagelist if they do. |
| 477 | */ | 484 | */ |
| 478 | static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 485 | static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, |
| 479 | unsigned long addr, unsigned long end, | 486 | unsigned long end, struct mm_walk *walk) |
| 480 | const nodemask_t *nodes, unsigned long flags, | ||
| 481 | void *private) | ||
| 482 | { | 487 | { |
| 483 | pte_t *orig_pte; | 488 | struct vm_area_struct *vma = walk->vma; |
| 489 | struct page *page; | ||
| 490 | struct queue_pages *qp = walk->private; | ||
| 491 | unsigned long flags = qp->flags; | ||
| 492 | int nid; | ||
| 484 | pte_t *pte; | 493 | pte_t *pte; |
| 485 | spinlock_t *ptl; | 494 | spinlock_t *ptl; |
| 486 | 495 | ||
| 487 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 496 | split_huge_page_pmd(vma, addr, pmd); |
| 488 | do { | 497 | if (pmd_trans_unstable(pmd)) |
| 489 | struct page *page; | 498 | return 0; |
| 490 | int nid; | ||
| 491 | 499 | ||
| 500 | pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | ||
| 501 | for (; addr != end; pte++, addr += PAGE_SIZE) { | ||
| 492 | if (!pte_present(*pte)) | 502 | if (!pte_present(*pte)) |
| 493 | continue; | 503 | continue; |
| 494 | page = vm_normal_page(vma, addr, *pte); | 504 | page = vm_normal_page(vma, addr, *pte); |
| @@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 501 | if (PageReserved(page)) | 511 | if (PageReserved(page)) |
| 502 | continue; | 512 | continue; |
| 503 | nid = page_to_nid(page); | 513 | nid = page_to_nid(page); |
| 504 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 514 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) |
| 505 | continue; | 515 | continue; |
| 506 | 516 | ||
| 507 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 517 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
| 508 | migrate_page_add(page, private, flags); | 518 | migrate_page_add(page, qp->pagelist, flags); |
| 509 | else | 519 | } |
| 510 | break; | 520 | pte_unmap_unlock(pte - 1, ptl); |
| 511 | } while (pte++, addr += PAGE_SIZE, addr != end); | 521 | cond_resched(); |
| 512 | pte_unmap_unlock(orig_pte, ptl); | 522 | return 0; |
| 513 | return addr != end; | ||
| 514 | } | 523 | } |
| 515 | 524 | ||
| 516 | static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, | 525 | static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, |
| 517 | pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, | 526 | unsigned long addr, unsigned long end, |
| 518 | void *private) | 527 | struct mm_walk *walk) |
| 519 | { | 528 | { |
| 520 | #ifdef CONFIG_HUGETLB_PAGE | 529 | #ifdef CONFIG_HUGETLB_PAGE |
| 530 | struct queue_pages *qp = walk->private; | ||
| 531 | unsigned long flags = qp->flags; | ||
| 521 | int nid; | 532 | int nid; |
| 522 | struct page *page; | 533 | struct page *page; |
| 523 | spinlock_t *ptl; | 534 | spinlock_t *ptl; |
| 524 | pte_t entry; | 535 | pte_t entry; |
| 525 | 536 | ||
| 526 | ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); | 537 | ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); |
| 527 | entry = huge_ptep_get((pte_t *)pmd); | 538 | entry = huge_ptep_get(pte); |
| 528 | if (!pte_present(entry)) | 539 | if (!pte_present(entry)) |
| 529 | goto unlock; | 540 | goto unlock; |
| 530 | page = pte_page(entry); | 541 | page = pte_page(entry); |
| 531 | nid = page_to_nid(page); | 542 | nid = page_to_nid(page); |
| 532 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 543 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) |
| 533 | goto unlock; | 544 | goto unlock; |
| 534 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ | 545 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ |
| 535 | if (flags & (MPOL_MF_MOVE_ALL) || | 546 | if (flags & (MPOL_MF_MOVE_ALL) || |
| 536 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) | 547 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) |
| 537 | isolate_huge_page(page, private); | 548 | isolate_huge_page(page, qp->pagelist); |
| 538 | unlock: | 549 | unlock: |
| 539 | spin_unlock(ptl); | 550 | spin_unlock(ptl); |
| 540 | #else | 551 | #else |
| 541 | BUG(); | 552 | BUG(); |
| 542 | #endif | 553 | #endif |
| 543 | } | ||
| 544 | |||
| 545 | static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, | ||
| 546 | unsigned long addr, unsigned long end, | ||
| 547 | const nodemask_t *nodes, unsigned long flags, | ||
| 548 | void *private) | ||
| 549 | { | ||
| 550 | pmd_t *pmd; | ||
| 551 | unsigned long next; | ||
| 552 | |||
| 553 | pmd = pmd_offset(pud, addr); | ||
| 554 | do { | ||
| 555 | next = pmd_addr_end(addr, end); | ||
| 556 | if (!pmd_present(*pmd)) | ||
| 557 | continue; | ||
| 558 | if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { | ||
| 559 | queue_pages_hugetlb_pmd_range(vma, pmd, nodes, | ||
| 560 | flags, private); | ||
| 561 | continue; | ||
| 562 | } | ||
| 563 | split_huge_page_pmd(vma, addr, pmd); | ||
| 564 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
| 565 | continue; | ||
| 566 | if (queue_pages_pte_range(vma, pmd, addr, next, nodes, | ||
| 567 | flags, private)) | ||
| 568 | return -EIO; | ||
| 569 | } while (pmd++, addr = next, addr != end); | ||
| 570 | return 0; | ||
| 571 | } | ||
| 572 | |||
| 573 | static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | ||
| 574 | unsigned long addr, unsigned long end, | ||
| 575 | const nodemask_t *nodes, unsigned long flags, | ||
| 576 | void *private) | ||
| 577 | { | ||
| 578 | pud_t *pud; | ||
| 579 | unsigned long next; | ||
| 580 | |||
| 581 | pud = pud_offset(pgd, addr); | ||
| 582 | do { | ||
| 583 | next = pud_addr_end(addr, end); | ||
| 584 | if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) | ||
| 585 | continue; | ||
| 586 | if (pud_none_or_clear_bad(pud)) | ||
| 587 | continue; | ||
| 588 | if (queue_pages_pmd_range(vma, pud, addr, next, nodes, | ||
| 589 | flags, private)) | ||
| 590 | return -EIO; | ||
| 591 | } while (pud++, addr = next, addr != end); | ||
| 592 | return 0; | ||
| 593 | } | ||
| 594 | |||
| 595 | static inline int queue_pages_pgd_range(struct vm_area_struct *vma, | ||
| 596 | unsigned long addr, unsigned long end, | ||
| 597 | const nodemask_t *nodes, unsigned long flags, | ||
| 598 | void *private) | ||
| 599 | { | ||
| 600 | pgd_t *pgd; | ||
| 601 | unsigned long next; | ||
| 602 | |||
| 603 | pgd = pgd_offset(vma->vm_mm, addr); | ||
| 604 | do { | ||
| 605 | next = pgd_addr_end(addr, end); | ||
| 606 | if (pgd_none_or_clear_bad(pgd)) | ||
| 607 | continue; | ||
| 608 | if (queue_pages_pud_range(vma, pgd, addr, next, nodes, | ||
| 609 | flags, private)) | ||
| 610 | return -EIO; | ||
| 611 | } while (pgd++, addr = next, addr != end); | ||
| 612 | return 0; | 554 | return 0; |
| 613 | } | 555 | } |
| 614 | 556 | ||
| @@ -627,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
| 627 | { | 569 | { |
| 628 | int nr_updated; | 570 | int nr_updated; |
| 629 | 571 | ||
| 630 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); | 572 | nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); |
| 631 | if (nr_updated) | 573 | if (nr_updated) |
| 632 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); | 574 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); |
| 633 | 575 | ||
| @@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
| 641 | } | 583 | } |
| 642 | #endif /* CONFIG_NUMA_BALANCING */ | 584 | #endif /* CONFIG_NUMA_BALANCING */ |
| 643 | 585 | ||
| 586 | static int queue_pages_test_walk(unsigned long start, unsigned long end, | ||
| 587 | struct mm_walk *walk) | ||
| 588 | { | ||
| 589 | struct vm_area_struct *vma = walk->vma; | ||
| 590 | struct queue_pages *qp = walk->private; | ||
| 591 | unsigned long endvma = vma->vm_end; | ||
| 592 | unsigned long flags = qp->flags; | ||
| 593 | |||
| 594 | if (vma->vm_flags & VM_PFNMAP) | ||
| 595 | return 1; | ||
| 596 | |||
| 597 | if (endvma > end) | ||
| 598 | endvma = end; | ||
| 599 | if (vma->vm_start > start) | ||
| 600 | start = vma->vm_start; | ||
| 601 | |||
| 602 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | ||
| 603 | if (!vma->vm_next && vma->vm_end < end) | ||
| 604 | return -EFAULT; | ||
| 605 | if (qp->prev && qp->prev->vm_end < vma->vm_start) | ||
| 606 | return -EFAULT; | ||
| 607 | } | ||
| 608 | |||
| 609 | qp->prev = vma; | ||
| 610 | |||
| 611 | if (vma->vm_flags & VM_PFNMAP) | ||
| 612 | return 1; | ||
| 613 | |||
| 614 | if (flags & MPOL_MF_LAZY) { | ||
| 615 | /* Similar to task_numa_work, skip inaccessible VMAs */ | ||
| 616 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
| 617 | change_prot_numa(vma, start, endvma); | ||
| 618 | return 1; | ||
| 619 | } | ||
| 620 | |||
| 621 | if ((flags & MPOL_MF_STRICT) || | ||
| 622 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
| 623 | vma_migratable(vma))) | ||
| 624 | /* queue pages from current vma */ | ||
| 625 | return 0; | ||
| 626 | return 1; | ||
| 627 | } | ||
| 628 | |||
| 644 | /* | 629 | /* |
| 645 | * Walk through page tables and collect pages to be migrated. | 630 | * Walk through page tables and collect pages to be migrated. |
| 646 | * | 631 | * |
| @@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
| 650 | */ | 635 | */ |
| 651 | static int | 636 | static int |
| 652 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 637 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
| 653 | const nodemask_t *nodes, unsigned long flags, void *private) | 638 | nodemask_t *nodes, unsigned long flags, |
| 654 | { | 639 | struct list_head *pagelist) |
| 655 | int err = 0; | 640 | { |
| 656 | struct vm_area_struct *vma, *prev; | 641 | struct queue_pages qp = { |
| 657 | 642 | .pagelist = pagelist, | |
| 658 | vma = find_vma(mm, start); | 643 | .flags = flags, |
| 659 | if (!vma) | 644 | .nmask = nodes, |
| 660 | return -EFAULT; | 645 | .prev = NULL, |
| 661 | prev = NULL; | 646 | }; |
| 662 | for (; vma && vma->vm_start < end; vma = vma->vm_next) { | 647 | struct mm_walk queue_pages_walk = { |
| 663 | unsigned long endvma = vma->vm_end; | 648 | .hugetlb_entry = queue_pages_hugetlb, |
| 664 | 649 | .pmd_entry = queue_pages_pte_range, | |
| 665 | if (endvma > end) | 650 | .test_walk = queue_pages_test_walk, |
| 666 | endvma = end; | 651 | .mm = mm, |
| 667 | if (vma->vm_start > start) | 652 | .private = &qp, |
| 668 | start = vma->vm_start; | 653 | }; |
| 669 | 654 | ||
| 670 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 655 | return walk_page_range(start, end, &queue_pages_walk); |
| 671 | if (!vma->vm_next && vma->vm_end < end) | ||
| 672 | return -EFAULT; | ||
| 673 | if (prev && prev->vm_end < vma->vm_start) | ||
| 674 | return -EFAULT; | ||
| 675 | } | ||
| 676 | |||
| 677 | if (flags & MPOL_MF_LAZY) { | ||
| 678 | /* Similar to task_numa_work, skip inaccessible VMAs */ | ||
| 679 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
| 680 | change_prot_numa(vma, start, endvma); | ||
| 681 | goto next; | ||
| 682 | } | ||
| 683 | |||
| 684 | if ((flags & MPOL_MF_STRICT) || | ||
| 685 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
| 686 | vma_migratable(vma))) { | ||
| 687 | |||
| 688 | err = queue_pages_pgd_range(vma, start, endvma, nodes, | ||
| 689 | flags, private); | ||
| 690 | if (err) | ||
| 691 | break; | ||
| 692 | } | ||
| 693 | next: | ||
| 694 | prev = vma; | ||
| 695 | } | ||
| 696 | return err; | ||
| 697 | } | 656 | } |
| 698 | 657 | ||
| 699 | /* | 658 | /* |
| @@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
| 1988 | * @order:Order of the GFP allocation. | 1947 | * @order:Order of the GFP allocation. |
| 1989 | * @vma: Pointer to VMA or NULL if not available. | 1948 | * @vma: Pointer to VMA or NULL if not available. |
| 1990 | * @addr: Virtual Address of the allocation. Must be inside the VMA. | 1949 | * @addr: Virtual Address of the allocation. Must be inside the VMA. |
| 1950 | * @node: Which node to prefer for allocation (modulo policy). | ||
| 1951 | * @hugepage: for hugepages try only the preferred node if possible | ||
| 1991 | * | 1952 | * |
| 1992 | * This function allocates a page from the kernel page pool and applies | 1953 | * This function allocates a page from the kernel page pool and applies |
| 1993 | * a NUMA policy associated with the VMA or the current process. | 1954 | * a NUMA policy associated with the VMA or the current process. |
| 1994 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the | 1955 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the |
| 1995 | * mm_struct of the VMA to prevent it from going away. Should be used for | 1956 | * mm_struct of the VMA to prevent it from going away. Should be used for |
| 1996 | * all allocations for pages that will be mapped into | 1957 | * all allocations for pages that will be mapped into user space. Returns |
| 1997 | * user space. Returns NULL when no page can be allocated. | 1958 | * NULL when no page can be allocated. |
| 1998 | * | ||
| 1999 | * Should be called with the mm_sem of the vma hold. | ||
| 2000 | */ | 1959 | */ |
| 2001 | struct page * | 1960 | struct page * |
| 2002 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 1961 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
| 2003 | unsigned long addr, int node) | 1962 | unsigned long addr, int node, bool hugepage) |
| 2004 | { | 1963 | { |
| 2005 | struct mempolicy *pol; | 1964 | struct mempolicy *pol; |
| 2006 | struct page *page; | 1965 | struct page *page; |
| 2007 | unsigned int cpuset_mems_cookie; | 1966 | unsigned int cpuset_mems_cookie; |
| 1967 | struct zonelist *zl; | ||
| 1968 | nodemask_t *nmask; | ||
| 2008 | 1969 | ||
| 2009 | retry_cpuset: | 1970 | retry_cpuset: |
| 2010 | pol = get_vma_policy(vma, addr); | 1971 | pol = get_vma_policy(vma, addr); |
| 2011 | cpuset_mems_cookie = read_mems_allowed_begin(); | 1972 | cpuset_mems_cookie = read_mems_allowed_begin(); |
| 2012 | 1973 | ||
| 2013 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1974 | if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && |
| 1975 | pol->mode != MPOL_INTERLEAVE)) { | ||
| 1976 | /* | ||
| 1977 | * For hugepage allocation and non-interleave policy which | ||
| 1978 | * allows the current node, we only try to allocate from the | ||
| 1979 | * current node and don't fall back to other nodes, as the | ||
| 1980 | * cost of remote accesses would likely offset THP benefits. | ||
| 1981 | * | ||
| 1982 | * If the policy is interleave, or does not allow the current | ||
| 1983 | * node in its nodemask, we allocate the standard way. | ||
| 1984 | */ | ||
| 1985 | nmask = policy_nodemask(gfp, pol); | ||
| 1986 | if (!nmask || node_isset(node, *nmask)) { | ||
| 1987 | mpol_cond_put(pol); | ||
| 1988 | page = alloc_pages_exact_node(node, gfp, order); | ||
| 1989 | goto out; | ||
| 1990 | } | ||
| 1991 | } | ||
| 1992 | |||
| 1993 | if (pol->mode == MPOL_INTERLEAVE) { | ||
| 2014 | unsigned nid; | 1994 | unsigned nid; |
| 2015 | 1995 | ||
| 2016 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 1996 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
| 2017 | mpol_cond_put(pol); | 1997 | mpol_cond_put(pol); |
| 2018 | page = alloc_page_interleave(gfp, order, nid); | 1998 | page = alloc_page_interleave(gfp, order, nid); |
| 2019 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 1999 | goto out; |
| 2020 | goto retry_cpuset; | ||
| 2021 | |||
| 2022 | return page; | ||
| 2023 | } | 2000 | } |
| 2024 | page = __alloc_pages_nodemask(gfp, order, | 2001 | |
| 2025 | policy_zonelist(gfp, pol, node), | 2002 | nmask = policy_nodemask(gfp, pol); |
| 2026 | policy_nodemask(gfp, pol)); | 2003 | zl = policy_zonelist(gfp, pol, node); |
| 2027 | mpol_cond_put(pol); | 2004 | mpol_cond_put(pol); |
| 2005 | page = __alloc_pages_nodemask(gfp, order, zl, nmask); | ||
| 2006 | out: | ||
| 2028 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2007 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
| 2029 | goto retry_cpuset; | 2008 | goto retry_cpuset; |
| 2030 | return page; | 2009 | return page; |
| @@ -2838,8 +2817,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
| 2838 | p += snprintf(p, buffer + maxlen - p, "relative"); | 2817 | p += snprintf(p, buffer + maxlen - p, "relative"); |
| 2839 | } | 2818 | } |
| 2840 | 2819 | ||
| 2841 | if (!nodes_empty(nodes)) { | 2820 | if (!nodes_empty(nodes)) |
| 2842 | p += snprintf(p, buffer + maxlen - p, ":"); | 2821 | p += scnprintf(p, buffer + maxlen - p, ":%*pbl", |
| 2843 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); | 2822 | nodemask_pr_args(&nodes)); |
| 2844 | } | ||
| 2845 | } | 2823 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 344cdf692fc8..85e042686031 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -179,37 +179,6 @@ out: | |||
| 179 | } | 179 | } |
| 180 | 180 | ||
| 181 | /* | 181 | /* |
| 182 | * Congratulations to trinity for discovering this bug. | ||
| 183 | * mm/fremap.c's remap_file_pages() accepts any range within a single vma to | ||
| 184 | * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then | ||
| 185 | * replace the specified range by file ptes throughout (maybe populated after). | ||
| 186 | * If page migration finds a page within that range, while it's still located | ||
| 187 | * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: | ||
| 188 | * zap_pte() clears the temporary migration entry before mmap_sem is dropped. | ||
| 189 | * But if the migrating page is in a part of the vma outside the range to be | ||
| 190 | * remapped, then it will not be cleared, and remove_migration_ptes() needs to | ||
| 191 | * deal with it. Fortunately, this part of the vma is of course still linear, | ||
| 192 | * so we just need to use linear location on the nonlinear list. | ||
| 193 | */ | ||
| 194 | static int remove_linear_migration_ptes_from_nonlinear(struct page *page, | ||
| 195 | struct address_space *mapping, void *arg) | ||
| 196 | { | ||
| 197 | struct vm_area_struct *vma; | ||
| 198 | /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ | ||
| 199 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 200 | unsigned long addr; | ||
| 201 | |||
| 202 | list_for_each_entry(vma, | ||
| 203 | &mapping->i_mmap_nonlinear, shared.nonlinear) { | ||
| 204 | |||
| 205 | addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
| 206 | if (addr >= vma->vm_start && addr < vma->vm_end) | ||
| 207 | remove_migration_pte(page, vma, addr, arg); | ||
| 208 | } | ||
| 209 | return SWAP_AGAIN; | ||
| 210 | } | ||
| 211 | |||
| 212 | /* | ||
| 213 | * Get rid of all migration entries and replace them by | 182 | * Get rid of all migration entries and replace them by |
| 214 | * references to the indicated page. | 183 | * references to the indicated page. |
| 215 | */ | 184 | */ |
| @@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) | |||
| 218 | struct rmap_walk_control rwc = { | 187 | struct rmap_walk_control rwc = { |
| 219 | .rmap_one = remove_migration_pte, | 188 | .rmap_one = remove_migration_pte, |
| 220 | .arg = old, | 189 | .arg = old, |
| 221 | .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, | ||
| 222 | }; | 190 | }; |
| 223 | 191 | ||
| 224 | rmap_walk(new, &rwc); | 192 | rmap_walk(new, &rwc); |
| @@ -229,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) | |||
| 229 | * get to the page and wait until migration is finished. | 197 | * get to the page and wait until migration is finished. |
| 230 | * When we return from this function the fault will be retried. | 198 | * When we return from this function the fault will be retried. |
| 231 | */ | 199 | */ |
| 232 | static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, | 200 | void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, |
| 233 | spinlock_t *ptl) | 201 | spinlock_t *ptl) |
| 234 | { | 202 | { |
| 235 | pte_t pte; | 203 | pte_t pte; |
| @@ -1268,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
| 1268 | goto put_and_set; | 1236 | goto put_and_set; |
| 1269 | 1237 | ||
| 1270 | if (PageHuge(page)) { | 1238 | if (PageHuge(page)) { |
| 1271 | isolate_huge_page(page, &pagelist); | 1239 | if (PageHead(page)) |
| 1240 | isolate_huge_page(page, &pagelist); | ||
| 1272 | goto put_and_set; | 1241 | goto put_and_set; |
| 1273 | } | 1242 | } |
| 1274 | 1243 | ||
| @@ -1685,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd) | |||
| 1685 | return PageLocked(page); | 1654 | return PageLocked(page); |
| 1686 | } | 1655 | } |
| 1687 | 1656 | ||
| 1688 | void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) | ||
| 1689 | { | ||
| 1690 | struct page *page = pmd_page(*pmd); | ||
| 1691 | wait_on_page_locked(page); | ||
| 1692 | } | ||
| 1693 | |||
| 1694 | /* | 1657 | /* |
| 1695 | * Attempt to migrate a misplaced page to the specified destination | 1658 | * Attempt to migrate a misplaced page to the specified destination |
| 1696 | * node. Caller is expected to have an elevated reference count on | 1659 | * node. Caller is expected to have an elevated reference count on |
| @@ -1884,7 +1847,7 @@ out_fail: | |||
| 1884 | out_dropref: | 1847 | out_dropref: |
| 1885 | ptl = pmd_lock(mm, pmd); | 1848 | ptl = pmd_lock(mm, pmd); |
| 1886 | if (pmd_same(*pmd, entry)) { | 1849 | if (pmd_same(*pmd, entry)) { |
| 1887 | entry = pmd_mknonnuma(entry); | 1850 | entry = pmd_modify(entry, vma->vm_page_prot); |
| 1888 | set_pmd_at(mm, mmun_start, pmd, entry); | 1851 | set_pmd_at(mm, mmun_start, pmd, entry); |
| 1889 | update_mmu_cache_pmd(vma, address, &entry); | 1852 | update_mmu_cache_pmd(vma, address, &entry); |
| 1890 | } | 1853 | } |
diff --git a/mm/mincore.c b/mm/mincore.c index c8c528b36641..be25efde64a4 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
| @@ -19,38 +19,25 @@ | |||
| 19 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
| 20 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
| 21 | 21 | ||
| 22 | static void mincore_hugetlb_page_range(struct vm_area_struct *vma, | 22 | static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, |
| 23 | unsigned long addr, unsigned long end, | 23 | unsigned long end, struct mm_walk *walk) |
| 24 | unsigned char *vec) | ||
| 25 | { | 24 | { |
| 26 | #ifdef CONFIG_HUGETLB_PAGE | 25 | #ifdef CONFIG_HUGETLB_PAGE |
| 27 | struct hstate *h; | 26 | unsigned char present; |
| 27 | unsigned char *vec = walk->private; | ||
| 28 | 28 | ||
| 29 | h = hstate_vma(vma); | 29 | /* |
| 30 | while (1) { | 30 | * Hugepages under user process are always in RAM and never |
| 31 | unsigned char present; | 31 | * swapped out, but theoretically it needs to be checked. |
| 32 | pte_t *ptep; | 32 | */ |
| 33 | /* | 33 | present = pte && !huge_pte_none(huge_ptep_get(pte)); |
| 34 | * Huge pages are always in RAM for now, but | 34 | for (; addr != end; vec++, addr += PAGE_SIZE) |
| 35 | * theoretically it needs to be checked. | 35 | *vec = present; |
| 36 | */ | 36 | walk->private = vec; |
| 37 | ptep = huge_pte_offset(current->mm, | ||
| 38 | addr & huge_page_mask(h)); | ||
| 39 | present = ptep && !huge_pte_none(huge_ptep_get(ptep)); | ||
| 40 | while (1) { | ||
| 41 | *vec = present; | ||
| 42 | vec++; | ||
| 43 | addr += PAGE_SIZE; | ||
| 44 | if (addr == end) | ||
| 45 | return; | ||
| 46 | /* check hugepage border */ | ||
| 47 | if (!(addr & ~huge_page_mask(h))) | ||
| 48 | break; | ||
| 49 | } | ||
| 50 | } | ||
| 51 | #else | 37 | #else |
| 52 | BUG(); | 38 | BUG(); |
| 53 | #endif | 39 | #endif |
| 40 | return 0; | ||
| 54 | } | 41 | } |
| 55 | 42 | ||
| 56 | /* | 43 | /* |
| @@ -94,9 +81,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
| 94 | return present; | 81 | return present; |
| 95 | } | 82 | } |
| 96 | 83 | ||
| 97 | static void mincore_unmapped_range(struct vm_area_struct *vma, | 84 | static int __mincore_unmapped_range(unsigned long addr, unsigned long end, |
| 98 | unsigned long addr, unsigned long end, | 85 | struct vm_area_struct *vma, unsigned char *vec) |
| 99 | unsigned char *vec) | ||
| 100 | { | 86 | { |
| 101 | unsigned long nr = (end - addr) >> PAGE_SHIFT; | 87 | unsigned long nr = (end - addr) >> PAGE_SHIFT; |
| 102 | int i; | 88 | int i; |
| @@ -111,30 +97,47 @@ static void mincore_unmapped_range(struct vm_area_struct *vma, | |||
| 111 | for (i = 0; i < nr; i++) | 97 | for (i = 0; i < nr; i++) |
| 112 | vec[i] = 0; | 98 | vec[i] = 0; |
| 113 | } | 99 | } |
| 100 | return nr; | ||
| 101 | } | ||
| 102 | |||
| 103 | static int mincore_unmapped_range(unsigned long addr, unsigned long end, | ||
| 104 | struct mm_walk *walk) | ||
| 105 | { | ||
| 106 | walk->private += __mincore_unmapped_range(addr, end, | ||
| 107 | walk->vma, walk->private); | ||
| 108 | return 0; | ||
| 114 | } | 109 | } |
| 115 | 110 | ||
| 116 | static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 111 | static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
| 117 | unsigned long addr, unsigned long end, | 112 | struct mm_walk *walk) |
| 118 | unsigned char *vec) | ||
| 119 | { | 113 | { |
| 120 | unsigned long next; | ||
| 121 | spinlock_t *ptl; | 114 | spinlock_t *ptl; |
| 115 | struct vm_area_struct *vma = walk->vma; | ||
| 122 | pte_t *ptep; | 116 | pte_t *ptep; |
| 117 | unsigned char *vec = walk->private; | ||
| 118 | int nr = (end - addr) >> PAGE_SHIFT; | ||
| 119 | |||
| 120 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | ||
| 121 | memset(vec, 1, nr); | ||
| 122 | spin_unlock(ptl); | ||
| 123 | goto out; | ||
| 124 | } | ||
| 125 | |||
| 126 | if (pmd_trans_unstable(pmd)) { | ||
| 127 | __mincore_unmapped_range(addr, end, vma, vec); | ||
| 128 | goto out; | ||
| 129 | } | ||
| 123 | 130 | ||
| 124 | ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 131 | ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
| 125 | do { | 132 | for (; addr != end; ptep++, addr += PAGE_SIZE) { |
| 126 | pte_t pte = *ptep; | 133 | pte_t pte = *ptep; |
| 127 | pgoff_t pgoff; | ||
| 128 | 134 | ||
| 129 | next = addr + PAGE_SIZE; | ||
| 130 | if (pte_none(pte)) | 135 | if (pte_none(pte)) |
| 131 | mincore_unmapped_range(vma, addr, next, vec); | 136 | __mincore_unmapped_range(addr, addr + PAGE_SIZE, |
| 137 | vma, vec); | ||
| 132 | else if (pte_present(pte)) | 138 | else if (pte_present(pte)) |
| 133 | *vec = 1; | 139 | *vec = 1; |
| 134 | else if (pte_file(pte)) { | 140 | else { /* pte is a swap entry */ |
| 135 | pgoff = pte_to_pgoff(pte); | ||
| 136 | *vec = mincore_page(vma->vm_file->f_mapping, pgoff); | ||
| 137 | } else { /* pte is a swap entry */ | ||
| 138 | swp_entry_t entry = pte_to_swp_entry(pte); | 141 | swp_entry_t entry = pte_to_swp_entry(pte); |
| 139 | 142 | ||
| 140 | if (non_swap_entry(entry)) { | 143 | if (non_swap_entry(entry)) { |
| @@ -145,9 +148,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 145 | *vec = 1; | 148 | *vec = 1; |
| 146 | } else { | 149 | } else { |
| 147 | #ifdef CONFIG_SWAP | 150 | #ifdef CONFIG_SWAP |
| 148 | pgoff = entry.val; | ||
| 149 | *vec = mincore_page(swap_address_space(entry), | 151 | *vec = mincore_page(swap_address_space(entry), |
| 150 | pgoff); | 152 | entry.val); |
| 151 | #else | 153 | #else |
| 152 | WARN_ON(1); | 154 | WARN_ON(1); |
| 153 | *vec = 1; | 155 | *vec = 1; |
| @@ -155,69 +157,12 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 155 | } | 157 | } |
| 156 | } | 158 | } |
| 157 | vec++; | 159 | vec++; |
| 158 | } while (ptep++, addr = next, addr != end); | 160 | } |
| 159 | pte_unmap_unlock(ptep - 1, ptl); | 161 | pte_unmap_unlock(ptep - 1, ptl); |
| 160 | } | 162 | out: |
| 161 | 163 | walk->private += nr; | |
| 162 | static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 164 | cond_resched(); |
| 163 | unsigned long addr, unsigned long end, | 165 | return 0; |
| 164 | unsigned char *vec) | ||
| 165 | { | ||
| 166 | unsigned long next; | ||
| 167 | pmd_t *pmd; | ||
| 168 | |||
| 169 | pmd = pmd_offset(pud, addr); | ||
| 170 | do { | ||
| 171 | next = pmd_addr_end(addr, end); | ||
| 172 | if (pmd_trans_huge(*pmd)) { | ||
| 173 | if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { | ||
| 174 | vec += (next - addr) >> PAGE_SHIFT; | ||
| 175 | continue; | ||
| 176 | } | ||
| 177 | /* fall through */ | ||
| 178 | } | ||
| 179 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
| 180 | mincore_unmapped_range(vma, addr, next, vec); | ||
| 181 | else | ||
| 182 | mincore_pte_range(vma, pmd, addr, next, vec); | ||
| 183 | vec += (next - addr) >> PAGE_SHIFT; | ||
| 184 | } while (pmd++, addr = next, addr != end); | ||
| 185 | } | ||
| 186 | |||
| 187 | static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | ||
| 188 | unsigned long addr, unsigned long end, | ||
| 189 | unsigned char *vec) | ||
| 190 | { | ||
| 191 | unsigned long next; | ||
| 192 | pud_t *pud; | ||
| 193 | |||
| 194 | pud = pud_offset(pgd, addr); | ||
| 195 | do { | ||
| 196 | next = pud_addr_end(addr, end); | ||
| 197 | if (pud_none_or_clear_bad(pud)) | ||
| 198 | mincore_unmapped_range(vma, addr, next, vec); | ||
| 199 | else | ||
| 200 | mincore_pmd_range(vma, pud, addr, next, vec); | ||
| 201 | vec += (next - addr) >> PAGE_SHIFT; | ||
| 202 | } while (pud++, addr = next, addr != end); | ||
| 203 | } | ||
| 204 | |||
| 205 | static void mincore_page_range(struct vm_area_struct *vma, | ||
| 206 | unsigned long addr, unsigned long end, | ||
| 207 | unsigned char *vec) | ||
| 208 | { | ||
| 209 | unsigned long next; | ||
| 210 | pgd_t *pgd; | ||
| 211 | |||
| 212 | pgd = pgd_offset(vma->vm_mm, addr); | ||
| 213 | do { | ||
| 214 | next = pgd_addr_end(addr, end); | ||
| 215 | if (pgd_none_or_clear_bad(pgd)) | ||
| 216 | mincore_unmapped_range(vma, addr, next, vec); | ||
| 217 | else | ||
| 218 | mincore_pud_range(vma, pgd, addr, next, vec); | ||
| 219 | vec += (next - addr) >> PAGE_SHIFT; | ||
| 220 | } while (pgd++, addr = next, addr != end); | ||
| 221 | } | 166 | } |
| 222 | 167 | ||
| 223 | /* | 168 | /* |
| @@ -229,18 +174,22 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v | |||
| 229 | { | 174 | { |
| 230 | struct vm_area_struct *vma; | 175 | struct vm_area_struct *vma; |
| 231 | unsigned long end; | 176 | unsigned long end; |
| 177 | int err; | ||
| 178 | struct mm_walk mincore_walk = { | ||
| 179 | .pmd_entry = mincore_pte_range, | ||
| 180 | .pte_hole = mincore_unmapped_range, | ||
| 181 | .hugetlb_entry = mincore_hugetlb, | ||
| 182 | .private = vec, | ||
| 183 | }; | ||
| 232 | 184 | ||
| 233 | vma = find_vma(current->mm, addr); | 185 | vma = find_vma(current->mm, addr); |
| 234 | if (!vma || addr < vma->vm_start) | 186 | if (!vma || addr < vma->vm_start) |
| 235 | return -ENOMEM; | 187 | return -ENOMEM; |
| 236 | 188 | mincore_walk.mm = vma->vm_mm; | |
| 237 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); | 189 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); |
| 238 | 190 | err = walk_page_range(addr, end, &mincore_walk); | |
| 239 | if (is_vm_hugetlb_page(vma)) | 191 | if (err < 0) |
| 240 | mincore_hugetlb_page_range(vma, addr, end, vec); | 192 | return err; |
| 241 | else | ||
| 242 | mincore_page_range(vma, addr, end, vec); | ||
| 243 | |||
| 244 | return (end - addr) >> PAGE_SHIFT; | 193 | return (end - addr) >> PAGE_SHIFT; |
| 245 | } | 194 | } |
| 246 | 195 | ||
diff --git a/mm/mm_init.c b/mm/mm_init.c index 4074caf9936b..5f420f7fafa1 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
| @@ -14,14 +14,14 @@ | |||
| 14 | #include "internal.h" | 14 | #include "internal.h" |
| 15 | 15 | ||
| 16 | #ifdef CONFIG_DEBUG_MEMORY_INIT | 16 | #ifdef CONFIG_DEBUG_MEMORY_INIT |
| 17 | int mminit_loglevel; | 17 | int __meminitdata mminit_loglevel; |
| 18 | 18 | ||
| 19 | #ifndef SECTIONS_SHIFT | 19 | #ifndef SECTIONS_SHIFT |
| 20 | #define SECTIONS_SHIFT 0 | 20 | #define SECTIONS_SHIFT 0 |
| 21 | #endif | 21 | #endif |
| 22 | 22 | ||
| 23 | /* The zonelists are simply reported, validation is manual. */ | 23 | /* The zonelists are simply reported, validation is manual. */ |
| 24 | void mminit_verify_zonelist(void) | 24 | void __init mminit_verify_zonelist(void) |
| 25 | { | 25 | { |
| 26 | int nid; | 26 | int nid; |
| 27 | 27 | ||
| @@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); | |||
| 152 | */ | 152 | */ |
| 153 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 153 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
| 154 | { | 154 | { |
| 155 | unsigned long free, allowed, reserve; | 155 | long free, allowed, reserve; |
| 156 | 156 | ||
| 157 | VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < | 157 | VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < |
| 158 | -(s64)vm_committed_as_batch * num_online_cpus(), | 158 | -(s64)vm_committed_as_batch * num_online_cpus(), |
| @@ -220,7 +220,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
| 220 | */ | 220 | */ |
| 221 | if (mm) { | 221 | if (mm) { |
| 222 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); | 222 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); |
| 223 | allowed -= min(mm->total_vm / 32, reserve); | 223 | allowed -= min_t(long, mm->total_vm / 32, reserve); |
| 224 | } | 224 | } |
| 225 | 225 | ||
| 226 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) | 226 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
| @@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
| 243 | mapping_unmap_writable(mapping); | 243 | mapping_unmap_writable(mapping); |
| 244 | 244 | ||
| 245 | flush_dcache_mmap_lock(mapping); | 245 | flush_dcache_mmap_lock(mapping); |
| 246 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 246 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
| 247 | list_del_init(&vma->shared.nonlinear); | ||
| 248 | else | ||
| 249 | vma_interval_tree_remove(vma, &mapping->i_mmap); | ||
| 250 | flush_dcache_mmap_unlock(mapping); | 247 | flush_dcache_mmap_unlock(mapping); |
| 251 | } | 248 | } |
| 252 | 249 | ||
| @@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma) | |||
| 649 | atomic_inc(&mapping->i_mmap_writable); | 646 | atomic_inc(&mapping->i_mmap_writable); |
| 650 | 647 | ||
| 651 | flush_dcache_mmap_lock(mapping); | 648 | flush_dcache_mmap_lock(mapping); |
| 652 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 649 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
| 653 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | ||
| 654 | else | ||
| 655 | vma_interval_tree_insert(vma, &mapping->i_mmap); | ||
| 656 | flush_dcache_mmap_unlock(mapping); | 650 | flush_dcache_mmap_unlock(mapping); |
| 657 | } | 651 | } |
| 658 | } | 652 | } |
| @@ -789,14 +783,11 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 789 | 783 | ||
| 790 | if (file) { | 784 | if (file) { |
| 791 | mapping = file->f_mapping; | 785 | mapping = file->f_mapping; |
| 792 | if (!(vma->vm_flags & VM_NONLINEAR)) { | 786 | root = &mapping->i_mmap; |
| 793 | root = &mapping->i_mmap; | 787 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); |
| 794 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); | ||
| 795 | 788 | ||
| 796 | if (adjust_next) | 789 | if (adjust_next) |
| 797 | uprobe_munmap(next, next->vm_start, | 790 | uprobe_munmap(next, next->vm_start, next->vm_end); |
| 798 | next->vm_end); | ||
| 799 | } | ||
| 800 | 791 | ||
| 801 | i_mmap_lock_write(mapping); | 792 | i_mmap_lock_write(mapping); |
| 802 | if (insert) { | 793 | if (insert) { |
| @@ -2634,6 +2625,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) | |||
| 2634 | return vm_munmap(addr, len); | 2625 | return vm_munmap(addr, len); |
| 2635 | } | 2626 | } |
| 2636 | 2627 | ||
| 2628 | |||
| 2629 | /* | ||
| 2630 | * Emulation of deprecated remap_file_pages() syscall. | ||
| 2631 | */ | ||
| 2632 | SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | ||
| 2633 | unsigned long, prot, unsigned long, pgoff, unsigned long, flags) | ||
| 2634 | { | ||
| 2635 | |||
| 2636 | struct mm_struct *mm = current->mm; | ||
| 2637 | struct vm_area_struct *vma; | ||
| 2638 | unsigned long populate = 0; | ||
| 2639 | unsigned long ret = -EINVAL; | ||
| 2640 | struct file *file; | ||
| 2641 | |||
| 2642 | pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " | ||
| 2643 | "See Documentation/vm/remap_file_pages.txt.\n", | ||
| 2644 | current->comm, current->pid); | ||
| 2645 | |||
| 2646 | if (prot) | ||
| 2647 | return ret; | ||
| 2648 | start = start & PAGE_MASK; | ||
| 2649 | size = size & PAGE_MASK; | ||
| 2650 | |||
| 2651 | if (start + size <= start) | ||
| 2652 | return ret; | ||
| 2653 | |||
| 2654 | /* Does pgoff wrap? */ | ||
| 2655 | if (pgoff + (size >> PAGE_SHIFT) < pgoff) | ||
| 2656 | return ret; | ||
| 2657 | |||
| 2658 | down_write(&mm->mmap_sem); | ||
| 2659 | vma = find_vma(mm, start); | ||
| 2660 | |||
| 2661 | if (!vma || !(vma->vm_flags & VM_SHARED)) | ||
| 2662 | goto out; | ||
| 2663 | |||
| 2664 | if (start < vma->vm_start || start + size > vma->vm_end) | ||
| 2665 | goto out; | ||
| 2666 | |||
| 2667 | if (pgoff == linear_page_index(vma, start)) { | ||
| 2668 | ret = 0; | ||
| 2669 | goto out; | ||
| 2670 | } | ||
| 2671 | |||
| 2672 | prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; | ||
| 2673 | prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; | ||
| 2674 | prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; | ||
| 2675 | |||
| 2676 | flags &= MAP_NONBLOCK; | ||
| 2677 | flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; | ||
| 2678 | if (vma->vm_flags & VM_LOCKED) { | ||
| 2679 | flags |= MAP_LOCKED; | ||
| 2680 | /* drop PG_Mlocked flag for over-mapped range */ | ||
| 2681 | munlock_vma_pages_range(vma, start, start + size); | ||
| 2682 | } | ||
| 2683 | |||
| 2684 | file = get_file(vma->vm_file); | ||
| 2685 | ret = do_mmap_pgoff(vma->vm_file, start, size, | ||
| 2686 | prot, flags, pgoff, &populate); | ||
| 2687 | fput(file); | ||
| 2688 | out: | ||
| 2689 | up_write(&mm->mmap_sem); | ||
| 2690 | if (populate) | ||
| 2691 | mm_populate(ret, populate); | ||
| 2692 | if (!IS_ERR_VALUE(ret)) | ||
| 2693 | ret = 0; | ||
| 2694 | return ret; | ||
| 2695 | } | ||
| 2696 | |||
| 2637 | static inline void verify_mm_writelocked(struct mm_struct *mm) | 2697 | static inline void verify_mm_writelocked(struct mm_struct *mm) |
| 2638 | { | 2698 | { |
| 2639 | #ifdef CONFIG_DEBUG_VM | 2699 | #ifdef CONFIG_DEBUG_VM |
| @@ -2791,9 +2851,6 @@ void exit_mmap(struct mm_struct *mm) | |||
| 2791 | vma = remove_vma(vma); | 2851 | vma = remove_vma(vma); |
| 2792 | } | 2852 | } |
| 2793 | vm_unacct_memory(nr_accounted); | 2853 | vm_unacct_memory(nr_accounted); |
| 2794 | |||
| 2795 | WARN_ON(atomic_long_read(&mm->nr_ptes) > | ||
| 2796 | (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | ||
| 2797 | } | 2854 | } |
| 2798 | 2855 | ||
| 2799 | /* Insert vm structure into process list sorted by address | 2856 | /* Insert vm structure into process list sorted by address |
| @@ -3108,8 +3165,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
| 3108 | * | 3165 | * |
| 3109 | * mmap_sem in write mode is required in order to block all operations | 3166 | * mmap_sem in write mode is required in order to block all operations |
| 3110 | * that could modify pagetables and free pages without need of | 3167 | * that could modify pagetables and free pages without need of |
| 3111 | * altering the vma layout (for example populate_range() with | 3168 | * altering the vma layout. It's also needed in write mode to avoid new |
| 3112 | * nonlinear vmas). It's also needed in write mode to avoid new | ||
| 3113 | * anon_vmas to be associated with existing vmas. | 3169 | * anon_vmas to be associated with existing vmas. |
| 3114 | * | 3170 | * |
| 3115 | * A single task can't take more than one mm_take_all_locks() in a row | 3171 | * A single task can't take more than one mm_take_all_locks() in a row |
diff --git a/mm/mmzone.c b/mm/mmzone.c index bf34fb8556db..7d87ebb0d632 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
| @@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) | |||
| 54 | /* Returns the next zone at or below highest_zoneidx in a zonelist */ | 54 | /* Returns the next zone at or below highest_zoneidx in a zonelist */ |
| 55 | struct zoneref *next_zones_zonelist(struct zoneref *z, | 55 | struct zoneref *next_zones_zonelist(struct zoneref *z, |
| 56 | enum zone_type highest_zoneidx, | 56 | enum zone_type highest_zoneidx, |
| 57 | nodemask_t *nodes, | 57 | nodemask_t *nodes) |
| 58 | struct zone **zone) | ||
| 59 | { | 58 | { |
| 60 | /* | 59 | /* |
| 61 | * Find the next suitable zone to use for the allocation. | 60 | * Find the next suitable zone to use for the allocation. |
| @@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, | |||
| 69 | (z->zone && !zref_in_nodemask(z, nodes))) | 68 | (z->zone && !zref_in_nodemask(z, nodes))) |
| 70 | z++; | 69 | z++; |
| 71 | 70 | ||
| 72 | *zone = zonelist_zone(z); | ||
| 73 | return z; | 71 | return z; |
| 74 | } | 72 | } |
| 75 | 73 | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index ace93454ce8e..44727811bf4c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -75,37 +75,35 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 75 | oldpte = *pte; | 75 | oldpte = *pte; |
| 76 | if (pte_present(oldpte)) { | 76 | if (pte_present(oldpte)) { |
| 77 | pte_t ptent; | 77 | pte_t ptent; |
| 78 | bool updated = false; | ||
| 79 | 78 | ||
| 80 | if (!prot_numa) { | 79 | /* |
| 81 | ptent = ptep_modify_prot_start(mm, addr, pte); | 80 | * Avoid trapping faults against the zero or KSM |
| 82 | if (pte_numa(ptent)) | 81 | * pages. See similar comment in change_huge_pmd. |
| 83 | ptent = pte_mknonnuma(ptent); | 82 | */ |
| 84 | ptent = pte_modify(ptent, newprot); | 83 | if (prot_numa) { |
| 85 | /* | ||
| 86 | * Avoid taking write faults for pages we | ||
| 87 | * know to be dirty. | ||
| 88 | */ | ||
| 89 | if (dirty_accountable && pte_dirty(ptent) && | ||
| 90 | (pte_soft_dirty(ptent) || | ||
| 91 | !(vma->vm_flags & VM_SOFTDIRTY))) | ||
| 92 | ptent = pte_mkwrite(ptent); | ||
| 93 | ptep_modify_prot_commit(mm, addr, pte, ptent); | ||
| 94 | updated = true; | ||
| 95 | } else { | ||
| 96 | struct page *page; | 84 | struct page *page; |
| 97 | 85 | ||
| 98 | page = vm_normal_page(vma, addr, oldpte); | 86 | page = vm_normal_page(vma, addr, oldpte); |
| 99 | if (page && !PageKsm(page)) { | 87 | if (!page || PageKsm(page)) |
| 100 | if (!pte_numa(oldpte)) { | 88 | continue; |
| 101 | ptep_set_numa(mm, addr, pte); | 89 | |
| 102 | updated = true; | 90 | /* Avoid TLB flush if possible */ |
| 103 | } | 91 | if (pte_protnone(oldpte)) |
| 104 | } | 92 | continue; |
| 105 | } | 93 | } |
| 106 | if (updated) | 94 | |
| 107 | pages++; | 95 | ptent = ptep_modify_prot_start(mm, addr, pte); |
| 108 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { | 96 | ptent = pte_modify(ptent, newprot); |
| 97 | |||
| 98 | /* Avoid taking write faults for known dirty pages */ | ||
| 99 | if (dirty_accountable && pte_dirty(ptent) && | ||
| 100 | (pte_soft_dirty(ptent) || | ||
| 101 | !(vma->vm_flags & VM_SOFTDIRTY))) { | ||
| 102 | ptent = pte_mkwrite(ptent); | ||
| 103 | } | ||
| 104 | ptep_modify_prot_commit(mm, addr, pte, ptent); | ||
| 105 | pages++; | ||
| 106 | } else if (IS_ENABLED(CONFIG_MIGRATION)) { | ||
| 109 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 107 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
| 110 | 108 | ||
| 111 | if (is_write_migration_entry(entry)) { | 109 | if (is_write_migration_entry(entry)) { |
diff --git a/mm/mremap.c b/mm/mremap.c index 17fa018f5f39..57dadc025c64 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte) | |||
| 81 | pte = pte_mksoft_dirty(pte); | 81 | pte = pte_mksoft_dirty(pte); |
| 82 | else if (is_swap_pte(pte)) | 82 | else if (is_swap_pte(pte)) |
| 83 | pte = pte_swp_mksoft_dirty(pte); | 83 | pte = pte_swp_mksoft_dirty(pte); |
| 84 | else if (pte_file(pte)) | ||
| 85 | pte = pte_file_mksoft_dirty(pte); | ||
| 86 | #endif | 84 | #endif |
| 87 | return pte; | 85 | return pte; |
| 88 | } | 86 | } |
diff --git a/mm/msync.c b/mm/msync.c index 992a1673d488..bb04d53ae852 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
| @@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) | |||
| 86 | (vma->vm_flags & VM_SHARED)) { | 86 | (vma->vm_flags & VM_SHARED)) { |
| 87 | get_file(file); | 87 | get_file(file); |
| 88 | up_read(&mm->mmap_sem); | 88 | up_read(&mm->mmap_sem); |
| 89 | if (vma->vm_flags & VM_NONLINEAR) | 89 | error = vfs_fsync_range(file, fstart, fend, 1); |
| 90 | error = vfs_fsync(file, 1); | ||
| 91 | else | ||
| 92 | error = vfs_fsync_range(file, fstart, fend, 1); | ||
| 93 | fput(file); | 90 | fput(file); |
| 94 | if (error || start >= end) | 91 | if (error || start >= end) |
| 95 | goto out; | 92 | goto out; |
diff --git a/mm/nommu.c b/mm/nommu.c index 28bd8c4dff6f..3e67e7538ecf 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -214,6 +214,39 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 214 | } | 214 | } |
| 215 | EXPORT_SYMBOL(get_user_pages); | 215 | EXPORT_SYMBOL(get_user_pages); |
| 216 | 216 | ||
| 217 | long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, | ||
| 218 | unsigned long start, unsigned long nr_pages, | ||
| 219 | int write, int force, struct page **pages, | ||
| 220 | int *locked) | ||
| 221 | { | ||
| 222 | return get_user_pages(tsk, mm, start, nr_pages, write, force, | ||
| 223 | pages, NULL); | ||
| 224 | } | ||
| 225 | EXPORT_SYMBOL(get_user_pages_locked); | ||
| 226 | |||
| 227 | long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
| 228 | unsigned long start, unsigned long nr_pages, | ||
| 229 | int write, int force, struct page **pages, | ||
| 230 | unsigned int gup_flags) | ||
| 231 | { | ||
| 232 | long ret; | ||
| 233 | down_read(&mm->mmap_sem); | ||
| 234 | ret = get_user_pages(tsk, mm, start, nr_pages, write, force, | ||
| 235 | pages, NULL); | ||
| 236 | up_read(&mm->mmap_sem); | ||
| 237 | return ret; | ||
| 238 | } | ||
| 239 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
| 240 | |||
| 241 | long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
| 242 | unsigned long start, unsigned long nr_pages, | ||
| 243 | int write, int force, struct page **pages) | ||
| 244 | { | ||
| 245 | return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, | ||
| 246 | force, pages, 0); | ||
| 247 | } | ||
| 248 | EXPORT_SYMBOL(get_user_pages_unlocked); | ||
| 249 | |||
| 217 | /** | 250 | /** |
| 218 | * follow_pfn - look up PFN at a user virtual address | 251 | * follow_pfn - look up PFN at a user virtual address |
| 219 | * @vma: memory mapping | 252 | * @vma: memory mapping |
| @@ -947,9 +980,6 @@ static int validate_mmap_request(struct file *file, | |||
| 947 | return -EOVERFLOW; | 980 | return -EOVERFLOW; |
| 948 | 981 | ||
| 949 | if (file) { | 982 | if (file) { |
| 950 | /* validate file mapping requests */ | ||
| 951 | struct address_space *mapping; | ||
| 952 | |||
| 953 | /* files must support mmap */ | 983 | /* files must support mmap */ |
| 954 | if (!file->f_op->mmap) | 984 | if (!file->f_op->mmap) |
| 955 | return -ENODEV; | 985 | return -ENODEV; |
| @@ -958,28 +988,22 @@ static int validate_mmap_request(struct file *file, | |||
| 958 | * - we support chardevs that provide their own "memory" | 988 | * - we support chardevs that provide their own "memory" |
| 959 | * - we support files/blockdevs that are memory backed | 989 | * - we support files/blockdevs that are memory backed |
| 960 | */ | 990 | */ |
| 961 | mapping = file->f_mapping; | 991 | if (file->f_op->mmap_capabilities) { |
| 962 | if (!mapping) | 992 | capabilities = file->f_op->mmap_capabilities(file); |
| 963 | mapping = file_inode(file)->i_mapping; | 993 | } else { |
| 964 | |||
| 965 | capabilities = 0; | ||
| 966 | if (mapping && mapping->backing_dev_info) | ||
| 967 | capabilities = mapping->backing_dev_info->capabilities; | ||
| 968 | |||
| 969 | if (!capabilities) { | ||
| 970 | /* no explicit capabilities set, so assume some | 994 | /* no explicit capabilities set, so assume some |
| 971 | * defaults */ | 995 | * defaults */ |
| 972 | switch (file_inode(file)->i_mode & S_IFMT) { | 996 | switch (file_inode(file)->i_mode & S_IFMT) { |
| 973 | case S_IFREG: | 997 | case S_IFREG: |
| 974 | case S_IFBLK: | 998 | case S_IFBLK: |
| 975 | capabilities = BDI_CAP_MAP_COPY; | 999 | capabilities = NOMMU_MAP_COPY; |
| 976 | break; | 1000 | break; |
| 977 | 1001 | ||
| 978 | case S_IFCHR: | 1002 | case S_IFCHR: |
| 979 | capabilities = | 1003 | capabilities = |
| 980 | BDI_CAP_MAP_DIRECT | | 1004 | NOMMU_MAP_DIRECT | |
| 981 | BDI_CAP_READ_MAP | | 1005 | NOMMU_MAP_READ | |
| 982 | BDI_CAP_WRITE_MAP; | 1006 | NOMMU_MAP_WRITE; |
| 983 | break; | 1007 | break; |
| 984 | 1008 | ||
| 985 | default: | 1009 | default: |
| @@ -990,9 +1014,9 @@ static int validate_mmap_request(struct file *file, | |||
| 990 | /* eliminate any capabilities that we can't support on this | 1014 | /* eliminate any capabilities that we can't support on this |
| 991 | * device */ | 1015 | * device */ |
| 992 | if (!file->f_op->get_unmapped_area) | 1016 | if (!file->f_op->get_unmapped_area) |
| 993 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1017 | capabilities &= ~NOMMU_MAP_DIRECT; |
| 994 | if (!file->f_op->read) | 1018 | if (!file->f_op->read) |
| 995 | capabilities &= ~BDI_CAP_MAP_COPY; | 1019 | capabilities &= ~NOMMU_MAP_COPY; |
| 996 | 1020 | ||
| 997 | /* The file shall have been opened with read permission. */ | 1021 | /* The file shall have been opened with read permission. */ |
| 998 | if (!(file->f_mode & FMODE_READ)) | 1022 | if (!(file->f_mode & FMODE_READ)) |
| @@ -1011,29 +1035,29 @@ static int validate_mmap_request(struct file *file, | |||
| 1011 | if (locks_verify_locked(file)) | 1035 | if (locks_verify_locked(file)) |
| 1012 | return -EAGAIN; | 1036 | return -EAGAIN; |
| 1013 | 1037 | ||
| 1014 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 1038 | if (!(capabilities & NOMMU_MAP_DIRECT)) |
| 1015 | return -ENODEV; | 1039 | return -ENODEV; |
| 1016 | 1040 | ||
| 1017 | /* we mustn't privatise shared mappings */ | 1041 | /* we mustn't privatise shared mappings */ |
| 1018 | capabilities &= ~BDI_CAP_MAP_COPY; | 1042 | capabilities &= ~NOMMU_MAP_COPY; |
| 1019 | } else { | 1043 | } else { |
| 1020 | /* we're going to read the file into private memory we | 1044 | /* we're going to read the file into private memory we |
| 1021 | * allocate */ | 1045 | * allocate */ |
| 1022 | if (!(capabilities & BDI_CAP_MAP_COPY)) | 1046 | if (!(capabilities & NOMMU_MAP_COPY)) |
| 1023 | return -ENODEV; | 1047 | return -ENODEV; |
| 1024 | 1048 | ||
| 1025 | /* we don't permit a private writable mapping to be | 1049 | /* we don't permit a private writable mapping to be |
| 1026 | * shared with the backing device */ | 1050 | * shared with the backing device */ |
| 1027 | if (prot & PROT_WRITE) | 1051 | if (prot & PROT_WRITE) |
| 1028 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1052 | capabilities &= ~NOMMU_MAP_DIRECT; |
| 1029 | } | 1053 | } |
| 1030 | 1054 | ||
| 1031 | if (capabilities & BDI_CAP_MAP_DIRECT) { | 1055 | if (capabilities & NOMMU_MAP_DIRECT) { |
| 1032 | if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || | 1056 | if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) || |
| 1033 | ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || | 1057 | ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) || |
| 1034 | ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) | 1058 | ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC)) |
| 1035 | ) { | 1059 | ) { |
| 1036 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1060 | capabilities &= ~NOMMU_MAP_DIRECT; |
| 1037 | if (flags & MAP_SHARED) { | 1061 | if (flags & MAP_SHARED) { |
| 1038 | printk(KERN_WARNING | 1062 | printk(KERN_WARNING |
| 1039 | "MAP_SHARED not completely supported on !MMU\n"); | 1063 | "MAP_SHARED not completely supported on !MMU\n"); |
| @@ -1050,21 +1074,21 @@ static int validate_mmap_request(struct file *file, | |||
| 1050 | } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { | 1074 | } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { |
| 1051 | /* handle implication of PROT_EXEC by PROT_READ */ | 1075 | /* handle implication of PROT_EXEC by PROT_READ */ |
| 1052 | if (current->personality & READ_IMPLIES_EXEC) { | 1076 | if (current->personality & READ_IMPLIES_EXEC) { |
| 1053 | if (capabilities & BDI_CAP_EXEC_MAP) | 1077 | if (capabilities & NOMMU_MAP_EXEC) |
| 1054 | prot |= PROT_EXEC; | 1078 | prot |= PROT_EXEC; |
| 1055 | } | 1079 | } |
| 1056 | } else if ((prot & PROT_READ) && | 1080 | } else if ((prot & PROT_READ) && |
| 1057 | (prot & PROT_EXEC) && | 1081 | (prot & PROT_EXEC) && |
| 1058 | !(capabilities & BDI_CAP_EXEC_MAP) | 1082 | !(capabilities & NOMMU_MAP_EXEC) |
| 1059 | ) { | 1083 | ) { |
| 1060 | /* backing file is not executable, try to copy */ | 1084 | /* backing file is not executable, try to copy */ |
| 1061 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1085 | capabilities &= ~NOMMU_MAP_DIRECT; |
| 1062 | } | 1086 | } |
| 1063 | } else { | 1087 | } else { |
| 1064 | /* anonymous mappings are always memory backed and can be | 1088 | /* anonymous mappings are always memory backed and can be |
| 1065 | * privately mapped | 1089 | * privately mapped |
| 1066 | */ | 1090 | */ |
| 1067 | capabilities = BDI_CAP_MAP_COPY; | 1091 | capabilities = NOMMU_MAP_COPY; |
| 1068 | 1092 | ||
| 1069 | /* handle PROT_EXEC implication by PROT_READ */ | 1093 | /* handle PROT_EXEC implication by PROT_READ */ |
| 1070 | if ((prot & PROT_READ) && | 1094 | if ((prot & PROT_READ) && |
| @@ -1096,7 +1120,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
| 1096 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); | 1120 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); |
| 1097 | /* vm_flags |= mm->def_flags; */ | 1121 | /* vm_flags |= mm->def_flags; */ |
| 1098 | 1122 | ||
| 1099 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) { | 1123 | if (!(capabilities & NOMMU_MAP_DIRECT)) { |
| 1100 | /* attempt to share read-only copies of mapped file chunks */ | 1124 | /* attempt to share read-only copies of mapped file chunks */ |
| 1101 | vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | 1125 | vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; |
| 1102 | if (file && !(prot & PROT_WRITE)) | 1126 | if (file && !(prot & PROT_WRITE)) |
| @@ -1105,7 +1129,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
| 1105 | /* overlay a shareable mapping on the backing device or inode | 1129 | /* overlay a shareable mapping on the backing device or inode |
| 1106 | * if possible - used for chardevs, ramfs/tmpfs/shmfs and | 1130 | * if possible - used for chardevs, ramfs/tmpfs/shmfs and |
| 1107 | * romfs/cramfs */ | 1131 | * romfs/cramfs */ |
| 1108 | vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS); | 1132 | vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS); |
| 1109 | if (flags & MAP_SHARED) | 1133 | if (flags & MAP_SHARED) |
| 1110 | vm_flags |= VM_SHARED; | 1134 | vm_flags |= VM_SHARED; |
| 1111 | } | 1135 | } |
| @@ -1158,7 +1182,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
| 1158 | * shared mappings on devices or memory | 1182 | * shared mappings on devices or memory |
| 1159 | * - VM_MAYSHARE will be set if it may attempt to share | 1183 | * - VM_MAYSHARE will be set if it may attempt to share |
| 1160 | */ | 1184 | */ |
| 1161 | if (capabilities & BDI_CAP_MAP_DIRECT) { | 1185 | if (capabilities & NOMMU_MAP_DIRECT) { |
| 1162 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1186 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
| 1163 | if (ret == 0) { | 1187 | if (ret == 0) { |
| 1164 | /* shouldn't return success if we're not sharing */ | 1188 | /* shouldn't return success if we're not sharing */ |
| @@ -1189,11 +1213,9 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
| 1189 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { | 1213 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { |
| 1190 | total = point; | 1214 | total = point; |
| 1191 | kdebug("try to alloc exact %lu pages", total); | 1215 | kdebug("try to alloc exact %lu pages", total); |
| 1192 | base = alloc_pages_exact(len, GFP_KERNEL); | ||
| 1193 | } else { | ||
| 1194 | base = (void *)__get_free_pages(GFP_KERNEL, order); | ||
| 1195 | } | 1216 | } |
| 1196 | 1217 | ||
| 1218 | base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); | ||
| 1197 | if (!base) | 1219 | if (!base) |
| 1198 | goto enomem; | 1220 | goto enomem; |
| 1199 | 1221 | ||
| @@ -1347,7 +1369,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1347 | if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && | 1369 | if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && |
| 1348 | !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { | 1370 | !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { |
| 1349 | /* new mapping is not a subset of the region */ | 1371 | /* new mapping is not a subset of the region */ |
| 1350 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 1372 | if (!(capabilities & NOMMU_MAP_DIRECT)) |
| 1351 | goto sharing_violation; | 1373 | goto sharing_violation; |
| 1352 | continue; | 1374 | continue; |
| 1353 | } | 1375 | } |
| @@ -1386,7 +1408,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1386 | * - this is the hook for quasi-memory character devices to | 1408 | * - this is the hook for quasi-memory character devices to |
| 1387 | * tell us the location of a shared mapping | 1409 | * tell us the location of a shared mapping |
| 1388 | */ | 1410 | */ |
| 1389 | if (capabilities & BDI_CAP_MAP_DIRECT) { | 1411 | if (capabilities & NOMMU_MAP_DIRECT) { |
| 1390 | addr = file->f_op->get_unmapped_area(file, addr, len, | 1412 | addr = file->f_op->get_unmapped_area(file, addr, len, |
| 1391 | pgoff, flags); | 1413 | pgoff, flags); |
| 1392 | if (IS_ERR_VALUE(addr)) { | 1414 | if (IS_ERR_VALUE(addr)) { |
| @@ -1398,10 +1420,10 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1398 | * the mapping so we'll have to attempt to copy | 1420 | * the mapping so we'll have to attempt to copy |
| 1399 | * it */ | 1421 | * it */ |
| 1400 | ret = -ENODEV; | 1422 | ret = -ENODEV; |
| 1401 | if (!(capabilities & BDI_CAP_MAP_COPY)) | 1423 | if (!(capabilities & NOMMU_MAP_COPY)) |
| 1402 | goto error_just_free; | 1424 | goto error_just_free; |
| 1403 | 1425 | ||
| 1404 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1426 | capabilities &= ~NOMMU_MAP_DIRECT; |
| 1405 | } else { | 1427 | } else { |
| 1406 | vma->vm_start = region->vm_start = addr; | 1428 | vma->vm_start = region->vm_start = addr; |
| 1407 | vma->vm_end = region->vm_end = addr + len; | 1429 | vma->vm_end = region->vm_end = addr + len; |
| @@ -1412,7 +1434,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1412 | vma->vm_region = region; | 1434 | vma->vm_region = region; |
| 1413 | 1435 | ||
| 1414 | /* set up the mapping | 1436 | /* set up the mapping |
| 1415 | * - the region is filled in if BDI_CAP_MAP_DIRECT is still set | 1437 | * - the region is filled in if NOMMU_MAP_DIRECT is still set |
| 1416 | */ | 1438 | */ |
| 1417 | if (file && vma->vm_flags & VM_SHARED) | 1439 | if (file && vma->vm_flags & VM_SHARED) |
| 1418 | ret = do_mmap_shared_file(vma); | 1440 | ret = do_mmap_shared_file(vma); |
| @@ -1895,7 +1917,7 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
| 1895 | */ | 1917 | */ |
| 1896 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 1918 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
| 1897 | { | 1919 | { |
| 1898 | unsigned long free, allowed, reserve; | 1920 | long free, allowed, reserve; |
| 1899 | 1921 | ||
| 1900 | vm_acct_memory(pages); | 1922 | vm_acct_memory(pages); |
| 1901 | 1923 | ||
| @@ -1959,7 +1981,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
| 1959 | */ | 1981 | */ |
| 1960 | if (mm) { | 1982 | if (mm) { |
| 1961 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); | 1983 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); |
| 1962 | allowed -= min(mm->total_vm / 32, reserve); | 1984 | allowed -= min_t(long, mm->total_vm / 32, reserve); |
| 1963 | } | 1985 | } |
| 1964 | 1986 | ||
| 1965 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) | 1987 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
| @@ -1984,14 +2006,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1984 | } | 2006 | } |
| 1985 | EXPORT_SYMBOL(filemap_map_pages); | 2007 | EXPORT_SYMBOL(filemap_map_pages); |
| 1986 | 2008 | ||
| 1987 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, | ||
| 1988 | unsigned long size, pgoff_t pgoff) | ||
| 1989 | { | ||
| 1990 | BUG(); | ||
| 1991 | return 0; | ||
| 1992 | } | ||
| 1993 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
| 1994 | |||
| 1995 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | 2009 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
| 1996 | unsigned long addr, void *buf, int len, int write) | 2010 | unsigned long addr, void *buf, int len, int write) |
| 1997 | { | 2011 | { |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d503e9ce1c7b..642f38cb175a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
| 169 | * The baseline for the badness score is the proportion of RAM that each | 169 | * The baseline for the badness score is the proportion of RAM that each |
| 170 | * task's rss, pagetable and swap space use. | 170 | * task's rss, pagetable and swap space use. |
| 171 | */ | 171 | */ |
| 172 | points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + | 172 | points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + |
| 173 | get_mm_counter(p->mm, MM_SWAPENTS); | 173 | atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); |
| 174 | task_unlock(p); | 174 | task_unlock(p); |
| 175 | 175 | ||
| 176 | /* | 176 | /* |
| @@ -266,8 +266,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
| 266 | * Don't allow any other task to have access to the reserves. | 266 | * Don't allow any other task to have access to the reserves. |
| 267 | */ | 267 | */ |
| 268 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { | 268 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { |
| 269 | if (unlikely(frozen(task))) | ||
| 270 | __thaw_task(task); | ||
| 271 | if (!force_kill) | 269 | if (!force_kill) |
| 272 | return OOM_SCAN_ABORT; | 270 | return OOM_SCAN_ABORT; |
| 273 | } | 271 | } |
| @@ -353,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
| 353 | struct task_struct *p; | 351 | struct task_struct *p; |
| 354 | struct task_struct *task; | 352 | struct task_struct *task; |
| 355 | 353 | ||
| 356 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); | 354 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); |
| 357 | rcu_read_lock(); | 355 | rcu_read_lock(); |
| 358 | for_each_process(p) { | 356 | for_each_process(p) { |
| 359 | if (oom_unkillable_task(p, memcg, nodemask)) | 357 | if (oom_unkillable_task(p, memcg, nodemask)) |
| @@ -369,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
| 369 | continue; | 367 | continue; |
| 370 | } | 368 | } |
| 371 | 369 | ||
| 372 | pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", | 370 | pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", |
| 373 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 371 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
| 374 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 372 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
| 375 | atomic_long_read(&task->mm->nr_ptes), | 373 | atomic_long_read(&task->mm->nr_ptes), |
| 374 | mm_nr_pmds(task->mm), | ||
| 376 | get_mm_counter(task->mm, MM_SWAPENTS), | 375 | get_mm_counter(task->mm, MM_SWAPENTS), |
| 377 | task->signal->oom_score_adj, task->comm); | 376 | task->signal->oom_score_adj, task->comm); |
| 378 | task_unlock(task); | 377 | task_unlock(task); |
| @@ -400,20 +399,98 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 400 | } | 399 | } |
| 401 | 400 | ||
| 402 | /* | 401 | /* |
| 403 | * Number of OOM killer invocations (including memcg OOM killer). | 402 | * Number of OOM victims in flight |
| 404 | * Primarily used by PM freezer to check for potential races with | ||
| 405 | * OOM killed frozen task. | ||
| 406 | */ | 403 | */ |
| 407 | static atomic_t oom_kills = ATOMIC_INIT(0); | 404 | static atomic_t oom_victims = ATOMIC_INIT(0); |
| 405 | static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); | ||
| 408 | 406 | ||
| 409 | int oom_kills_count(void) | 407 | bool oom_killer_disabled __read_mostly; |
| 408 | static DECLARE_RWSEM(oom_sem); | ||
| 409 | |||
| 410 | /** | ||
| 411 | * mark_tsk_oom_victim - marks the given taks as OOM victim. | ||
| 412 | * @tsk: task to mark | ||
| 413 | * | ||
| 414 | * Has to be called with oom_sem taken for read and never after | ||
| 415 | * oom has been disabled already. | ||
| 416 | */ | ||
| 417 | void mark_tsk_oom_victim(struct task_struct *tsk) | ||
| 410 | { | 418 | { |
| 411 | return atomic_read(&oom_kills); | 419 | WARN_ON(oom_killer_disabled); |
| 420 | /* OOM killer might race with memcg OOM */ | ||
| 421 | if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) | ||
| 422 | return; | ||
| 423 | /* | ||
| 424 | * Make sure that the task is woken up from uninterruptible sleep | ||
| 425 | * if it is frozen because OOM killer wouldn't be able to free | ||
| 426 | * any memory and livelock. freezing_slow_path will tell the freezer | ||
| 427 | * that TIF_MEMDIE tasks should be ignored. | ||
| 428 | */ | ||
| 429 | __thaw_task(tsk); | ||
| 430 | atomic_inc(&oom_victims); | ||
| 431 | } | ||
| 432 | |||
| 433 | /** | ||
| 434 | * unmark_oom_victim - unmarks the current task as OOM victim. | ||
| 435 | * | ||
| 436 | * Wakes up all waiters in oom_killer_disable() | ||
| 437 | */ | ||
| 438 | void unmark_oom_victim(void) | ||
| 439 | { | ||
| 440 | if (!test_and_clear_thread_flag(TIF_MEMDIE)) | ||
| 441 | return; | ||
| 442 | |||
| 443 | down_read(&oom_sem); | ||
| 444 | /* | ||
| 445 | * There is no need to signal the lasst oom_victim if there | ||
| 446 | * is nobody who cares. | ||
| 447 | */ | ||
| 448 | if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) | ||
| 449 | wake_up_all(&oom_victims_wait); | ||
| 450 | up_read(&oom_sem); | ||
| 451 | } | ||
| 452 | |||
| 453 | /** | ||
| 454 | * oom_killer_disable - disable OOM killer | ||
| 455 | * | ||
| 456 | * Forces all page allocations to fail rather than trigger OOM killer. | ||
| 457 | * Will block and wait until all OOM victims are killed. | ||
| 458 | * | ||
| 459 | * The function cannot be called when there are runnable user tasks because | ||
| 460 | * the userspace would see unexpected allocation failures as a result. Any | ||
| 461 | * new usage of this function should be consulted with MM people. | ||
| 462 | * | ||
| 463 | * Returns true if successful and false if the OOM killer cannot be | ||
| 464 | * disabled. | ||
| 465 | */ | ||
| 466 | bool oom_killer_disable(void) | ||
| 467 | { | ||
| 468 | /* | ||
| 469 | * Make sure to not race with an ongoing OOM killer | ||
| 470 | * and that the current is not the victim. | ||
| 471 | */ | ||
| 472 | down_write(&oom_sem); | ||
| 473 | if (test_thread_flag(TIF_MEMDIE)) { | ||
| 474 | up_write(&oom_sem); | ||
| 475 | return false; | ||
| 476 | } | ||
| 477 | |||
| 478 | oom_killer_disabled = true; | ||
| 479 | up_write(&oom_sem); | ||
| 480 | |||
| 481 | wait_event(oom_victims_wait, !atomic_read(&oom_victims)); | ||
| 482 | |||
| 483 | return true; | ||
| 412 | } | 484 | } |
| 413 | 485 | ||
| 414 | void note_oom_kill(void) | 486 | /** |
| 487 | * oom_killer_enable - enable OOM killer | ||
| 488 | */ | ||
| 489 | void oom_killer_enable(void) | ||
| 415 | { | 490 | { |
| 416 | atomic_inc(&oom_kills); | 491 | down_write(&oom_sem); |
| 492 | oom_killer_disabled = false; | ||
| 493 | up_write(&oom_sem); | ||
| 417 | } | 494 | } |
| 418 | 495 | ||
| 419 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 496 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
| @@ -438,11 +515,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 438 | * If the task is already exiting, don't alarm the sysadmin or kill | 515 | * If the task is already exiting, don't alarm the sysadmin or kill |
| 439 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 516 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
| 440 | */ | 517 | */ |
| 441 | if (task_will_free_mem(p)) { | 518 | task_lock(p); |
| 442 | set_tsk_thread_flag(p, TIF_MEMDIE); | 519 | if (p->mm && task_will_free_mem(p)) { |
| 520 | mark_tsk_oom_victim(p); | ||
| 521 | task_unlock(p); | ||
| 443 | put_task_struct(p); | 522 | put_task_struct(p); |
| 444 | return; | 523 | return; |
| 445 | } | 524 | } |
| 525 | task_unlock(p); | ||
| 446 | 526 | ||
| 447 | if (__ratelimit(&oom_rs)) | 527 | if (__ratelimit(&oom_rs)) |
| 448 | dump_header(p, gfp_mask, order, memcg, nodemask); | 528 | dump_header(p, gfp_mask, order, memcg, nodemask); |
| @@ -492,6 +572,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 492 | 572 | ||
| 493 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | 573 | /* mm cannot safely be dereferenced after task_unlock(victim) */ |
| 494 | mm = victim->mm; | 574 | mm = victim->mm; |
| 575 | mark_tsk_oom_victim(victim); | ||
| 495 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | 576 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
| 496 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), | 577 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), |
| 497 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), | 578 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), |
| @@ -522,7 +603,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 522 | } | 603 | } |
| 523 | rcu_read_unlock(); | 604 | rcu_read_unlock(); |
| 524 | 605 | ||
| 525 | set_tsk_thread_flag(victim, TIF_MEMDIE); | ||
| 526 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | 606 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); |
| 527 | put_task_struct(victim); | 607 | put_task_struct(victim); |
| 528 | } | 608 | } |
| @@ -611,7 +691,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
| 611 | } | 691 | } |
| 612 | 692 | ||
| 613 | /** | 693 | /** |
| 614 | * out_of_memory - kill the "best" process when we run out of memory | 694 | * __out_of_memory - kill the "best" process when we run out of memory |
| 615 | * @zonelist: zonelist pointer | 695 | * @zonelist: zonelist pointer |
| 616 | * @gfp_mask: memory allocation flags | 696 | * @gfp_mask: memory allocation flags |
| 617 | * @order: amount of memory being requested as a power of 2 | 697 | * @order: amount of memory being requested as a power of 2 |
| @@ -623,7 +703,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
| 623 | * OR try to be smart about which process to kill. Note that we | 703 | * OR try to be smart about which process to kill. Note that we |
| 624 | * don't have to be perfect here, we just have to be good. | 704 | * don't have to be perfect here, we just have to be good. |
| 625 | */ | 705 | */ |
| 626 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 706 | static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
| 627 | int order, nodemask_t *nodemask, bool force_kill) | 707 | int order, nodemask_t *nodemask, bool force_kill) |
| 628 | { | 708 | { |
| 629 | const nodemask_t *mpol_mask; | 709 | const nodemask_t *mpol_mask; |
| @@ -643,9 +723,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
| 643 | * If current has a pending SIGKILL or is exiting, then automatically | 723 | * If current has a pending SIGKILL or is exiting, then automatically |
| 644 | * select it. The goal is to allow it to allocate so that it may | 724 | * select it. The goal is to allow it to allocate so that it may |
| 645 | * quickly exit and free its memory. | 725 | * quickly exit and free its memory. |
| 726 | * | ||
| 727 | * But don't select if current has already released its mm and cleared | ||
| 728 | * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur. | ||
| 646 | */ | 729 | */ |
| 647 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { | 730 | if (current->mm && |
| 648 | set_thread_flag(TIF_MEMDIE); | 731 | (fatal_signal_pending(current) || task_will_free_mem(current))) { |
| 732 | mark_tsk_oom_victim(current); | ||
| 649 | return; | 733 | return; |
| 650 | } | 734 | } |
| 651 | 735 | ||
| @@ -688,6 +772,32 @@ out: | |||
| 688 | schedule_timeout_killable(1); | 772 | schedule_timeout_killable(1); |
| 689 | } | 773 | } |
| 690 | 774 | ||
| 775 | /** | ||
| 776 | * out_of_memory - tries to invoke OOM killer. | ||
| 777 | * @zonelist: zonelist pointer | ||
| 778 | * @gfp_mask: memory allocation flags | ||
| 779 | * @order: amount of memory being requested as a power of 2 | ||
| 780 | * @nodemask: nodemask passed to page allocator | ||
| 781 | * @force_kill: true if a task must be killed, even if others are exiting | ||
| 782 | * | ||
| 783 | * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() | ||
| 784 | * when it returns false. Otherwise returns true. | ||
| 785 | */ | ||
| 786 | bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | ||
| 787 | int order, nodemask_t *nodemask, bool force_kill) | ||
| 788 | { | ||
| 789 | bool ret = false; | ||
| 790 | |||
| 791 | down_read(&oom_sem); | ||
| 792 | if (!oom_killer_disabled) { | ||
| 793 | __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); | ||
| 794 | ret = true; | ||
| 795 | } | ||
| 796 | up_read(&oom_sem); | ||
| 797 | |||
| 798 | return ret; | ||
| 799 | } | ||
| 800 | |||
| 691 | /* | 801 | /* |
| 692 | * The pagefault handler calls here because it is out of memory, so kill a | 802 | * The pagefault handler calls here because it is out of memory, so kill a |
| 693 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a | 803 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a |
| @@ -697,12 +807,25 @@ void pagefault_out_of_memory(void) | |||
| 697 | { | 807 | { |
| 698 | struct zonelist *zonelist; | 808 | struct zonelist *zonelist; |
| 699 | 809 | ||
| 810 | down_read(&oom_sem); | ||
| 700 | if (mem_cgroup_oom_synchronize(true)) | 811 | if (mem_cgroup_oom_synchronize(true)) |
| 701 | return; | 812 | goto unlock; |
| 702 | 813 | ||
| 703 | zonelist = node_zonelist(first_memory_node, GFP_KERNEL); | 814 | zonelist = node_zonelist(first_memory_node, GFP_KERNEL); |
| 704 | if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { | 815 | if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { |
| 705 | out_of_memory(NULL, 0, 0, NULL, false); | 816 | if (!oom_killer_disabled) |
| 817 | __out_of_memory(NULL, 0, 0, NULL, false); | ||
| 818 | else | ||
| 819 | /* | ||
| 820 | * There shouldn't be any user tasks runable while the | ||
| 821 | * OOM killer is disabled so the current task has to | ||
| 822 | * be a racing OOM victim for which oom_killer_disable() | ||
| 823 | * is waiting for. | ||
| 824 | */ | ||
| 825 | WARN_ON(test_thread_flag(TIF_MEMDIE)); | ||
| 826 | |||
| 706 | oom_zonelist_unlock(zonelist, GFP_KERNEL); | 827 | oom_zonelist_unlock(zonelist, GFP_KERNEL); |
| 707 | } | 828 | } |
| 829 | unlock: | ||
| 830 | up_read(&oom_sem); | ||
| 708 | } | 831 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6f4335238e33..45e187b2d971 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
| 1351 | unsigned long task_ratelimit; | 1351 | unsigned long task_ratelimit; |
| 1352 | unsigned long dirty_ratelimit; | 1352 | unsigned long dirty_ratelimit; |
| 1353 | unsigned long pos_ratio; | 1353 | unsigned long pos_ratio; |
| 1354 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1354 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
| 1355 | bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; | 1355 | bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; |
| 1356 | unsigned long start_time = jiffies; | 1356 | unsigned long start_time = jiffies; |
| 1357 | 1357 | ||
| @@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | |||
| 1574 | */ | 1574 | */ |
| 1575 | void balance_dirty_pages_ratelimited(struct address_space *mapping) | 1575 | void balance_dirty_pages_ratelimited(struct address_space *mapping) |
| 1576 | { | 1576 | { |
| 1577 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1577 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
| 1578 | int ratelimit; | 1578 | int ratelimit; |
| 1579 | int *p; | 1579 | int *p; |
| 1580 | 1580 | ||
| @@ -1929,7 +1929,7 @@ continue_unlock: | |||
| 1929 | if (!clear_page_dirty_for_io(page)) | 1929 | if (!clear_page_dirty_for_io(page)) |
| 1930 | goto continue_unlock; | 1930 | goto continue_unlock; |
| 1931 | 1931 | ||
| 1932 | trace_wbc_writepage(wbc, mapping->backing_dev_info); | 1932 | trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); |
| 1933 | ret = (*writepage)(page, wbc, data); | 1933 | ret = (*writepage)(page, wbc, data); |
| 1934 | if (unlikely(ret)) { | 1934 | if (unlikely(ret)) { |
| 1935 | if (ret == AOP_WRITEPAGE_ACTIVATE) { | 1935 | if (ret == AOP_WRITEPAGE_ACTIVATE) { |
| @@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) | |||
| 2094 | trace_writeback_dirty_page(page, mapping); | 2094 | trace_writeback_dirty_page(page, mapping); |
| 2095 | 2095 | ||
| 2096 | if (mapping_cap_account_dirty(mapping)) { | 2096 | if (mapping_cap_account_dirty(mapping)) { |
| 2097 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); | ||
| 2098 | |||
| 2097 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 2099 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
| 2098 | __inc_zone_page_state(page, NR_DIRTIED); | 2100 | __inc_zone_page_state(page, NR_DIRTIED); |
| 2099 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); | 2101 | __inc_bdi_stat(bdi, BDI_RECLAIMABLE); |
| 2100 | __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); | 2102 | __inc_bdi_stat(bdi, BDI_DIRTIED); |
| 2101 | task_io_account_write(PAGE_CACHE_SIZE); | 2103 | task_io_account_write(PAGE_CACHE_SIZE); |
| 2102 | current->nr_dirtied++; | 2104 | current->nr_dirtied++; |
| 2103 | this_cpu_inc(bdp_ratelimits); | 2105 | this_cpu_inc(bdp_ratelimits); |
| @@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page) | |||
| 2156 | if (mapping && mapping_cap_account_dirty(mapping)) { | 2158 | if (mapping && mapping_cap_account_dirty(mapping)) { |
| 2157 | current->nr_dirtied--; | 2159 | current->nr_dirtied--; |
| 2158 | dec_zone_page_state(page, NR_DIRTIED); | 2160 | dec_zone_page_state(page, NR_DIRTIED); |
| 2159 | dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); | 2161 | dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED); |
| 2160 | } | 2162 | } |
| 2161 | } | 2163 | } |
| 2162 | EXPORT_SYMBOL(account_page_redirty); | 2164 | EXPORT_SYMBOL(account_page_redirty); |
| @@ -2168,9 +2170,12 @@ EXPORT_SYMBOL(account_page_redirty); | |||
| 2168 | */ | 2170 | */ |
| 2169 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) | 2171 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) |
| 2170 | { | 2172 | { |
| 2173 | int ret; | ||
| 2174 | |||
| 2171 | wbc->pages_skipped++; | 2175 | wbc->pages_skipped++; |
| 2176 | ret = __set_page_dirty_nobuffers(page); | ||
| 2172 | account_page_redirty(page); | 2177 | account_page_redirty(page); |
| 2173 | return __set_page_dirty_nobuffers(page); | 2178 | return ret; |
| 2174 | } | 2179 | } |
| 2175 | EXPORT_SYMBOL(redirty_page_for_writepage); | 2180 | EXPORT_SYMBOL(redirty_page_for_writepage); |
| 2176 | 2181 | ||
| @@ -2295,7 +2300,7 @@ int clear_page_dirty_for_io(struct page *page) | |||
| 2295 | */ | 2300 | */ |
| 2296 | if (TestClearPageDirty(page)) { | 2301 | if (TestClearPageDirty(page)) { |
| 2297 | dec_zone_page_state(page, NR_FILE_DIRTY); | 2302 | dec_zone_page_state(page, NR_FILE_DIRTY); |
| 2298 | dec_bdi_stat(mapping->backing_dev_info, | 2303 | dec_bdi_stat(inode_to_bdi(mapping->host), |
| 2299 | BDI_RECLAIMABLE); | 2304 | BDI_RECLAIMABLE); |
| 2300 | return 1; | 2305 | return 1; |
| 2301 | } | 2306 | } |
| @@ -2308,14 +2313,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); | |||
| 2308 | int test_clear_page_writeback(struct page *page) | 2313 | int test_clear_page_writeback(struct page *page) |
| 2309 | { | 2314 | { |
| 2310 | struct address_space *mapping = page_mapping(page); | 2315 | struct address_space *mapping = page_mapping(page); |
| 2311 | unsigned long memcg_flags; | ||
| 2312 | struct mem_cgroup *memcg; | 2316 | struct mem_cgroup *memcg; |
| 2313 | bool locked; | ||
| 2314 | int ret; | 2317 | int ret; |
| 2315 | 2318 | ||
| 2316 | memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); | 2319 | memcg = mem_cgroup_begin_page_stat(page); |
| 2317 | if (mapping) { | 2320 | if (mapping) { |
| 2318 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2321 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
| 2319 | unsigned long flags; | 2322 | unsigned long flags; |
| 2320 | 2323 | ||
| 2321 | spin_lock_irqsave(&mapping->tree_lock, flags); | 2324 | spin_lock_irqsave(&mapping->tree_lock, flags); |
| @@ -2338,21 +2341,19 @@ int test_clear_page_writeback(struct page *page) | |||
| 2338 | dec_zone_page_state(page, NR_WRITEBACK); | 2341 | dec_zone_page_state(page, NR_WRITEBACK); |
| 2339 | inc_zone_page_state(page, NR_WRITTEN); | 2342 | inc_zone_page_state(page, NR_WRITTEN); |
| 2340 | } | 2343 | } |
| 2341 | mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); | 2344 | mem_cgroup_end_page_stat(memcg); |
| 2342 | return ret; | 2345 | return ret; |
| 2343 | } | 2346 | } |
| 2344 | 2347 | ||
| 2345 | int __test_set_page_writeback(struct page *page, bool keep_write) | 2348 | int __test_set_page_writeback(struct page *page, bool keep_write) |
| 2346 | { | 2349 | { |
| 2347 | struct address_space *mapping = page_mapping(page); | 2350 | struct address_space *mapping = page_mapping(page); |
| 2348 | unsigned long memcg_flags; | ||
| 2349 | struct mem_cgroup *memcg; | 2351 | struct mem_cgroup *memcg; |
| 2350 | bool locked; | ||
| 2351 | int ret; | 2352 | int ret; |
| 2352 | 2353 | ||
| 2353 | memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); | 2354 | memcg = mem_cgroup_begin_page_stat(page); |
| 2354 | if (mapping) { | 2355 | if (mapping) { |
| 2355 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2356 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
| 2356 | unsigned long flags; | 2357 | unsigned long flags; |
| 2357 | 2358 | ||
| 2358 | spin_lock_irqsave(&mapping->tree_lock, flags); | 2359 | spin_lock_irqsave(&mapping->tree_lock, flags); |
| @@ -2380,7 +2381,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
| 2380 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); | 2381 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); |
| 2381 | inc_zone_page_state(page, NR_WRITEBACK); | 2382 | inc_zone_page_state(page, NR_WRITEBACK); |
| 2382 | } | 2383 | } |
| 2383 | mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); | 2384 | mem_cgroup_end_page_stat(memcg); |
| 2384 | return ret; | 2385 | return ret; |
| 2385 | 2386 | ||
| 2386 | } | 2387 | } |
| @@ -2406,12 +2407,7 @@ EXPORT_SYMBOL(mapping_tagged); | |||
| 2406 | */ | 2407 | */ |
| 2407 | void wait_for_stable_page(struct page *page) | 2408 | void wait_for_stable_page(struct page *page) |
| 2408 | { | 2409 | { |
| 2409 | struct address_space *mapping = page_mapping(page); | 2410 | if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) |
| 2410 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2411 | wait_on_page_writeback(page); |
| 2411 | |||
| 2412 | if (!bdi_cap_stable_pages_required(bdi)) | ||
| 2413 | return; | ||
| 2414 | |||
| 2415 | wait_on_page_writeback(page); | ||
| 2416 | } | 2412 | } |
| 2417 | EXPORT_SYMBOL_GPL(wait_for_stable_page); | 2413 | EXPORT_SYMBOL_GPL(wait_for_stable_page); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e20f9c2fa5a..7abfa70cdc1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/compiler.h> | 25 | #include <linux/compiler.h> |
| 26 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
| 27 | #include <linux/kmemcheck.h> | 27 | #include <linux/kmemcheck.h> |
| 28 | #include <linux/kasan.h> | ||
| 28 | #include <linux/module.h> | 29 | #include <linux/module.h> |
| 29 | #include <linux/suspend.h> | 30 | #include <linux/suspend.h> |
| 30 | #include <linux/pagevec.h> | 31 | #include <linux/pagevec.h> |
| @@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order); | |||
| 172 | * 1G machine -> (16M dma, 784M normal, 224M high) | 173 | * 1G machine -> (16M dma, 784M normal, 224M high) |
| 173 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA | 174 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
| 174 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL | 175 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
| 175 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA | 176 | * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA |
| 176 | * | 177 | * |
| 177 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | 178 | * TBD: should special case ZONE_DMA32 machines here - in those we normally |
| 178 | * don't need any ZONE_NORMAL reservation | 179 | * don't need any ZONE_NORMAL reservation |
| @@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) | |||
| 244 | PB_migrate, PB_migrate_end); | 245 | PB_migrate, PB_migrate_end); |
| 245 | } | 246 | } |
| 246 | 247 | ||
| 247 | bool oom_killer_disabled __read_mostly; | ||
| 248 | |||
| 249 | #ifdef CONFIG_DEBUG_VM | 248 | #ifdef CONFIG_DEBUG_VM |
| 250 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 249 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
| 251 | { | 250 | { |
| @@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
| 381 | } | 380 | } |
| 382 | } | 381 | } |
| 383 | 382 | ||
| 384 | /* update __split_huge_page_refcount if you change this function */ | ||
| 385 | static int destroy_compound_page(struct page *page, unsigned long order) | ||
| 386 | { | ||
| 387 | int i; | ||
| 388 | int nr_pages = 1 << order; | ||
| 389 | int bad = 0; | ||
| 390 | |||
| 391 | if (unlikely(compound_order(page) != order)) { | ||
| 392 | bad_page(page, "wrong compound order", 0); | ||
| 393 | bad++; | ||
| 394 | } | ||
| 395 | |||
| 396 | __ClearPageHead(page); | ||
| 397 | |||
| 398 | for (i = 1; i < nr_pages; i++) { | ||
| 399 | struct page *p = page + i; | ||
| 400 | |||
| 401 | if (unlikely(!PageTail(p))) { | ||
| 402 | bad_page(page, "PageTail not set", 0); | ||
| 403 | bad++; | ||
| 404 | } else if (unlikely(p->first_page != page)) { | ||
| 405 | bad_page(page, "first_page not consistent", 0); | ||
| 406 | bad++; | ||
| 407 | } | ||
| 408 | __ClearPageTail(p); | ||
| 409 | } | ||
| 410 | |||
| 411 | return bad; | ||
| 412 | } | ||
| 413 | |||
| 414 | static inline void prep_zero_page(struct page *page, unsigned int order, | 383 | static inline void prep_zero_page(struct page *page, unsigned int order, |
| 415 | gfp_t gfp_flags) | 384 | gfp_t gfp_flags) |
| 416 | { | 385 | { |
| @@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
| 552 | return 0; | 521 | return 0; |
| 553 | 522 | ||
| 554 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 523 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
| 555 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
| 556 | |||
| 557 | if (page_zone_id(page) != page_zone_id(buddy)) | 524 | if (page_zone_id(page) != page_zone_id(buddy)) |
| 558 | return 0; | 525 | return 0; |
| 559 | 526 | ||
| 527 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
| 528 | |||
| 560 | return 1; | 529 | return 1; |
| 561 | } | 530 | } |
| 562 | 531 | ||
| 563 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 532 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
| 564 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
| 565 | |||
| 566 | /* | 533 | /* |
| 567 | * zone check is done late to avoid uselessly | 534 | * zone check is done late to avoid uselessly |
| 568 | * calculating zone/node ids for pages that could | 535 | * calculating zone/node ids for pages that could |
| @@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
| 571 | if (page_zone_id(page) != page_zone_id(buddy)) | 538 | if (page_zone_id(page) != page_zone_id(buddy)) |
| 572 | return 0; | 539 | return 0; |
| 573 | 540 | ||
| 541 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
| 542 | |||
| 574 | return 1; | 543 | return 1; |
| 575 | } | 544 | } |
| 576 | return 0; | 545 | return 0; |
| @@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page, | |||
| 613 | int max_order = MAX_ORDER; | 582 | int max_order = MAX_ORDER; |
| 614 | 583 | ||
| 615 | VM_BUG_ON(!zone_is_initialized(zone)); | 584 | VM_BUG_ON(!zone_is_initialized(zone)); |
| 616 | 585 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); | |
| 617 | if (unlikely(PageCompound(page))) | ||
| 618 | if (unlikely(destroy_compound_page(page, order))) | ||
| 619 | return; | ||
| 620 | 586 | ||
| 621 | VM_BUG_ON(migratetype == -1); | 587 | VM_BUG_ON(migratetype == -1); |
| 622 | if (is_migrate_isolate(migratetype)) { | 588 | if (is_migrate_isolate(migratetype)) { |
| @@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone, | |||
| 797 | spin_unlock(&zone->lock); | 763 | spin_unlock(&zone->lock); |
| 798 | } | 764 | } |
| 799 | 765 | ||
| 766 | static int free_tail_pages_check(struct page *head_page, struct page *page) | ||
| 767 | { | ||
| 768 | if (!IS_ENABLED(CONFIG_DEBUG_VM)) | ||
| 769 | return 0; | ||
| 770 | if (unlikely(!PageTail(page))) { | ||
| 771 | bad_page(page, "PageTail not set", 0); | ||
| 772 | return 1; | ||
| 773 | } | ||
| 774 | if (unlikely(page->first_page != head_page)) { | ||
| 775 | bad_page(page, "first_page not consistent", 0); | ||
| 776 | return 1; | ||
| 777 | } | ||
| 778 | return 0; | ||
| 779 | } | ||
| 780 | |||
| 800 | static bool free_pages_prepare(struct page *page, unsigned int order) | 781 | static bool free_pages_prepare(struct page *page, unsigned int order) |
| 801 | { | 782 | { |
| 802 | int i; | 783 | bool compound = PageCompound(page); |
| 803 | int bad = 0; | 784 | int i, bad = 0; |
| 804 | 785 | ||
| 805 | VM_BUG_ON_PAGE(PageTail(page), page); | 786 | VM_BUG_ON_PAGE(PageTail(page), page); |
| 806 | VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); | 787 | VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); |
| 807 | 788 | ||
| 808 | trace_mm_page_free(page, order); | 789 | trace_mm_page_free(page, order); |
| 809 | kmemcheck_free_shadow(page, order); | 790 | kmemcheck_free_shadow(page, order); |
| 791 | kasan_free_pages(page, order); | ||
| 810 | 792 | ||
| 811 | if (PageAnon(page)) | 793 | if (PageAnon(page)) |
| 812 | page->mapping = NULL; | 794 | page->mapping = NULL; |
| 813 | for (i = 0; i < (1 << order); i++) | 795 | bad += free_pages_check(page); |
| 796 | for (i = 1; i < (1 << order); i++) { | ||
| 797 | if (compound) | ||
| 798 | bad += free_tail_pages_check(page, page + i); | ||
| 814 | bad += free_pages_check(page + i); | 799 | bad += free_pages_check(page + i); |
| 800 | } | ||
| 815 | if (bad) | 801 | if (bad) |
| 816 | return false; | 802 | return false; |
| 817 | 803 | ||
| @@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page) | |||
| 970 | return 0; | 956 | return 0; |
| 971 | } | 957 | } |
| 972 | 958 | ||
| 973 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | 959 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
| 960 | int alloc_flags) | ||
| 974 | { | 961 | { |
| 975 | int i; | 962 | int i; |
| 976 | 963 | ||
| @@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
| 985 | 972 | ||
| 986 | arch_alloc_page(page, order); | 973 | arch_alloc_page(page, order); |
| 987 | kernel_map_pages(page, 1 << order, 1); | 974 | kernel_map_pages(page, 1 << order, 1); |
| 975 | kasan_alloc_pages(page, order); | ||
| 988 | 976 | ||
| 989 | if (gfp_flags & __GFP_ZERO) | 977 | if (gfp_flags & __GFP_ZERO) |
| 990 | prep_zero_page(page, order, gfp_flags); | 978 | prep_zero_page(page, order, gfp_flags); |
| @@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
| 994 | 982 | ||
| 995 | set_page_owner(page, order, gfp_flags); | 983 | set_page_owner(page, order, gfp_flags); |
| 996 | 984 | ||
| 985 | /* | ||
| 986 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to | ||
| 987 | * allocate the page. The expectation is that the caller is taking | ||
| 988 | * steps that will free more memory. The caller should avoid the page | ||
| 989 | * being used for !PFMEMALLOC purposes. | ||
| 990 | */ | ||
| 991 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
| 992 | |||
| 997 | return 0; | 993 | return 0; |
| 998 | } | 994 | } |
| 999 | 995 | ||
| @@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page, | |||
| 1130 | } | 1126 | } |
| 1131 | 1127 | ||
| 1132 | /* | 1128 | /* |
| 1133 | * If breaking a large block of pages, move all free pages to the preferred | 1129 | * When we are falling back to another migratetype during allocation, try to |
| 1134 | * allocation list. If falling back for a reclaimable kernel allocation, be | 1130 | * steal extra free pages from the same pageblocks to satisfy further |
| 1135 | * more aggressive about taking ownership of free pages. | 1131 | * allocations, instead of polluting multiple pageblocks. |
| 1136 | * | 1132 | * |
| 1137 | * On the other hand, never change migration type of MIGRATE_CMA pageblocks | 1133 | * If we are stealing a relatively large buddy page, it is likely there will |
| 1138 | * nor move CMA pages to different free lists. We don't want unmovable pages | 1134 | * be more free pages in the pageblock, so try to steal them all. For |
| 1139 | * to be allocated from MIGRATE_CMA areas. | 1135 | * reclaimable and unmovable allocations, we steal regardless of page size, |
| 1136 | * as fragmentation caused by those allocations polluting movable pageblocks | ||
| 1137 | * is worse than movable allocations stealing from unmovable and reclaimable | ||
| 1138 | * pageblocks. | ||
| 1140 | * | 1139 | * |
| 1141 | * Returns the new migratetype of the pageblock (or the same old migratetype | 1140 | * If we claim more than half of the pageblock, change pageblock's migratetype |
| 1142 | * if it was unchanged). | 1141 | * as well. |
| 1143 | */ | 1142 | */ |
| 1144 | static int try_to_steal_freepages(struct zone *zone, struct page *page, | 1143 | static void try_to_steal_freepages(struct zone *zone, struct page *page, |
| 1145 | int start_type, int fallback_type) | 1144 | int start_type, int fallback_type) |
| 1146 | { | 1145 | { |
| 1147 | int current_order = page_order(page); | 1146 | int current_order = page_order(page); |
| 1148 | 1147 | ||
| 1149 | /* | ||
| 1150 | * When borrowing from MIGRATE_CMA, we need to release the excess | ||
| 1151 | * buddy pages to CMA itself. We also ensure the freepage_migratetype | ||
| 1152 | * is set to CMA so it is returned to the correct freelist in case | ||
| 1153 | * the page ends up being not actually allocated from the pcp lists. | ||
| 1154 | */ | ||
| 1155 | if (is_migrate_cma(fallback_type)) | ||
| 1156 | return fallback_type; | ||
| 1157 | |||
| 1158 | /* Take ownership for orders >= pageblock_order */ | 1148 | /* Take ownership for orders >= pageblock_order */ |
| 1159 | if (current_order >= pageblock_order) { | 1149 | if (current_order >= pageblock_order) { |
| 1160 | change_pageblock_range(page, current_order, start_type); | 1150 | change_pageblock_range(page, current_order, start_type); |
| 1161 | return start_type; | 1151 | return; |
| 1162 | } | 1152 | } |
| 1163 | 1153 | ||
| 1164 | if (current_order >= pageblock_order / 2 || | 1154 | if (current_order >= pageblock_order / 2 || |
| 1165 | start_type == MIGRATE_RECLAIMABLE || | 1155 | start_type == MIGRATE_RECLAIMABLE || |
| 1156 | start_type == MIGRATE_UNMOVABLE || | ||
| 1166 | page_group_by_mobility_disabled) { | 1157 | page_group_by_mobility_disabled) { |
| 1167 | int pages; | 1158 | int pages; |
| 1168 | 1159 | ||
| @@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
| 1170 | 1161 | ||
| 1171 | /* Claim the whole block if over half of it is free */ | 1162 | /* Claim the whole block if over half of it is free */ |
| 1172 | if (pages >= (1 << (pageblock_order-1)) || | 1163 | if (pages >= (1 << (pageblock_order-1)) || |
| 1173 | page_group_by_mobility_disabled) { | 1164 | page_group_by_mobility_disabled) |
| 1174 | |||
| 1175 | set_pageblock_migratetype(page, start_type); | 1165 | set_pageblock_migratetype(page, start_type); |
| 1176 | return start_type; | ||
| 1177 | } | ||
| 1178 | |||
| 1179 | } | 1166 | } |
| 1180 | |||
| 1181 | return fallback_type; | ||
| 1182 | } | 1167 | } |
| 1183 | 1168 | ||
| 1184 | /* Remove an element from the buddy allocator from the fallback list */ | 1169 | /* Remove an element from the buddy allocator from the fallback list */ |
| @@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
| 1188 | struct free_area *area; | 1173 | struct free_area *area; |
| 1189 | unsigned int current_order; | 1174 | unsigned int current_order; |
| 1190 | struct page *page; | 1175 | struct page *page; |
| 1191 | int migratetype, new_type, i; | ||
| 1192 | 1176 | ||
| 1193 | /* Find the largest possible block of pages in the other list */ | 1177 | /* Find the largest possible block of pages in the other list */ |
| 1194 | for (current_order = MAX_ORDER-1; | 1178 | for (current_order = MAX_ORDER-1; |
| 1195 | current_order >= order && current_order <= MAX_ORDER-1; | 1179 | current_order >= order && current_order <= MAX_ORDER-1; |
| 1196 | --current_order) { | 1180 | --current_order) { |
| 1181 | int i; | ||
| 1197 | for (i = 0;; i++) { | 1182 | for (i = 0;; i++) { |
| 1198 | migratetype = fallbacks[start_migratetype][i]; | 1183 | int migratetype = fallbacks[start_migratetype][i]; |
| 1184 | int buddy_type = start_migratetype; | ||
| 1199 | 1185 | ||
| 1200 | /* MIGRATE_RESERVE handled later if necessary */ | 1186 | /* MIGRATE_RESERVE handled later if necessary */ |
| 1201 | if (migratetype == MIGRATE_RESERVE) | 1187 | if (migratetype == MIGRATE_RESERVE) |
| @@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
| 1209 | struct page, lru); | 1195 | struct page, lru); |
| 1210 | area->nr_free--; | 1196 | area->nr_free--; |
| 1211 | 1197 | ||
| 1212 | new_type = try_to_steal_freepages(zone, page, | 1198 | if (!is_migrate_cma(migratetype)) { |
| 1213 | start_migratetype, | 1199 | try_to_steal_freepages(zone, page, |
| 1214 | migratetype); | 1200 | start_migratetype, |
| 1201 | migratetype); | ||
| 1202 | } else { | ||
| 1203 | /* | ||
| 1204 | * When borrowing from MIGRATE_CMA, we need to | ||
| 1205 | * release the excess buddy pages to CMA | ||
| 1206 | * itself, and we do not try to steal extra | ||
| 1207 | * free pages. | ||
| 1208 | */ | ||
| 1209 | buddy_type = migratetype; | ||
| 1210 | } | ||
| 1215 | 1211 | ||
| 1216 | /* Remove the page from the freelists */ | 1212 | /* Remove the page from the freelists */ |
| 1217 | list_del(&page->lru); | 1213 | list_del(&page->lru); |
| 1218 | rmv_page_order(page); | 1214 | rmv_page_order(page); |
| 1219 | 1215 | ||
| 1220 | expand(zone, page, order, current_order, area, | 1216 | expand(zone, page, order, current_order, area, |
| 1221 | new_type); | 1217 | buddy_type); |
| 1222 | /* The freepage_migratetype may differ from pageblock's | 1218 | |
| 1219 | /* | ||
| 1220 | * The freepage_migratetype may differ from pageblock's | ||
| 1223 | * migratetype depending on the decisions in | 1221 | * migratetype depending on the decisions in |
| 1224 | * try_to_steal_freepages. This is OK as long as it does | 1222 | * try_to_steal_freepages(). This is OK as long as it |
| 1225 | * not differ for MIGRATE_CMA type. | 1223 | * does not differ for MIGRATE_CMA pageblocks. For CMA |
| 1224 | * we need to make sure unallocated pages flushed from | ||
| 1225 | * pcp lists are returned to the correct freelist. | ||
| 1226 | */ | 1226 | */ |
| 1227 | set_freepage_migratetype(page, new_type); | 1227 | set_freepage_migratetype(page, buddy_type); |
| 1228 | 1228 | ||
| 1229 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1229 | trace_mm_page_alloc_extfrag(page, order, current_order, |
| 1230 | start_migratetype, migratetype, new_type); | 1230 | start_migratetype, migratetype); |
| 1231 | 1231 | ||
| 1232 | return page; | 1232 | return page; |
| 1233 | } | 1233 | } |
| @@ -1642,9 +1642,7 @@ int split_free_page(struct page *page) | |||
| 1642 | } | 1642 | } |
| 1643 | 1643 | ||
| 1644 | /* | 1644 | /* |
| 1645 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 1645 | * Allocate a page from the given zone. Use pcplists for order-0 allocations. |
| 1646 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | ||
| 1647 | * or two. | ||
| 1648 | */ | 1646 | */ |
| 1649 | static inline | 1647 | static inline |
| 1650 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 1648 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
| @@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
| 1655 | struct page *page; | 1653 | struct page *page; |
| 1656 | bool cold = ((gfp_flags & __GFP_COLD) != 0); | 1654 | bool cold = ((gfp_flags & __GFP_COLD) != 0); |
| 1657 | 1655 | ||
| 1658 | again: | ||
| 1659 | if (likely(order == 0)) { | 1656 | if (likely(order == 0)) { |
| 1660 | struct per_cpu_pages *pcp; | 1657 | struct per_cpu_pages *pcp; |
| 1661 | struct list_head *list; | 1658 | struct list_head *list; |
| @@ -1711,8 +1708,6 @@ again: | |||
| 1711 | local_irq_restore(flags); | 1708 | local_irq_restore(flags); |
| 1712 | 1709 | ||
| 1713 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 1710 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
| 1714 | if (prep_new_page(page, order, gfp_flags)) | ||
| 1715 | goto again; | ||
| 1716 | return page; | 1711 | return page; |
| 1717 | 1712 | ||
| 1718 | failed: | 1713 | failed: |
| @@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone) | |||
| 2033 | * a page. | 2028 | * a page. |
| 2034 | */ | 2029 | */ |
| 2035 | static struct page * | 2030 | static struct page * |
| 2036 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 2031 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, |
| 2037 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, | 2032 | const struct alloc_context *ac) |
| 2038 | struct zone *preferred_zone, int classzone_idx, int migratetype) | ||
| 2039 | { | 2033 | { |
| 2034 | struct zonelist *zonelist = ac->zonelist; | ||
| 2040 | struct zoneref *z; | 2035 | struct zoneref *z; |
| 2041 | struct page *page = NULL; | 2036 | struct page *page = NULL; |
| 2042 | struct zone *zone; | 2037 | struct zone *zone; |
| @@ -2055,8 +2050,8 @@ zonelist_scan: | |||
| 2055 | * Scan zonelist, looking for a zone with enough free. | 2050 | * Scan zonelist, looking for a zone with enough free. |
| 2056 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. | 2051 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
| 2057 | */ | 2052 | */ |
| 2058 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2053 | for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, |
| 2059 | high_zoneidx, nodemask) { | 2054 | ac->nodemask) { |
| 2060 | unsigned long mark; | 2055 | unsigned long mark; |
| 2061 | 2056 | ||
| 2062 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && | 2057 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
| @@ -2073,7 +2068,7 @@ zonelist_scan: | |||
| 2073 | * time the page has in memory before being reclaimed. | 2068 | * time the page has in memory before being reclaimed. |
| 2074 | */ | 2069 | */ |
| 2075 | if (alloc_flags & ALLOC_FAIR) { | 2070 | if (alloc_flags & ALLOC_FAIR) { |
| 2076 | if (!zone_local(preferred_zone, zone)) | 2071 | if (!zone_local(ac->preferred_zone, zone)) |
| 2077 | break; | 2072 | break; |
| 2078 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { | 2073 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { |
| 2079 | nr_fair_skipped++; | 2074 | nr_fair_skipped++; |
| @@ -2111,7 +2106,7 @@ zonelist_scan: | |||
| 2111 | 2106 | ||
| 2112 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 2107 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
| 2113 | if (!zone_watermark_ok(zone, order, mark, | 2108 | if (!zone_watermark_ok(zone, order, mark, |
| 2114 | classzone_idx, alloc_flags)) { | 2109 | ac->classzone_idx, alloc_flags)) { |
| 2115 | int ret; | 2110 | int ret; |
| 2116 | 2111 | ||
| 2117 | /* Checked here to keep the fast path fast */ | 2112 | /* Checked here to keep the fast path fast */ |
| @@ -2132,7 +2127,7 @@ zonelist_scan: | |||
| 2132 | } | 2127 | } |
| 2133 | 2128 | ||
| 2134 | if (zone_reclaim_mode == 0 || | 2129 | if (zone_reclaim_mode == 0 || |
| 2135 | !zone_allows_reclaim(preferred_zone, zone)) | 2130 | !zone_allows_reclaim(ac->preferred_zone, zone)) |
| 2136 | goto this_zone_full; | 2131 | goto this_zone_full; |
| 2137 | 2132 | ||
| 2138 | /* | 2133 | /* |
| @@ -2154,7 +2149,7 @@ zonelist_scan: | |||
| 2154 | default: | 2149 | default: |
| 2155 | /* did we reclaim enough */ | 2150 | /* did we reclaim enough */ |
| 2156 | if (zone_watermark_ok(zone, order, mark, | 2151 | if (zone_watermark_ok(zone, order, mark, |
| 2157 | classzone_idx, alloc_flags)) | 2152 | ac->classzone_idx, alloc_flags)) |
| 2158 | goto try_this_zone; | 2153 | goto try_this_zone; |
| 2159 | 2154 | ||
| 2160 | /* | 2155 | /* |
| @@ -2175,27 +2170,18 @@ zonelist_scan: | |||
| 2175 | } | 2170 | } |
| 2176 | 2171 | ||
| 2177 | try_this_zone: | 2172 | try_this_zone: |
| 2178 | page = buffered_rmqueue(preferred_zone, zone, order, | 2173 | page = buffered_rmqueue(ac->preferred_zone, zone, order, |
| 2179 | gfp_mask, migratetype); | 2174 | gfp_mask, ac->migratetype); |
| 2180 | if (page) | 2175 | if (page) { |
| 2181 | break; | 2176 | if (prep_new_page(page, order, gfp_mask, alloc_flags)) |
| 2177 | goto try_this_zone; | ||
| 2178 | return page; | ||
| 2179 | } | ||
| 2182 | this_zone_full: | 2180 | this_zone_full: |
| 2183 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) | 2181 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) |
| 2184 | zlc_mark_zone_full(zonelist, z); | 2182 | zlc_mark_zone_full(zonelist, z); |
| 2185 | } | 2183 | } |
| 2186 | 2184 | ||
| 2187 | if (page) { | ||
| 2188 | /* | ||
| 2189 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | ||
| 2190 | * necessary to allocate the page. The expectation is | ||
| 2191 | * that the caller is taking steps that will free more | ||
| 2192 | * memory. The caller should avoid the page being used | ||
| 2193 | * for !PFMEMALLOC purposes. | ||
| 2194 | */ | ||
| 2195 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
| 2196 | return page; | ||
| 2197 | } | ||
| 2198 | |||
| 2199 | /* | 2185 | /* |
| 2200 | * The first pass makes sure allocations are spread fairly within the | 2186 | * The first pass makes sure allocations are spread fairly within the |
| 2201 | * local node. However, the local node might have free pages left | 2187 | * local node. However, the local node might have free pages left |
| @@ -2208,7 +2194,7 @@ this_zone_full: | |||
| 2208 | alloc_flags &= ~ALLOC_FAIR; | 2194 | alloc_flags &= ~ALLOC_FAIR; |
| 2209 | if (nr_fair_skipped) { | 2195 | if (nr_fair_skipped) { |
| 2210 | zonelist_rescan = true; | 2196 | zonelist_rescan = true; |
| 2211 | reset_alloc_batches(preferred_zone); | 2197 | reset_alloc_batches(ac->preferred_zone); |
| 2212 | } | 2198 | } |
| 2213 | if (nr_online_nodes > 1) | 2199 | if (nr_online_nodes > 1) |
| 2214 | zonelist_rescan = true; | 2200 | zonelist_rescan = true; |
| @@ -2330,44 +2316,29 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, | |||
| 2330 | 2316 | ||
| 2331 | static inline struct page * | 2317 | static inline struct page * |
| 2332 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2318 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
| 2333 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2319 | const struct alloc_context *ac, unsigned long *did_some_progress) |
| 2334 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
| 2335 | int classzone_idx, int migratetype, unsigned long *did_some_progress) | ||
| 2336 | { | 2320 | { |
| 2337 | struct page *page; | 2321 | struct page *page; |
| 2338 | 2322 | ||
| 2339 | *did_some_progress = 0; | 2323 | *did_some_progress = 0; |
| 2340 | 2324 | ||
| 2341 | if (oom_killer_disabled) | ||
| 2342 | return NULL; | ||
| 2343 | |||
| 2344 | /* | 2325 | /* |
| 2345 | * Acquire the per-zone oom lock for each zone. If that | 2326 | * Acquire the per-zone oom lock for each zone. If that |
| 2346 | * fails, somebody else is making progress for us. | 2327 | * fails, somebody else is making progress for us. |
| 2347 | */ | 2328 | */ |
| 2348 | if (!oom_zonelist_trylock(zonelist, gfp_mask)) { | 2329 | if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { |
| 2349 | *did_some_progress = 1; | 2330 | *did_some_progress = 1; |
| 2350 | schedule_timeout_uninterruptible(1); | 2331 | schedule_timeout_uninterruptible(1); |
| 2351 | return NULL; | 2332 | return NULL; |
| 2352 | } | 2333 | } |
| 2353 | 2334 | ||
| 2354 | /* | 2335 | /* |
| 2355 | * PM-freezer should be notified that there might be an OOM killer on | ||
| 2356 | * its way to kill and wake somebody up. This is too early and we might | ||
| 2357 | * end up not killing anything but false positives are acceptable. | ||
| 2358 | * See freeze_processes. | ||
| 2359 | */ | ||
| 2360 | note_oom_kill(); | ||
| 2361 | |||
| 2362 | /* | ||
| 2363 | * Go through the zonelist yet one more time, keep very high watermark | 2336 | * Go through the zonelist yet one more time, keep very high watermark |
| 2364 | * here, this is only to catch a parallel oom killing, we must fail if | 2337 | * here, this is only to catch a parallel oom killing, we must fail if |
| 2365 | * we're still under heavy pressure. | 2338 | * we're still under heavy pressure. |
| 2366 | */ | 2339 | */ |
| 2367 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | 2340 | page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, |
| 2368 | order, zonelist, high_zoneidx, | 2341 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); |
| 2369 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | ||
| 2370 | preferred_zone, classzone_idx, migratetype); | ||
| 2371 | if (page) | 2342 | if (page) |
| 2372 | goto out; | 2343 | goto out; |
| 2373 | 2344 | ||
| @@ -2379,11 +2350,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
| 2379 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 2350 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
| 2380 | goto out; | 2351 | goto out; |
| 2381 | /* The OOM killer does not needlessly kill tasks for lowmem */ | 2352 | /* The OOM killer does not needlessly kill tasks for lowmem */ |
| 2382 | if (high_zoneidx < ZONE_NORMAL) | 2353 | if (ac->high_zoneidx < ZONE_NORMAL) |
| 2383 | goto out; | 2354 | goto out; |
| 2384 | /* The OOM killer does not compensate for light reclaim */ | 2355 | /* The OOM killer does not compensate for light reclaim */ |
| 2385 | if (!(gfp_mask & __GFP_FS)) | 2356 | if (!(gfp_mask & __GFP_FS)) { |
| 2357 | /* | ||
| 2358 | * XXX: Page reclaim didn't yield anything, | ||
| 2359 | * and the OOM killer can't be invoked, but | ||
| 2360 | * keep looping as per should_alloc_retry(). | ||
| 2361 | */ | ||
| 2362 | *did_some_progress = 1; | ||
| 2386 | goto out; | 2363 | goto out; |
| 2364 | } | ||
| 2387 | /* | 2365 | /* |
| 2388 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | 2366 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. |
| 2389 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | 2367 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. |
| @@ -2395,10 +2373,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
| 2395 | goto out; | 2373 | goto out; |
| 2396 | } | 2374 | } |
| 2397 | /* Exhausted what can be done so it's blamo time */ | 2375 | /* Exhausted what can be done so it's blamo time */ |
| 2398 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); | 2376 | if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) |
| 2399 | *did_some_progress = 1; | 2377 | *did_some_progress = 1; |
| 2400 | out: | 2378 | out: |
| 2401 | oom_zonelist_unlock(zonelist, gfp_mask); | 2379 | oom_zonelist_unlock(ac->zonelist, gfp_mask); |
| 2402 | return page; | 2380 | return page; |
| 2403 | } | 2381 | } |
| 2404 | 2382 | ||
| @@ -2406,10 +2384,9 @@ out: | |||
| 2406 | /* Try memory compaction for high-order allocations before reclaim */ | 2384 | /* Try memory compaction for high-order allocations before reclaim */ |
| 2407 | static struct page * | 2385 | static struct page * |
| 2408 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2386 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
| 2409 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2387 | int alloc_flags, const struct alloc_context *ac, |
| 2410 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2388 | enum migrate_mode mode, int *contended_compaction, |
| 2411 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2389 | bool *deferred_compaction) |
| 2412 | int *contended_compaction, bool *deferred_compaction) | ||
| 2413 | { | 2390 | { |
| 2414 | unsigned long compact_result; | 2391 | unsigned long compact_result; |
| 2415 | struct page *page; | 2392 | struct page *page; |
| @@ -2418,10 +2395,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
| 2418 | return NULL; | 2395 | return NULL; |
| 2419 | 2396 | ||
| 2420 | current->flags |= PF_MEMALLOC; | 2397 | current->flags |= PF_MEMALLOC; |
| 2421 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, | 2398 | compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
| 2422 | nodemask, mode, | 2399 | mode, contended_compaction); |
| 2423 | contended_compaction, | ||
| 2424 | alloc_flags, classzone_idx); | ||
| 2425 | current->flags &= ~PF_MEMALLOC; | 2400 | current->flags &= ~PF_MEMALLOC; |
| 2426 | 2401 | ||
| 2427 | switch (compact_result) { | 2402 | switch (compact_result) { |
| @@ -2440,10 +2415,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
| 2440 | */ | 2415 | */ |
| 2441 | count_vm_event(COMPACTSTALL); | 2416 | count_vm_event(COMPACTSTALL); |
| 2442 | 2417 | ||
| 2443 | page = get_page_from_freelist(gfp_mask, nodemask, | 2418 | page = get_page_from_freelist(gfp_mask, order, |
| 2444 | order, zonelist, high_zoneidx, | 2419 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
| 2445 | alloc_flags & ~ALLOC_NO_WATERMARKS, | ||
| 2446 | preferred_zone, classzone_idx, migratetype); | ||
| 2447 | 2420 | ||
| 2448 | if (page) { | 2421 | if (page) { |
| 2449 | struct zone *zone = page_zone(page); | 2422 | struct zone *zone = page_zone(page); |
| @@ -2467,10 +2440,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
| 2467 | #else | 2440 | #else |
| 2468 | static inline struct page * | 2441 | static inline struct page * |
| 2469 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2442 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
| 2470 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2443 | int alloc_flags, const struct alloc_context *ac, |
| 2471 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2444 | enum migrate_mode mode, int *contended_compaction, |
| 2472 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2445 | bool *deferred_compaction) |
| 2473 | int *contended_compaction, bool *deferred_compaction) | ||
| 2474 | { | 2446 | { |
| 2475 | return NULL; | 2447 | return NULL; |
| 2476 | } | 2448 | } |
| @@ -2478,8 +2450,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
| 2478 | 2450 | ||
| 2479 | /* Perform direct synchronous page reclaim */ | 2451 | /* Perform direct synchronous page reclaim */ |
| 2480 | static int | 2452 | static int |
| 2481 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | 2453 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, |
| 2482 | nodemask_t *nodemask) | 2454 | const struct alloc_context *ac) |
| 2483 | { | 2455 | { |
| 2484 | struct reclaim_state reclaim_state; | 2456 | struct reclaim_state reclaim_state; |
| 2485 | int progress; | 2457 | int progress; |
| @@ -2493,7 +2465,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | |||
| 2493 | reclaim_state.reclaimed_slab = 0; | 2465 | reclaim_state.reclaimed_slab = 0; |
| 2494 | current->reclaim_state = &reclaim_state; | 2466 | current->reclaim_state = &reclaim_state; |
| 2495 | 2467 | ||
| 2496 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 2468 | progress = try_to_free_pages(ac->zonelist, order, gfp_mask, |
| 2469 | ac->nodemask); | ||
| 2497 | 2470 | ||
| 2498 | current->reclaim_state = NULL; | 2471 | current->reclaim_state = NULL; |
| 2499 | lockdep_clear_current_reclaim_state(); | 2472 | lockdep_clear_current_reclaim_state(); |
| @@ -2507,28 +2480,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | |||
| 2507 | /* The really slow allocator path where we enter direct reclaim */ | 2480 | /* The really slow allocator path where we enter direct reclaim */ |
| 2508 | static inline struct page * | 2481 | static inline struct page * |
| 2509 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2482 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
| 2510 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2483 | int alloc_flags, const struct alloc_context *ac, |
| 2511 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2484 | unsigned long *did_some_progress) |
| 2512 | int classzone_idx, int migratetype, unsigned long *did_some_progress) | ||
| 2513 | { | 2485 | { |
| 2514 | struct page *page = NULL; | 2486 | struct page *page = NULL; |
| 2515 | bool drained = false; | 2487 | bool drained = false; |
| 2516 | 2488 | ||
| 2517 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | 2489 | *did_some_progress = __perform_reclaim(gfp_mask, order, ac); |
| 2518 | nodemask); | ||
| 2519 | if (unlikely(!(*did_some_progress))) | 2490 | if (unlikely(!(*did_some_progress))) |
| 2520 | return NULL; | 2491 | return NULL; |
| 2521 | 2492 | ||
| 2522 | /* After successful reclaim, reconsider all zones for allocation */ | 2493 | /* After successful reclaim, reconsider all zones for allocation */ |
| 2523 | if (IS_ENABLED(CONFIG_NUMA)) | 2494 | if (IS_ENABLED(CONFIG_NUMA)) |
| 2524 | zlc_clear_zones_full(zonelist); | 2495 | zlc_clear_zones_full(ac->zonelist); |
| 2525 | 2496 | ||
| 2526 | retry: | 2497 | retry: |
| 2527 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2498 | page = get_page_from_freelist(gfp_mask, order, |
| 2528 | zonelist, high_zoneidx, | 2499 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
| 2529 | alloc_flags & ~ALLOC_NO_WATERMARKS, | ||
| 2530 | preferred_zone, classzone_idx, | ||
| 2531 | migratetype); | ||
| 2532 | 2500 | ||
| 2533 | /* | 2501 | /* |
| 2534 | * If an allocation failed after direct reclaim, it could be because | 2502 | * If an allocation failed after direct reclaim, it could be because |
| @@ -2549,36 +2517,30 @@ retry: | |||
| 2549 | */ | 2517 | */ |
| 2550 | static inline struct page * | 2518 | static inline struct page * |
| 2551 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | 2519 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
| 2552 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2520 | const struct alloc_context *ac) |
| 2553 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
| 2554 | int classzone_idx, int migratetype) | ||
| 2555 | { | 2521 | { |
| 2556 | struct page *page; | 2522 | struct page *page; |
| 2557 | 2523 | ||
| 2558 | do { | 2524 | do { |
| 2559 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2525 | page = get_page_from_freelist(gfp_mask, order, |
| 2560 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | 2526 | ALLOC_NO_WATERMARKS, ac); |
| 2561 | preferred_zone, classzone_idx, migratetype); | ||
| 2562 | 2527 | ||
| 2563 | if (!page && gfp_mask & __GFP_NOFAIL) | 2528 | if (!page && gfp_mask & __GFP_NOFAIL) |
| 2564 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2529 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, |
| 2530 | HZ/50); | ||
| 2565 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 2531 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
| 2566 | 2532 | ||
| 2567 | return page; | 2533 | return page; |
| 2568 | } | 2534 | } |
| 2569 | 2535 | ||
| 2570 | static void wake_all_kswapds(unsigned int order, | 2536 | static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) |
| 2571 | struct zonelist *zonelist, | ||
| 2572 | enum zone_type high_zoneidx, | ||
| 2573 | struct zone *preferred_zone, | ||
| 2574 | nodemask_t *nodemask) | ||
| 2575 | { | 2537 | { |
| 2576 | struct zoneref *z; | 2538 | struct zoneref *z; |
| 2577 | struct zone *zone; | 2539 | struct zone *zone; |
| 2578 | 2540 | ||
| 2579 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2541 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, |
| 2580 | high_zoneidx, nodemask) | 2542 | ac->high_zoneidx, ac->nodemask) |
| 2581 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); | 2543 | wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); |
| 2582 | } | 2544 | } |
| 2583 | 2545 | ||
| 2584 | static inline int | 2546 | static inline int |
| @@ -2637,9 +2599,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | |||
| 2637 | 2599 | ||
| 2638 | static inline struct page * | 2600 | static inline struct page * |
| 2639 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2601 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
| 2640 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2602 | struct alloc_context *ac) |
| 2641 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
| 2642 | int classzone_idx, int migratetype) | ||
| 2643 | { | 2603 | { |
| 2644 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2604 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
| 2645 | struct page *page = NULL; | 2605 | struct page *page = NULL; |
| @@ -2675,8 +2635,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
| 2675 | 2635 | ||
| 2676 | retry: | 2636 | retry: |
| 2677 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2637 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
| 2678 | wake_all_kswapds(order, zonelist, high_zoneidx, | 2638 | wake_all_kswapds(order, ac); |
| 2679 | preferred_zone, nodemask); | ||
| 2680 | 2639 | ||
| 2681 | /* | 2640 | /* |
| 2682 | * OK, we're below the kswapd watermark and have kicked background | 2641 | * OK, we're below the kswapd watermark and have kicked background |
| @@ -2689,17 +2648,16 @@ retry: | |||
| 2689 | * Find the true preferred zone if the allocation is unconstrained by | 2648 | * Find the true preferred zone if the allocation is unconstrained by |
| 2690 | * cpusets. | 2649 | * cpusets. |
| 2691 | */ | 2650 | */ |
| 2692 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { | 2651 | if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { |
| 2693 | struct zoneref *preferred_zoneref; | 2652 | struct zoneref *preferred_zoneref; |
| 2694 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, | 2653 | preferred_zoneref = first_zones_zonelist(ac->zonelist, |
| 2695 | NULL, &preferred_zone); | 2654 | ac->high_zoneidx, NULL, &ac->preferred_zone); |
| 2696 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2655 | ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); |
| 2697 | } | 2656 | } |
| 2698 | 2657 | ||
| 2699 | /* This is the last chance, in general, before the goto nopage. */ | 2658 | /* This is the last chance, in general, before the goto nopage. */ |
| 2700 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2659 | page = get_page_from_freelist(gfp_mask, order, |
| 2701 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2660 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
| 2702 | preferred_zone, classzone_idx, migratetype); | ||
| 2703 | if (page) | 2661 | if (page) |
| 2704 | goto got_pg; | 2662 | goto got_pg; |
| 2705 | 2663 | ||
| @@ -2710,11 +2668,10 @@ retry: | |||
| 2710 | * the allocation is high priority and these type of | 2668 | * the allocation is high priority and these type of |
| 2711 | * allocations are system rather than user orientated | 2669 | * allocations are system rather than user orientated |
| 2712 | */ | 2670 | */ |
| 2713 | zonelist = node_zonelist(numa_node_id(), gfp_mask); | 2671 | ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); |
| 2672 | |||
| 2673 | page = __alloc_pages_high_priority(gfp_mask, order, ac); | ||
| 2714 | 2674 | ||
| 2715 | page = __alloc_pages_high_priority(gfp_mask, order, | ||
| 2716 | zonelist, high_zoneidx, nodemask, | ||
| 2717 | preferred_zone, classzone_idx, migratetype); | ||
| 2718 | if (page) { | 2675 | if (page) { |
| 2719 | goto got_pg; | 2676 | goto got_pg; |
| 2720 | } | 2677 | } |
| @@ -2743,11 +2700,9 @@ retry: | |||
| 2743 | * Try direct compaction. The first pass is asynchronous. Subsequent | 2700 | * Try direct compaction. The first pass is asynchronous. Subsequent |
| 2744 | * attempts after direct reclaim are synchronous | 2701 | * attempts after direct reclaim are synchronous |
| 2745 | */ | 2702 | */ |
| 2746 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, | 2703 | page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, |
| 2747 | high_zoneidx, nodemask, alloc_flags, | 2704 | migration_mode, |
| 2748 | preferred_zone, | 2705 | &contended_compaction, |
| 2749 | classzone_idx, migratetype, | ||
| 2750 | migration_mode, &contended_compaction, | ||
| 2751 | &deferred_compaction); | 2706 | &deferred_compaction); |
| 2752 | if (page) | 2707 | if (page) |
| 2753 | goto got_pg; | 2708 | goto got_pg; |
| @@ -2793,12 +2748,8 @@ retry: | |||
| 2793 | migration_mode = MIGRATE_SYNC_LIGHT; | 2748 | migration_mode = MIGRATE_SYNC_LIGHT; |
| 2794 | 2749 | ||
| 2795 | /* Try direct reclaim and then allocating */ | 2750 | /* Try direct reclaim and then allocating */ |
| 2796 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2751 | page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, |
| 2797 | zonelist, high_zoneidx, | 2752 | &did_some_progress); |
| 2798 | nodemask, | ||
| 2799 | alloc_flags, preferred_zone, | ||
| 2800 | classzone_idx, migratetype, | ||
| 2801 | &did_some_progress); | ||
| 2802 | if (page) | 2753 | if (page) |
| 2803 | goto got_pg; | 2754 | goto got_pg; |
| 2804 | 2755 | ||
| @@ -2812,17 +2763,15 @@ retry: | |||
| 2812 | * start OOM killing tasks. | 2763 | * start OOM killing tasks. |
| 2813 | */ | 2764 | */ |
| 2814 | if (!did_some_progress) { | 2765 | if (!did_some_progress) { |
| 2815 | page = __alloc_pages_may_oom(gfp_mask, order, zonelist, | 2766 | page = __alloc_pages_may_oom(gfp_mask, order, ac, |
| 2816 | high_zoneidx, nodemask, | 2767 | &did_some_progress); |
| 2817 | preferred_zone, classzone_idx, | ||
| 2818 | migratetype,&did_some_progress); | ||
| 2819 | if (page) | 2768 | if (page) |
| 2820 | goto got_pg; | 2769 | goto got_pg; |
| 2821 | if (!did_some_progress) | 2770 | if (!did_some_progress) |
| 2822 | goto nopage; | 2771 | goto nopage; |
| 2823 | } | 2772 | } |
| 2824 | /* Wait for some write requests to complete then retry */ | 2773 | /* Wait for some write requests to complete then retry */ |
| 2825 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2774 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); |
| 2826 | goto retry; | 2775 | goto retry; |
| 2827 | } else { | 2776 | } else { |
| 2828 | /* | 2777 | /* |
| @@ -2830,11 +2779,9 @@ retry: | |||
| 2830 | * direct reclaim and reclaim/compaction depends on compaction | 2779 | * direct reclaim and reclaim/compaction depends on compaction |
| 2831 | * being called after reclaim so call directly if necessary | 2780 | * being called after reclaim so call directly if necessary |
| 2832 | */ | 2781 | */ |
| 2833 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, | 2782 | page = __alloc_pages_direct_compact(gfp_mask, order, |
| 2834 | high_zoneidx, nodemask, alloc_flags, | 2783 | alloc_flags, ac, migration_mode, |
| 2835 | preferred_zone, | 2784 | &contended_compaction, |
| 2836 | classzone_idx, migratetype, | ||
| 2837 | migration_mode, &contended_compaction, | ||
| 2838 | &deferred_compaction); | 2785 | &deferred_compaction); |
| 2839 | if (page) | 2786 | if (page) |
| 2840 | goto got_pg; | 2787 | goto got_pg; |
| @@ -2842,11 +2789,7 @@ retry: | |||
| 2842 | 2789 | ||
| 2843 | nopage: | 2790 | nopage: |
| 2844 | warn_alloc_failed(gfp_mask, order, NULL); | 2791 | warn_alloc_failed(gfp_mask, order, NULL); |
| 2845 | return page; | ||
| 2846 | got_pg: | 2792 | got_pg: |
| 2847 | if (kmemcheck_enabled) | ||
| 2848 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
| 2849 | |||
| 2850 | return page; | 2793 | return page; |
| 2851 | } | 2794 | } |
| 2852 | 2795 | ||
| @@ -2857,14 +2800,16 @@ struct page * | |||
| 2857 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | 2800 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, |
| 2858 | struct zonelist *zonelist, nodemask_t *nodemask) | 2801 | struct zonelist *zonelist, nodemask_t *nodemask) |
| 2859 | { | 2802 | { |
| 2860 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
| 2861 | struct zone *preferred_zone; | ||
| 2862 | struct zoneref *preferred_zoneref; | 2803 | struct zoneref *preferred_zoneref; |
| 2863 | struct page *page = NULL; | 2804 | struct page *page = NULL; |
| 2864 | int migratetype = gfpflags_to_migratetype(gfp_mask); | ||
| 2865 | unsigned int cpuset_mems_cookie; | 2805 | unsigned int cpuset_mems_cookie; |
| 2866 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; | 2806 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
| 2867 | int classzone_idx; | 2807 | gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ |
| 2808 | struct alloc_context ac = { | ||
| 2809 | .high_zoneidx = gfp_zone(gfp_mask), | ||
| 2810 | .nodemask = nodemask, | ||
| 2811 | .migratetype = gfpflags_to_migratetype(gfp_mask), | ||
| 2812 | }; | ||
| 2868 | 2813 | ||
| 2869 | gfp_mask &= gfp_allowed_mask; | 2814 | gfp_mask &= gfp_allowed_mask; |
| 2870 | 2815 | ||
| @@ -2883,37 +2828,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
| 2883 | if (unlikely(!zonelist->_zonerefs->zone)) | 2828 | if (unlikely(!zonelist->_zonerefs->zone)) |
| 2884 | return NULL; | 2829 | return NULL; |
| 2885 | 2830 | ||
| 2886 | if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) | 2831 | if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) |
| 2887 | alloc_flags |= ALLOC_CMA; | 2832 | alloc_flags |= ALLOC_CMA; |
| 2888 | 2833 | ||
| 2889 | retry_cpuset: | 2834 | retry_cpuset: |
| 2890 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2835 | cpuset_mems_cookie = read_mems_allowed_begin(); |
| 2891 | 2836 | ||
| 2837 | /* We set it here, as __alloc_pages_slowpath might have changed it */ | ||
| 2838 | ac.zonelist = zonelist; | ||
| 2892 | /* The preferred zone is used for statistics later */ | 2839 | /* The preferred zone is used for statistics later */ |
| 2893 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, | 2840 | preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, |
| 2894 | nodemask ? : &cpuset_current_mems_allowed, | 2841 | ac.nodemask ? : &cpuset_current_mems_allowed, |
| 2895 | &preferred_zone); | 2842 | &ac.preferred_zone); |
| 2896 | if (!preferred_zone) | 2843 | if (!ac.preferred_zone) |
| 2897 | goto out; | 2844 | goto out; |
| 2898 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2845 | ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); |
| 2899 | 2846 | ||
| 2900 | /* First allocation attempt */ | 2847 | /* First allocation attempt */ |
| 2901 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2848 | alloc_mask = gfp_mask|__GFP_HARDWALL; |
| 2902 | zonelist, high_zoneidx, alloc_flags, | 2849 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); |
| 2903 | preferred_zone, classzone_idx, migratetype); | ||
| 2904 | if (unlikely(!page)) { | 2850 | if (unlikely(!page)) { |
| 2905 | /* | 2851 | /* |
| 2906 | * Runtime PM, block IO and its error handling path | 2852 | * Runtime PM, block IO and its error handling path |
| 2907 | * can deadlock because I/O on the device might not | 2853 | * can deadlock because I/O on the device might not |
| 2908 | * complete. | 2854 | * complete. |
| 2909 | */ | 2855 | */ |
| 2910 | gfp_mask = memalloc_noio_flags(gfp_mask); | 2856 | alloc_mask = memalloc_noio_flags(gfp_mask); |
| 2911 | page = __alloc_pages_slowpath(gfp_mask, order, | 2857 | |
| 2912 | zonelist, high_zoneidx, nodemask, | 2858 | page = __alloc_pages_slowpath(alloc_mask, order, &ac); |
| 2913 | preferred_zone, classzone_idx, migratetype); | ||
| 2914 | } | 2859 | } |
| 2915 | 2860 | ||
| 2916 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2861 | if (kmemcheck_enabled && page) |
| 2862 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
| 2863 | |||
| 2864 | trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); | ||
| 2917 | 2865 | ||
| 2918 | out: | 2866 | out: |
| 2919 | /* | 2867 | /* |
| @@ -3933,18 +3881,29 @@ static int __build_all_zonelists(void *data) | |||
| 3933 | return 0; | 3881 | return 0; |
| 3934 | } | 3882 | } |
| 3935 | 3883 | ||
| 3884 | static noinline void __init | ||
| 3885 | build_all_zonelists_init(void) | ||
| 3886 | { | ||
| 3887 | __build_all_zonelists(NULL); | ||
| 3888 | mminit_verify_zonelist(); | ||
| 3889 | cpuset_init_current_mems_allowed(); | ||
| 3890 | } | ||
| 3891 | |||
| 3936 | /* | 3892 | /* |
| 3937 | * Called with zonelists_mutex held always | 3893 | * Called with zonelists_mutex held always |
| 3938 | * unless system_state == SYSTEM_BOOTING. | 3894 | * unless system_state == SYSTEM_BOOTING. |
| 3895 | * | ||
| 3896 | * __ref due to (1) call of __meminit annotated setup_zone_pageset | ||
| 3897 | * [we're only called with non-NULL zone through __meminit paths] and | ||
| 3898 | * (2) call of __init annotated helper build_all_zonelists_init | ||
| 3899 | * [protected by SYSTEM_BOOTING]. | ||
| 3939 | */ | 3900 | */ |
| 3940 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | 3901 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
| 3941 | { | 3902 | { |
| 3942 | set_zonelist_order(); | 3903 | set_zonelist_order(); |
| 3943 | 3904 | ||
| 3944 | if (system_state == SYSTEM_BOOTING) { | 3905 | if (system_state == SYSTEM_BOOTING) { |
| 3945 | __build_all_zonelists(NULL); | 3906 | build_all_zonelists_init(); |
| 3946 | mminit_verify_zonelist(); | ||
| 3947 | cpuset_init_current_mems_allowed(); | ||
| 3948 | } else { | 3907 | } else { |
| 3949 | #ifdef CONFIG_MEMORY_HOTPLUG | 3908 | #ifdef CONFIG_MEMORY_HOTPLUG |
| 3950 | if (zone) | 3909 | if (zone) |
| @@ -5047,8 +5006,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
| 5047 | pgdat->node_start_pfn = node_start_pfn; | 5006 | pgdat->node_start_pfn = node_start_pfn; |
| 5048 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 5007 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
| 5049 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 5008 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
| 5050 | printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, | 5009 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
| 5051 | (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); | 5010 | (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); |
| 5052 | #endif | 5011 | #endif |
| 5053 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, | 5012 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, |
| 5054 | zones_size, zholes_size); | 5013 | zones_size, zholes_size); |
| @@ -5420,9 +5379,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
| 5420 | arch_zone_highest_possible_pfn[i]) | 5379 | arch_zone_highest_possible_pfn[i]) |
| 5421 | pr_cont("empty\n"); | 5380 | pr_cont("empty\n"); |
| 5422 | else | 5381 | else |
| 5423 | pr_cont("[mem %0#10lx-%0#10lx]\n", | 5382 | pr_cont("[mem %#018Lx-%#018Lx]\n", |
| 5424 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, | 5383 | (u64)arch_zone_lowest_possible_pfn[i] |
| 5425 | (arch_zone_highest_possible_pfn[i] | 5384 | << PAGE_SHIFT, |
| 5385 | ((u64)arch_zone_highest_possible_pfn[i] | ||
| 5426 | << PAGE_SHIFT) - 1); | 5386 | << PAGE_SHIFT) - 1); |
| 5427 | } | 5387 | } |
| 5428 | 5388 | ||
| @@ -5430,15 +5390,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
| 5430 | pr_info("Movable zone start for each node\n"); | 5390 | pr_info("Movable zone start for each node\n"); |
| 5431 | for (i = 0; i < MAX_NUMNODES; i++) { | 5391 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 5432 | if (zone_movable_pfn[i]) | 5392 | if (zone_movable_pfn[i]) |
| 5433 | pr_info(" Node %d: %#010lx\n", i, | 5393 | pr_info(" Node %d: %#018Lx\n", i, |
| 5434 | zone_movable_pfn[i] << PAGE_SHIFT); | 5394 | (u64)zone_movable_pfn[i] << PAGE_SHIFT); |
| 5435 | } | 5395 | } |
| 5436 | 5396 | ||
| 5437 | /* Print out the early node map */ | 5397 | /* Print out the early node map */ |
| 5438 | pr_info("Early memory node ranges\n"); | 5398 | pr_info("Early memory node ranges\n"); |
| 5439 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 5399 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
| 5440 | pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 5400 | pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, |
| 5441 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | 5401 | (u64)start_pfn << PAGE_SHIFT, |
| 5402 | ((u64)end_pfn << PAGE_SHIFT) - 1); | ||
| 5442 | 5403 | ||
| 5443 | /* Initialise every node */ | 5404 | /* Initialise every node */ |
| 5444 | mminit_verify_pageflags_layout(); | 5405 | mminit_verify_pageflags_layout(); |
diff --git a/mm/page_counter.c b/mm/page_counter.c index a009574fbba9..11b4beda14ba 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c | |||
| @@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) | |||
| 166 | /** | 166 | /** |
| 167 | * page_counter_memparse - memparse() for page counter limits | 167 | * page_counter_memparse - memparse() for page counter limits |
| 168 | * @buf: string to parse | 168 | * @buf: string to parse |
| 169 | * @max: string meaning maximum possible value | ||
| 169 | * @nr_pages: returns the result in number of pages | 170 | * @nr_pages: returns the result in number of pages |
| 170 | * | 171 | * |
| 171 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be | 172 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be |
| 172 | * limited to %PAGE_COUNTER_MAX. | 173 | * limited to %PAGE_COUNTER_MAX. |
| 173 | */ | 174 | */ |
| 174 | int page_counter_memparse(const char *buf, unsigned long *nr_pages) | 175 | int page_counter_memparse(const char *buf, const char *max, |
| 176 | unsigned long *nr_pages) | ||
| 175 | { | 177 | { |
| 176 | char unlimited[] = "-1"; | ||
| 177 | char *end; | 178 | char *end; |
| 178 | u64 bytes; | 179 | u64 bytes; |
| 179 | 180 | ||
| 180 | if (!strncmp(buf, unlimited, sizeof(unlimited))) { | 181 | if (!strcmp(buf, max)) { |
| 181 | *nr_pages = PAGE_COUNTER_MAX; | 182 | *nr_pages = PAGE_COUNTER_MAX; |
| 182 | return 0; | 183 | return 0; |
| 183 | } | 184 | } |
diff --git a/mm/page_io.c b/mm/page_io.c index 955db8b0d497..e6045804c8d8 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -269,14 +269,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, | |||
| 269 | .bv_len = PAGE_SIZE, | 269 | .bv_len = PAGE_SIZE, |
| 270 | .bv_offset = 0 | 270 | .bv_offset = 0 |
| 271 | }; | 271 | }; |
| 272 | struct iov_iter from = { | 272 | struct iov_iter from; |
| 273 | .type = ITER_BVEC | WRITE, | ||
| 274 | .count = PAGE_SIZE, | ||
| 275 | .iov_offset = 0, | ||
| 276 | .nr_segs = 1, | ||
| 277 | }; | ||
| 278 | from.bvec = &bv; /* older gcc versions are broken */ | ||
| 279 | 273 | ||
| 274 | iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); | ||
| 280 | init_sync_kiocb(&kiocb, swap_file); | 275 | init_sync_kiocb(&kiocb, swap_file); |
| 281 | kiocb.ki_pos = page_file_offset(page); | 276 | kiocb.ki_pos = page_file_offset(page); |
| 282 | kiocb.ki_nbytes = PAGE_SIZE; | 277 | kiocb.ki_nbytes = PAGE_SIZE; |
diff --git a/mm/page_owner.c b/mm/page_owner.c index 9ab4a9b5bc09..0993f5f36b01 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
| @@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order) | |||
| 59 | 59 | ||
| 60 | void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) | 60 | void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) |
| 61 | { | 61 | { |
| 62 | struct page_ext *page_ext; | 62 | struct page_ext *page_ext = lookup_page_ext(page); |
| 63 | struct stack_trace *trace; | 63 | struct stack_trace trace = { |
| 64 | 64 | .nr_entries = 0, | |
| 65 | page_ext = lookup_page_ext(page); | 65 | .max_entries = ARRAY_SIZE(page_ext->trace_entries), |
| 66 | .entries = &page_ext->trace_entries[0], | ||
| 67 | .skip = 3, | ||
| 68 | }; | ||
| 66 | 69 | ||
| 67 | trace = &page_ext->trace; | 70 | save_stack_trace(&trace); |
| 68 | trace->nr_entries = 0; | ||
| 69 | trace->max_entries = ARRAY_SIZE(page_ext->trace_entries); | ||
| 70 | trace->entries = &page_ext->trace_entries[0]; | ||
| 71 | trace->skip = 3; | ||
| 72 | save_stack_trace(&page_ext->trace); | ||
| 73 | 71 | ||
| 74 | page_ext->order = order; | 72 | page_ext->order = order; |
| 75 | page_ext->gfp_mask = gfp_mask; | 73 | page_ext->gfp_mask = gfp_mask; |
| 74 | page_ext->nr_entries = trace.nr_entries; | ||
| 76 | 75 | ||
| 77 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | 76 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); |
| 78 | } | 77 | } |
| @@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
| 84 | int ret; | 83 | int ret; |
| 85 | int pageblock_mt, page_mt; | 84 | int pageblock_mt, page_mt; |
| 86 | char *kbuf; | 85 | char *kbuf; |
| 86 | struct stack_trace trace = { | ||
| 87 | .nr_entries = page_ext->nr_entries, | ||
| 88 | .entries = &page_ext->trace_entries[0], | ||
| 89 | }; | ||
| 87 | 90 | ||
| 88 | kbuf = kmalloc(count, GFP_KERNEL); | 91 | kbuf = kmalloc(count, GFP_KERNEL); |
| 89 | if (!kbuf) | 92 | if (!kbuf) |
| @@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
| 121 | if (ret >= count) | 124 | if (ret >= count) |
| 122 | goto err; | 125 | goto err; |
| 123 | 126 | ||
| 124 | ret += snprint_stack_trace(kbuf + ret, count - ret, | 127 | ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); |
| 125 | &page_ext->trace, 0); | ||
| 126 | if (ret >= count) | 128 | if (ret >= count) |
| 127 | goto err; | 129 | goto err; |
| 128 | 130 | ||
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b264bda46e1b..75c1f2878519 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
| @@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, | |||
| 35 | do { | 35 | do { |
| 36 | again: | 36 | again: |
| 37 | next = pmd_addr_end(addr, end); | 37 | next = pmd_addr_end(addr, end); |
| 38 | if (pmd_none(*pmd)) { | 38 | if (pmd_none(*pmd) || !walk->vma) { |
| 39 | if (walk->pte_hole) | 39 | if (walk->pte_hole) |
| 40 | err = walk->pte_hole(addr, next, walk); | 40 | err = walk->pte_hole(addr, next, walk); |
| 41 | if (err) | 41 | if (err) |
| @@ -59,7 +59,7 @@ again: | |||
| 59 | continue; | 59 | continue; |
| 60 | 60 | ||
| 61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); | 61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); |
| 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 62 | if (pmd_trans_unstable(pmd)) |
| 63 | goto again; | 63 | goto again; |
| 64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
| 65 | if (err) | 65 | if (err) |
| @@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
| 86 | break; | 86 | break; |
| 87 | continue; | 87 | continue; |
| 88 | } | 88 | } |
| 89 | if (walk->pud_entry) | 89 | if (walk->pmd_entry || walk->pte_entry) |
| 90 | err = walk->pud_entry(pud, addr, next, walk); | ||
| 91 | if (!err && (walk->pmd_entry || walk->pte_entry)) | ||
| 92 | err = walk_pmd_range(pud, addr, next, walk); | 90 | err = walk_pmd_range(pud, addr, next, walk); |
| 93 | if (err) | 91 | if (err) |
| 94 | break; | 92 | break; |
| @@ -97,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
| 97 | return err; | 95 | return err; |
| 98 | } | 96 | } |
| 99 | 97 | ||
| 98 | static int walk_pgd_range(unsigned long addr, unsigned long end, | ||
| 99 | struct mm_walk *walk) | ||
| 100 | { | ||
| 101 | pgd_t *pgd; | ||
| 102 | unsigned long next; | ||
| 103 | int err = 0; | ||
| 104 | |||
| 105 | pgd = pgd_offset(walk->mm, addr); | ||
| 106 | do { | ||
| 107 | next = pgd_addr_end(addr, end); | ||
| 108 | if (pgd_none_or_clear_bad(pgd)) { | ||
| 109 | if (walk->pte_hole) | ||
| 110 | err = walk->pte_hole(addr, next, walk); | ||
| 111 | if (err) | ||
| 112 | break; | ||
| 113 | continue; | ||
| 114 | } | ||
| 115 | if (walk->pmd_entry || walk->pte_entry) | ||
| 116 | err = walk_pud_range(pgd, addr, next, walk); | ||
| 117 | if (err) | ||
| 118 | break; | ||
| 119 | } while (pgd++, addr = next, addr != end); | ||
| 120 | |||
| 121 | return err; | ||
| 122 | } | ||
| 123 | |||
| 100 | #ifdef CONFIG_HUGETLB_PAGE | 124 | #ifdef CONFIG_HUGETLB_PAGE |
| 101 | static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, | 125 | static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, |
| 102 | unsigned long end) | 126 | unsigned long end) |
| @@ -105,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, | |||
| 105 | return boundary < end ? boundary : end; | 129 | return boundary < end ? boundary : end; |
| 106 | } | 130 | } |
| 107 | 131 | ||
| 108 | static int walk_hugetlb_range(struct vm_area_struct *vma, | 132 | static int walk_hugetlb_range(unsigned long addr, unsigned long end, |
| 109 | unsigned long addr, unsigned long end, | ||
| 110 | struct mm_walk *walk) | 133 | struct mm_walk *walk) |
| 111 | { | 134 | { |
| 135 | struct vm_area_struct *vma = walk->vma; | ||
| 112 | struct hstate *h = hstate_vma(vma); | 136 | struct hstate *h = hstate_vma(vma); |
| 113 | unsigned long next; | 137 | unsigned long next; |
| 114 | unsigned long hmask = huge_page_mask(h); | 138 | unsigned long hmask = huge_page_mask(h); |
| @@ -121,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
| 121 | if (pte && walk->hugetlb_entry) | 145 | if (pte && walk->hugetlb_entry) |
| 122 | err = walk->hugetlb_entry(pte, hmask, addr, next, walk); | 146 | err = walk->hugetlb_entry(pte, hmask, addr, next, walk); |
| 123 | if (err) | 147 | if (err) |
| 124 | return err; | 148 | break; |
| 125 | } while (addr = next, addr != end); | 149 | } while (addr = next, addr != end); |
| 126 | 150 | ||
| 127 | return 0; | 151 | return err; |
| 128 | } | 152 | } |
| 129 | 153 | ||
| 130 | #else /* CONFIG_HUGETLB_PAGE */ | 154 | #else /* CONFIG_HUGETLB_PAGE */ |
| 131 | static int walk_hugetlb_range(struct vm_area_struct *vma, | 155 | static int walk_hugetlb_range(unsigned long addr, unsigned long end, |
| 132 | unsigned long addr, unsigned long end, | ||
| 133 | struct mm_walk *walk) | 156 | struct mm_walk *walk) |
| 134 | { | 157 | { |
| 135 | return 0; | 158 | return 0; |
| @@ -137,115 +160,138 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
| 137 | 160 | ||
| 138 | #endif /* CONFIG_HUGETLB_PAGE */ | 161 | #endif /* CONFIG_HUGETLB_PAGE */ |
| 139 | 162 | ||
| 163 | /* | ||
| 164 | * Decide whether we really walk over the current vma on [@start, @end) | ||
| 165 | * or skip it via the returned value. Return 0 if we do walk over the | ||
| 166 | * current vma, and return 1 if we skip the vma. Negative values means | ||
| 167 | * error, where we abort the current walk. | ||
| 168 | */ | ||
| 169 | static int walk_page_test(unsigned long start, unsigned long end, | ||
| 170 | struct mm_walk *walk) | ||
| 171 | { | ||
| 172 | struct vm_area_struct *vma = walk->vma; | ||
| 173 | |||
| 174 | if (walk->test_walk) | ||
| 175 | return walk->test_walk(start, end, walk); | ||
| 176 | |||
| 177 | /* | ||
| 178 | * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP | ||
| 179 | * range, so we don't walk over it as we do for normal vmas. However, | ||
| 180 | * Some callers are interested in handling hole range and they don't | ||
| 181 | * want to just ignore any single address range. Such users certainly | ||
| 182 | * define their ->pte_hole() callbacks, so let's delegate them to handle | ||
| 183 | * vma(VM_PFNMAP). | ||
| 184 | */ | ||
| 185 | if (vma->vm_flags & VM_PFNMAP) { | ||
| 186 | int err = 1; | ||
| 187 | if (walk->pte_hole) | ||
| 188 | err = walk->pte_hole(start, end, walk); | ||
| 189 | return err ? err : 1; | ||
| 190 | } | ||
| 191 | return 0; | ||
| 192 | } | ||
| 193 | |||
| 194 | static int __walk_page_range(unsigned long start, unsigned long end, | ||
| 195 | struct mm_walk *walk) | ||
| 196 | { | ||
| 197 | int err = 0; | ||
| 198 | struct vm_area_struct *vma = walk->vma; | ||
| 199 | |||
| 200 | if (vma && is_vm_hugetlb_page(vma)) { | ||
| 201 | if (walk->hugetlb_entry) | ||
| 202 | err = walk_hugetlb_range(start, end, walk); | ||
| 203 | } else | ||
| 204 | err = walk_pgd_range(start, end, walk); | ||
| 140 | 205 | ||
| 206 | return err; | ||
| 207 | } | ||
| 141 | 208 | ||
| 142 | /** | 209 | /** |
| 143 | * walk_page_range - walk a memory map's page tables with a callback | 210 | * walk_page_range - walk page table with caller specific callbacks |
| 144 | * @addr: starting address | ||
| 145 | * @end: ending address | ||
| 146 | * @walk: set of callbacks to invoke for each level of the tree | ||
| 147 | * | 211 | * |
| 148 | * Recursively walk the page table for the memory area in a VMA, | 212 | * Recursively walk the page table tree of the process represented by @walk->mm |
| 149 | * calling supplied callbacks. Callbacks are called in-order (first | 213 | * within the virtual address range [@start, @end). During walking, we can do |
| 150 | * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, | 214 | * some caller-specific works for each entry, by setting up pmd_entry(), |
| 151 | * etc.). If lower-level callbacks are omitted, walking depth is reduced. | 215 | * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these |
| 216 | * callbacks, the associated entries/pages are just ignored. | ||
| 217 | * The return values of these callbacks are commonly defined like below: | ||
| 218 | * - 0 : succeeded to handle the current entry, and if you don't reach the | ||
| 219 | * end address yet, continue to walk. | ||
| 220 | * - >0 : succeeded to handle the current entry, and return to the caller | ||
| 221 | * with caller specific value. | ||
| 222 | * - <0 : failed to handle the current entry, and return to the caller | ||
| 223 | * with error code. | ||
| 152 | * | 224 | * |
| 153 | * Each callback receives an entry pointer and the start and end of the | 225 | * Before starting to walk page table, some callers want to check whether |
| 154 | * associated range, and a copy of the original mm_walk for access to | 226 | * they really want to walk over the current vma, typically by checking |
| 155 | * the ->private or ->mm fields. | 227 | * its vm_flags. walk_page_test() and @walk->test_walk() are used for this |
| 228 | * purpose. | ||
| 156 | * | 229 | * |
| 157 | * Usually no locks are taken, but splitting transparent huge page may | 230 | * struct mm_walk keeps current values of some common data like vma and pmd, |
| 158 | * take page table lock. And the bottom level iterator will map PTE | 231 | * which are useful for the access from callbacks. If you want to pass some |
| 159 | * directories from highmem if necessary. | 232 | * caller-specific data to callbacks, @walk->private should be helpful. |
| 160 | * | 233 | * |
| 161 | * If any callback returns a non-zero value, the walk is aborted and | 234 | * Locking: |
| 162 | * the return value is propagated back to the caller. Otherwise 0 is returned. | 235 | * Callers of walk_page_range() and walk_page_vma() should hold |
| 163 | * | 236 | * @walk->mm->mmap_sem, because these function traverse vma list and/or |
| 164 | * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry | 237 | * access to vma's data. |
| 165 | * is !NULL. | ||
| 166 | */ | 238 | */ |
| 167 | int walk_page_range(unsigned long addr, unsigned long end, | 239 | int walk_page_range(unsigned long start, unsigned long end, |
| 168 | struct mm_walk *walk) | 240 | struct mm_walk *walk) |
| 169 | { | 241 | { |
| 170 | pgd_t *pgd; | ||
| 171 | unsigned long next; | ||
| 172 | int err = 0; | 242 | int err = 0; |
| 243 | unsigned long next; | ||
| 244 | struct vm_area_struct *vma; | ||
| 173 | 245 | ||
| 174 | if (addr >= end) | 246 | if (start >= end) |
| 175 | return err; | 247 | return -EINVAL; |
| 176 | 248 | ||
| 177 | if (!walk->mm) | 249 | if (!walk->mm) |
| 178 | return -EINVAL; | 250 | return -EINVAL; |
| 179 | 251 | ||
| 180 | VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); | 252 | VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); |
| 181 | 253 | ||
| 182 | pgd = pgd_offset(walk->mm, addr); | 254 | vma = find_vma(walk->mm, start); |
| 183 | do { | 255 | do { |
| 184 | struct vm_area_struct *vma = NULL; | 256 | if (!vma) { /* after the last vma */ |
| 185 | 257 | walk->vma = NULL; | |
| 186 | next = pgd_addr_end(addr, end); | 258 | next = end; |
| 259 | } else if (start < vma->vm_start) { /* outside vma */ | ||
| 260 | walk->vma = NULL; | ||
| 261 | next = min(end, vma->vm_start); | ||
| 262 | } else { /* inside vma */ | ||
| 263 | walk->vma = vma; | ||
| 264 | next = min(end, vma->vm_end); | ||
| 265 | vma = vma->vm_next; | ||
| 187 | 266 | ||
| 188 | /* | 267 | err = walk_page_test(start, next, walk); |
| 189 | * This function was not intended to be vma based. | 268 | if (err > 0) |
| 190 | * But there are vma special cases to be handled: | ||
| 191 | * - hugetlb vma's | ||
| 192 | * - VM_PFNMAP vma's | ||
| 193 | */ | ||
| 194 | vma = find_vma(walk->mm, addr); | ||
| 195 | if (vma) { | ||
| 196 | /* | ||
| 197 | * There are no page structures backing a VM_PFNMAP | ||
| 198 | * range, so do not allow split_huge_page_pmd(). | ||
| 199 | */ | ||
| 200 | if ((vma->vm_start <= addr) && | ||
| 201 | (vma->vm_flags & VM_PFNMAP)) { | ||
| 202 | if (walk->pte_hole) | ||
| 203 | err = walk->pte_hole(addr, next, walk); | ||
| 204 | if (err) | ||
| 205 | break; | ||
| 206 | pgd = pgd_offset(walk->mm, next); | ||
| 207 | continue; | ||
| 208 | } | ||
| 209 | /* | ||
| 210 | * Handle hugetlb vma individually because pagetable | ||
| 211 | * walk for the hugetlb page is dependent on the | ||
| 212 | * architecture and we can't handled it in the same | ||
| 213 | * manner as non-huge pages. | ||
| 214 | */ | ||
| 215 | if (walk->hugetlb_entry && (vma->vm_start <= addr) && | ||
| 216 | is_vm_hugetlb_page(vma)) { | ||
| 217 | if (vma->vm_end < next) | ||
| 218 | next = vma->vm_end; | ||
| 219 | /* | ||
| 220 | * Hugepage is very tightly coupled with vma, | ||
| 221 | * so walk through hugetlb entries within a | ||
| 222 | * given vma. | ||
| 223 | */ | ||
| 224 | err = walk_hugetlb_range(vma, addr, next, walk); | ||
| 225 | if (err) | ||
| 226 | break; | ||
| 227 | pgd = pgd_offset(walk->mm, next); | ||
| 228 | continue; | 269 | continue; |
| 229 | } | 270 | if (err < 0) |
| 230 | } | ||
| 231 | |||
| 232 | if (pgd_none_or_clear_bad(pgd)) { | ||
| 233 | if (walk->pte_hole) | ||
| 234 | err = walk->pte_hole(addr, next, walk); | ||
| 235 | if (err) | ||
| 236 | break; | 271 | break; |
| 237 | pgd++; | ||
| 238 | continue; | ||
| 239 | } | 272 | } |
| 240 | if (walk->pgd_entry) | 273 | if (walk->vma || walk->pte_hole) |
| 241 | err = walk->pgd_entry(pgd, addr, next, walk); | 274 | err = __walk_page_range(start, next, walk); |
| 242 | if (!err && | ||
| 243 | (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) | ||
| 244 | err = walk_pud_range(pgd, addr, next, walk); | ||
| 245 | if (err) | 275 | if (err) |
| 246 | break; | 276 | break; |
| 247 | pgd++; | 277 | } while (start = next, start < end); |
| 248 | } while (addr = next, addr < end); | ||
| 249 | |||
| 250 | return err; | 278 | return err; |
| 251 | } | 279 | } |
| 280 | |||
| 281 | int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) | ||
| 282 | { | ||
| 283 | int err; | ||
| 284 | |||
| 285 | if (!walk->mm) | ||
| 286 | return -EINVAL; | ||
| 287 | |||
| 288 | VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); | ||
| 289 | VM_BUG_ON(!vma); | ||
| 290 | walk->vma = vma; | ||
| 291 | err = walk_page_test(vma->vm_start, vma->vm_end, walk); | ||
| 292 | if (err > 0) | ||
| 293 | return 0; | ||
| 294 | if (err < 0) | ||
| 295 | return err; | ||
| 296 | return __walk_page_range(vma->vm_start, vma->vm_end, walk); | ||
| 297 | } | ||
diff --git a/mm/percpu.c b/mm/percpu.c index d39e2f4e335c..73c97a5f4495 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
| @@ -1528,7 +1528,6 @@ static void pcpu_dump_alloc_info(const char *lvl, | |||
| 1528 | int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | 1528 | int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, |
| 1529 | void *base_addr) | 1529 | void *base_addr) |
| 1530 | { | 1530 | { |
| 1531 | static char cpus_buf[4096] __initdata; | ||
| 1532 | static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; | 1531 | static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; |
| 1533 | static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; | 1532 | static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; |
| 1534 | size_t dyn_size = ai->dyn_size; | 1533 | size_t dyn_size = ai->dyn_size; |
| @@ -1541,12 +1540,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
| 1541 | int *unit_map; | 1540 | int *unit_map; |
| 1542 | int group, unit, i; | 1541 | int group, unit, i; |
| 1543 | 1542 | ||
| 1544 | cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); | ||
| 1545 | |||
| 1546 | #define PCPU_SETUP_BUG_ON(cond) do { \ | 1543 | #define PCPU_SETUP_BUG_ON(cond) do { \ |
| 1547 | if (unlikely(cond)) { \ | 1544 | if (unlikely(cond)) { \ |
| 1548 | pr_emerg("PERCPU: failed to initialize, %s", #cond); \ | 1545 | pr_emerg("PERCPU: failed to initialize, %s", #cond); \ |
| 1549 | pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ | 1546 | pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \ |
| 1547 | cpumask_pr_args(cpu_possible_mask)); \ | ||
| 1550 | pcpu_dump_alloc_info(KERN_EMERG, ai); \ | 1548 | pcpu_dump_alloc_info(KERN_EMERG, ai); \ |
| 1551 | BUG(); \ | 1549 | BUG(); \ |
| 1552 | } \ | 1550 | } \ |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index dfb79e028ecb..c25f94b33811 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
| @@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | |||
| 193 | pmd_t *pmdp) | 193 | pmd_t *pmdp) |
| 194 | { | 194 | { |
| 195 | pmd_t entry = *pmdp; | 195 | pmd_t entry = *pmdp; |
| 196 | if (pmd_numa(entry)) | ||
| 197 | entry = pmd_mknonnuma(entry); | ||
| 198 | set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); | 196 | set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); |
| 199 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | 197 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
| 200 | } | 198 | } |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 5077afcd9e11..b1597690530c 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c | |||
| @@ -99,11 +99,8 @@ static int process_vm_rw_single_vec(unsigned long addr, | |||
| 99 | size_t bytes; | 99 | size_t bytes; |
| 100 | 100 | ||
| 101 | /* Get the pages we're interested in */ | 101 | /* Get the pages we're interested in */ |
| 102 | down_read(&mm->mmap_sem); | 102 | pages = get_user_pages_unlocked(task, mm, pa, pages, |
| 103 | pages = get_user_pages(task, mm, pa, pages, | 103 | vm_write, 0, process_pages); |
| 104 | vm_write, 0, process_pages, NULL); | ||
| 105 | up_read(&mm->mmap_sem); | ||
| 106 | |||
| 107 | if (pages <= 0) | 104 | if (pages <= 0) |
| 108 | return -EFAULT; | 105 | return -EFAULT; |
| 109 | 106 | ||
diff --git a/mm/readahead.c b/mm/readahead.c index 17b9172ec37f..935675844b2e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -27,7 +27,7 @@ | |||
| 27 | void | 27 | void |
| 28 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) | 28 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) |
| 29 | { | 29 | { |
| 30 | ra->ra_pages = mapping->backing_dev_info->ra_pages; | 30 | ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; |
| 31 | ra->prev_pos = -1; | 31 | ra->prev_pos = -1; |
| 32 | } | 32 | } |
| 33 | EXPORT_SYMBOL_GPL(file_ra_state_init); | 33 | EXPORT_SYMBOL_GPL(file_ra_state_init); |
| @@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping, | |||
| 541 | /* | 541 | /* |
| 542 | * Defer asynchronous read-ahead on IO congestion. | 542 | * Defer asynchronous read-ahead on IO congestion. |
| 543 | */ | 543 | */ |
| 544 | if (bdi_read_congested(mapping->backing_dev_info)) | 544 | if (bdi_read_congested(inode_to_bdi(mapping->host))) |
| 545 | return; | 545 | return; |
| 546 | 546 | ||
| 547 | /* do read-ahead */ | 547 | /* do read-ahead */ |
| @@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
| 590 | if (!vma->anon_vma || !page__anon_vma || | 590 | if (!vma->anon_vma || !page__anon_vma || |
| 591 | vma->anon_vma->root != page__anon_vma->root) | 591 | vma->anon_vma->root != page__anon_vma->root) |
| 592 | return -EFAULT; | 592 | return -EFAULT; |
| 593 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | 593 | } else if (page->mapping) { |
| 594 | if (!vma->vm_file || | 594 | if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) |
| 595 | vma->vm_file->f_mapping != page->mapping) | ||
| 596 | return -EFAULT; | 595 | return -EFAULT; |
| 597 | } else | 596 | } else |
| 598 | return -EFAULT; | 597 | return -EFAULT; |
| @@ -1086,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page, | |||
| 1086 | void page_add_file_rmap(struct page *page) | 1085 | void page_add_file_rmap(struct page *page) |
| 1087 | { | 1086 | { |
| 1088 | struct mem_cgroup *memcg; | 1087 | struct mem_cgroup *memcg; |
| 1089 | unsigned long flags; | ||
| 1090 | bool locked; | ||
| 1091 | 1088 | ||
| 1092 | memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); | 1089 | memcg = mem_cgroup_begin_page_stat(page); |
| 1093 | if (atomic_inc_and_test(&page->_mapcount)) { | 1090 | if (atomic_inc_and_test(&page->_mapcount)) { |
| 1094 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1091 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
| 1095 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); | 1092 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); |
| 1096 | } | 1093 | } |
| 1097 | mem_cgroup_end_page_stat(memcg, &locked, &flags); | 1094 | mem_cgroup_end_page_stat(memcg); |
| 1098 | } | 1095 | } |
| 1099 | 1096 | ||
| 1100 | static void page_remove_file_rmap(struct page *page) | 1097 | static void page_remove_file_rmap(struct page *page) |
| 1101 | { | 1098 | { |
| 1102 | struct mem_cgroup *memcg; | 1099 | struct mem_cgroup *memcg; |
| 1103 | unsigned long flags; | ||
| 1104 | bool locked; | ||
| 1105 | 1100 | ||
| 1106 | memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); | 1101 | memcg = mem_cgroup_begin_page_stat(page); |
| 1107 | 1102 | ||
| 1108 | /* page still mapped by someone else? */ | 1103 | /* page still mapped by someone else? */ |
| 1109 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1104 | if (!atomic_add_negative(-1, &page->_mapcount)) |
| @@ -1124,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page) | |||
| 1124 | if (unlikely(PageMlocked(page))) | 1119 | if (unlikely(PageMlocked(page))) |
| 1125 | clear_page_mlock(page); | 1120 | clear_page_mlock(page); |
| 1126 | out: | 1121 | out: |
| 1127 | mem_cgroup_end_page_stat(memcg, &locked, &flags); | 1122 | mem_cgroup_end_page_stat(memcg); |
| 1128 | } | 1123 | } |
| 1129 | 1124 | ||
| 1130 | /** | 1125 | /** |
| @@ -1274,7 +1269,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 1274 | if (pte_soft_dirty(pteval)) | 1269 | if (pte_soft_dirty(pteval)) |
| 1275 | swp_pte = pte_swp_mksoft_dirty(swp_pte); | 1270 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
| 1276 | set_pte_at(mm, address, pte, swp_pte); | 1271 | set_pte_at(mm, address, pte, swp_pte); |
| 1277 | BUG_ON(pte_file(*pte)); | ||
| 1278 | } else if (IS_ENABLED(CONFIG_MIGRATION) && | 1272 | } else if (IS_ENABLED(CONFIG_MIGRATION) && |
| 1279 | (flags & TTU_MIGRATION)) { | 1273 | (flags & TTU_MIGRATION)) { |
| 1280 | /* Establish migration entry for a file page */ | 1274 | /* Establish migration entry for a file page */ |
| @@ -1316,211 +1310,6 @@ out_mlock: | |||
| 1316 | return ret; | 1310 | return ret; |
| 1317 | } | 1311 | } |
| 1318 | 1312 | ||
| 1319 | /* | ||
| 1320 | * objrmap doesn't work for nonlinear VMAs because the assumption that | ||
| 1321 | * offset-into-file correlates with offset-into-virtual-addresses does not hold. | ||
| 1322 | * Consequently, given a particular page and its ->index, we cannot locate the | ||
| 1323 | * ptes which are mapping that page without an exhaustive linear search. | ||
| 1324 | * | ||
| 1325 | * So what this code does is a mini "virtual scan" of each nonlinear VMA which | ||
| 1326 | * maps the file to which the target page belongs. The ->vm_private_data field | ||
| 1327 | * holds the current cursor into that scan. Successive searches will circulate | ||
| 1328 | * around the vma's virtual address space. | ||
| 1329 | * | ||
| 1330 | * So as more replacement pressure is applied to the pages in a nonlinear VMA, | ||
| 1331 | * more scanning pressure is placed against them as well. Eventually pages | ||
| 1332 | * will become fully unmapped and are eligible for eviction. | ||
| 1333 | * | ||
| 1334 | * For very sparsely populated VMAs this is a little inefficient - chances are | ||
| 1335 | * there there won't be many ptes located within the scan cluster. In this case | ||
| 1336 | * maybe we could scan further - to the end of the pte page, perhaps. | ||
| 1337 | * | ||
| 1338 | * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can | ||
| 1339 | * acquire it without blocking. If vma locked, mlock the pages in the cluster, | ||
| 1340 | * rather than unmapping them. If we encounter the "check_page" that vmscan is | ||
| 1341 | * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. | ||
| 1342 | */ | ||
| 1343 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) | ||
| 1344 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) | ||
| 1345 | |||
| 1346 | static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | ||
| 1347 | struct vm_area_struct *vma, struct page *check_page) | ||
| 1348 | { | ||
| 1349 | struct mm_struct *mm = vma->vm_mm; | ||
| 1350 | pmd_t *pmd; | ||
| 1351 | pte_t *pte; | ||
| 1352 | pte_t pteval; | ||
| 1353 | spinlock_t *ptl; | ||
| 1354 | struct page *page; | ||
| 1355 | unsigned long address; | ||
| 1356 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
| 1357 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
| 1358 | unsigned long end; | ||
| 1359 | int ret = SWAP_AGAIN; | ||
| 1360 | int locked_vma = 0; | ||
| 1361 | |||
| 1362 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | ||
| 1363 | end = address + CLUSTER_SIZE; | ||
| 1364 | if (address < vma->vm_start) | ||
| 1365 | address = vma->vm_start; | ||
| 1366 | if (end > vma->vm_end) | ||
| 1367 | end = vma->vm_end; | ||
| 1368 | |||
| 1369 | pmd = mm_find_pmd(mm, address); | ||
| 1370 | if (!pmd) | ||
| 1371 | return ret; | ||
| 1372 | |||
| 1373 | mmun_start = address; | ||
| 1374 | mmun_end = end; | ||
| 1375 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
| 1376 | |||
| 1377 | /* | ||
| 1378 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
| 1379 | * keep the sem while scanning the cluster for mlocking pages. | ||
| 1380 | */ | ||
| 1381 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
| 1382 | locked_vma = (vma->vm_flags & VM_LOCKED); | ||
| 1383 | if (!locked_vma) | ||
| 1384 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | ||
| 1385 | } | ||
| 1386 | |||
| 1387 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
| 1388 | |||
| 1389 | /* Update high watermark before we lower rss */ | ||
| 1390 | update_hiwater_rss(mm); | ||
| 1391 | |||
| 1392 | for (; address < end; pte++, address += PAGE_SIZE) { | ||
| 1393 | if (!pte_present(*pte)) | ||
| 1394 | continue; | ||
| 1395 | page = vm_normal_page(vma, address, *pte); | ||
| 1396 | BUG_ON(!page || PageAnon(page)); | ||
| 1397 | |||
| 1398 | if (locked_vma) { | ||
| 1399 | if (page == check_page) { | ||
| 1400 | /* we know we have check_page locked */ | ||
| 1401 | mlock_vma_page(page); | ||
| 1402 | ret = SWAP_MLOCK; | ||
| 1403 | } else if (trylock_page(page)) { | ||
| 1404 | /* | ||
| 1405 | * If we can lock the page, perform mlock. | ||
| 1406 | * Otherwise leave the page alone, it will be | ||
| 1407 | * eventually encountered again later. | ||
| 1408 | */ | ||
| 1409 | mlock_vma_page(page); | ||
| 1410 | unlock_page(page); | ||
| 1411 | } | ||
| 1412 | continue; /* don't unmap */ | ||
| 1413 | } | ||
| 1414 | |||
| 1415 | /* | ||
| 1416 | * No need for _notify because we're within an | ||
| 1417 | * mmu_notifier_invalidate_range_ {start|end} scope. | ||
| 1418 | */ | ||
| 1419 | if (ptep_clear_flush_young(vma, address, pte)) | ||
| 1420 | continue; | ||
| 1421 | |||
| 1422 | /* Nuke the page table entry. */ | ||
| 1423 | flush_cache_page(vma, address, pte_pfn(*pte)); | ||
| 1424 | pteval = ptep_clear_flush_notify(vma, address, pte); | ||
| 1425 | |||
| 1426 | /* If nonlinear, store the file page offset in the pte. */ | ||
| 1427 | if (page->index != linear_page_index(vma, address)) { | ||
| 1428 | pte_t ptfile = pgoff_to_pte(page->index); | ||
| 1429 | if (pte_soft_dirty(pteval)) | ||
| 1430 | ptfile = pte_file_mksoft_dirty(ptfile); | ||
| 1431 | set_pte_at(mm, address, pte, ptfile); | ||
| 1432 | } | ||
| 1433 | |||
| 1434 | /* Move the dirty bit to the physical page now the pte is gone. */ | ||
| 1435 | if (pte_dirty(pteval)) | ||
| 1436 | set_page_dirty(page); | ||
| 1437 | |||
| 1438 | page_remove_rmap(page); | ||
| 1439 | page_cache_release(page); | ||
| 1440 | dec_mm_counter(mm, MM_FILEPAGES); | ||
| 1441 | (*mapcount)--; | ||
| 1442 | } | ||
| 1443 | pte_unmap_unlock(pte - 1, ptl); | ||
| 1444 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
| 1445 | if (locked_vma) | ||
| 1446 | up_read(&vma->vm_mm->mmap_sem); | ||
| 1447 | return ret; | ||
| 1448 | } | ||
| 1449 | |||
| 1450 | static int try_to_unmap_nonlinear(struct page *page, | ||
| 1451 | struct address_space *mapping, void *arg) | ||
| 1452 | { | ||
| 1453 | struct vm_area_struct *vma; | ||
| 1454 | int ret = SWAP_AGAIN; | ||
| 1455 | unsigned long cursor; | ||
| 1456 | unsigned long max_nl_cursor = 0; | ||
| 1457 | unsigned long max_nl_size = 0; | ||
| 1458 | unsigned int mapcount; | ||
| 1459 | |||
| 1460 | list_for_each_entry(vma, | ||
| 1461 | &mapping->i_mmap_nonlinear, shared.nonlinear) { | ||
| 1462 | |||
| 1463 | cursor = (unsigned long) vma->vm_private_data; | ||
| 1464 | if (cursor > max_nl_cursor) | ||
| 1465 | max_nl_cursor = cursor; | ||
| 1466 | cursor = vma->vm_end - vma->vm_start; | ||
| 1467 | if (cursor > max_nl_size) | ||
| 1468 | max_nl_size = cursor; | ||
| 1469 | } | ||
| 1470 | |||
| 1471 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ | ||
| 1472 | return SWAP_FAIL; | ||
| 1473 | } | ||
| 1474 | |||
| 1475 | /* | ||
| 1476 | * We don't try to search for this page in the nonlinear vmas, | ||
| 1477 | * and page_referenced wouldn't have found it anyway. Instead | ||
| 1478 | * just walk the nonlinear vmas trying to age and unmap some. | ||
| 1479 | * The mapcount of the page we came in with is irrelevant, | ||
| 1480 | * but even so use it as a guide to how hard we should try? | ||
| 1481 | */ | ||
| 1482 | mapcount = page_mapcount(page); | ||
| 1483 | if (!mapcount) | ||
| 1484 | return ret; | ||
| 1485 | |||
| 1486 | cond_resched(); | ||
| 1487 | |||
| 1488 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | ||
| 1489 | if (max_nl_cursor == 0) | ||
| 1490 | max_nl_cursor = CLUSTER_SIZE; | ||
| 1491 | |||
| 1492 | do { | ||
| 1493 | list_for_each_entry(vma, | ||
| 1494 | &mapping->i_mmap_nonlinear, shared.nonlinear) { | ||
| 1495 | |||
| 1496 | cursor = (unsigned long) vma->vm_private_data; | ||
| 1497 | while (cursor < max_nl_cursor && | ||
| 1498 | cursor < vma->vm_end - vma->vm_start) { | ||
| 1499 | if (try_to_unmap_cluster(cursor, &mapcount, | ||
| 1500 | vma, page) == SWAP_MLOCK) | ||
| 1501 | ret = SWAP_MLOCK; | ||
| 1502 | cursor += CLUSTER_SIZE; | ||
| 1503 | vma->vm_private_data = (void *) cursor; | ||
| 1504 | if ((int)mapcount <= 0) | ||
| 1505 | return ret; | ||
| 1506 | } | ||
| 1507 | vma->vm_private_data = (void *) max_nl_cursor; | ||
| 1508 | } | ||
| 1509 | cond_resched(); | ||
| 1510 | max_nl_cursor += CLUSTER_SIZE; | ||
| 1511 | } while (max_nl_cursor <= max_nl_size); | ||
| 1512 | |||
| 1513 | /* | ||
| 1514 | * Don't loop forever (perhaps all the remaining pages are | ||
| 1515 | * in locked vmas). Reset cursor on all unreserved nonlinear | ||
| 1516 | * vmas, now forgetting on which ones it had fallen behind. | ||
| 1517 | */ | ||
| 1518 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) | ||
| 1519 | vma->vm_private_data = NULL; | ||
| 1520 | |||
| 1521 | return ret; | ||
| 1522 | } | ||
| 1523 | |||
| 1524 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1313 | bool is_vma_temporary_stack(struct vm_area_struct *vma) |
| 1525 | { | 1314 | { |
| 1526 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | 1315 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); |
| @@ -1566,7 +1355,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
| 1566 | .rmap_one = try_to_unmap_one, | 1355 | .rmap_one = try_to_unmap_one, |
| 1567 | .arg = (void *)flags, | 1356 | .arg = (void *)flags, |
| 1568 | .done = page_not_mapped, | 1357 | .done = page_not_mapped, |
| 1569 | .file_nonlinear = try_to_unmap_nonlinear, | ||
| 1570 | .anon_lock = page_lock_anon_vma_read, | 1358 | .anon_lock = page_lock_anon_vma_read, |
| 1571 | }; | 1359 | }; |
| 1572 | 1360 | ||
| @@ -1612,12 +1400,6 @@ int try_to_munlock(struct page *page) | |||
| 1612 | .rmap_one = try_to_unmap_one, | 1400 | .rmap_one = try_to_unmap_one, |
| 1613 | .arg = (void *)TTU_MUNLOCK, | 1401 | .arg = (void *)TTU_MUNLOCK, |
| 1614 | .done = page_not_mapped, | 1402 | .done = page_not_mapped, |
| 1615 | /* | ||
| 1616 | * We don't bother to try to find the munlocked page in | ||
| 1617 | * nonlinears. It's costly. Instead, later, page reclaim logic | ||
| 1618 | * may call try_to_unmap() and recover PG_mlocked lazily. | ||
| 1619 | */ | ||
| 1620 | .file_nonlinear = NULL, | ||
| 1621 | .anon_lock = page_lock_anon_vma_read, | 1403 | .anon_lock = page_lock_anon_vma_read, |
| 1622 | 1404 | ||
| 1623 | }; | 1405 | }; |
| @@ -1748,13 +1530,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
| 1748 | goto done; | 1530 | goto done; |
| 1749 | } | 1531 | } |
| 1750 | 1532 | ||
| 1751 | if (!rwc->file_nonlinear) | ||
| 1752 | goto done; | ||
| 1753 | |||
| 1754 | if (list_empty(&mapping->i_mmap_nonlinear)) | ||
| 1755 | goto done; | ||
| 1756 | |||
| 1757 | ret = rwc->file_nonlinear(page, mapping, rwc->arg); | ||
| 1758 | done: | 1533 | done: |
| 1759 | i_mmap_unlock_read(mapping); | 1534 | i_mmap_unlock_read(mapping); |
| 1760 | return ret; | 1535 | return ret; |
diff --git a/mm/shmem.c b/mm/shmem.c index 993e6ba689cc..cf2d0ca010bc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations; | |||
| 191 | static const struct inode_operations shmem_special_inode_operations; | 191 | static const struct inode_operations shmem_special_inode_operations; |
| 192 | static const struct vm_operations_struct shmem_vm_ops; | 192 | static const struct vm_operations_struct shmem_vm_ops; |
| 193 | 193 | ||
| 194 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | ||
| 195 | .ra_pages = 0, /* No readahead */ | ||
| 196 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | ||
| 197 | }; | ||
| 198 | |||
| 199 | static LIST_HEAD(shmem_swaplist); | 194 | static LIST_HEAD(shmem_swaplist); |
| 200 | static DEFINE_MUTEX(shmem_swaplist_mutex); | 195 | static DEFINE_MUTEX(shmem_swaplist_mutex); |
| 201 | 196 | ||
| @@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
| 765 | goto redirty; | 760 | goto redirty; |
| 766 | 761 | ||
| 767 | /* | 762 | /* |
| 768 | * shmem_backing_dev_info's capabilities prevent regular writeback or | 763 | * Our capabilities prevent regular writeback or sync from ever calling |
| 769 | * sync from ever calling shmem_writepage; but a stacking filesystem | 764 | * shmem_writepage; but a stacking filesystem might use ->writepage of |
| 770 | * might use ->writepage of its underlying filesystem, in which case | 765 | * its underlying filesystem, in which case tmpfs should write out to |
| 771 | * tmpfs should write out to swap only in response to memory pressure, | 766 | * swap only in response to memory pressure, and not for the writeback |
| 772 | * and not for the writeback threads or sync. | 767 | * threads or sync. |
| 773 | */ | 768 | */ |
| 774 | if (!wbc->for_reclaim) { | 769 | if (!wbc->for_reclaim) { |
| 775 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ | 770 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ |
| @@ -1131,7 +1126,7 @@ repeat: | |||
| 1131 | * truncated or holepunched since swap was confirmed. | 1126 | * truncated or holepunched since swap was confirmed. |
| 1132 | * shmem_undo_range() will have done some of the | 1127 | * shmem_undo_range() will have done some of the |
| 1133 | * unaccounting, now delete_from_swap_cache() will do | 1128 | * unaccounting, now delete_from_swap_cache() will do |
| 1134 | * the rest (including mem_cgroup_uncharge_swapcache). | 1129 | * the rest. |
| 1135 | * Reset swap.val? No, leave it so "failed" goes back to | 1130 | * Reset swap.val? No, leave it so "failed" goes back to |
| 1136 | * "repeat": reading a hole and writing should succeed. | 1131 | * "repeat": reading a hole and writing should succeed. |
| 1137 | */ | 1132 | */ |
| @@ -1415,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
| 1415 | inode->i_ino = get_next_ino(); | 1410 | inode->i_ino = get_next_ino(); |
| 1416 | inode_init_owner(inode, dir, mode); | 1411 | inode_init_owner(inode, dir, mode); |
| 1417 | inode->i_blocks = 0; | 1412 | inode->i_blocks = 0; |
| 1418 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | ||
| 1419 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 1413 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
| 1420 | inode->i_generation = get_seconds(); | 1414 | inode->i_generation = get_seconds(); |
| 1421 | info = SHMEM_I(inode); | 1415 | info = SHMEM_I(inode); |
| @@ -1461,7 +1455,10 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
| 1461 | 1455 | ||
| 1462 | bool shmem_mapping(struct address_space *mapping) | 1456 | bool shmem_mapping(struct address_space *mapping) |
| 1463 | { | 1457 | { |
| 1464 | return mapping->backing_dev_info == &shmem_backing_dev_info; | 1458 | if (!mapping->host) |
| 1459 | return false; | ||
| 1460 | |||
| 1461 | return mapping->host->i_sb->s_op == &shmem_ops; | ||
| 1465 | } | 1462 | } |
| 1466 | 1463 | ||
| 1467 | #ifdef CONFIG_TMPFS | 1464 | #ifdef CONFIG_TMPFS |
| @@ -2325,8 +2322,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) | |||
| 2325 | 2322 | ||
| 2326 | static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) | 2323 | static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) |
| 2327 | { | 2324 | { |
| 2328 | bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode); | 2325 | bool old_is_dir = d_is_dir(old_dentry); |
| 2329 | bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode); | 2326 | bool new_is_dir = d_is_dir(new_dentry); |
| 2330 | 2327 | ||
| 2331 | if (old_dir != new_dir && old_is_dir != new_is_dir) { | 2328 | if (old_dir != new_dir && old_is_dir != new_is_dir) { |
| 2332 | if (old_is_dir) { | 2329 | if (old_is_dir) { |
| @@ -3201,7 +3198,6 @@ static const struct vm_operations_struct shmem_vm_ops = { | |||
| 3201 | .set_policy = shmem_set_policy, | 3198 | .set_policy = shmem_set_policy, |
| 3202 | .get_policy = shmem_get_policy, | 3199 | .get_policy = shmem_get_policy, |
| 3203 | #endif | 3200 | #endif |
| 3204 | .remap_pages = generic_file_remap_pages, | ||
| 3205 | }; | 3201 | }; |
| 3206 | 3202 | ||
| 3207 | static struct dentry *shmem_mount(struct file_system_type *fs_type, | 3203 | static struct dentry *shmem_mount(struct file_system_type *fs_type, |
| @@ -3226,10 +3222,6 @@ int __init shmem_init(void) | |||
| 3226 | if (shmem_inode_cachep) | 3222 | if (shmem_inode_cachep) |
| 3227 | return 0; | 3223 | return 0; |
| 3228 | 3224 | ||
| 3229 | error = bdi_init(&shmem_backing_dev_info); | ||
| 3230 | if (error) | ||
| 3231 | goto out4; | ||
| 3232 | |||
| 3233 | error = shmem_init_inodecache(); | 3225 | error = shmem_init_inodecache(); |
| 3234 | if (error) | 3226 | if (error) |
| 3235 | goto out3; | 3227 | goto out3; |
| @@ -3253,8 +3245,6 @@ out1: | |||
| 3253 | out2: | 3245 | out2: |
| 3254 | shmem_destroy_inodecache(); | 3246 | shmem_destroy_inodecache(); |
| 3255 | out3: | 3247 | out3: |
| 3256 | bdi_destroy(&shmem_backing_dev_info); | ||
| 3257 | out4: | ||
| 3258 | shm_mnt = ERR_PTR(error); | 3248 | shm_mnt = ERR_PTR(error); |
| 3259 | return error; | 3249 | return error; |
| 3260 | } | 3250 | } |
| @@ -2382,7 +2382,7 @@ out: | |||
| 2382 | return nr_freed; | 2382 | return nr_freed; |
| 2383 | } | 2383 | } |
| 2384 | 2384 | ||
| 2385 | int __kmem_cache_shrink(struct kmem_cache *cachep) | 2385 | int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) |
| 2386 | { | 2386 | { |
| 2387 | int ret = 0; | 2387 | int ret = 0; |
| 2388 | int node; | 2388 | int node; |
| @@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) | |||
| 2404 | { | 2404 | { |
| 2405 | int i; | 2405 | int i; |
| 2406 | struct kmem_cache_node *n; | 2406 | struct kmem_cache_node *n; |
| 2407 | int rc = __kmem_cache_shrink(cachep); | 2407 | int rc = __kmem_cache_shrink(cachep, false); |
| 2408 | 2408 | ||
| 2409 | if (rc) | 2409 | if (rc) |
| 2410 | return rc; | 2410 | return rc; |
| @@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
| 3708 | int batchcount, int shared, gfp_t gfp) | 3708 | int batchcount, int shared, gfp_t gfp) |
| 3709 | { | 3709 | { |
| 3710 | int ret; | 3710 | int ret; |
| 3711 | struct kmem_cache *c = NULL; | 3711 | struct kmem_cache *c; |
| 3712 | int i = 0; | ||
| 3713 | 3712 | ||
| 3714 | ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); | 3713 | ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); |
| 3715 | 3714 | ||
| @@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
| 3719 | if ((ret < 0) || !is_root_cache(cachep)) | 3718 | if ((ret < 0) || !is_root_cache(cachep)) |
| 3720 | return ret; | 3719 | return ret; |
| 3721 | 3720 | ||
| 3722 | VM_BUG_ON(!mutex_is_locked(&slab_mutex)); | 3721 | lockdep_assert_held(&slab_mutex); |
| 3723 | for_each_memcg_cache_index(i) { | 3722 | for_each_memcg_cache(c, cachep) { |
| 3724 | c = cache_from_memcg_idx(cachep, i); | 3723 | /* return value determined by the root cache only */ |
| 3725 | if (c) | 3724 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); |
| 3726 | /* return value determined by the parent cache only */ | ||
| 3727 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); | ||
| 3728 | } | 3725 | } |
| 3729 | 3726 | ||
| 3730 | return ret; | 3727 | return ret; |
| @@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, | |||
| 86 | extern void create_boot_cache(struct kmem_cache *, const char *name, | 86 | extern void create_boot_cache(struct kmem_cache *, const char *name, |
| 87 | size_t size, unsigned long flags); | 87 | size_t size, unsigned long flags); |
| 88 | 88 | ||
| 89 | struct mem_cgroup; | ||
| 90 | |||
| 91 | int slab_unmergeable(struct kmem_cache *s); | 89 | int slab_unmergeable(struct kmem_cache *s); |
| 92 | struct kmem_cache *find_mergeable(size_t size, size_t align, | 90 | struct kmem_cache *find_mergeable(size_t size, size_t align, |
| 93 | unsigned long flags, const char *name, void (*ctor)(void *)); | 91 | unsigned long flags, const char *name, void (*ctor)(void *)); |
| @@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, | |||
| 140 | #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) | 138 | #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) |
| 141 | 139 | ||
| 142 | int __kmem_cache_shutdown(struct kmem_cache *); | 140 | int __kmem_cache_shutdown(struct kmem_cache *); |
| 143 | int __kmem_cache_shrink(struct kmem_cache *); | 141 | int __kmem_cache_shrink(struct kmem_cache *, bool); |
| 144 | void slab_kmem_cache_release(struct kmem_cache *); | 142 | void slab_kmem_cache_release(struct kmem_cache *); |
| 145 | 143 | ||
| 146 | struct seq_file; | 144 | struct seq_file; |
| @@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
| 165 | size_t count, loff_t *ppos); | 163 | size_t count, loff_t *ppos); |
| 166 | 164 | ||
| 167 | #ifdef CONFIG_MEMCG_KMEM | 165 | #ifdef CONFIG_MEMCG_KMEM |
| 166 | /* | ||
| 167 | * Iterate over all memcg caches of the given root cache. The caller must hold | ||
| 168 | * slab_mutex. | ||
| 169 | */ | ||
| 170 | #define for_each_memcg_cache(iter, root) \ | ||
| 171 | list_for_each_entry(iter, &(root)->memcg_params.list, \ | ||
| 172 | memcg_params.list) | ||
| 173 | |||
| 174 | #define for_each_memcg_cache_safe(iter, tmp, root) \ | ||
| 175 | list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ | ||
| 176 | memcg_params.list) | ||
| 177 | |||
| 168 | static inline bool is_root_cache(struct kmem_cache *s) | 178 | static inline bool is_root_cache(struct kmem_cache *s) |
| 169 | { | 179 | { |
| 170 | return !s->memcg_params || s->memcg_params->is_root_cache; | 180 | return s->memcg_params.is_root_cache; |
| 171 | } | 181 | } |
| 172 | 182 | ||
| 173 | static inline bool slab_equal_or_root(struct kmem_cache *s, | 183 | static inline bool slab_equal_or_root(struct kmem_cache *s, |
| 174 | struct kmem_cache *p) | 184 | struct kmem_cache *p) |
| 175 | { | 185 | { |
| 176 | return (p == s) || | 186 | return p == s || p == s->memcg_params.root_cache; |
| 177 | (s->memcg_params && (p == s->memcg_params->root_cache)); | ||
| 178 | } | 187 | } |
| 179 | 188 | ||
| 180 | /* | 189 | /* |
| @@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s, | |||
| 185 | static inline const char *cache_name(struct kmem_cache *s) | 194 | static inline const char *cache_name(struct kmem_cache *s) |
| 186 | { | 195 | { |
| 187 | if (!is_root_cache(s)) | 196 | if (!is_root_cache(s)) |
| 188 | return s->memcg_params->root_cache->name; | 197 | s = s->memcg_params.root_cache; |
| 189 | return s->name; | 198 | return s->name; |
| 190 | } | 199 | } |
| 191 | 200 | ||
| 192 | /* | 201 | /* |
| 193 | * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. | 202 | * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. |
| 194 | * That said the caller must assure the memcg's cache won't go away. Since once | 203 | * That said the caller must assure the memcg's cache won't go away by either |
| 195 | * created a memcg's cache is destroyed only along with the root cache, it is | 204 | * taking a css reference to the owner cgroup, or holding the slab_mutex. |
| 196 | * true if we are going to allocate from the cache or hold a reference to the | ||
| 197 | * root cache by other means. Otherwise, we should hold either the slab_mutex | ||
| 198 | * or the memcg's slab_caches_mutex while calling this function and accessing | ||
| 199 | * the returned value. | ||
| 200 | */ | 205 | */ |
| 201 | static inline struct kmem_cache * | 206 | static inline struct kmem_cache * |
| 202 | cache_from_memcg_idx(struct kmem_cache *s, int idx) | 207 | cache_from_memcg_idx(struct kmem_cache *s, int idx) |
| 203 | { | 208 | { |
| 204 | struct kmem_cache *cachep; | 209 | struct kmem_cache *cachep; |
| 205 | struct memcg_cache_params *params; | 210 | struct memcg_cache_array *arr; |
| 206 | |||
| 207 | if (!s->memcg_params) | ||
| 208 | return NULL; | ||
| 209 | 211 | ||
| 210 | rcu_read_lock(); | 212 | rcu_read_lock(); |
| 211 | params = rcu_dereference(s->memcg_params); | 213 | arr = rcu_dereference(s->memcg_params.memcg_caches); |
| 212 | 214 | ||
| 213 | /* | 215 | /* |
| 214 | * Make sure we will access the up-to-date value. The code updating | 216 | * Make sure we will access the up-to-date value. The code updating |
| 215 | * memcg_caches issues a write barrier to match this (see | 217 | * memcg_caches issues a write barrier to match this (see |
| 216 | * memcg_register_cache()). | 218 | * memcg_create_kmem_cache()). |
| 217 | */ | 219 | */ |
| 218 | cachep = lockless_dereference(params->memcg_caches[idx]); | 220 | cachep = lockless_dereference(arr->entries[idx]); |
| 219 | rcu_read_unlock(); | 221 | rcu_read_unlock(); |
| 220 | 222 | ||
| 221 | return cachep; | 223 | return cachep; |
| @@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | |||
| 225 | { | 227 | { |
| 226 | if (is_root_cache(s)) | 228 | if (is_root_cache(s)) |
| 227 | return s; | 229 | return s; |
| 228 | return s->memcg_params->root_cache; | 230 | return s->memcg_params.root_cache; |
| 229 | } | 231 | } |
| 230 | 232 | ||
| 231 | static __always_inline int memcg_charge_slab(struct kmem_cache *s, | 233 | static __always_inline int memcg_charge_slab(struct kmem_cache *s, |
| @@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, | |||
| 235 | return 0; | 237 | return 0; |
| 236 | if (is_root_cache(s)) | 238 | if (is_root_cache(s)) |
| 237 | return 0; | 239 | return 0; |
| 238 | return __memcg_charge_slab(s, gfp, order); | 240 | return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); |
| 239 | } | 241 | } |
| 240 | 242 | ||
| 241 | static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | 243 | static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) |
| @@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | |||
| 244 | return; | 246 | return; |
| 245 | if (is_root_cache(s)) | 247 | if (is_root_cache(s)) |
| 246 | return; | 248 | return; |
| 247 | __memcg_uncharge_slab(s, order); | 249 | memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); |
| 248 | } | 250 | } |
| 249 | #else | 251 | |
| 252 | extern void slab_init_memcg_params(struct kmem_cache *); | ||
| 253 | |||
| 254 | #else /* !CONFIG_MEMCG_KMEM */ | ||
| 255 | |||
| 256 | #define for_each_memcg_cache(iter, root) \ | ||
| 257 | for ((void)(iter), (void)(root); 0; ) | ||
| 258 | #define for_each_memcg_cache_safe(iter, tmp, root) \ | ||
| 259 | for ((void)(iter), (void)(tmp), (void)(root); 0; ) | ||
| 260 | |||
| 250 | static inline bool is_root_cache(struct kmem_cache *s) | 261 | static inline bool is_root_cache(struct kmem_cache *s) |
| 251 | { | 262 | { |
| 252 | return true; | 263 | return true; |
| @@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) | |||
| 282 | static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | 293 | static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) |
| 283 | { | 294 | { |
| 284 | } | 295 | } |
| 285 | #endif | 296 | |
| 297 | static inline void slab_init_memcg_params(struct kmem_cache *s) | ||
| 298 | { | ||
| 299 | } | ||
| 300 | #endif /* CONFIG_MEMCG_KMEM */ | ||
| 286 | 301 | ||
| 287 | static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) | 302 | static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) |
| 288 | { | 303 | { |
diff --git a/mm/slab_common.c b/mm/slab_common.c index e03dd6f2a272..999bb3424d44 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
| @@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) | |||
| 106 | #endif | 106 | #endif |
| 107 | 107 | ||
| 108 | #ifdef CONFIG_MEMCG_KMEM | 108 | #ifdef CONFIG_MEMCG_KMEM |
| 109 | static int memcg_alloc_cache_params(struct mem_cgroup *memcg, | 109 | void slab_init_memcg_params(struct kmem_cache *s) |
| 110 | struct kmem_cache *s, struct kmem_cache *root_cache) | ||
| 111 | { | 110 | { |
| 112 | size_t size; | 111 | s->memcg_params.is_root_cache = true; |
| 112 | INIT_LIST_HEAD(&s->memcg_params.list); | ||
| 113 | RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); | ||
| 114 | } | ||
| 115 | |||
| 116 | static int init_memcg_params(struct kmem_cache *s, | ||
| 117 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) | ||
| 118 | { | ||
| 119 | struct memcg_cache_array *arr; | ||
| 113 | 120 | ||
| 114 | if (!memcg_kmem_enabled()) | 121 | if (memcg) { |
| 122 | s->memcg_params.is_root_cache = false; | ||
| 123 | s->memcg_params.memcg = memcg; | ||
| 124 | s->memcg_params.root_cache = root_cache; | ||
| 115 | return 0; | 125 | return 0; |
| 126 | } | ||
| 116 | 127 | ||
| 117 | if (!memcg) { | 128 | slab_init_memcg_params(s); |
| 118 | size = offsetof(struct memcg_cache_params, memcg_caches); | ||
| 119 | size += memcg_limited_groups_array_size * sizeof(void *); | ||
| 120 | } else | ||
| 121 | size = sizeof(struct memcg_cache_params); | ||
| 122 | 129 | ||
| 123 | s->memcg_params = kzalloc(size, GFP_KERNEL); | 130 | if (!memcg_nr_cache_ids) |
| 124 | if (!s->memcg_params) | 131 | return 0; |
| 125 | return -ENOMEM; | ||
| 126 | 132 | ||
| 127 | if (memcg) { | 133 | arr = kzalloc(sizeof(struct memcg_cache_array) + |
| 128 | s->memcg_params->memcg = memcg; | 134 | memcg_nr_cache_ids * sizeof(void *), |
| 129 | s->memcg_params->root_cache = root_cache; | 135 | GFP_KERNEL); |
| 130 | } else | 136 | if (!arr) |
| 131 | s->memcg_params->is_root_cache = true; | 137 | return -ENOMEM; |
| 132 | 138 | ||
| 139 | RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr); | ||
| 133 | return 0; | 140 | return 0; |
| 134 | } | 141 | } |
| 135 | 142 | ||
| 136 | static void memcg_free_cache_params(struct kmem_cache *s) | 143 | static void destroy_memcg_params(struct kmem_cache *s) |
| 137 | { | 144 | { |
| 138 | kfree(s->memcg_params); | 145 | if (is_root_cache(s)) |
| 146 | kfree(rcu_access_pointer(s->memcg_params.memcg_caches)); | ||
| 139 | } | 147 | } |
| 140 | 148 | ||
| 141 | static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) | 149 | static int update_memcg_params(struct kmem_cache *s, int new_array_size) |
| 142 | { | 150 | { |
| 143 | int size; | 151 | struct memcg_cache_array *old, *new; |
| 144 | struct memcg_cache_params *new_params, *cur_params; | ||
| 145 | |||
| 146 | BUG_ON(!is_root_cache(s)); | ||
| 147 | 152 | ||
| 148 | size = offsetof(struct memcg_cache_params, memcg_caches); | 153 | if (!is_root_cache(s)) |
| 149 | size += num_memcgs * sizeof(void *); | 154 | return 0; |
| 150 | 155 | ||
| 151 | new_params = kzalloc(size, GFP_KERNEL); | 156 | new = kzalloc(sizeof(struct memcg_cache_array) + |
| 152 | if (!new_params) | 157 | new_array_size * sizeof(void *), GFP_KERNEL); |
| 158 | if (!new) | ||
| 153 | return -ENOMEM; | 159 | return -ENOMEM; |
| 154 | 160 | ||
| 155 | cur_params = s->memcg_params; | 161 | old = rcu_dereference_protected(s->memcg_params.memcg_caches, |
| 156 | memcpy(new_params->memcg_caches, cur_params->memcg_caches, | 162 | lockdep_is_held(&slab_mutex)); |
| 157 | memcg_limited_groups_array_size * sizeof(void *)); | 163 | if (old) |
| 158 | 164 | memcpy(new->entries, old->entries, | |
| 159 | new_params->is_root_cache = true; | 165 | memcg_nr_cache_ids * sizeof(void *)); |
| 160 | |||
| 161 | rcu_assign_pointer(s->memcg_params, new_params); | ||
| 162 | if (cur_params) | ||
| 163 | kfree_rcu(cur_params, rcu_head); | ||
| 164 | 166 | ||
| 167 | rcu_assign_pointer(s->memcg_params.memcg_caches, new); | ||
| 168 | if (old) | ||
| 169 | kfree_rcu(old, rcu); | ||
| 165 | return 0; | 170 | return 0; |
| 166 | } | 171 | } |
| 167 | 172 | ||
| @@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs) | |||
| 169 | { | 174 | { |
| 170 | struct kmem_cache *s; | 175 | struct kmem_cache *s; |
| 171 | int ret = 0; | 176 | int ret = 0; |
| 172 | mutex_lock(&slab_mutex); | ||
| 173 | 177 | ||
| 178 | mutex_lock(&slab_mutex); | ||
| 174 | list_for_each_entry(s, &slab_caches, list) { | 179 | list_for_each_entry(s, &slab_caches, list) { |
| 175 | if (!is_root_cache(s)) | 180 | ret = update_memcg_params(s, num_memcgs); |
| 176 | continue; | ||
| 177 | |||
| 178 | ret = memcg_update_cache_params(s, num_memcgs); | ||
| 179 | /* | 181 | /* |
| 180 | * Instead of freeing the memory, we'll just leave the caches | 182 | * Instead of freeing the memory, we'll just leave the caches |
| 181 | * up to this point in an updated state. | 183 | * up to this point in an updated state. |
| 182 | */ | 184 | */ |
| 183 | if (ret) | 185 | if (ret) |
| 184 | goto out; | 186 | break; |
| 185 | } | 187 | } |
| 186 | |||
| 187 | memcg_update_array_size(num_memcgs); | ||
| 188 | out: | ||
| 189 | mutex_unlock(&slab_mutex); | 188 | mutex_unlock(&slab_mutex); |
| 190 | return ret; | 189 | return ret; |
| 191 | } | 190 | } |
| 192 | #else | 191 | #else |
| 193 | static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, | 192 | static inline int init_memcg_params(struct kmem_cache *s, |
| 194 | struct kmem_cache *s, struct kmem_cache *root_cache) | 193 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) |
| 195 | { | 194 | { |
| 196 | return 0; | 195 | return 0; |
| 197 | } | 196 | } |
| 198 | 197 | ||
| 199 | static inline void memcg_free_cache_params(struct kmem_cache *s) | 198 | static inline void destroy_memcg_params(struct kmem_cache *s) |
| 200 | { | 199 | { |
| 201 | } | 200 | } |
| 202 | #endif /* CONFIG_MEMCG_KMEM */ | 201 | #endif /* CONFIG_MEMCG_KMEM */ |
| @@ -296,8 +295,8 @@ unsigned long calculate_alignment(unsigned long flags, | |||
| 296 | } | 295 | } |
| 297 | 296 | ||
| 298 | static struct kmem_cache * | 297 | static struct kmem_cache * |
| 299 | do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, | 298 | do_kmem_cache_create(const char *name, size_t object_size, size_t size, |
| 300 | unsigned long flags, void (*ctor)(void *), | 299 | size_t align, unsigned long flags, void (*ctor)(void *), |
| 301 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) | 300 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) |
| 302 | { | 301 | { |
| 303 | struct kmem_cache *s; | 302 | struct kmem_cache *s; |
| @@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, | |||
| 314 | s->align = align; | 313 | s->align = align; |
| 315 | s->ctor = ctor; | 314 | s->ctor = ctor; |
| 316 | 315 | ||
| 317 | err = memcg_alloc_cache_params(memcg, s, root_cache); | 316 | err = init_memcg_params(s, memcg, root_cache); |
| 318 | if (err) | 317 | if (err) |
| 319 | goto out_free_cache; | 318 | goto out_free_cache; |
| 320 | 319 | ||
| @@ -330,8 +329,8 @@ out: | |||
| 330 | return s; | 329 | return s; |
| 331 | 330 | ||
| 332 | out_free_cache: | 331 | out_free_cache: |
| 333 | memcg_free_cache_params(s); | 332 | destroy_memcg_params(s); |
| 334 | kfree(s); | 333 | kmem_cache_free(kmem_cache, s); |
| 335 | goto out; | 334 | goto out; |
| 336 | } | 335 | } |
| 337 | 336 | ||
| @@ -364,11 +363,12 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
| 364 | unsigned long flags, void (*ctor)(void *)) | 363 | unsigned long flags, void (*ctor)(void *)) |
| 365 | { | 364 | { |
| 366 | struct kmem_cache *s; | 365 | struct kmem_cache *s; |
| 367 | char *cache_name; | 366 | const char *cache_name; |
| 368 | int err; | 367 | int err; |
| 369 | 368 | ||
| 370 | get_online_cpus(); | 369 | get_online_cpus(); |
| 371 | get_online_mems(); | 370 | get_online_mems(); |
| 371 | memcg_get_cache_ids(); | ||
| 372 | 372 | ||
| 373 | mutex_lock(&slab_mutex); | 373 | mutex_lock(&slab_mutex); |
| 374 | 374 | ||
| @@ -390,7 +390,7 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
| 390 | if (s) | 390 | if (s) |
| 391 | goto out_unlock; | 391 | goto out_unlock; |
| 392 | 392 | ||
| 393 | cache_name = kstrdup(name, GFP_KERNEL); | 393 | cache_name = kstrdup_const(name, GFP_KERNEL); |
| 394 | if (!cache_name) { | 394 | if (!cache_name) { |
| 395 | err = -ENOMEM; | 395 | err = -ENOMEM; |
| 396 | goto out_unlock; | 396 | goto out_unlock; |
| @@ -401,12 +401,13 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
| 401 | flags, ctor, NULL, NULL); | 401 | flags, ctor, NULL, NULL); |
| 402 | if (IS_ERR(s)) { | 402 | if (IS_ERR(s)) { |
| 403 | err = PTR_ERR(s); | 403 | err = PTR_ERR(s); |
| 404 | kfree(cache_name); | 404 | kfree_const(cache_name); |
| 405 | } | 405 | } |
| 406 | 406 | ||
| 407 | out_unlock: | 407 | out_unlock: |
| 408 | mutex_unlock(&slab_mutex); | 408 | mutex_unlock(&slab_mutex); |
| 409 | 409 | ||
| 410 | memcg_put_cache_ids(); | ||
| 410 | put_online_mems(); | 411 | put_online_mems(); |
| 411 | put_online_cpus(); | 412 | put_online_cpus(); |
| 412 | 413 | ||
| @@ -425,31 +426,91 @@ out_unlock: | |||
| 425 | } | 426 | } |
| 426 | EXPORT_SYMBOL(kmem_cache_create); | 427 | EXPORT_SYMBOL(kmem_cache_create); |
| 427 | 428 | ||
| 429 | static int do_kmem_cache_shutdown(struct kmem_cache *s, | ||
| 430 | struct list_head *release, bool *need_rcu_barrier) | ||
| 431 | { | ||
| 432 | if (__kmem_cache_shutdown(s) != 0) { | ||
| 433 | printk(KERN_ERR "kmem_cache_destroy %s: " | ||
| 434 | "Slab cache still has objects\n", s->name); | ||
| 435 | dump_stack(); | ||
| 436 | return -EBUSY; | ||
| 437 | } | ||
| 438 | |||
| 439 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
| 440 | *need_rcu_barrier = true; | ||
| 441 | |||
| 442 | #ifdef CONFIG_MEMCG_KMEM | ||
| 443 | if (!is_root_cache(s)) | ||
| 444 | list_del(&s->memcg_params.list); | ||
| 445 | #endif | ||
| 446 | list_move(&s->list, release); | ||
| 447 | return 0; | ||
| 448 | } | ||
| 449 | |||
| 450 | static void do_kmem_cache_release(struct list_head *release, | ||
| 451 | bool need_rcu_barrier) | ||
| 452 | { | ||
| 453 | struct kmem_cache *s, *s2; | ||
| 454 | |||
| 455 | if (need_rcu_barrier) | ||
| 456 | rcu_barrier(); | ||
| 457 | |||
| 458 | list_for_each_entry_safe(s, s2, release, list) { | ||
| 459 | #ifdef SLAB_SUPPORTS_SYSFS | ||
| 460 | sysfs_slab_remove(s); | ||
| 461 | #else | ||
| 462 | slab_kmem_cache_release(s); | ||
| 463 | #endif | ||
| 464 | } | ||
| 465 | } | ||
| 466 | |||
| 428 | #ifdef CONFIG_MEMCG_KMEM | 467 | #ifdef CONFIG_MEMCG_KMEM |
| 429 | /* | 468 | /* |
| 430 | * memcg_create_kmem_cache - Create a cache for a memory cgroup. | 469 | * memcg_create_kmem_cache - Create a cache for a memory cgroup. |
| 431 | * @memcg: The memory cgroup the new cache is for. | 470 | * @memcg: The memory cgroup the new cache is for. |
| 432 | * @root_cache: The parent of the new cache. | 471 | * @root_cache: The parent of the new cache. |
| 433 | * @memcg_name: The name of the memory cgroup (used for naming the new cache). | ||
| 434 | * | 472 | * |
| 435 | * This function attempts to create a kmem cache that will serve allocation | 473 | * This function attempts to create a kmem cache that will serve allocation |
| 436 | * requests going from @memcg to @root_cache. The new cache inherits properties | 474 | * requests going from @memcg to @root_cache. The new cache inherits properties |
| 437 | * from its parent. | 475 | * from its parent. |
| 438 | */ | 476 | */ |
| 439 | struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | 477 | void memcg_create_kmem_cache(struct mem_cgroup *memcg, |
| 440 | struct kmem_cache *root_cache, | 478 | struct kmem_cache *root_cache) |
| 441 | const char *memcg_name) | ||
| 442 | { | 479 | { |
| 480 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ | ||
| 481 | struct cgroup_subsys_state *css = mem_cgroup_css(memcg); | ||
| 482 | struct memcg_cache_array *arr; | ||
| 443 | struct kmem_cache *s = NULL; | 483 | struct kmem_cache *s = NULL; |
| 444 | char *cache_name; | 484 | char *cache_name; |
| 485 | int idx; | ||
| 445 | 486 | ||
| 446 | get_online_cpus(); | 487 | get_online_cpus(); |
| 447 | get_online_mems(); | 488 | get_online_mems(); |
| 448 | 489 | ||
| 449 | mutex_lock(&slab_mutex); | 490 | mutex_lock(&slab_mutex); |
| 450 | 491 | ||
| 492 | /* | ||
| 493 | * The memory cgroup could have been deactivated while the cache | ||
| 494 | * creation work was pending. | ||
| 495 | */ | ||
| 496 | if (!memcg_kmem_is_active(memcg)) | ||
| 497 | goto out_unlock; | ||
| 498 | |||
| 499 | idx = memcg_cache_id(memcg); | ||
| 500 | arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches, | ||
| 501 | lockdep_is_held(&slab_mutex)); | ||
| 502 | |||
| 503 | /* | ||
| 504 | * Since per-memcg caches are created asynchronously on first | ||
| 505 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
| 506 | * create the same cache, but only one of them may succeed. | ||
| 507 | */ | ||
| 508 | if (arr->entries[idx]) | ||
| 509 | goto out_unlock; | ||
| 510 | |||
| 511 | cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); | ||
| 451 | cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, | 512 | cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, |
| 452 | memcg_cache_id(memcg), memcg_name); | 513 | css->id, memcg_name_buf); |
| 453 | if (!cache_name) | 514 | if (!cache_name) |
| 454 | goto out_unlock; | 515 | goto out_unlock; |
| 455 | 516 | ||
| @@ -457,49 +518,108 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | |||
| 457 | root_cache->size, root_cache->align, | 518 | root_cache->size, root_cache->align, |
| 458 | root_cache->flags, root_cache->ctor, | 519 | root_cache->flags, root_cache->ctor, |
| 459 | memcg, root_cache); | 520 | memcg, root_cache); |
| 521 | /* | ||
| 522 | * If we could not create a memcg cache, do not complain, because | ||
| 523 | * that's not critical at all as we can always proceed with the root | ||
| 524 | * cache. | ||
| 525 | */ | ||
| 460 | if (IS_ERR(s)) { | 526 | if (IS_ERR(s)) { |
| 461 | kfree(cache_name); | 527 | kfree(cache_name); |
| 462 | s = NULL; | 528 | goto out_unlock; |
| 463 | } | 529 | } |
| 464 | 530 | ||
| 531 | list_add(&s->memcg_params.list, &root_cache->memcg_params.list); | ||
| 532 | |||
| 533 | /* | ||
| 534 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | ||
| 535 | * barrier here to ensure nobody will see the kmem_cache partially | ||
| 536 | * initialized. | ||
| 537 | */ | ||
| 538 | smp_wmb(); | ||
| 539 | arr->entries[idx] = s; | ||
| 540 | |||
| 465 | out_unlock: | 541 | out_unlock: |
| 466 | mutex_unlock(&slab_mutex); | 542 | mutex_unlock(&slab_mutex); |
| 467 | 543 | ||
| 468 | put_online_mems(); | 544 | put_online_mems(); |
| 469 | put_online_cpus(); | 545 | put_online_cpus(); |
| 470 | |||
| 471 | return s; | ||
| 472 | } | 546 | } |
| 473 | 547 | ||
| 474 | static int memcg_cleanup_cache_params(struct kmem_cache *s) | 548 | void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) |
| 475 | { | 549 | { |
| 476 | int rc; | 550 | int idx; |
| 551 | struct memcg_cache_array *arr; | ||
| 552 | struct kmem_cache *s, *c; | ||
| 477 | 553 | ||
| 478 | if (!s->memcg_params || | 554 | idx = memcg_cache_id(memcg); |
| 479 | !s->memcg_params->is_root_cache) | 555 | |
| 480 | return 0; | 556 | get_online_cpus(); |
| 557 | get_online_mems(); | ||
| 481 | 558 | ||
| 482 | mutex_unlock(&slab_mutex); | ||
| 483 | rc = __memcg_cleanup_cache_params(s); | ||
| 484 | mutex_lock(&slab_mutex); | 559 | mutex_lock(&slab_mutex); |
| 560 | list_for_each_entry(s, &slab_caches, list) { | ||
| 561 | if (!is_root_cache(s)) | ||
| 562 | continue; | ||
| 563 | |||
| 564 | arr = rcu_dereference_protected(s->memcg_params.memcg_caches, | ||
| 565 | lockdep_is_held(&slab_mutex)); | ||
| 566 | c = arr->entries[idx]; | ||
| 567 | if (!c) | ||
| 568 | continue; | ||
| 569 | |||
| 570 | __kmem_cache_shrink(c, true); | ||
| 571 | arr->entries[idx] = NULL; | ||
| 572 | } | ||
| 573 | mutex_unlock(&slab_mutex); | ||
| 485 | 574 | ||
| 486 | return rc; | 575 | put_online_mems(); |
| 576 | put_online_cpus(); | ||
| 487 | } | 577 | } |
| 488 | #else | 578 | |
| 489 | static int memcg_cleanup_cache_params(struct kmem_cache *s) | 579 | void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) |
| 490 | { | 580 | { |
| 491 | return 0; | 581 | LIST_HEAD(release); |
| 582 | bool need_rcu_barrier = false; | ||
| 583 | struct kmem_cache *s, *s2; | ||
| 584 | |||
| 585 | get_online_cpus(); | ||
| 586 | get_online_mems(); | ||
| 587 | |||
| 588 | mutex_lock(&slab_mutex); | ||
| 589 | list_for_each_entry_safe(s, s2, &slab_caches, list) { | ||
| 590 | if (is_root_cache(s) || s->memcg_params.memcg != memcg) | ||
| 591 | continue; | ||
| 592 | /* | ||
| 593 | * The cgroup is about to be freed and therefore has no charges | ||
| 594 | * left. Hence, all its caches must be empty by now. | ||
| 595 | */ | ||
| 596 | BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); | ||
| 597 | } | ||
| 598 | mutex_unlock(&slab_mutex); | ||
| 599 | |||
| 600 | put_online_mems(); | ||
| 601 | put_online_cpus(); | ||
| 602 | |||
| 603 | do_kmem_cache_release(&release, need_rcu_barrier); | ||
| 492 | } | 604 | } |
| 493 | #endif /* CONFIG_MEMCG_KMEM */ | 605 | #endif /* CONFIG_MEMCG_KMEM */ |
| 494 | 606 | ||
| 495 | void slab_kmem_cache_release(struct kmem_cache *s) | 607 | void slab_kmem_cache_release(struct kmem_cache *s) |
| 496 | { | 608 | { |
| 497 | kfree(s->name); | 609 | destroy_memcg_params(s); |
| 610 | kfree_const(s->name); | ||
| 498 | kmem_cache_free(kmem_cache, s); | 611 | kmem_cache_free(kmem_cache, s); |
| 499 | } | 612 | } |
| 500 | 613 | ||
| 501 | void kmem_cache_destroy(struct kmem_cache *s) | 614 | void kmem_cache_destroy(struct kmem_cache *s) |
| 502 | { | 615 | { |
| 616 | struct kmem_cache *c, *c2; | ||
| 617 | LIST_HEAD(release); | ||
| 618 | bool need_rcu_barrier = false; | ||
| 619 | bool busy = false; | ||
| 620 | |||
| 621 | BUG_ON(!is_root_cache(s)); | ||
| 622 | |||
| 503 | get_online_cpus(); | 623 | get_online_cpus(); |
| 504 | get_online_mems(); | 624 | get_online_mems(); |
| 505 | 625 | ||
| @@ -509,35 +629,21 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
| 509 | if (s->refcount) | 629 | if (s->refcount) |
| 510 | goto out_unlock; | 630 | goto out_unlock; |
| 511 | 631 | ||
| 512 | if (memcg_cleanup_cache_params(s) != 0) | 632 | for_each_memcg_cache_safe(c, c2, s) { |
| 513 | goto out_unlock; | 633 | if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) |
| 514 | 634 | busy = true; | |
| 515 | if (__kmem_cache_shutdown(s) != 0) { | ||
| 516 | printk(KERN_ERR "kmem_cache_destroy %s: " | ||
| 517 | "Slab cache still has objects\n", s->name); | ||
| 518 | dump_stack(); | ||
| 519 | goto out_unlock; | ||
| 520 | } | 635 | } |
| 521 | 636 | ||
| 522 | list_del(&s->list); | 637 | if (!busy) |
| 523 | 638 | do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); | |
| 524 | mutex_unlock(&slab_mutex); | ||
| 525 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
| 526 | rcu_barrier(); | ||
| 527 | |||
| 528 | memcg_free_cache_params(s); | ||
| 529 | #ifdef SLAB_SUPPORTS_SYSFS | ||
| 530 | sysfs_slab_remove(s); | ||
| 531 | #else | ||
| 532 | slab_kmem_cache_release(s); | ||
| 533 | #endif | ||
| 534 | goto out; | ||
| 535 | 639 | ||
| 536 | out_unlock: | 640 | out_unlock: |
| 537 | mutex_unlock(&slab_mutex); | 641 | mutex_unlock(&slab_mutex); |
| 538 | out: | 642 | |
| 539 | put_online_mems(); | 643 | put_online_mems(); |
| 540 | put_online_cpus(); | 644 | put_online_cpus(); |
| 645 | |||
| 646 | do_kmem_cache_release(&release, need_rcu_barrier); | ||
| 541 | } | 647 | } |
| 542 | EXPORT_SYMBOL(kmem_cache_destroy); | 648 | EXPORT_SYMBOL(kmem_cache_destroy); |
| 543 | 649 | ||
| @@ -554,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) | |||
| 554 | 660 | ||
| 555 | get_online_cpus(); | 661 | get_online_cpus(); |
| 556 | get_online_mems(); | 662 | get_online_mems(); |
| 557 | ret = __kmem_cache_shrink(cachep); | 663 | ret = __kmem_cache_shrink(cachep, false); |
| 558 | put_online_mems(); | 664 | put_online_mems(); |
| 559 | put_online_cpus(); | 665 | put_online_cpus(); |
| 560 | return ret; | 666 | return ret; |
| @@ -576,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz | |||
| 576 | s->name = name; | 682 | s->name = name; |
| 577 | s->size = s->object_size = size; | 683 | s->size = s->object_size = size; |
| 578 | s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); | 684 | s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); |
| 685 | |||
| 686 | slab_init_memcg_params(s); | ||
| 687 | |||
| 579 | err = __kmem_cache_create(s, flags); | 688 | err = __kmem_cache_create(s, flags); |
| 580 | 689 | ||
| 581 | if (err) | 690 | if (err) |
| @@ -789,6 +898,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) | |||
| 789 | page = alloc_kmem_pages(flags, order); | 898 | page = alloc_kmem_pages(flags, order); |
| 790 | ret = page ? page_address(page) : NULL; | 899 | ret = page ? page_address(page) : NULL; |
| 791 | kmemleak_alloc(ret, size, 1, flags); | 900 | kmemleak_alloc(ret, size, 1, flags); |
| 901 | kasan_kmalloc_large(ret, size); | ||
| 792 | return ret; | 902 | return ret; |
| 793 | } | 903 | } |
| 794 | EXPORT_SYMBOL(kmalloc_order); | 904 | EXPORT_SYMBOL(kmalloc_order); |
| @@ -855,16 +965,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) | |||
| 855 | { | 965 | { |
| 856 | struct kmem_cache *c; | 966 | struct kmem_cache *c; |
| 857 | struct slabinfo sinfo; | 967 | struct slabinfo sinfo; |
| 858 | int i; | ||
| 859 | 968 | ||
| 860 | if (!is_root_cache(s)) | 969 | if (!is_root_cache(s)) |
| 861 | return; | 970 | return; |
| 862 | 971 | ||
| 863 | for_each_memcg_cache_index(i) { | 972 | for_each_memcg_cache(c, s) { |
| 864 | c = cache_from_memcg_idx(s, i); | ||
| 865 | if (!c) | ||
| 866 | continue; | ||
| 867 | |||
| 868 | memset(&sinfo, 0, sizeof(sinfo)); | 973 | memset(&sinfo, 0, sizeof(sinfo)); |
| 869 | get_slabinfo(c, &sinfo); | 974 | get_slabinfo(c, &sinfo); |
| 870 | 975 | ||
| @@ -916,7 +1021,7 @@ int memcg_slab_show(struct seq_file *m, void *p) | |||
| 916 | 1021 | ||
| 917 | if (p == slab_caches.next) | 1022 | if (p == slab_caches.next) |
| 918 | print_slabinfo_header(m); | 1023 | print_slabinfo_header(m); |
| 919 | if (!is_root_cache(s) && s->memcg_params->memcg == memcg) | 1024 | if (!is_root_cache(s) && s->memcg_params.memcg == memcg) |
| 920 | cache_show(s, m); | 1025 | cache_show(s, m); |
| 921 | return 0; | 1026 | return 0; |
| 922 | } | 1027 | } |
| @@ -973,8 +1078,10 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, | |||
| 973 | if (p) | 1078 | if (p) |
| 974 | ks = ksize(p); | 1079 | ks = ksize(p); |
| 975 | 1080 | ||
| 976 | if (ks >= new_size) | 1081 | if (ks >= new_size) { |
| 1082 | kasan_krealloc((void *)p, new_size); | ||
| 977 | return (void *)p; | 1083 | return (void *)p; |
| 1084 | } | ||
| 978 | 1085 | ||
| 979 | ret = kmalloc_track_caller(new_size, flags); | 1086 | ret = kmalloc_track_caller(new_size, flags); |
| 980 | if (ret && p) | 1087 | if (ret && p) |
| @@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c) | |||
| 618 | return 0; | 618 | return 0; |
| 619 | } | 619 | } |
| 620 | 620 | ||
| 621 | int __kmem_cache_shrink(struct kmem_cache *d) | 621 | int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) |
| 622 | { | 622 | { |
| 623 | return 0; | 623 | return 0; |
| 624 | } | 624 | } |
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
| 21 | #include <linux/notifier.h> | 21 | #include <linux/notifier.h> |
| 22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
| 23 | #include <linux/kasan.h> | ||
| 23 | #include <linux/kmemcheck.h> | 24 | #include <linux/kmemcheck.h> |
| 24 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
| 25 | #include <linux/cpuset.h> | 26 | #include <linux/cpuset.h> |
| @@ -468,12 +469,30 @@ static char *slub_debug_slabs; | |||
| 468 | static int disable_higher_order_debug; | 469 | static int disable_higher_order_debug; |
| 469 | 470 | ||
| 470 | /* | 471 | /* |
| 472 | * slub is about to manipulate internal object metadata. This memory lies | ||
| 473 | * outside the range of the allocated object, so accessing it would normally | ||
| 474 | * be reported by kasan as a bounds error. metadata_access_enable() is used | ||
| 475 | * to tell kasan that these accesses are OK. | ||
| 476 | */ | ||
| 477 | static inline void metadata_access_enable(void) | ||
| 478 | { | ||
| 479 | kasan_disable_current(); | ||
| 480 | } | ||
| 481 | |||
| 482 | static inline void metadata_access_disable(void) | ||
| 483 | { | ||
| 484 | kasan_enable_current(); | ||
| 485 | } | ||
| 486 | |||
| 487 | /* | ||
| 471 | * Object debugging | 488 | * Object debugging |
| 472 | */ | 489 | */ |
| 473 | static void print_section(char *text, u8 *addr, unsigned int length) | 490 | static void print_section(char *text, u8 *addr, unsigned int length) |
| 474 | { | 491 | { |
| 492 | metadata_access_enable(); | ||
| 475 | print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, | 493 | print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, |
| 476 | length, 1); | 494 | length, 1); |
| 495 | metadata_access_disable(); | ||
| 477 | } | 496 | } |
| 478 | 497 | ||
| 479 | static struct track *get_track(struct kmem_cache *s, void *object, | 498 | static struct track *get_track(struct kmem_cache *s, void *object, |
| @@ -503,7 +522,9 @@ static void set_track(struct kmem_cache *s, void *object, | |||
| 503 | trace.max_entries = TRACK_ADDRS_COUNT; | 522 | trace.max_entries = TRACK_ADDRS_COUNT; |
| 504 | trace.entries = p->addrs; | 523 | trace.entries = p->addrs; |
| 505 | trace.skip = 3; | 524 | trace.skip = 3; |
| 525 | metadata_access_enable(); | ||
| 506 | save_stack_trace(&trace); | 526 | save_stack_trace(&trace); |
| 527 | metadata_access_disable(); | ||
| 507 | 528 | ||
| 508 | /* See rant in lockdep.c */ | 529 | /* See rant in lockdep.c */ |
| 509 | if (trace.nr_entries != 0 && | 530 | if (trace.nr_entries != 0 && |
| @@ -629,7 +650,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
| 629 | dump_stack(); | 650 | dump_stack(); |
| 630 | } | 651 | } |
| 631 | 652 | ||
| 632 | static void object_err(struct kmem_cache *s, struct page *page, | 653 | void object_err(struct kmem_cache *s, struct page *page, |
| 633 | u8 *object, char *reason) | 654 | u8 *object, char *reason) |
| 634 | { | 655 | { |
| 635 | slab_bug(s, "%s", reason); | 656 | slab_bug(s, "%s", reason); |
| @@ -677,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | |||
| 677 | u8 *fault; | 698 | u8 *fault; |
| 678 | u8 *end; | 699 | u8 *end; |
| 679 | 700 | ||
| 701 | metadata_access_enable(); | ||
| 680 | fault = memchr_inv(start, value, bytes); | 702 | fault = memchr_inv(start, value, bytes); |
| 703 | metadata_access_disable(); | ||
| 681 | if (!fault) | 704 | if (!fault) |
| 682 | return 1; | 705 | return 1; |
| 683 | 706 | ||
| @@ -770,7 +793,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
| 770 | if (!remainder) | 793 | if (!remainder) |
| 771 | return 1; | 794 | return 1; |
| 772 | 795 | ||
| 796 | metadata_access_enable(); | ||
| 773 | fault = memchr_inv(end - remainder, POISON_INUSE, remainder); | 797 | fault = memchr_inv(end - remainder, POISON_INUSE, remainder); |
| 798 | metadata_access_disable(); | ||
| 774 | if (!fault) | 799 | if (!fault) |
| 775 | return 1; | 800 | return 1; |
| 776 | while (end > fault && end[-1] == POISON_INUSE) | 801 | while (end > fault && end[-1] == POISON_INUSE) |
| @@ -1226,11 +1251,13 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, | |||
| 1226 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) | 1251 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) |
| 1227 | { | 1252 | { |
| 1228 | kmemleak_alloc(ptr, size, 1, flags); | 1253 | kmemleak_alloc(ptr, size, 1, flags); |
| 1254 | kasan_kmalloc_large(ptr, size); | ||
| 1229 | } | 1255 | } |
| 1230 | 1256 | ||
| 1231 | static inline void kfree_hook(const void *x) | 1257 | static inline void kfree_hook(const void *x) |
| 1232 | { | 1258 | { |
| 1233 | kmemleak_free(x); | 1259 | kmemleak_free(x); |
| 1260 | kasan_kfree_large(x); | ||
| 1234 | } | 1261 | } |
| 1235 | 1262 | ||
| 1236 | static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, | 1263 | static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, |
| @@ -1253,6 +1280,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, | |||
| 1253 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | 1280 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
| 1254 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); | 1281 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); |
| 1255 | memcg_kmem_put_cache(s); | 1282 | memcg_kmem_put_cache(s); |
| 1283 | kasan_slab_alloc(s, object); | ||
| 1256 | } | 1284 | } |
| 1257 | 1285 | ||
| 1258 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 1286 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
| @@ -1276,6 +1304,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
| 1276 | #endif | 1304 | #endif |
| 1277 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1305 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
| 1278 | debug_check_no_obj_freed(x, s->object_size); | 1306 | debug_check_no_obj_freed(x, s->object_size); |
| 1307 | |||
| 1308 | kasan_slab_free(s, x); | ||
| 1279 | } | 1309 | } |
| 1280 | 1310 | ||
| 1281 | /* | 1311 | /* |
| @@ -1370,8 +1400,11 @@ static void setup_object(struct kmem_cache *s, struct page *page, | |||
| 1370 | void *object) | 1400 | void *object) |
| 1371 | { | 1401 | { |
| 1372 | setup_object_debug(s, page, object); | 1402 | setup_object_debug(s, page, object); |
| 1373 | if (unlikely(s->ctor)) | 1403 | if (unlikely(s->ctor)) { |
| 1404 | kasan_unpoison_object_data(s, object); | ||
| 1374 | s->ctor(object); | 1405 | s->ctor(object); |
| 1406 | kasan_poison_object_data(s, object); | ||
| 1407 | } | ||
| 1375 | } | 1408 | } |
| 1376 | 1409 | ||
| 1377 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | 1410 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) |
| @@ -1404,6 +1437,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1404 | if (unlikely(s->flags & SLAB_POISON)) | 1437 | if (unlikely(s->flags & SLAB_POISON)) |
| 1405 | memset(start, POISON_INUSE, PAGE_SIZE << order); | 1438 | memset(start, POISON_INUSE, PAGE_SIZE << order); |
| 1406 | 1439 | ||
| 1440 | kasan_poison_slab(page); | ||
| 1441 | |||
| 1407 | for_each_object_idx(p, idx, s, start, page->objects) { | 1442 | for_each_object_idx(p, idx, s, start, page->objects) { |
| 1408 | setup_object(s, page, p); | 1443 | setup_object(s, page, p); |
| 1409 | if (likely(idx < page->objects)) | 1444 | if (likely(idx < page->objects)) |
| @@ -2007,6 +2042,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | |||
| 2007 | int pages; | 2042 | int pages; |
| 2008 | int pobjects; | 2043 | int pobjects; |
| 2009 | 2044 | ||
| 2045 | preempt_disable(); | ||
| 2010 | do { | 2046 | do { |
| 2011 | pages = 0; | 2047 | pages = 0; |
| 2012 | pobjects = 0; | 2048 | pobjects = 0; |
| @@ -2040,6 +2076,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | |||
| 2040 | 2076 | ||
| 2041 | } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) | 2077 | } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) |
| 2042 | != oldpage); | 2078 | != oldpage); |
| 2079 | if (unlikely(!s->cpu_partial)) { | ||
| 2080 | unsigned long flags; | ||
| 2081 | |||
| 2082 | local_irq_save(flags); | ||
| 2083 | unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); | ||
| 2084 | local_irq_restore(flags); | ||
| 2085 | } | ||
| 2086 | preempt_enable(); | ||
| 2043 | #endif | 2087 | #endif |
| 2044 | } | 2088 | } |
| 2045 | 2089 | ||
| @@ -2398,13 +2442,24 @@ redo: | |||
| 2398 | * reading from one cpu area. That does not matter as long | 2442 | * reading from one cpu area. That does not matter as long |
| 2399 | * as we end up on the original cpu again when doing the cmpxchg. | 2443 | * as we end up on the original cpu again when doing the cmpxchg. |
| 2400 | * | 2444 | * |
| 2401 | * Preemption is disabled for the retrieval of the tid because that | 2445 | * We should guarantee that tid and kmem_cache are retrieved on |
| 2402 | * must occur from the current processor. We cannot allow rescheduling | 2446 | * the same cpu. It could be different if CONFIG_PREEMPT so we need |
| 2403 | * on a different processor between the determination of the pointer | 2447 | * to check if it is matched or not. |
| 2404 | * and the retrieval of the tid. | ||
| 2405 | */ | 2448 | */ |
| 2406 | preempt_disable(); | 2449 | do { |
| 2407 | c = this_cpu_ptr(s->cpu_slab); | 2450 | tid = this_cpu_read(s->cpu_slab->tid); |
| 2451 | c = raw_cpu_ptr(s->cpu_slab); | ||
| 2452 | } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); | ||
| 2453 | |||
| 2454 | /* | ||
| 2455 | * Irqless object alloc/free algorithm used here depends on sequence | ||
| 2456 | * of fetching cpu_slab's data. tid should be fetched before anything | ||
| 2457 | * on c to guarantee that object and page associated with previous tid | ||
| 2458 | * won't be used with current tid. If we fetch tid first, object and | ||
| 2459 | * page could be one associated with next tid and our alloc/free | ||
| 2460 | * request will be failed. In this case, we will retry. So, no problem. | ||
| 2461 | */ | ||
| 2462 | barrier(); | ||
| 2408 | 2463 | ||
| 2409 | /* | 2464 | /* |
| 2410 | * The transaction ids are globally unique per cpu and per operation on | 2465 | * The transaction ids are globally unique per cpu and per operation on |
| @@ -2412,8 +2467,6 @@ redo: | |||
| 2412 | * occurs on the right processor and that there was no operation on the | 2467 | * occurs on the right processor and that there was no operation on the |
| 2413 | * linked list in between. | 2468 | * linked list in between. |
| 2414 | */ | 2469 | */ |
| 2415 | tid = c->tid; | ||
| 2416 | preempt_enable(); | ||
| 2417 | 2470 | ||
| 2418 | object = c->freelist; | 2471 | object = c->freelist; |
| 2419 | page = c->page; | 2472 | page = c->page; |
| @@ -2479,6 +2532,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) | |||
| 2479 | { | 2532 | { |
| 2480 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); | 2533 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); |
| 2481 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); | 2534 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); |
| 2535 | kasan_kmalloc(s, ret, size); | ||
| 2482 | return ret; | 2536 | return ret; |
| 2483 | } | 2537 | } |
| 2484 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | 2538 | EXPORT_SYMBOL(kmem_cache_alloc_trace); |
| @@ -2505,6 +2559,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, | |||
| 2505 | 2559 | ||
| 2506 | trace_kmalloc_node(_RET_IP_, ret, | 2560 | trace_kmalloc_node(_RET_IP_, ret, |
| 2507 | size, s->size, gfpflags, node); | 2561 | size, s->size, gfpflags, node); |
| 2562 | |||
| 2563 | kasan_kmalloc(s, ret, size); | ||
| 2508 | return ret; | 2564 | return ret; |
| 2509 | } | 2565 | } |
| 2510 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); | 2566 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
| @@ -2512,7 +2568,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); | |||
| 2512 | #endif | 2568 | #endif |
| 2513 | 2569 | ||
| 2514 | /* | 2570 | /* |
| 2515 | * Slow patch handling. This may still be called frequently since objects | 2571 | * Slow path handling. This may still be called frequently since objects |
| 2516 | * have a longer lifetime than the cpu slabs in most processing loads. | 2572 | * have a longer lifetime than the cpu slabs in most processing loads. |
| 2517 | * | 2573 | * |
| 2518 | * So we still attempt to reduce cache line usage. Just take the slab | 2574 | * So we still attempt to reduce cache line usage. Just take the slab |
| @@ -2659,11 +2715,13 @@ redo: | |||
| 2659 | * data is retrieved via this pointer. If we are on the same cpu | 2715 | * data is retrieved via this pointer. If we are on the same cpu |
| 2660 | * during the cmpxchg then the free will succedd. | 2716 | * during the cmpxchg then the free will succedd. |
| 2661 | */ | 2717 | */ |
| 2662 | preempt_disable(); | 2718 | do { |
| 2663 | c = this_cpu_ptr(s->cpu_slab); | 2719 | tid = this_cpu_read(s->cpu_slab->tid); |
| 2720 | c = raw_cpu_ptr(s->cpu_slab); | ||
| 2721 | } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); | ||
| 2664 | 2722 | ||
| 2665 | tid = c->tid; | 2723 | /* Same with comment on barrier() in slab_alloc_node() */ |
| 2666 | preempt_enable(); | 2724 | barrier(); |
| 2667 | 2725 | ||
| 2668 | if (likely(page == c->page)) { | 2726 | if (likely(page == c->page)) { |
| 2669 | set_freepointer(s, object, c->freelist); | 2727 | set_freepointer(s, object, c->freelist); |
| @@ -2888,6 +2946,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
| 2888 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 2946 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
| 2889 | init_tracking(kmem_cache_node, n); | 2947 | init_tracking(kmem_cache_node, n); |
| 2890 | #endif | 2948 | #endif |
| 2949 | kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); | ||
| 2891 | init_kmem_cache_node(n); | 2950 | init_kmem_cache_node(n); |
| 2892 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2951 | inc_slabs_node(kmem_cache_node, node, page->objects); |
| 2893 | 2952 | ||
| @@ -3260,6 +3319,8 @@ void *__kmalloc(size_t size, gfp_t flags) | |||
| 3260 | 3319 | ||
| 3261 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); | 3320 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); |
| 3262 | 3321 | ||
| 3322 | kasan_kmalloc(s, ret, size); | ||
| 3323 | |||
| 3263 | return ret; | 3324 | return ret; |
| 3264 | } | 3325 | } |
| 3265 | EXPORT_SYMBOL(__kmalloc); | 3326 | EXPORT_SYMBOL(__kmalloc); |
| @@ -3303,12 +3364,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
| 3303 | 3364 | ||
| 3304 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); | 3365 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); |
| 3305 | 3366 | ||
| 3367 | kasan_kmalloc(s, ret, size); | ||
| 3368 | |||
| 3306 | return ret; | 3369 | return ret; |
| 3307 | } | 3370 | } |
| 3308 | EXPORT_SYMBOL(__kmalloc_node); | 3371 | EXPORT_SYMBOL(__kmalloc_node); |
| 3309 | #endif | 3372 | #endif |
| 3310 | 3373 | ||
| 3311 | size_t ksize(const void *object) | 3374 | static size_t __ksize(const void *object) |
| 3312 | { | 3375 | { |
| 3313 | struct page *page; | 3376 | struct page *page; |
| 3314 | 3377 | ||
| @@ -3324,6 +3387,15 @@ size_t ksize(const void *object) | |||
| 3324 | 3387 | ||
| 3325 | return slab_ksize(page->slab_cache); | 3388 | return slab_ksize(page->slab_cache); |
| 3326 | } | 3389 | } |
| 3390 | |||
| 3391 | size_t ksize(const void *object) | ||
| 3392 | { | ||
| 3393 | size_t size = __ksize(object); | ||
| 3394 | /* We assume that ksize callers could use whole allocated area, | ||
| 3395 | so we need unpoison this area. */ | ||
| 3396 | kasan_krealloc(object, size); | ||
| 3397 | return size; | ||
| 3398 | } | ||
| 3327 | EXPORT_SYMBOL(ksize); | 3399 | EXPORT_SYMBOL(ksize); |
| 3328 | 3400 | ||
| 3329 | void kfree(const void *x) | 3401 | void kfree(const void *x) |
| @@ -3347,69 +3419,92 @@ void kfree(const void *x) | |||
| 3347 | } | 3419 | } |
| 3348 | EXPORT_SYMBOL(kfree); | 3420 | EXPORT_SYMBOL(kfree); |
| 3349 | 3421 | ||
| 3422 | #define SHRINK_PROMOTE_MAX 32 | ||
| 3423 | |||
| 3350 | /* | 3424 | /* |
| 3351 | * kmem_cache_shrink removes empty slabs from the partial lists and sorts | 3425 | * kmem_cache_shrink discards empty slabs and promotes the slabs filled |
| 3352 | * the remaining slabs by the number of items in use. The slabs with the | 3426 | * up most to the head of the partial lists. New allocations will then |
| 3353 | * most items in use come first. New allocations will then fill those up | 3427 | * fill those up and thus they can be removed from the partial lists. |
| 3354 | * and thus they can be removed from the partial lists. | ||
| 3355 | * | 3428 | * |
| 3356 | * The slabs with the least items are placed last. This results in them | 3429 | * The slabs with the least items are placed last. This results in them |
| 3357 | * being allocated from last increasing the chance that the last objects | 3430 | * being allocated from last increasing the chance that the last objects |
| 3358 | * are freed in them. | 3431 | * are freed in them. |
| 3359 | */ | 3432 | */ |
| 3360 | int __kmem_cache_shrink(struct kmem_cache *s) | 3433 | int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) |
| 3361 | { | 3434 | { |
| 3362 | int node; | 3435 | int node; |
| 3363 | int i; | 3436 | int i; |
| 3364 | struct kmem_cache_node *n; | 3437 | struct kmem_cache_node *n; |
| 3365 | struct page *page; | 3438 | struct page *page; |
| 3366 | struct page *t; | 3439 | struct page *t; |
| 3367 | int objects = oo_objects(s->max); | 3440 | struct list_head discard; |
| 3368 | struct list_head *slabs_by_inuse = | 3441 | struct list_head promote[SHRINK_PROMOTE_MAX]; |
| 3369 | kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); | ||
| 3370 | unsigned long flags; | 3442 | unsigned long flags; |
| 3443 | int ret = 0; | ||
| 3371 | 3444 | ||
| 3372 | if (!slabs_by_inuse) | 3445 | if (deactivate) { |
| 3373 | return -ENOMEM; | 3446 | /* |
| 3447 | * Disable empty slabs caching. Used to avoid pinning offline | ||
| 3448 | * memory cgroups by kmem pages that can be freed. | ||
| 3449 | */ | ||
| 3450 | s->cpu_partial = 0; | ||
| 3451 | s->min_partial = 0; | ||
| 3452 | |||
| 3453 | /* | ||
| 3454 | * s->cpu_partial is checked locklessly (see put_cpu_partial), | ||
| 3455 | * so we have to make sure the change is visible. | ||
| 3456 | */ | ||
| 3457 | kick_all_cpus_sync(); | ||
| 3458 | } | ||
| 3374 | 3459 | ||
| 3375 | flush_all(s); | 3460 | flush_all(s); |
| 3376 | for_each_kmem_cache_node(s, node, n) { | 3461 | for_each_kmem_cache_node(s, node, n) { |
| 3377 | if (!n->nr_partial) | 3462 | INIT_LIST_HEAD(&discard); |
| 3378 | continue; | 3463 | for (i = 0; i < SHRINK_PROMOTE_MAX; i++) |
| 3379 | 3464 | INIT_LIST_HEAD(promote + i); | |
| 3380 | for (i = 0; i < objects; i++) | ||
| 3381 | INIT_LIST_HEAD(slabs_by_inuse + i); | ||
| 3382 | 3465 | ||
| 3383 | spin_lock_irqsave(&n->list_lock, flags); | 3466 | spin_lock_irqsave(&n->list_lock, flags); |
| 3384 | 3467 | ||
| 3385 | /* | 3468 | /* |
| 3386 | * Build lists indexed by the items in use in each slab. | 3469 | * Build lists of slabs to discard or promote. |
| 3387 | * | 3470 | * |
| 3388 | * Note that concurrent frees may occur while we hold the | 3471 | * Note that concurrent frees may occur while we hold the |
| 3389 | * list_lock. page->inuse here is the upper limit. | 3472 | * list_lock. page->inuse here is the upper limit. |
| 3390 | */ | 3473 | */ |
| 3391 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 3474 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
| 3392 | list_move(&page->lru, slabs_by_inuse + page->inuse); | 3475 | int free = page->objects - page->inuse; |
| 3393 | if (!page->inuse) | 3476 | |
| 3477 | /* Do not reread page->inuse */ | ||
| 3478 | barrier(); | ||
| 3479 | |||
| 3480 | /* We do not keep full slabs on the list */ | ||
| 3481 | BUG_ON(free <= 0); | ||
| 3482 | |||
| 3483 | if (free == page->objects) { | ||
| 3484 | list_move(&page->lru, &discard); | ||
| 3394 | n->nr_partial--; | 3485 | n->nr_partial--; |
| 3486 | } else if (free <= SHRINK_PROMOTE_MAX) | ||
| 3487 | list_move(&page->lru, promote + free - 1); | ||
| 3395 | } | 3488 | } |
| 3396 | 3489 | ||
| 3397 | /* | 3490 | /* |
| 3398 | * Rebuild the partial list with the slabs filled up most | 3491 | * Promote the slabs filled up most to the head of the |
| 3399 | * first and the least used slabs at the end. | 3492 | * partial list. |
| 3400 | */ | 3493 | */ |
| 3401 | for (i = objects - 1; i > 0; i--) | 3494 | for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) |
| 3402 | list_splice(slabs_by_inuse + i, n->partial.prev); | 3495 | list_splice(promote + i, &n->partial); |
| 3403 | 3496 | ||
| 3404 | spin_unlock_irqrestore(&n->list_lock, flags); | 3497 | spin_unlock_irqrestore(&n->list_lock, flags); |
| 3405 | 3498 | ||
| 3406 | /* Release empty slabs */ | 3499 | /* Release empty slabs */ |
| 3407 | list_for_each_entry_safe(page, t, slabs_by_inuse, lru) | 3500 | list_for_each_entry_safe(page, t, &discard, lru) |
| 3408 | discard_slab(s, page); | 3501 | discard_slab(s, page); |
| 3502 | |||
| 3503 | if (slabs_node(s, node)) | ||
| 3504 | ret = 1; | ||
| 3409 | } | 3505 | } |
| 3410 | 3506 | ||
| 3411 | kfree(slabs_by_inuse); | 3507 | return ret; |
| 3412 | return 0; | ||
| 3413 | } | 3508 | } |
| 3414 | 3509 | ||
| 3415 | static int slab_mem_going_offline_callback(void *arg) | 3510 | static int slab_mem_going_offline_callback(void *arg) |
| @@ -3418,7 +3513,7 @@ static int slab_mem_going_offline_callback(void *arg) | |||
| 3418 | 3513 | ||
| 3419 | mutex_lock(&slab_mutex); | 3514 | mutex_lock(&slab_mutex); |
| 3420 | list_for_each_entry(s, &slab_caches, list) | 3515 | list_for_each_entry(s, &slab_caches, list) |
| 3421 | __kmem_cache_shrink(s); | 3516 | __kmem_cache_shrink(s, false); |
| 3422 | mutex_unlock(&slab_mutex); | 3517 | mutex_unlock(&slab_mutex); |
| 3423 | 3518 | ||
| 3424 | return 0; | 3519 | return 0; |
| @@ -3566,6 +3661,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) | |||
| 3566 | p->slab_cache = s; | 3661 | p->slab_cache = s; |
| 3567 | #endif | 3662 | #endif |
| 3568 | } | 3663 | } |
| 3664 | slab_init_memcg_params(s); | ||
| 3569 | list_add(&s->list, &slab_caches); | 3665 | list_add(&s->list, &slab_caches); |
| 3570 | return s; | 3666 | return s; |
| 3571 | } | 3667 | } |
| @@ -3624,13 +3720,10 @@ struct kmem_cache * | |||
| 3624 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 3720 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
| 3625 | unsigned long flags, void (*ctor)(void *)) | 3721 | unsigned long flags, void (*ctor)(void *)) |
| 3626 | { | 3722 | { |
| 3627 | struct kmem_cache *s; | 3723 | struct kmem_cache *s, *c; |
| 3628 | 3724 | ||
| 3629 | s = find_mergeable(size, align, flags, name, ctor); | 3725 | s = find_mergeable(size, align, flags, name, ctor); |
| 3630 | if (s) { | 3726 | if (s) { |
| 3631 | int i; | ||
| 3632 | struct kmem_cache *c; | ||
| 3633 | |||
| 3634 | s->refcount++; | 3727 | s->refcount++; |
| 3635 | 3728 | ||
| 3636 | /* | 3729 | /* |
| @@ -3640,10 +3733,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, | |||
| 3640 | s->object_size = max(s->object_size, (int)size); | 3733 | s->object_size = max(s->object_size, (int)size); |
| 3641 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 3734 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
| 3642 | 3735 | ||
| 3643 | for_each_memcg_cache_index(i) { | 3736 | for_each_memcg_cache(c, s) { |
| 3644 | c = cache_from_memcg_idx(s, i); | ||
| 3645 | if (!c) | ||
| 3646 | continue; | ||
| 3647 | c->object_size = s->object_size; | 3737 | c->object_size = s->object_size; |
| 3648 | c->inuse = max_t(int, c->inuse, | 3738 | c->inuse = max_t(int, c->inuse, |
| 3649 | ALIGN(size, sizeof(void *))); | 3739 | ALIGN(size, sizeof(void *))); |
| @@ -4070,20 +4160,16 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
| 4070 | 4160 | ||
| 4071 | if (num_online_cpus() > 1 && | 4161 | if (num_online_cpus() > 1 && |
| 4072 | !cpumask_empty(to_cpumask(l->cpus)) && | 4162 | !cpumask_empty(to_cpumask(l->cpus)) && |
| 4073 | len < PAGE_SIZE - 60) { | 4163 | len < PAGE_SIZE - 60) |
| 4074 | len += sprintf(buf + len, " cpus="); | 4164 | len += scnprintf(buf + len, PAGE_SIZE - len - 50, |
| 4075 | len += cpulist_scnprintf(buf + len, | 4165 | " cpus=%*pbl", |
| 4076 | PAGE_SIZE - len - 50, | 4166 | cpumask_pr_args(to_cpumask(l->cpus))); |
| 4077 | to_cpumask(l->cpus)); | ||
| 4078 | } | ||
| 4079 | 4167 | ||
| 4080 | if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && | 4168 | if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && |
| 4081 | len < PAGE_SIZE - 60) { | 4169 | len < PAGE_SIZE - 60) |
| 4082 | len += sprintf(buf + len, " nodes="); | 4170 | len += scnprintf(buf + len, PAGE_SIZE - len - 50, |
| 4083 | len += nodelist_scnprintf(buf + len, | 4171 | " nodes=%*pbl", |
| 4084 | PAGE_SIZE - len - 50, | 4172 | nodemask_pr_args(&l->nodes)); |
| 4085 | l->nodes); | ||
| 4086 | } | ||
| 4087 | 4173 | ||
| 4088 | len += sprintf(buf + len, "\n"); | 4174 | len += sprintf(buf + len, "\n"); |
| 4089 | } | 4175 | } |
| @@ -4680,12 +4766,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf) | |||
| 4680 | static ssize_t shrink_store(struct kmem_cache *s, | 4766 | static ssize_t shrink_store(struct kmem_cache *s, |
| 4681 | const char *buf, size_t length) | 4767 | const char *buf, size_t length) |
| 4682 | { | 4768 | { |
| 4683 | if (buf[0] == '1') { | 4769 | if (buf[0] == '1') |
| 4684 | int rc = kmem_cache_shrink(s); | 4770 | kmem_cache_shrink(s); |
| 4685 | 4771 | else | |
| 4686 | if (rc) | ||
| 4687 | return rc; | ||
| 4688 | } else | ||
| 4689 | return -EINVAL; | 4772 | return -EINVAL; |
| 4690 | return length; | 4773 | return length; |
| 4691 | } | 4774 | } |
| @@ -4909,7 +4992,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, | |||
| 4909 | err = attribute->store(s, buf, len); | 4992 | err = attribute->store(s, buf, len); |
| 4910 | #ifdef CONFIG_MEMCG_KMEM | 4993 | #ifdef CONFIG_MEMCG_KMEM |
| 4911 | if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { | 4994 | if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { |
| 4912 | int i; | 4995 | struct kmem_cache *c; |
| 4913 | 4996 | ||
| 4914 | mutex_lock(&slab_mutex); | 4997 | mutex_lock(&slab_mutex); |
| 4915 | if (s->max_attr_size < len) | 4998 | if (s->max_attr_size < len) |
| @@ -4932,11 +5015,8 @@ static ssize_t slab_attr_store(struct kobject *kobj, | |||
| 4932 | * directly either failed or succeeded, in which case we loop | 5015 | * directly either failed or succeeded, in which case we loop |
| 4933 | * through the descendants with best-effort propagation. | 5016 | * through the descendants with best-effort propagation. |
| 4934 | */ | 5017 | */ |
| 4935 | for_each_memcg_cache_index(i) { | 5018 | for_each_memcg_cache(c, s) |
| 4936 | struct kmem_cache *c = cache_from_memcg_idx(s, i); | 5019 | attribute->store(c, buf, len); |
| 4937 | if (c) | ||
| 4938 | attribute->store(c, buf, len); | ||
| 4939 | } | ||
| 4940 | mutex_unlock(&slab_mutex); | 5020 | mutex_unlock(&slab_mutex); |
| 4941 | } | 5021 | } |
| 4942 | #endif | 5022 | #endif |
| @@ -4953,7 +5033,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) | |||
| 4953 | if (is_root_cache(s)) | 5033 | if (is_root_cache(s)) |
| 4954 | return; | 5034 | return; |
| 4955 | 5035 | ||
| 4956 | root_cache = s->memcg_params->root_cache; | 5036 | root_cache = s->memcg_params.root_cache; |
| 4957 | 5037 | ||
| 4958 | /* | 5038 | /* |
| 4959 | * This mean this cache had no attribute written. Therefore, no point | 5039 | * This mean this cache had no attribute written. Therefore, no point |
| @@ -5033,7 +5113,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) | |||
| 5033 | { | 5113 | { |
| 5034 | #ifdef CONFIG_MEMCG_KMEM | 5114 | #ifdef CONFIG_MEMCG_KMEM |
| 5035 | if (!is_root_cache(s)) | 5115 | if (!is_root_cache(s)) |
| 5036 | return s->memcg_params->root_cache->memcg_kset; | 5116 | return s->memcg_params.root_cache->memcg_kset; |
| 5037 | #endif | 5117 | #endif |
| 5038 | return slab_kset; | 5118 | return slab_kset; |
| 5039 | } | 5119 | } |
| @@ -1138,12 +1138,8 @@ void __init swap_setup(void) | |||
| 1138 | #ifdef CONFIG_SWAP | 1138 | #ifdef CONFIG_SWAP |
| 1139 | int i; | 1139 | int i; |
| 1140 | 1140 | ||
| 1141 | if (bdi_init(swapper_spaces[0].backing_dev_info)) | 1141 | for (i = 0; i < MAX_SWAPFILES; i++) |
| 1142 | panic("Failed to init swap bdi"); | ||
| 1143 | for (i = 0; i < MAX_SWAPFILES; i++) { | ||
| 1144 | spin_lock_init(&swapper_spaces[i].tree_lock); | 1142 | spin_lock_init(&swapper_spaces[i].tree_lock); |
| 1145 | INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); | ||
| 1146 | } | ||
| 1147 | #endif | 1143 | #endif |
| 1148 | 1144 | ||
| 1149 | /* Use a smaller cluster for small-memory machines */ | 1145 | /* Use a smaller cluster for small-memory machines */ |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 9711342987a0..405923f77334 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -32,17 +32,11 @@ static const struct address_space_operations swap_aops = { | |||
| 32 | #endif | 32 | #endif |
| 33 | }; | 33 | }; |
| 34 | 34 | ||
| 35 | static struct backing_dev_info swap_backing_dev_info = { | ||
| 36 | .name = "swap", | ||
| 37 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | ||
| 38 | }; | ||
| 39 | |||
| 40 | struct address_space swapper_spaces[MAX_SWAPFILES] = { | 35 | struct address_space swapper_spaces[MAX_SWAPFILES] = { |
| 41 | [0 ... MAX_SWAPFILES - 1] = { | 36 | [0 ... MAX_SWAPFILES - 1] = { |
| 42 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 37 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), |
| 43 | .i_mmap_writable = ATOMIC_INIT(0), | 38 | .i_mmap_writable = ATOMIC_INIT(0), |
| 44 | .a_ops = &swap_aops, | 39 | .a_ops = &swap_aops, |
| 45 | .backing_dev_info = &swap_backing_dev_info, | ||
| 46 | } | 40 | } |
| 47 | }; | 41 | }; |
| 48 | 42 | ||
diff --git a/mm/truncate.c b/mm/truncate.c index f1e4d6052369..ddec5a5966d7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) | |||
| 112 | struct address_space *mapping = page->mapping; | 112 | struct address_space *mapping = page->mapping; |
| 113 | if (mapping && mapping_cap_account_dirty(mapping)) { | 113 | if (mapping && mapping_cap_account_dirty(mapping)) { |
| 114 | dec_zone_page_state(page, NR_FILE_DIRTY); | 114 | dec_zone_page_state(page, NR_FILE_DIRTY); |
| 115 | dec_bdi_stat(mapping->backing_dev_info, | 115 | dec_bdi_stat(inode_to_bdi(mapping->host), |
| 116 | BDI_RECLAIMABLE); | 116 | BDI_RECLAIMABLE); |
| 117 | if (account_size) | 117 | if (account_size) |
| 118 | task_io_account_cancelled_write(account_size); | 118 | task_io_account_cancelled_write(account_size); |
| @@ -12,10 +12,30 @@ | |||
| 12 | #include <linux/hugetlb.h> | 12 | #include <linux/hugetlb.h> |
| 13 | #include <linux/vmalloc.h> | 13 | #include <linux/vmalloc.h> |
| 14 | 14 | ||
| 15 | #include <asm/sections.h> | ||
| 15 | #include <asm/uaccess.h> | 16 | #include <asm/uaccess.h> |
| 16 | 17 | ||
| 17 | #include "internal.h" | 18 | #include "internal.h" |
| 18 | 19 | ||
| 20 | static inline int is_kernel_rodata(unsigned long addr) | ||
| 21 | { | ||
| 22 | return addr >= (unsigned long)__start_rodata && | ||
| 23 | addr < (unsigned long)__end_rodata; | ||
| 24 | } | ||
| 25 | |||
| 26 | /** | ||
| 27 | * kfree_const - conditionally free memory | ||
| 28 | * @x: pointer to the memory | ||
| 29 | * | ||
| 30 | * Function calls kfree only if @x is not in .rodata section. | ||
| 31 | */ | ||
| 32 | void kfree_const(const void *x) | ||
| 33 | { | ||
| 34 | if (!is_kernel_rodata((unsigned long)x)) | ||
| 35 | kfree(x); | ||
| 36 | } | ||
| 37 | EXPORT_SYMBOL(kfree_const); | ||
| 38 | |||
| 19 | /** | 39 | /** |
| 20 | * kstrdup - allocate space for and copy an existing string | 40 | * kstrdup - allocate space for and copy an existing string |
| 21 | * @s: the string to duplicate | 41 | * @s: the string to duplicate |
| @@ -38,6 +58,24 @@ char *kstrdup(const char *s, gfp_t gfp) | |||
| 38 | EXPORT_SYMBOL(kstrdup); | 58 | EXPORT_SYMBOL(kstrdup); |
| 39 | 59 | ||
| 40 | /** | 60 | /** |
| 61 | * kstrdup_const - conditionally duplicate an existing const string | ||
| 62 | * @s: the string to duplicate | ||
| 63 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory | ||
| 64 | * | ||
| 65 | * Function returns source string if it is in .rodata section otherwise it | ||
| 66 | * fallbacks to kstrdup. | ||
| 67 | * Strings allocated by kstrdup_const should be freed by kfree_const. | ||
| 68 | */ | ||
| 69 | const char *kstrdup_const(const char *s, gfp_t gfp) | ||
| 70 | { | ||
| 71 | if (is_kernel_rodata((unsigned long)s)) | ||
| 72 | return s; | ||
| 73 | |||
| 74 | return kstrdup(s, gfp); | ||
| 75 | } | ||
| 76 | EXPORT_SYMBOL(kstrdup_const); | ||
| 77 | |||
| 78 | /** | ||
| 41 | * kstrndup - allocate space for and copy an existing string | 79 | * kstrndup - allocate space for and copy an existing string |
| 42 | * @s: the string to duplicate | 80 | * @s: the string to duplicate |
| 43 | * @max: read at most @max chars from @s | 81 | * @max: read at most @max chars from @s |
| @@ -240,14 +278,8 @@ int __weak get_user_pages_fast(unsigned long start, | |||
| 240 | int nr_pages, int write, struct page **pages) | 278 | int nr_pages, int write, struct page **pages) |
| 241 | { | 279 | { |
| 242 | struct mm_struct *mm = current->mm; | 280 | struct mm_struct *mm = current->mm; |
| 243 | int ret; | 281 | return get_user_pages_unlocked(current, mm, start, nr_pages, |
| 244 | 282 | write, 0, pages); | |
| 245 | down_read(&mm->mmap_sem); | ||
| 246 | ret = get_user_pages(current, mm, start, nr_pages, | ||
| 247 | write, 0, pages, NULL); | ||
| 248 | up_read(&mm->mmap_sem); | ||
| 249 | |||
| 250 | return ret; | ||
| 251 | } | 283 | } |
| 252 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | 284 | EXPORT_SYMBOL_GPL(get_user_pages_fast); |
| 253 | 285 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 39c338896416..35b25e1340ca 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -1324,10 +1324,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
| 1324 | if (unlikely(!area)) | 1324 | if (unlikely(!area)) |
| 1325 | return NULL; | 1325 | return NULL; |
| 1326 | 1326 | ||
| 1327 | /* | 1327 | if (!(flags & VM_NO_GUARD)) |
| 1328 | * We always allocate a guard page. | 1328 | size += PAGE_SIZE; |
| 1329 | */ | ||
| 1330 | size += PAGE_SIZE; | ||
| 1331 | 1329 | ||
| 1332 | va = alloc_vmap_area(size, align, start, end, node, gfp_mask); | 1330 | va = alloc_vmap_area(size, align, start, end, node, gfp_mask); |
| 1333 | if (IS_ERR(va)) { | 1331 | if (IS_ERR(va)) { |
| @@ -1621,6 +1619,7 @@ fail: | |||
| 1621 | * @end: vm area range end | 1619 | * @end: vm area range end |
| 1622 | * @gfp_mask: flags for the page level allocator | 1620 | * @gfp_mask: flags for the page level allocator |
| 1623 | * @prot: protection mask for the allocated pages | 1621 | * @prot: protection mask for the allocated pages |
| 1622 | * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) | ||
| 1624 | * @node: node to use for allocation or NUMA_NO_NODE | 1623 | * @node: node to use for allocation or NUMA_NO_NODE |
| 1625 | * @caller: caller's return address | 1624 | * @caller: caller's return address |
| 1626 | * | 1625 | * |
| @@ -1630,7 +1629,8 @@ fail: | |||
| 1630 | */ | 1629 | */ |
| 1631 | void *__vmalloc_node_range(unsigned long size, unsigned long align, | 1630 | void *__vmalloc_node_range(unsigned long size, unsigned long align, |
| 1632 | unsigned long start, unsigned long end, gfp_t gfp_mask, | 1631 | unsigned long start, unsigned long end, gfp_t gfp_mask, |
| 1633 | pgprot_t prot, int node, const void *caller) | 1632 | pgprot_t prot, unsigned long vm_flags, int node, |
| 1633 | const void *caller) | ||
| 1634 | { | 1634 | { |
| 1635 | struct vm_struct *area; | 1635 | struct vm_struct *area; |
| 1636 | void *addr; | 1636 | void *addr; |
| @@ -1640,8 +1640,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
| 1640 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1640 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
| 1641 | goto fail; | 1641 | goto fail; |
| 1642 | 1642 | ||
| 1643 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED, | 1643 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | |
| 1644 | start, end, node, gfp_mask, caller); | 1644 | vm_flags, start, end, node, gfp_mask, caller); |
| 1645 | if (!area) | 1645 | if (!area) |
| 1646 | goto fail; | 1646 | goto fail; |
| 1647 | 1647 | ||
| @@ -1690,7 +1690,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
| 1690 | int node, const void *caller) | 1690 | int node, const void *caller) |
| 1691 | { | 1691 | { |
| 1692 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, | 1692 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, |
| 1693 | gfp_mask, prot, node, caller); | 1693 | gfp_mask, prot, 0, node, caller); |
| 1694 | } | 1694 | } |
| 1695 | 1695 | ||
| 1696 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1696 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index dcd90c891d8e..5e8eadd71bac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -91,6 +91,9 @@ struct scan_control { | |||
| 91 | /* Can pages be swapped as part of reclaim? */ | 91 | /* Can pages be swapped as part of reclaim? */ |
| 92 | unsigned int may_swap:1; | 92 | unsigned int may_swap:1; |
| 93 | 93 | ||
| 94 | /* Can cgroups be reclaimed below their normal consumption range? */ | ||
| 95 | unsigned int may_thrash:1; | ||
| 96 | |||
| 94 | unsigned int hibernation_mode:1; | 97 | unsigned int hibernation_mode:1; |
| 95 | 98 | ||
| 96 | /* One of the zones is ready for compaction */ | 99 | /* One of the zones is ready for compaction */ |
| @@ -229,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker); | |||
| 229 | 232 | ||
| 230 | #define SHRINK_BATCH 128 | 233 | #define SHRINK_BATCH 128 |
| 231 | 234 | ||
| 232 | static unsigned long shrink_slabs(struct shrink_control *shrinkctl, | 235 | static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, |
| 233 | struct shrinker *shrinker, | 236 | struct shrinker *shrinker, |
| 234 | unsigned long nr_scanned, | 237 | unsigned long nr_scanned, |
| 235 | unsigned long nr_eligible) | 238 | unsigned long nr_eligible) |
| 236 | { | 239 | { |
| 237 | unsigned long freed = 0; | 240 | unsigned long freed = 0; |
| 238 | unsigned long long delta; | 241 | unsigned long long delta; |
| @@ -341,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, | |||
| 341 | } | 344 | } |
| 342 | 345 | ||
| 343 | /** | 346 | /** |
| 344 | * shrink_node_slabs - shrink slab caches of a given node | 347 | * shrink_slab - shrink slab caches |
| 345 | * @gfp_mask: allocation context | 348 | * @gfp_mask: allocation context |
| 346 | * @nid: node whose slab caches to target | 349 | * @nid: node whose slab caches to target |
| 350 | * @memcg: memory cgroup whose slab caches to target | ||
| 347 | * @nr_scanned: pressure numerator | 351 | * @nr_scanned: pressure numerator |
| 348 | * @nr_eligible: pressure denominator | 352 | * @nr_eligible: pressure denominator |
| 349 | * | 353 | * |
| @@ -352,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, | |||
| 352 | * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, | 356 | * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, |
| 353 | * unaware shrinkers will receive a node id of 0 instead. | 357 | * unaware shrinkers will receive a node id of 0 instead. |
| 354 | * | 358 | * |
| 359 | * @memcg specifies the memory cgroup to target. If it is not NULL, | ||
| 360 | * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan | ||
| 361 | * objects from the memory cgroup specified. Otherwise all shrinkers | ||
| 362 | * are called, and memcg aware shrinkers are supposed to scan the | ||
| 363 | * global list then. | ||
| 364 | * | ||
| 355 | * @nr_scanned and @nr_eligible form a ratio that indicate how much of | 365 | * @nr_scanned and @nr_eligible form a ratio that indicate how much of |
| 356 | * the available objects should be scanned. Page reclaim for example | 366 | * the available objects should be scanned. Page reclaim for example |
| 357 | * passes the number of pages scanned and the number of pages on the | 367 | * passes the number of pages scanned and the number of pages on the |
| @@ -362,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, | |||
| 362 | * | 372 | * |
| 363 | * Returns the number of reclaimed slab objects. | 373 | * Returns the number of reclaimed slab objects. |
| 364 | */ | 374 | */ |
| 365 | unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, | 375 | static unsigned long shrink_slab(gfp_t gfp_mask, int nid, |
| 366 | unsigned long nr_scanned, | 376 | struct mem_cgroup *memcg, |
| 367 | unsigned long nr_eligible) | 377 | unsigned long nr_scanned, |
| 378 | unsigned long nr_eligible) | ||
| 368 | { | 379 | { |
| 369 | struct shrinker *shrinker; | 380 | struct shrinker *shrinker; |
| 370 | unsigned long freed = 0; | 381 | unsigned long freed = 0; |
| 371 | 382 | ||
| 383 | if (memcg && !memcg_kmem_is_active(memcg)) | ||
| 384 | return 0; | ||
| 385 | |||
| 372 | if (nr_scanned == 0) | 386 | if (nr_scanned == 0) |
| 373 | nr_scanned = SWAP_CLUSTER_MAX; | 387 | nr_scanned = SWAP_CLUSTER_MAX; |
| 374 | 388 | ||
| @@ -387,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, | |||
| 387 | struct shrink_control sc = { | 401 | struct shrink_control sc = { |
| 388 | .gfp_mask = gfp_mask, | 402 | .gfp_mask = gfp_mask, |
| 389 | .nid = nid, | 403 | .nid = nid, |
| 404 | .memcg = memcg, | ||
| 390 | }; | 405 | }; |
| 391 | 406 | ||
| 407 | if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) | ||
| 408 | continue; | ||
| 409 | |||
| 392 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) | 410 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) |
| 393 | sc.nid = 0; | 411 | sc.nid = 0; |
| 394 | 412 | ||
| 395 | freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); | 413 | freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); |
| 396 | } | 414 | } |
| 397 | 415 | ||
| 398 | up_read(&shrinker_rwsem); | 416 | up_read(&shrinker_rwsem); |
| @@ -401,6 +419,29 @@ out: | |||
| 401 | return freed; | 419 | return freed; |
| 402 | } | 420 | } |
| 403 | 421 | ||
| 422 | void drop_slab_node(int nid) | ||
| 423 | { | ||
| 424 | unsigned long freed; | ||
| 425 | |||
| 426 | do { | ||
| 427 | struct mem_cgroup *memcg = NULL; | ||
| 428 | |||
| 429 | freed = 0; | ||
| 430 | do { | ||
| 431 | freed += shrink_slab(GFP_KERNEL, nid, memcg, | ||
| 432 | 1000, 1000); | ||
| 433 | } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); | ||
| 434 | } while (freed > 10); | ||
| 435 | } | ||
| 436 | |||
| 437 | void drop_slab(void) | ||
| 438 | { | ||
| 439 | int nid; | ||
| 440 | |||
| 441 | for_each_online_node(nid) | ||
| 442 | drop_slab_node(nid); | ||
| 443 | } | ||
| 444 | |||
| 404 | static inline int is_page_cache_freeable(struct page *page) | 445 | static inline int is_page_cache_freeable(struct page *page) |
| 405 | { | 446 | { |
| 406 | /* | 447 | /* |
| @@ -497,7 +538,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
| 497 | } | 538 | } |
| 498 | if (mapping->a_ops->writepage == NULL) | 539 | if (mapping->a_ops->writepage == NULL) |
| 499 | return PAGE_ACTIVATE; | 540 | return PAGE_ACTIVATE; |
| 500 | if (!may_write_to_queue(mapping->backing_dev_info, sc)) | 541 | if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) |
| 501 | return PAGE_KEEP; | 542 | return PAGE_KEEP; |
| 502 | 543 | ||
| 503 | if (clear_page_dirty_for_io(page)) { | 544 | if (clear_page_dirty_for_io(page)) { |
| @@ -876,7 +917,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 876 | */ | 917 | */ |
| 877 | mapping = page_mapping(page); | 918 | mapping = page_mapping(page); |
| 878 | if (((dirty || writeback) && mapping && | 919 | if (((dirty || writeback) && mapping && |
| 879 | bdi_write_congested(mapping->backing_dev_info)) || | 920 | bdi_write_congested(inode_to_bdi(mapping->host))) || |
| 880 | (writeback && PageReclaim(page))) | 921 | (writeback && PageReclaim(page))) |
| 881 | nr_congested++; | 922 | nr_congested++; |
| 882 | 923 | ||
| @@ -1903,8 +1944,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, | |||
| 1903 | * latencies, so it's better to scan a minimum amount there as | 1944 | * latencies, so it's better to scan a minimum amount there as |
| 1904 | * well. | 1945 | * well. |
| 1905 | */ | 1946 | */ |
| 1906 | if (current_is_kswapd() && !zone_reclaimable(zone)) | 1947 | if (current_is_kswapd()) { |
| 1907 | force_scan = true; | 1948 | if (!zone_reclaimable(zone)) |
| 1949 | force_scan = true; | ||
| 1950 | if (!mem_cgroup_lruvec_online(lruvec)) | ||
| 1951 | force_scan = true; | ||
| 1952 | } | ||
| 1908 | if (!global_reclaim(sc)) | 1953 | if (!global_reclaim(sc)) |
| 1909 | force_scan = true; | 1954 | force_scan = true; |
| 1910 | 1955 | ||
| @@ -2269,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
| 2269 | static bool shrink_zone(struct zone *zone, struct scan_control *sc, | 2314 | static bool shrink_zone(struct zone *zone, struct scan_control *sc, |
| 2270 | bool is_classzone) | 2315 | bool is_classzone) |
| 2271 | { | 2316 | { |
| 2317 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
| 2272 | unsigned long nr_reclaimed, nr_scanned; | 2318 | unsigned long nr_reclaimed, nr_scanned; |
| 2273 | bool reclaimable = false; | 2319 | bool reclaimable = false; |
| 2274 | 2320 | ||
| @@ -2287,15 +2333,28 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
| 2287 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2333 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
| 2288 | do { | 2334 | do { |
| 2289 | unsigned long lru_pages; | 2335 | unsigned long lru_pages; |
| 2336 | unsigned long scanned; | ||
| 2290 | struct lruvec *lruvec; | 2337 | struct lruvec *lruvec; |
| 2291 | int swappiness; | 2338 | int swappiness; |
| 2292 | 2339 | ||
| 2340 | if (mem_cgroup_low(root, memcg)) { | ||
| 2341 | if (!sc->may_thrash) | ||
| 2342 | continue; | ||
| 2343 | mem_cgroup_events(memcg, MEMCG_LOW, 1); | ||
| 2344 | } | ||
| 2345 | |||
| 2293 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2346 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
| 2294 | swappiness = mem_cgroup_swappiness(memcg); | 2347 | swappiness = mem_cgroup_swappiness(memcg); |
| 2348 | scanned = sc->nr_scanned; | ||
| 2295 | 2349 | ||
| 2296 | shrink_lruvec(lruvec, swappiness, sc, &lru_pages); | 2350 | shrink_lruvec(lruvec, swappiness, sc, &lru_pages); |
| 2297 | zone_lru_pages += lru_pages; | 2351 | zone_lru_pages += lru_pages; |
| 2298 | 2352 | ||
| 2353 | if (memcg && is_classzone) | ||
| 2354 | shrink_slab(sc->gfp_mask, zone_to_nid(zone), | ||
| 2355 | memcg, sc->nr_scanned - scanned, | ||
| 2356 | lru_pages); | ||
| 2357 | |||
| 2299 | /* | 2358 | /* |
| 2300 | * Direct reclaim and kswapd have to scan all memory | 2359 | * Direct reclaim and kswapd have to scan all memory |
| 2301 | * cgroups to fulfill the overall scan target for the | 2360 | * cgroups to fulfill the overall scan target for the |
| @@ -2311,26 +2370,20 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
| 2311 | mem_cgroup_iter_break(root, memcg); | 2370 | mem_cgroup_iter_break(root, memcg); |
| 2312 | break; | 2371 | break; |
| 2313 | } | 2372 | } |
| 2314 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 2373 | } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); |
| 2315 | } while (memcg); | ||
| 2316 | 2374 | ||
| 2317 | /* | 2375 | /* |
| 2318 | * Shrink the slab caches in the same proportion that | 2376 | * Shrink the slab caches in the same proportion that |
| 2319 | * the eligible LRU pages were scanned. | 2377 | * the eligible LRU pages were scanned. |
| 2320 | */ | 2378 | */ |
| 2321 | if (global_reclaim(sc) && is_classzone) { | 2379 | if (global_reclaim(sc) && is_classzone) |
| 2322 | struct reclaim_state *reclaim_state; | 2380 | shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, |
| 2323 | 2381 | sc->nr_scanned - nr_scanned, | |
| 2324 | shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), | 2382 | zone_lru_pages); |
| 2325 | sc->nr_scanned - nr_scanned, | 2383 | |
| 2326 | zone_lru_pages); | 2384 | if (reclaim_state) { |
| 2327 | 2385 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | |
| 2328 | reclaim_state = current->reclaim_state; | 2386 | reclaim_state->reclaimed_slab = 0; |
| 2329 | if (reclaim_state) { | ||
| 2330 | sc->nr_reclaimed += | ||
| 2331 | reclaim_state->reclaimed_slab; | ||
| 2332 | reclaim_state->reclaimed_slab = 0; | ||
| 2333 | } | ||
| 2334 | } | 2387 | } |
| 2335 | 2388 | ||
| 2336 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2389 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
| @@ -2515,10 +2568,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
| 2515 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | 2568 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, |
| 2516 | struct scan_control *sc) | 2569 | struct scan_control *sc) |
| 2517 | { | 2570 | { |
| 2571 | int initial_priority = sc->priority; | ||
| 2518 | unsigned long total_scanned = 0; | 2572 | unsigned long total_scanned = 0; |
| 2519 | unsigned long writeback_threshold; | 2573 | unsigned long writeback_threshold; |
| 2520 | bool zones_reclaimable; | 2574 | bool zones_reclaimable; |
| 2521 | 2575 | retry: | |
| 2522 | delayacct_freepages_start(); | 2576 | delayacct_freepages_start(); |
| 2523 | 2577 | ||
| 2524 | if (global_reclaim(sc)) | 2578 | if (global_reclaim(sc)) |
| @@ -2568,6 +2622,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 2568 | if (sc->compaction_ready) | 2622 | if (sc->compaction_ready) |
| 2569 | return 1; | 2623 | return 1; |
| 2570 | 2624 | ||
| 2625 | /* Untapped cgroup reserves? Don't OOM, retry. */ | ||
| 2626 | if (!sc->may_thrash) { | ||
| 2627 | sc->priority = initial_priority; | ||
| 2628 | sc->may_thrash = 1; | ||
| 2629 | goto retry; | ||
| 2630 | } | ||
| 2631 | |||
| 2571 | /* Any of the zones still reclaimable? Don't OOM. */ | 2632 | /* Any of the zones still reclaimable? Don't OOM. */ |
| 2572 | if (zones_reclaimable) | 2633 | if (zones_reclaimable) |
| 2573 | return 1; | 2634 | return 1; |
| @@ -3175,7 +3236,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
| 3175 | */ | 3236 | */ |
| 3176 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | 3237 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && |
| 3177 | pfmemalloc_watermark_ok(pgdat)) | 3238 | pfmemalloc_watermark_ok(pgdat)) |
| 3178 | wake_up(&pgdat->pfmemalloc_wait); | 3239 | wake_up_all(&pgdat->pfmemalloc_wait); |
| 3179 | 3240 | ||
| 3180 | /* | 3241 | /* |
| 3181 | * Fragmentation may mean that the system cannot be rebalanced | 3242 | * Fragmentation may mean that the system cannot be rebalanced |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 1284f89fca08..4f5cd974e11a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -17,6 +17,9 @@ | |||
| 17 | #include <linux/cpu.h> | 17 | #include <linux/cpu.h> |
| 18 | #include <linux/cpumask.h> | 18 | #include <linux/cpumask.h> |
| 19 | #include <linux/vmstat.h> | 19 | #include <linux/vmstat.h> |
| 20 | #include <linux/proc_fs.h> | ||
| 21 | #include <linux/seq_file.h> | ||
| 22 | #include <linux/debugfs.h> | ||
| 20 | #include <linux/sched.h> | 23 | #include <linux/sched.h> |
| 21 | #include <linux/math64.h> | 24 | #include <linux/math64.h> |
| 22 | #include <linux/writeback.h> | 25 | #include <linux/writeback.h> |
| @@ -670,66 +673,6 @@ int fragmentation_index(struct zone *zone, unsigned int order) | |||
| 670 | } | 673 | } |
| 671 | #endif | 674 | #endif |
| 672 | 675 | ||
| 673 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) | ||
| 674 | #include <linux/proc_fs.h> | ||
| 675 | #include <linux/seq_file.h> | ||
| 676 | |||
| 677 | static char * const migratetype_names[MIGRATE_TYPES] = { | ||
| 678 | "Unmovable", | ||
| 679 | "Reclaimable", | ||
| 680 | "Movable", | ||
| 681 | "Reserve", | ||
| 682 | #ifdef CONFIG_CMA | ||
| 683 | "CMA", | ||
| 684 | #endif | ||
| 685 | #ifdef CONFIG_MEMORY_ISOLATION | ||
| 686 | "Isolate", | ||
| 687 | #endif | ||
| 688 | }; | ||
| 689 | |||
| 690 | static void *frag_start(struct seq_file *m, loff_t *pos) | ||
| 691 | { | ||
| 692 | pg_data_t *pgdat; | ||
| 693 | loff_t node = *pos; | ||
| 694 | for (pgdat = first_online_pgdat(); | ||
| 695 | pgdat && node; | ||
| 696 | pgdat = next_online_pgdat(pgdat)) | ||
| 697 | --node; | ||
| 698 | |||
| 699 | return pgdat; | ||
| 700 | } | ||
| 701 | |||
| 702 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | ||
| 703 | { | ||
| 704 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 705 | |||
| 706 | (*pos)++; | ||
| 707 | return next_online_pgdat(pgdat); | ||
| 708 | } | ||
| 709 | |||
| 710 | static void frag_stop(struct seq_file *m, void *arg) | ||
| 711 | { | ||
| 712 | } | ||
| 713 | |||
| 714 | /* Walk all the zones in a node and print using a callback */ | ||
| 715 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | ||
| 716 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) | ||
| 717 | { | ||
| 718 | struct zone *zone; | ||
| 719 | struct zone *node_zones = pgdat->node_zones; | ||
| 720 | unsigned long flags; | ||
| 721 | |||
| 722 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
| 723 | if (!populated_zone(zone)) | ||
| 724 | continue; | ||
| 725 | |||
| 726 | spin_lock_irqsave(&zone->lock, flags); | ||
| 727 | print(m, pgdat, zone); | ||
| 728 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 729 | } | ||
| 730 | } | ||
| 731 | #endif | ||
| 732 | |||
| 733 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) | 676 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) |
| 734 | #ifdef CONFIG_ZONE_DMA | 677 | #ifdef CONFIG_ZONE_DMA |
| 735 | #define TEXT_FOR_DMA(xx) xx "_dma", | 678 | #define TEXT_FOR_DMA(xx) xx "_dma", |
| @@ -907,7 +850,66 @@ const char * const vmstat_text[] = { | |||
| 907 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ | 850 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |
| 908 | 851 | ||
| 909 | 852 | ||
| 853 | #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ | ||
| 854 | defined(CONFIG_PROC_FS) | ||
| 855 | static void *frag_start(struct seq_file *m, loff_t *pos) | ||
| 856 | { | ||
| 857 | pg_data_t *pgdat; | ||
| 858 | loff_t node = *pos; | ||
| 859 | |||
| 860 | for (pgdat = first_online_pgdat(); | ||
| 861 | pgdat && node; | ||
| 862 | pgdat = next_online_pgdat(pgdat)) | ||
| 863 | --node; | ||
| 864 | |||
| 865 | return pgdat; | ||
| 866 | } | ||
| 867 | |||
| 868 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | ||
| 869 | { | ||
| 870 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 871 | |||
| 872 | (*pos)++; | ||
| 873 | return next_online_pgdat(pgdat); | ||
| 874 | } | ||
| 875 | |||
| 876 | static void frag_stop(struct seq_file *m, void *arg) | ||
| 877 | { | ||
| 878 | } | ||
| 879 | |||
| 880 | /* Walk all the zones in a node and print using a callback */ | ||
| 881 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | ||
| 882 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) | ||
| 883 | { | ||
| 884 | struct zone *zone; | ||
| 885 | struct zone *node_zones = pgdat->node_zones; | ||
| 886 | unsigned long flags; | ||
| 887 | |||
| 888 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
| 889 | if (!populated_zone(zone)) | ||
| 890 | continue; | ||
| 891 | |||
| 892 | spin_lock_irqsave(&zone->lock, flags); | ||
| 893 | print(m, pgdat, zone); | ||
| 894 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 895 | } | ||
| 896 | } | ||
| 897 | #endif | ||
| 898 | |||
| 910 | #ifdef CONFIG_PROC_FS | 899 | #ifdef CONFIG_PROC_FS |
| 900 | static char * const migratetype_names[MIGRATE_TYPES] = { | ||
| 901 | "Unmovable", | ||
| 902 | "Reclaimable", | ||
| 903 | "Movable", | ||
| 904 | "Reserve", | ||
| 905 | #ifdef CONFIG_CMA | ||
| 906 | "CMA", | ||
| 907 | #endif | ||
| 908 | #ifdef CONFIG_MEMORY_ISOLATION | ||
| 909 | "Isolate", | ||
| 910 | #endif | ||
| 911 | }; | ||
| 912 | |||
| 911 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, | 913 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, |
| 912 | struct zone *zone) | 914 | struct zone *zone) |
| 913 | { | 915 | { |
| @@ -1435,8 +1437,8 @@ static void vmstat_shepherd(struct work_struct *w) | |||
| 1435 | if (need_update(cpu) && | 1437 | if (need_update(cpu) && |
| 1436 | cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) | 1438 | cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) |
| 1437 | 1439 | ||
| 1438 | schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), | 1440 | schedule_delayed_work_on(cpu, |
| 1439 | __round_jiffies_relative(sysctl_stat_interval, cpu)); | 1441 | &per_cpu(vmstat_work, cpu), 0); |
| 1440 | 1442 | ||
| 1441 | put_online_cpus(); | 1443 | put_online_cpus(); |
| 1442 | 1444 | ||
| @@ -1450,7 +1452,7 @@ static void __init start_shepherd_timer(void) | |||
| 1450 | int cpu; | 1452 | int cpu; |
| 1451 | 1453 | ||
| 1452 | for_each_possible_cpu(cpu) | 1454 | for_each_possible_cpu(cpu) |
| 1453 | INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), | 1455 | INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), |
| 1454 | vmstat_update); | 1456 | vmstat_update); |
| 1455 | 1457 | ||
| 1456 | if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) | 1458 | if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) |
| @@ -1536,8 +1538,6 @@ static int __init setup_vmstat(void) | |||
| 1536 | module_init(setup_vmstat) | 1538 | module_init(setup_vmstat) |
| 1537 | 1539 | ||
| 1538 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) | 1540 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) |
| 1539 | #include <linux/debugfs.h> | ||
| 1540 | |||
| 1541 | 1541 | ||
| 1542 | /* | 1542 | /* |
| 1543 | * Return an index indicating how much of the available free memory is | 1543 | * Return an index indicating how much of the available free memory is |
diff --git a/mm/workingset.c b/mm/workingset.c index f7216fa7da27..aa017133744b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
| @@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, | |||
| 275 | 275 | ||
| 276 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ | 276 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ |
| 277 | local_irq_disable(); | 277 | local_irq_disable(); |
| 278 | shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); | 278 | shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); |
| 279 | local_irq_enable(); | 279 | local_irq_enable(); |
| 280 | 280 | ||
| 281 | pages = node_present_pages(sc->nid); | 281 | pages = node_present_pages(sc->nid); |
| @@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, | |||
| 302 | } | 302 | } |
| 303 | 303 | ||
| 304 | static enum lru_status shadow_lru_isolate(struct list_head *item, | 304 | static enum lru_status shadow_lru_isolate(struct list_head *item, |
| 305 | struct list_lru_one *lru, | ||
| 305 | spinlock_t *lru_lock, | 306 | spinlock_t *lru_lock, |
| 306 | void *arg) | 307 | void *arg) |
| 307 | { | 308 | { |
| @@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, | |||
| 332 | goto out; | 333 | goto out; |
| 333 | } | 334 | } |
| 334 | 335 | ||
| 335 | list_del_init(item); | 336 | list_lru_isolate(lru, item); |
| 336 | spin_unlock(lru_lock); | 337 | spin_unlock(lru_lock); |
| 337 | 338 | ||
| 338 | /* | 339 | /* |
| @@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, | |||
| 376 | 377 | ||
| 377 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ | 378 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ |
| 378 | local_irq_disable(); | 379 | local_irq_disable(); |
| 379 | ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, | 380 | ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, |
| 380 | shadow_lru_isolate, NULL, &sc->nr_to_scan); | 381 | shadow_lru_isolate, NULL); |
| 381 | local_irq_enable(); | 382 | local_irq_enable(); |
| 382 | return ret; | 383 | return ret; |
| 383 | } | 384 | } |
| @@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = { | |||
| 130 | .evict = zbud_zpool_evict | 130 | .evict = zbud_zpool_evict |
| 131 | }; | 131 | }; |
| 132 | 132 | ||
| 133 | static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) | 133 | static void *zbud_zpool_create(char *name, gfp_t gfp, |
| 134 | struct zpool_ops *zpool_ops) | ||
| 134 | { | 135 | { |
| 135 | return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); | 136 | return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); |
| 136 | } | 137 | } |
diff --git a/mm/zpool.c b/mm/zpool.c index 739cdf0d183a..bacdab6e47de 100644 --- a/mm/zpool.c +++ b/mm/zpool.c | |||
| @@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver) | |||
| 129 | /** | 129 | /** |
| 130 | * zpool_create_pool() - Create a new zpool | 130 | * zpool_create_pool() - Create a new zpool |
| 131 | * @type The type of the zpool to create (e.g. zbud, zsmalloc) | 131 | * @type The type of the zpool to create (e.g. zbud, zsmalloc) |
| 132 | * @name The name of the zpool (e.g. zram0, zswap) | ||
| 132 | * @gfp The GFP flags to use when allocating the pool. | 133 | * @gfp The GFP flags to use when allocating the pool. |
| 133 | * @ops The optional ops callback. | 134 | * @ops The optional ops callback. |
| 134 | * | 135 | * |
| @@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver) | |||
| 140 | * | 141 | * |
| 141 | * Returns: New zpool on success, NULL on failure. | 142 | * Returns: New zpool on success, NULL on failure. |
| 142 | */ | 143 | */ |
| 143 | struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) | 144 | struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, |
| 145 | struct zpool_ops *ops) | ||
| 144 | { | 146 | { |
| 145 | struct zpool_driver *driver; | 147 | struct zpool_driver *driver; |
| 146 | struct zpool *zpool; | 148 | struct zpool *zpool; |
| @@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) | |||
| 168 | 170 | ||
| 169 | zpool->type = driver->type; | 171 | zpool->type = driver->type; |
| 170 | zpool->driver = driver; | 172 | zpool->driver = driver; |
| 171 | zpool->pool = driver->create(gfp, ops); | 173 | zpool->pool = driver->create(name, gfp, ops); |
| 172 | zpool->ops = ops; | 174 | zpool->ops = ops; |
| 173 | 175 | ||
| 174 | if (!zpool->pool) { | 176 | if (!zpool->pool) { |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b72403927aa4..0dec1fa5f656 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
| @@ -91,6 +91,7 @@ | |||
| 91 | #include <linux/hardirq.h> | 91 | #include <linux/hardirq.h> |
| 92 | #include <linux/spinlock.h> | 92 | #include <linux/spinlock.h> |
| 93 | #include <linux/types.h> | 93 | #include <linux/types.h> |
| 94 | #include <linux/debugfs.h> | ||
| 94 | #include <linux/zsmalloc.h> | 95 | #include <linux/zsmalloc.h> |
| 95 | #include <linux/zpool.h> | 96 | #include <linux/zpool.h> |
| 96 | 97 | ||
| @@ -168,6 +169,22 @@ enum fullness_group { | |||
| 168 | ZS_FULL | 169 | ZS_FULL |
| 169 | }; | 170 | }; |
| 170 | 171 | ||
| 172 | enum zs_stat_type { | ||
| 173 | OBJ_ALLOCATED, | ||
| 174 | OBJ_USED, | ||
| 175 | NR_ZS_STAT_TYPE, | ||
| 176 | }; | ||
| 177 | |||
| 178 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 179 | |||
| 180 | static struct dentry *zs_stat_root; | ||
| 181 | |||
| 182 | struct zs_size_stat { | ||
| 183 | unsigned long objs[NR_ZS_STAT_TYPE]; | ||
| 184 | }; | ||
| 185 | |||
| 186 | #endif | ||
| 187 | |||
| 171 | /* | 188 | /* |
| 172 | * number of size_classes | 189 | * number of size_classes |
| 173 | */ | 190 | */ |
| @@ -200,6 +217,10 @@ struct size_class { | |||
| 200 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ | 217 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ |
| 201 | int pages_per_zspage; | 218 | int pages_per_zspage; |
| 202 | 219 | ||
| 220 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 221 | struct zs_size_stat stats; | ||
| 222 | #endif | ||
| 223 | |||
| 203 | spinlock_t lock; | 224 | spinlock_t lock; |
| 204 | 225 | ||
| 205 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; | 226 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; |
| @@ -217,10 +238,16 @@ struct link_free { | |||
| 217 | }; | 238 | }; |
| 218 | 239 | ||
| 219 | struct zs_pool { | 240 | struct zs_pool { |
| 241 | char *name; | ||
| 242 | |||
| 220 | struct size_class **size_class; | 243 | struct size_class **size_class; |
| 221 | 244 | ||
| 222 | gfp_t flags; /* allocation flags used when growing pool */ | 245 | gfp_t flags; /* allocation flags used when growing pool */ |
| 223 | atomic_long_t pages_allocated; | 246 | atomic_long_t pages_allocated; |
| 247 | |||
| 248 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 249 | struct dentry *stat_dentry; | ||
| 250 | #endif | ||
| 224 | }; | 251 | }; |
| 225 | 252 | ||
| 226 | /* | 253 | /* |
| @@ -246,9 +273,9 @@ struct mapping_area { | |||
| 246 | 273 | ||
| 247 | #ifdef CONFIG_ZPOOL | 274 | #ifdef CONFIG_ZPOOL |
| 248 | 275 | ||
| 249 | static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) | 276 | static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) |
| 250 | { | 277 | { |
| 251 | return zs_create_pool(gfp); | 278 | return zs_create_pool(name, gfp); |
| 252 | } | 279 | } |
| 253 | 280 | ||
| 254 | static void zs_zpool_destroy(void *pool) | 281 | static void zs_zpool_destroy(void *pool) |
| @@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | |||
| 942 | return true; | 969 | return true; |
| 943 | } | 970 | } |
| 944 | 971 | ||
| 972 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 973 | |||
| 974 | static inline void zs_stat_inc(struct size_class *class, | ||
| 975 | enum zs_stat_type type, unsigned long cnt) | ||
| 976 | { | ||
| 977 | class->stats.objs[type] += cnt; | ||
| 978 | } | ||
| 979 | |||
| 980 | static inline void zs_stat_dec(struct size_class *class, | ||
| 981 | enum zs_stat_type type, unsigned long cnt) | ||
| 982 | { | ||
| 983 | class->stats.objs[type] -= cnt; | ||
| 984 | } | ||
| 985 | |||
| 986 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 987 | enum zs_stat_type type) | ||
| 988 | { | ||
| 989 | return class->stats.objs[type]; | ||
| 990 | } | ||
| 991 | |||
| 992 | static int __init zs_stat_init(void) | ||
| 993 | { | ||
| 994 | if (!debugfs_initialized()) | ||
| 995 | return -ENODEV; | ||
| 996 | |||
| 997 | zs_stat_root = debugfs_create_dir("zsmalloc", NULL); | ||
| 998 | if (!zs_stat_root) | ||
| 999 | return -ENOMEM; | ||
| 1000 | |||
| 1001 | return 0; | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static void __exit zs_stat_exit(void) | ||
| 1005 | { | ||
| 1006 | debugfs_remove_recursive(zs_stat_root); | ||
| 1007 | } | ||
| 1008 | |||
| 1009 | static int zs_stats_size_show(struct seq_file *s, void *v) | ||
| 1010 | { | ||
| 1011 | int i; | ||
| 1012 | struct zs_pool *pool = s->private; | ||
| 1013 | struct size_class *class; | ||
| 1014 | int objs_per_zspage; | ||
| 1015 | unsigned long obj_allocated, obj_used, pages_used; | ||
| 1016 | unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; | ||
| 1017 | |||
| 1018 | seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size", | ||
| 1019 | "obj_allocated", "obj_used", "pages_used"); | ||
| 1020 | |||
| 1021 | for (i = 0; i < zs_size_classes; i++) { | ||
| 1022 | class = pool->size_class[i]; | ||
| 1023 | |||
| 1024 | if (class->index != i) | ||
| 1025 | continue; | ||
| 1026 | |||
| 1027 | spin_lock(&class->lock); | ||
| 1028 | obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); | ||
| 1029 | obj_used = zs_stat_get(class, OBJ_USED); | ||
| 1030 | spin_unlock(&class->lock); | ||
| 1031 | |||
| 1032 | objs_per_zspage = get_maxobj_per_zspage(class->size, | ||
| 1033 | class->pages_per_zspage); | ||
| 1034 | pages_used = obj_allocated / objs_per_zspage * | ||
| 1035 | class->pages_per_zspage; | ||
| 1036 | |||
| 1037 | seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i, | ||
| 1038 | class->size, obj_allocated, obj_used, pages_used); | ||
| 1039 | |||
| 1040 | total_objs += obj_allocated; | ||
| 1041 | total_used_objs += obj_used; | ||
| 1042 | total_pages += pages_used; | ||
| 1043 | } | ||
| 1044 | |||
| 1045 | seq_puts(s, "\n"); | ||
| 1046 | seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "", | ||
| 1047 | total_objs, total_used_objs, total_pages); | ||
| 1048 | |||
| 1049 | return 0; | ||
| 1050 | } | ||
| 1051 | |||
| 1052 | static int zs_stats_size_open(struct inode *inode, struct file *file) | ||
| 1053 | { | ||
| 1054 | return single_open(file, zs_stats_size_show, inode->i_private); | ||
| 1055 | } | ||
| 1056 | |||
| 1057 | static const struct file_operations zs_stat_size_ops = { | ||
| 1058 | .open = zs_stats_size_open, | ||
| 1059 | .read = seq_read, | ||
| 1060 | .llseek = seq_lseek, | ||
| 1061 | .release = single_release, | ||
| 1062 | }; | ||
| 1063 | |||
| 1064 | static int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
| 1065 | { | ||
| 1066 | struct dentry *entry; | ||
| 1067 | |||
| 1068 | if (!zs_stat_root) | ||
| 1069 | return -ENODEV; | ||
| 1070 | |||
| 1071 | entry = debugfs_create_dir(name, zs_stat_root); | ||
| 1072 | if (!entry) { | ||
| 1073 | pr_warn("debugfs dir <%s> creation failed\n", name); | ||
| 1074 | return -ENOMEM; | ||
| 1075 | } | ||
| 1076 | pool->stat_dentry = entry; | ||
| 1077 | |||
| 1078 | entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO, | ||
| 1079 | pool->stat_dentry, pool, &zs_stat_size_ops); | ||
| 1080 | if (!entry) { | ||
| 1081 | pr_warn("%s: debugfs file entry <%s> creation failed\n", | ||
| 1082 | name, "obj_in_classes"); | ||
| 1083 | return -ENOMEM; | ||
| 1084 | } | ||
| 1085 | |||
| 1086 | return 0; | ||
| 1087 | } | ||
| 1088 | |||
| 1089 | static void zs_pool_stat_destroy(struct zs_pool *pool) | ||
| 1090 | { | ||
| 1091 | debugfs_remove_recursive(pool->stat_dentry); | ||
| 1092 | } | ||
| 1093 | |||
| 1094 | #else /* CONFIG_ZSMALLOC_STAT */ | ||
| 1095 | |||
| 1096 | static inline void zs_stat_inc(struct size_class *class, | ||
| 1097 | enum zs_stat_type type, unsigned long cnt) | ||
| 1098 | { | ||
| 1099 | } | ||
| 1100 | |||
| 1101 | static inline void zs_stat_dec(struct size_class *class, | ||
| 1102 | enum zs_stat_type type, unsigned long cnt) | ||
| 1103 | { | ||
| 1104 | } | ||
| 1105 | |||
| 1106 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 1107 | enum zs_stat_type type) | ||
| 1108 | { | ||
| 1109 | return 0; | ||
| 1110 | } | ||
| 1111 | |||
| 1112 | static int __init zs_stat_init(void) | ||
| 1113 | { | ||
| 1114 | return 0; | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | static void __exit zs_stat_exit(void) | ||
| 1118 | { | ||
| 1119 | } | ||
| 1120 | |||
| 1121 | static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
| 1122 | { | ||
| 1123 | return 0; | ||
| 1124 | } | ||
| 1125 | |||
| 1126 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) | ||
| 1127 | { | ||
| 1128 | } | ||
| 1129 | |||
| 1130 | #endif | ||
| 1131 | |||
| 945 | unsigned long zs_get_total_pages(struct zs_pool *pool) | 1132 | unsigned long zs_get_total_pages(struct zs_pool *pool) |
| 946 | { | 1133 | { |
| 947 | return atomic_long_read(&pool->pages_allocated); | 1134 | return atomic_long_read(&pool->pages_allocated); |
| @@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
| 1074 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); | 1261 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); |
| 1075 | atomic_long_add(class->pages_per_zspage, | 1262 | atomic_long_add(class->pages_per_zspage, |
| 1076 | &pool->pages_allocated); | 1263 | &pool->pages_allocated); |
| 1264 | |||
| 1077 | spin_lock(&class->lock); | 1265 | spin_lock(&class->lock); |
| 1266 | zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | ||
| 1267 | class->size, class->pages_per_zspage)); | ||
| 1078 | } | 1268 | } |
| 1079 | 1269 | ||
| 1080 | obj = (unsigned long)first_page->freelist; | 1270 | obj = (unsigned long)first_page->freelist; |
| @@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
| 1088 | kunmap_atomic(vaddr); | 1278 | kunmap_atomic(vaddr); |
| 1089 | 1279 | ||
| 1090 | first_page->inuse++; | 1280 | first_page->inuse++; |
| 1281 | zs_stat_inc(class, OBJ_USED, 1); | ||
| 1091 | /* Now move the zspage to another fullness group, if required */ | 1282 | /* Now move the zspage to another fullness group, if required */ |
| 1092 | fix_fullness_group(pool, first_page); | 1283 | fix_fullness_group(pool, first_page); |
| 1093 | spin_unlock(&class->lock); | 1284 | spin_unlock(&class->lock); |
| @@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj) | |||
| 1128 | 1319 | ||
| 1129 | first_page->inuse--; | 1320 | first_page->inuse--; |
| 1130 | fullness = fix_fullness_group(pool, first_page); | 1321 | fullness = fix_fullness_group(pool, first_page); |
| 1322 | |||
| 1323 | zs_stat_dec(class, OBJ_USED, 1); | ||
| 1324 | if (fullness == ZS_EMPTY) | ||
| 1325 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | ||
| 1326 | class->size, class->pages_per_zspage)); | ||
| 1327 | |||
| 1131 | spin_unlock(&class->lock); | 1328 | spin_unlock(&class->lock); |
| 1132 | 1329 | ||
| 1133 | if (fullness == ZS_EMPTY) { | 1330 | if (fullness == ZS_EMPTY) { |
| @@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free); | |||
| 1148 | * On success, a pointer to the newly created pool is returned, | 1345 | * On success, a pointer to the newly created pool is returned, |
| 1149 | * otherwise NULL. | 1346 | * otherwise NULL. |
| 1150 | */ | 1347 | */ |
| 1151 | struct zs_pool *zs_create_pool(gfp_t flags) | 1348 | struct zs_pool *zs_create_pool(char *name, gfp_t flags) |
| 1152 | { | 1349 | { |
| 1153 | int i; | 1350 | int i; |
| 1154 | struct zs_pool *pool; | 1351 | struct zs_pool *pool; |
| @@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags) | |||
| 1158 | if (!pool) | 1355 | if (!pool) |
| 1159 | return NULL; | 1356 | return NULL; |
| 1160 | 1357 | ||
| 1358 | pool->name = kstrdup(name, GFP_KERNEL); | ||
| 1359 | if (!pool->name) { | ||
| 1360 | kfree(pool); | ||
| 1361 | return NULL; | ||
| 1362 | } | ||
| 1363 | |||
| 1161 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), | 1364 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), |
| 1162 | GFP_KERNEL); | 1365 | GFP_KERNEL); |
| 1163 | if (!pool->size_class) { | 1366 | if (!pool->size_class) { |
| 1367 | kfree(pool->name); | ||
| 1164 | kfree(pool); | 1368 | kfree(pool); |
| 1165 | return NULL; | 1369 | return NULL; |
| 1166 | } | 1370 | } |
| @@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags) | |||
| 1210 | 1414 | ||
| 1211 | pool->flags = flags; | 1415 | pool->flags = flags; |
| 1212 | 1416 | ||
| 1417 | if (zs_pool_stat_create(name, pool)) | ||
| 1418 | goto err; | ||
| 1419 | |||
| 1213 | return pool; | 1420 | return pool; |
| 1214 | 1421 | ||
| 1215 | err: | 1422 | err: |
| @@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
| 1222 | { | 1429 | { |
| 1223 | int i; | 1430 | int i; |
| 1224 | 1431 | ||
| 1432 | zs_pool_stat_destroy(pool); | ||
| 1433 | |||
| 1225 | for (i = 0; i < zs_size_classes; i++) { | 1434 | for (i = 0; i < zs_size_classes; i++) { |
| 1226 | int fg; | 1435 | int fg; |
| 1227 | struct size_class *class = pool->size_class[i]; | 1436 | struct size_class *class = pool->size_class[i]; |
| @@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
| 1242 | } | 1451 | } |
| 1243 | 1452 | ||
| 1244 | kfree(pool->size_class); | 1453 | kfree(pool->size_class); |
| 1454 | kfree(pool->name); | ||
| 1245 | kfree(pool); | 1455 | kfree(pool); |
| 1246 | } | 1456 | } |
| 1247 | EXPORT_SYMBOL_GPL(zs_destroy_pool); | 1457 | EXPORT_SYMBOL_GPL(zs_destroy_pool); |
| @@ -1250,17 +1460,30 @@ static int __init zs_init(void) | |||
| 1250 | { | 1460 | { |
| 1251 | int ret = zs_register_cpu_notifier(); | 1461 | int ret = zs_register_cpu_notifier(); |
| 1252 | 1462 | ||
| 1253 | if (ret) { | 1463 | if (ret) |
| 1254 | zs_unregister_cpu_notifier(); | 1464 | goto notifier_fail; |
| 1255 | return ret; | ||
| 1256 | } | ||
| 1257 | 1465 | ||
| 1258 | init_zs_size_classes(); | 1466 | init_zs_size_classes(); |
| 1259 | 1467 | ||
| 1260 | #ifdef CONFIG_ZPOOL | 1468 | #ifdef CONFIG_ZPOOL |
| 1261 | zpool_register_driver(&zs_zpool_driver); | 1469 | zpool_register_driver(&zs_zpool_driver); |
| 1262 | #endif | 1470 | #endif |
| 1471 | |||
| 1472 | ret = zs_stat_init(); | ||
| 1473 | if (ret) { | ||
| 1474 | pr_err("zs stat initialization failed\n"); | ||
| 1475 | goto stat_fail; | ||
| 1476 | } | ||
| 1263 | return 0; | 1477 | return 0; |
| 1478 | |||
| 1479 | stat_fail: | ||
| 1480 | #ifdef CONFIG_ZPOOL | ||
| 1481 | zpool_unregister_driver(&zs_zpool_driver); | ||
| 1482 | #endif | ||
| 1483 | notifier_fail: | ||
| 1484 | zs_unregister_cpu_notifier(); | ||
| 1485 | |||
| 1486 | return ret; | ||
| 1264 | } | 1487 | } |
| 1265 | 1488 | ||
| 1266 | static void __exit zs_exit(void) | 1489 | static void __exit zs_exit(void) |
| @@ -1269,6 +1492,8 @@ static void __exit zs_exit(void) | |||
| 1269 | zpool_unregister_driver(&zs_zpool_driver); | 1492 | zpool_unregister_driver(&zs_zpool_driver); |
| 1270 | #endif | 1493 | #endif |
| 1271 | zs_unregister_cpu_notifier(); | 1494 | zs_unregister_cpu_notifier(); |
| 1495 | |||
| 1496 | zs_stat_exit(); | ||
| 1272 | } | 1497 | } |
| 1273 | 1498 | ||
| 1274 | module_init(zs_init); | 1499 | module_init(zs_init); |
diff --git a/mm/zswap.c b/mm/zswap.c index 0cfce9bc51e4..4249e82ff934 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
| @@ -906,11 +906,12 @@ static int __init init_zswap(void) | |||
| 906 | 906 | ||
| 907 | pr_info("loading zswap\n"); | 907 | pr_info("loading zswap\n"); |
| 908 | 908 | ||
| 909 | zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); | 909 | zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, |
| 910 | &zswap_zpool_ops); | ||
| 910 | if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { | 911 | if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { |
| 911 | pr_info("%s zpool not available\n", zswap_zpool_type); | 912 | pr_info("%s zpool not available\n", zswap_zpool_type); |
| 912 | zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; | 913 | zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; |
| 913 | zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, | 914 | zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, |
| 914 | &zswap_zpool_ops); | 915 | &zswap_zpool_ops); |
| 915 | } | 916 | } |
| 916 | if (!zswap_pool) { | 917 | if (!zswap_pool) { |
