diff options
Diffstat (limited to 'mm')
48 files changed, 3410 insertions, 1710 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b2176374b98e..d5c8019c6627 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK | |||
140 | config NO_BOOTMEM | 140 | config NO_BOOTMEM |
141 | boolean | 141 | boolean |
142 | 142 | ||
143 | config MEMORY_ISOLATION | ||
144 | boolean | ||
145 | |||
143 | # eventually, we can have this option just 'select SPARSEMEM' | 146 | # eventually, we can have this option just 'select SPARSEMEM' |
144 | config MEMORY_HOTPLUG | 147 | config MEMORY_HOTPLUG |
145 | bool "Allow for memory hot-add" | 148 | bool "Allow for memory hot-add" |
149 | select MEMORY_ISOLATION | ||
146 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 150 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
147 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG | 151 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG |
148 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) | 152 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) |
@@ -272,6 +276,7 @@ config MEMORY_FAILURE | |||
272 | depends on MMU | 276 | depends on MMU |
273 | depends on ARCH_SUPPORTS_MEMORY_FAILURE | 277 | depends on ARCH_SUPPORTS_MEMORY_FAILURE |
274 | bool "Enable recovery from hardware memory errors" | 278 | bool "Enable recovery from hardware memory errors" |
279 | select MEMORY_ISOLATION | ||
275 | help | 280 | help |
276 | Enables code to recover from some memory failures on systems | 281 | Enables code to recover from some memory failures on systems |
277 | with MCA recovery. This allows a system to continue running | 282 | with MCA recovery. This allows a system to continue running |
@@ -389,3 +394,20 @@ config CLEANCACHE | |||
389 | in a negligible performance hit. | 394 | in a negligible performance hit. |
390 | 395 | ||
391 | If unsure, say Y to enable cleancache | 396 | If unsure, say Y to enable cleancache |
397 | |||
398 | config FRONTSWAP | ||
399 | bool "Enable frontswap to cache swap pages if tmem is present" | ||
400 | depends on SWAP | ||
401 | default n | ||
402 | help | ||
403 | Frontswap is so named because it can be thought of as the opposite | ||
404 | of a "backing" store for a swap device. The data is stored into | ||
405 | "transcendent memory", memory that is not directly accessible or | ||
406 | addressable by the kernel and is of unknown and possibly | ||
407 | time-varying size. When space in transcendent memory is available, | ||
408 | a significant swap I/O reduction may be achieved. When none is | ||
409 | available, all frontswap calls are reduced to a single pointer- | ||
410 | compare-against-NULL resulting in a negligible performance hit | ||
411 | and swap data is stored as normal on the matching swap device. | ||
412 | |||
413 | If unsure, say Y to enable frontswap. | ||
diff --git a/mm/Makefile b/mm/Makefile index a156285ce88d..92753e2d82da 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -15,8 +15,9 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | |||
15 | maccess.o page_alloc.o page-writeback.o \ | 15 | maccess.o page_alloc.o page-writeback.o \ |
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
18 | page_isolation.o mm_init.o mmu_context.o percpu.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o $(mmu-y) | 19 | compaction.o $(mmu-y) |
20 | |||
20 | obj-y += init-mm.o | 21 | obj-y += init-mm.o |
21 | 22 | ||
22 | ifdef CONFIG_NO_BOOTMEM | 23 | ifdef CONFIG_NO_BOOTMEM |
@@ -29,6 +30,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | |||
29 | 30 | ||
30 | obj-$(CONFIG_BOUNCE) += bounce.o | 31 | obj-$(CONFIG_BOUNCE) += bounce.o |
31 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o | 32 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o |
33 | obj-$(CONFIG_FRONTSWAP) += frontswap.o | ||
32 | obj-$(CONFIG_HAS_DMA) += dmapool.o | 34 | obj-$(CONFIG_HAS_DMA) += dmapool.o |
33 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 35 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
34 | obj-$(CONFIG_NUMA) += mempolicy.o | 36 | obj-$(CONFIG_NUMA) += mempolicy.o |
@@ -47,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
47 | obj-$(CONFIG_MIGRATION) += migrate.o | 49 | obj-$(CONFIG_MIGRATION) += migrate.o |
48 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 50 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
49 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 51 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o |
50 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 52 | obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o |
53 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o | ||
51 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 54 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
52 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 55 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
53 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 56 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
54 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 57 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
55 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 58 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
59 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dd8e2aafb07e..6b4718e2ee34 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -677,7 +677,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
677 | 677 | ||
678 | bdi->min_ratio = 0; | 678 | bdi->min_ratio = 0; |
679 | bdi->max_ratio = 100; | 679 | bdi->max_ratio = 100; |
680 | bdi->max_prop_frac = PROP_FRAC_BASE; | 680 | bdi->max_prop_frac = FPROP_FRAC_BASE; |
681 | spin_lock_init(&bdi->wb_lock); | 681 | spin_lock_init(&bdi->wb_lock); |
682 | INIT_LIST_HEAD(&bdi->bdi_list); | 682 | INIT_LIST_HEAD(&bdi->bdi_list); |
683 | INIT_LIST_HEAD(&bdi->work_list); | 683 | INIT_LIST_HEAD(&bdi->work_list); |
@@ -700,7 +700,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
700 | bdi->write_bandwidth = INIT_BW; | 700 | bdi->write_bandwidth = INIT_BW; |
701 | bdi->avg_write_bandwidth = INIT_BW; | 701 | bdi->avg_write_bandwidth = INIT_BW; |
702 | 702 | ||
703 | err = prop_local_init_percpu(&bdi->completions); | 703 | err = fprop_local_init_percpu(&bdi->completions); |
704 | 704 | ||
705 | if (err) { | 705 | if (err) { |
706 | err: | 706 | err: |
@@ -744,7 +744,7 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
744 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 744 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
745 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 745 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
746 | 746 | ||
747 | prop_local_destroy_percpu(&bdi->completions); | 747 | fprop_local_destroy_percpu(&bdi->completions); |
748 | } | 748 | } |
749 | EXPORT_SYMBOL(bdi_destroy); | 749 | EXPORT_SYMBOL(bdi_destroy); |
750 | 750 | ||
@@ -886,3 +886,23 @@ out: | |||
886 | return ret; | 886 | return ret; |
887 | } | 887 | } |
888 | EXPORT_SYMBOL(wait_iff_congested); | 888 | EXPORT_SYMBOL(wait_iff_congested); |
889 | |||
890 | int pdflush_proc_obsolete(struct ctl_table *table, int write, | ||
891 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
892 | { | ||
893 | char kbuf[] = "0\n"; | ||
894 | |||
895 | if (*ppos) { | ||
896 | *lenp = 0; | ||
897 | return 0; | ||
898 | } | ||
899 | |||
900 | if (copy_to_user(buffer, kbuf, sizeof(kbuf))) | ||
901 | return -EFAULT; | ||
902 | printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", | ||
903 | table->procname); | ||
904 | |||
905 | *lenp = 2; | ||
906 | *ppos += *lenp; | ||
907 | return 2; | ||
908 | } | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index ec4fcb7a56c8..bcb63ac48cc5 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
698 | return ___alloc_bootmem(size, align, goal, limit); | 698 | return ___alloc_bootmem(size, align, goal, limit); |
699 | } | 699 | } |
700 | 700 | ||
701 | static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, | 701 | void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, |
702 | unsigned long size, unsigned long align, | 702 | unsigned long size, unsigned long align, |
703 | unsigned long goal, unsigned long limit) | 703 | unsigned long goal, unsigned long limit) |
704 | { | 704 | { |
@@ -710,6 +710,10 @@ again: | |||
710 | if (ptr) | 710 | if (ptr) |
711 | return ptr; | 711 | return ptr; |
712 | 712 | ||
713 | /* do not panic in alloc_bootmem_bdata() */ | ||
714 | if (limit && goal + size > limit) | ||
715 | limit = 0; | ||
716 | |||
713 | ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); | 717 | ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); |
714 | if (ptr) | 718 | if (ptr) |
715 | return ptr; | 719 | return ptr; |
diff --git a/mm/bounce.c b/mm/bounce.c index d1be02ca1889..042086775561 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -24,23 +24,25 @@ | |||
24 | 24 | ||
25 | static mempool_t *page_pool, *isa_page_pool; | 25 | static mempool_t *page_pool, *isa_page_pool; |
26 | 26 | ||
27 | #ifdef CONFIG_HIGHMEM | 27 | #if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) |
28 | static __init int init_emergency_pool(void) | 28 | static __init int init_emergency_pool(void) |
29 | { | 29 | { |
30 | #ifndef CONFIG_MEMORY_HOTPLUG | 30 | #if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) |
31 | if (max_pfn <= max_low_pfn) | 31 | if (max_pfn <= max_low_pfn) |
32 | return 0; | 32 | return 0; |
33 | #endif | 33 | #endif |
34 | 34 | ||
35 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); | 35 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); |
36 | BUG_ON(!page_pool); | 36 | BUG_ON(!page_pool); |
37 | printk("highmem bounce pool size: %d pages\n", POOL_SIZE); | 37 | printk("bounce pool size: %d pages\n", POOL_SIZE); |
38 | 38 | ||
39 | return 0; | 39 | return 0; |
40 | } | 40 | } |
41 | 41 | ||
42 | __initcall(init_emergency_pool); | 42 | __initcall(init_emergency_pool); |
43 | #endif | ||
43 | 44 | ||
45 | #ifdef CONFIG_HIGHMEM | ||
44 | /* | 46 | /* |
45 | * highmem version, map in to vec | 47 | * highmem version, map in to vec |
46 | */ | 48 | */ |
diff --git a/mm/compaction.c b/mm/compaction.c index 7ea259d82a99..e78cb9688421 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone, | |||
422 | pfn -= pageblock_nr_pages) { | 422 | pfn -= pageblock_nr_pages) { |
423 | unsigned long isolated; | 423 | unsigned long isolated; |
424 | 424 | ||
425 | /* | ||
426 | * Skip ahead if another thread is compacting in the area | ||
427 | * simultaneously. If we wrapped around, we can only skip | ||
428 | * ahead if zone->compact_cached_free_pfn also wrapped to | ||
429 | * above our starting point. | ||
430 | */ | ||
431 | if (cc->order > 0 && (!cc->wrapped || | ||
432 | zone->compact_cached_free_pfn > | ||
433 | cc->start_free_pfn)) | ||
434 | pfn = min(pfn, zone->compact_cached_free_pfn); | ||
435 | |||
425 | if (!pfn_valid(pfn)) | 436 | if (!pfn_valid(pfn)) |
426 | continue; | 437 | continue; |
427 | 438 | ||
@@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone, | |||
461 | * looking for free pages, the search will restart here as | 472 | * looking for free pages, the search will restart here as |
462 | * page migration may have returned some pages to the allocator | 473 | * page migration may have returned some pages to the allocator |
463 | */ | 474 | */ |
464 | if (isolated) | 475 | if (isolated) { |
465 | high_pfn = max(high_pfn, pfn); | 476 | high_pfn = max(high_pfn, pfn); |
477 | if (cc->order > 0) | ||
478 | zone->compact_cached_free_pfn = high_pfn; | ||
479 | } | ||
466 | } | 480 | } |
467 | 481 | ||
468 | /* split_free_page does not map the pages */ | 482 | /* split_free_page does not map the pages */ |
@@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
556 | return ISOLATE_SUCCESS; | 570 | return ISOLATE_SUCCESS; |
557 | } | 571 | } |
558 | 572 | ||
573 | /* | ||
574 | * Returns the start pfn of the last page block in a zone. This is the starting | ||
575 | * point for full compaction of a zone. Compaction searches for free pages from | ||
576 | * the end of each zone, while isolate_freepages_block scans forward inside each | ||
577 | * page block. | ||
578 | */ | ||
579 | static unsigned long start_free_pfn(struct zone *zone) | ||
580 | { | ||
581 | unsigned long free_pfn; | ||
582 | free_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
583 | free_pfn &= ~(pageblock_nr_pages-1); | ||
584 | return free_pfn; | ||
585 | } | ||
586 | |||
559 | static int compact_finished(struct zone *zone, | 587 | static int compact_finished(struct zone *zone, |
560 | struct compact_control *cc) | 588 | struct compact_control *cc) |
561 | { | 589 | { |
@@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone, | |||
565 | if (fatal_signal_pending(current)) | 593 | if (fatal_signal_pending(current)) |
566 | return COMPACT_PARTIAL; | 594 | return COMPACT_PARTIAL; |
567 | 595 | ||
568 | /* Compaction run completes if the migrate and free scanner meet */ | 596 | /* |
569 | if (cc->free_pfn <= cc->migrate_pfn) | 597 | * A full (order == -1) compaction run starts at the beginning and |
598 | * end of a zone; it completes when the migrate and free scanner meet. | ||
599 | * A partial (order > 0) compaction can start with the free scanner | ||
600 | * at a random point in the zone, and may have to restart. | ||
601 | */ | ||
602 | if (cc->free_pfn <= cc->migrate_pfn) { | ||
603 | if (cc->order > 0 && !cc->wrapped) { | ||
604 | /* We started partway through; restart at the end. */ | ||
605 | unsigned long free_pfn = start_free_pfn(zone); | ||
606 | zone->compact_cached_free_pfn = free_pfn; | ||
607 | cc->free_pfn = free_pfn; | ||
608 | cc->wrapped = 1; | ||
609 | return COMPACT_CONTINUE; | ||
610 | } | ||
611 | return COMPACT_COMPLETE; | ||
612 | } | ||
613 | |||
614 | /* We wrapped around and ended up where we started. */ | ||
615 | if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) | ||
570 | return COMPACT_COMPLETE; | 616 | return COMPACT_COMPLETE; |
571 | 617 | ||
572 | /* | 618 | /* |
@@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
664 | 710 | ||
665 | /* Setup to move all movable pages to the end of the zone */ | 711 | /* Setup to move all movable pages to the end of the zone */ |
666 | cc->migrate_pfn = zone->zone_start_pfn; | 712 | cc->migrate_pfn = zone->zone_start_pfn; |
667 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; | 713 | |
668 | cc->free_pfn &= ~(pageblock_nr_pages-1); | 714 | if (cc->order > 0) { |
715 | /* Incremental compaction. Start where the last one stopped. */ | ||
716 | cc->free_pfn = zone->compact_cached_free_pfn; | ||
717 | cc->start_free_pfn = cc->free_pfn; | ||
718 | } else { | ||
719 | /* Order == -1 starts at the end of the zone. */ | ||
720 | cc->free_pfn = start_free_pfn(zone); | ||
721 | } | ||
669 | 722 | ||
670 | migrate_prep_local(); | 723 | migrate_prep_local(); |
671 | 724 | ||
@@ -701,8 +754,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
701 | if (err) { | 754 | if (err) { |
702 | putback_lru_pages(&cc->migratepages); | 755 | putback_lru_pages(&cc->migratepages); |
703 | cc->nr_migratepages = 0; | 756 | cc->nr_migratepages = 0; |
757 | if (err == -ENOMEM) { | ||
758 | ret = COMPACT_PARTIAL; | ||
759 | goto out; | ||
760 | } | ||
704 | } | 761 | } |
705 | |||
706 | } | 762 | } |
707 | 763 | ||
708 | out: | 764 | out: |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 469491e0af79..9b75a045dbf4 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
93 | spin_unlock(&file->f_lock); | 93 | spin_unlock(&file->f_lock); |
94 | break; | 94 | break; |
95 | case POSIX_FADV_WILLNEED: | 95 | case POSIX_FADV_WILLNEED: |
96 | if (!mapping->a_ops->readpage) { | ||
97 | ret = -EINVAL; | ||
98 | break; | ||
99 | } | ||
100 | |||
101 | /* First and last PARTIAL page! */ | 96 | /* First and last PARTIAL page! */ |
102 | start_index = offset >> PAGE_CACHE_SHIFT; | 97 | start_index = offset >> PAGE_CACHE_SHIFT; |
103 | end_index = endbyte >> PAGE_CACHE_SHIFT; | 98 | end_index = endbyte >> PAGE_CACHE_SHIFT; |
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
106 | nrpages = end_index - start_index + 1; | 101 | nrpages = end_index - start_index + 1; |
107 | if (!nrpages) | 102 | if (!nrpages) |
108 | nrpages = ~0UL; | 103 | nrpages = ~0UL; |
109 | 104 | ||
110 | ret = force_page_cache_readahead(mapping, file, | 105 | /* |
111 | start_index, | 106 | * Ignore return value because fadvise() shall return |
112 | nrpages); | 107 | * success even if filesystem can't retrieve a hint, |
113 | if (ret > 0) | 108 | */ |
114 | ret = 0; | 109 | force_page_cache_readahead(mapping, file, start_index, |
110 | nrpages); | ||
115 | break; | 111 | break; |
116 | case POSIX_FADV_NOREUSE: | 112 | case POSIX_FADV_NOREUSE: |
117 | break; | 113 | break; |
diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..6b3e71a2cd48 --- /dev/null +++ b/mm/frontswap.c | |||
@@ -0,0 +1,344 @@ | |||
1 | /* | ||
2 | * Frontswap frontend | ||
3 | * | ||
4 | * This code provides the generic "frontend" layer to call a matching | ||
5 | * "backend" driver implementation of frontswap. See | ||
6 | * Documentation/vm/frontswap.txt for more information. | ||
7 | * | ||
8 | * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. | ||
9 | * Author: Dan Magenheimer | ||
10 | * | ||
11 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
12 | */ | ||
13 | |||
14 | #include <linux/mman.h> | ||
15 | #include <linux/swap.h> | ||
16 | #include <linux/swapops.h> | ||
17 | #include <linux/security.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/debugfs.h> | ||
20 | #include <linux/frontswap.h> | ||
21 | #include <linux/swapfile.h> | ||
22 | |||
23 | /* | ||
24 | * frontswap_ops is set by frontswap_register_ops to contain the pointers | ||
25 | * to the frontswap "backend" implementation functions. | ||
26 | */ | ||
27 | static struct frontswap_ops frontswap_ops __read_mostly; | ||
28 | |||
29 | /* | ||
30 | * This global enablement flag reduces overhead on systems where frontswap_ops | ||
31 | * has not been registered, so is preferred to the slower alternative: a | ||
32 | * function call that checks a non-global. | ||
33 | */ | ||
34 | bool frontswap_enabled __read_mostly; | ||
35 | EXPORT_SYMBOL(frontswap_enabled); | ||
36 | |||
37 | /* | ||
38 | * If enabled, frontswap_store will return failure even on success. As | ||
39 | * a result, the swap subsystem will always write the page to swap, in | ||
40 | * effect converting frontswap into a writethrough cache. In this mode, | ||
41 | * there is no direct reduction in swap writes, but a frontswap backend | ||
42 | * can unilaterally "reclaim" any pages in use with no data loss, thus | ||
43 | * providing increases control over maximum memory usage due to frontswap. | ||
44 | */ | ||
45 | static bool frontswap_writethrough_enabled __read_mostly; | ||
46 | |||
47 | #ifdef CONFIG_DEBUG_FS | ||
48 | /* | ||
49 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | ||
50 | * properly configured). These are for information only so are not protected | ||
51 | * against increment races. | ||
52 | */ | ||
53 | static u64 frontswap_loads; | ||
54 | static u64 frontswap_succ_stores; | ||
55 | static u64 frontswap_failed_stores; | ||
56 | static u64 frontswap_invalidates; | ||
57 | |||
58 | static inline void inc_frontswap_loads(void) { | ||
59 | frontswap_loads++; | ||
60 | } | ||
61 | static inline void inc_frontswap_succ_stores(void) { | ||
62 | frontswap_succ_stores++; | ||
63 | } | ||
64 | static inline void inc_frontswap_failed_stores(void) { | ||
65 | frontswap_failed_stores++; | ||
66 | } | ||
67 | static inline void inc_frontswap_invalidates(void) { | ||
68 | frontswap_invalidates++; | ||
69 | } | ||
70 | #else | ||
71 | static inline void inc_frontswap_loads(void) { } | ||
72 | static inline void inc_frontswap_succ_stores(void) { } | ||
73 | static inline void inc_frontswap_failed_stores(void) { } | ||
74 | static inline void inc_frontswap_invalidates(void) { } | ||
75 | #endif | ||
76 | /* | ||
77 | * Register operations for frontswap, returning previous thus allowing | ||
78 | * detection of multiple backends and possible nesting. | ||
79 | */ | ||
80 | struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) | ||
81 | { | ||
82 | struct frontswap_ops old = frontswap_ops; | ||
83 | |||
84 | frontswap_ops = *ops; | ||
85 | frontswap_enabled = true; | ||
86 | return old; | ||
87 | } | ||
88 | EXPORT_SYMBOL(frontswap_register_ops); | ||
89 | |||
90 | /* | ||
91 | * Enable/disable frontswap writethrough (see above). | ||
92 | */ | ||
93 | void frontswap_writethrough(bool enable) | ||
94 | { | ||
95 | frontswap_writethrough_enabled = enable; | ||
96 | } | ||
97 | EXPORT_SYMBOL(frontswap_writethrough); | ||
98 | |||
99 | /* | ||
100 | * Called when a swap device is swapon'd. | ||
101 | */ | ||
102 | void __frontswap_init(unsigned type) | ||
103 | { | ||
104 | struct swap_info_struct *sis = swap_info[type]; | ||
105 | |||
106 | BUG_ON(sis == NULL); | ||
107 | if (sis->frontswap_map == NULL) | ||
108 | return; | ||
109 | frontswap_ops.init(type); | ||
110 | } | ||
111 | EXPORT_SYMBOL(__frontswap_init); | ||
112 | |||
113 | static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) | ||
114 | { | ||
115 | frontswap_clear(sis, offset); | ||
116 | atomic_dec(&sis->frontswap_pages); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * "Store" data from a page to frontswap and associate it with the page's | ||
121 | * swaptype and offset. Page must be locked and in the swap cache. | ||
122 | * If frontswap already contains a page with matching swaptype and | ||
123 | * offset, the frontswap implementation may either overwrite the data and | ||
124 | * return success or invalidate the page from frontswap and return failure. | ||
125 | */ | ||
126 | int __frontswap_store(struct page *page) | ||
127 | { | ||
128 | int ret = -1, dup = 0; | ||
129 | swp_entry_t entry = { .val = page_private(page), }; | ||
130 | int type = swp_type(entry); | ||
131 | struct swap_info_struct *sis = swap_info[type]; | ||
132 | pgoff_t offset = swp_offset(entry); | ||
133 | |||
134 | BUG_ON(!PageLocked(page)); | ||
135 | BUG_ON(sis == NULL); | ||
136 | if (frontswap_test(sis, offset)) | ||
137 | dup = 1; | ||
138 | ret = frontswap_ops.store(type, offset, page); | ||
139 | if (ret == 0) { | ||
140 | frontswap_set(sis, offset); | ||
141 | inc_frontswap_succ_stores(); | ||
142 | if (!dup) | ||
143 | atomic_inc(&sis->frontswap_pages); | ||
144 | } else { | ||
145 | /* | ||
146 | failed dup always results in automatic invalidate of | ||
147 | the (older) page from frontswap | ||
148 | */ | ||
149 | inc_frontswap_failed_stores(); | ||
150 | if (dup) | ||
151 | __frontswap_clear(sis, offset); | ||
152 | } | ||
153 | if (frontswap_writethrough_enabled) | ||
154 | /* report failure so swap also writes to swap device */ | ||
155 | ret = -1; | ||
156 | return ret; | ||
157 | } | ||
158 | EXPORT_SYMBOL(__frontswap_store); | ||
159 | |||
160 | /* | ||
161 | * "Get" data from frontswap associated with swaptype and offset that were | ||
162 | * specified when the data was put to frontswap and use it to fill the | ||
163 | * specified page with data. Page must be locked and in the swap cache. | ||
164 | */ | ||
165 | int __frontswap_load(struct page *page) | ||
166 | { | ||
167 | int ret = -1; | ||
168 | swp_entry_t entry = { .val = page_private(page), }; | ||
169 | int type = swp_type(entry); | ||
170 | struct swap_info_struct *sis = swap_info[type]; | ||
171 | pgoff_t offset = swp_offset(entry); | ||
172 | |||
173 | BUG_ON(!PageLocked(page)); | ||
174 | BUG_ON(sis == NULL); | ||
175 | if (frontswap_test(sis, offset)) | ||
176 | ret = frontswap_ops.load(type, offset, page); | ||
177 | if (ret == 0) | ||
178 | inc_frontswap_loads(); | ||
179 | return ret; | ||
180 | } | ||
181 | EXPORT_SYMBOL(__frontswap_load); | ||
182 | |||
183 | /* | ||
184 | * Invalidate any data from frontswap associated with the specified swaptype | ||
185 | * and offset so that a subsequent "get" will fail. | ||
186 | */ | ||
187 | void __frontswap_invalidate_page(unsigned type, pgoff_t offset) | ||
188 | { | ||
189 | struct swap_info_struct *sis = swap_info[type]; | ||
190 | |||
191 | BUG_ON(sis == NULL); | ||
192 | if (frontswap_test(sis, offset)) { | ||
193 | frontswap_ops.invalidate_page(type, offset); | ||
194 | __frontswap_clear(sis, offset); | ||
195 | inc_frontswap_invalidates(); | ||
196 | } | ||
197 | } | ||
198 | EXPORT_SYMBOL(__frontswap_invalidate_page); | ||
199 | |||
200 | /* | ||
201 | * Invalidate all data from frontswap associated with all offsets for the | ||
202 | * specified swaptype. | ||
203 | */ | ||
204 | void __frontswap_invalidate_area(unsigned type) | ||
205 | { | ||
206 | struct swap_info_struct *sis = swap_info[type]; | ||
207 | |||
208 | BUG_ON(sis == NULL); | ||
209 | if (sis->frontswap_map == NULL) | ||
210 | return; | ||
211 | frontswap_ops.invalidate_area(type); | ||
212 | atomic_set(&sis->frontswap_pages, 0); | ||
213 | memset(sis->frontswap_map, 0, sis->max / sizeof(long)); | ||
214 | } | ||
215 | EXPORT_SYMBOL(__frontswap_invalidate_area); | ||
216 | |||
217 | static unsigned long __frontswap_curr_pages(void) | ||
218 | { | ||
219 | int type; | ||
220 | unsigned long totalpages = 0; | ||
221 | struct swap_info_struct *si = NULL; | ||
222 | |||
223 | assert_spin_locked(&swap_lock); | ||
224 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
225 | si = swap_info[type]; | ||
226 | totalpages += atomic_read(&si->frontswap_pages); | ||
227 | } | ||
228 | return totalpages; | ||
229 | } | ||
230 | |||
231 | static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | ||
232 | int *swapid) | ||
233 | { | ||
234 | int ret = -EINVAL; | ||
235 | struct swap_info_struct *si = NULL; | ||
236 | int si_frontswap_pages; | ||
237 | unsigned long total_pages_to_unuse = total; | ||
238 | unsigned long pages = 0, pages_to_unuse = 0; | ||
239 | int type; | ||
240 | |||
241 | assert_spin_locked(&swap_lock); | ||
242 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
243 | si = swap_info[type]; | ||
244 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | ||
245 | if (total_pages_to_unuse < si_frontswap_pages) { | ||
246 | pages = pages_to_unuse = total_pages_to_unuse; | ||
247 | } else { | ||
248 | pages = si_frontswap_pages; | ||
249 | pages_to_unuse = 0; /* unuse all */ | ||
250 | } | ||
251 | /* ensure there is enough RAM to fetch pages from frontswap */ | ||
252 | if (security_vm_enough_memory_mm(current->mm, pages)) { | ||
253 | ret = -ENOMEM; | ||
254 | continue; | ||
255 | } | ||
256 | vm_unacct_memory(pages); | ||
257 | *unused = pages_to_unuse; | ||
258 | *swapid = type; | ||
259 | ret = 0; | ||
260 | break; | ||
261 | } | ||
262 | |||
263 | return ret; | ||
264 | } | ||
265 | |||
266 | static int __frontswap_shrink(unsigned long target_pages, | ||
267 | unsigned long *pages_to_unuse, | ||
268 | int *type) | ||
269 | { | ||
270 | unsigned long total_pages = 0, total_pages_to_unuse; | ||
271 | |||
272 | assert_spin_locked(&swap_lock); | ||
273 | |||
274 | total_pages = __frontswap_curr_pages(); | ||
275 | if (total_pages <= target_pages) { | ||
276 | /* Nothing to do */ | ||
277 | *pages_to_unuse = 0; | ||
278 | return 0; | ||
279 | } | ||
280 | total_pages_to_unuse = total_pages - target_pages; | ||
281 | return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); | ||
282 | } | ||
283 | |||
284 | /* | ||
285 | * Frontswap, like a true swap device, may unnecessarily retain pages | ||
286 | * under certain circumstances; "shrink" frontswap is essentially a | ||
287 | * "partial swapoff" and works by calling try_to_unuse to attempt to | ||
288 | * unuse enough frontswap pages to attempt to -- subject to memory | ||
289 | * constraints -- reduce the number of pages in frontswap to the | ||
290 | * number given in the parameter target_pages. | ||
291 | */ | ||
292 | void frontswap_shrink(unsigned long target_pages) | ||
293 | { | ||
294 | unsigned long pages_to_unuse = 0; | ||
295 | int type, ret; | ||
296 | |||
297 | /* | ||
298 | * we don't want to hold swap_lock while doing a very | ||
299 | * lengthy try_to_unuse, but swap_list may change | ||
300 | * so restart scan from swap_list.head each time | ||
301 | */ | ||
302 | spin_lock(&swap_lock); | ||
303 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); | ||
304 | spin_unlock(&swap_lock); | ||
305 | if (ret == 0 && pages_to_unuse) | ||
306 | try_to_unuse(type, true, pages_to_unuse); | ||
307 | return; | ||
308 | } | ||
309 | EXPORT_SYMBOL(frontswap_shrink); | ||
310 | |||
311 | /* | ||
312 | * Count and return the number of frontswap pages across all | ||
313 | * swap devices. This is exported so that backend drivers can | ||
314 | * determine current usage without reading debugfs. | ||
315 | */ | ||
316 | unsigned long frontswap_curr_pages(void) | ||
317 | { | ||
318 | unsigned long totalpages = 0; | ||
319 | |||
320 | spin_lock(&swap_lock); | ||
321 | totalpages = __frontswap_curr_pages(); | ||
322 | spin_unlock(&swap_lock); | ||
323 | |||
324 | return totalpages; | ||
325 | } | ||
326 | EXPORT_SYMBOL(frontswap_curr_pages); | ||
327 | |||
328 | static int __init init_frontswap(void) | ||
329 | { | ||
330 | #ifdef CONFIG_DEBUG_FS | ||
331 | struct dentry *root = debugfs_create_dir("frontswap", NULL); | ||
332 | if (root == NULL) | ||
333 | return -ENXIO; | ||
334 | debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); | ||
335 | debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); | ||
336 | debugfs_create_u64("failed_stores", S_IRUGO, root, | ||
337 | &frontswap_failed_stores); | ||
338 | debugfs_create_u64("invalidates", S_IRUGO, | ||
339 | root, &frontswap_invalidates); | ||
340 | #endif | ||
341 | return 0; | ||
342 | } | ||
343 | |||
344 | module_init(init_frontswap); | ||
diff --git a/mm/highmem.c b/mm/highmem.c index 57d82c6250c3..d517cd16a6eb 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); | |||
94 | do { spin_unlock(&kmap_lock); (void)(flags); } while (0) | 94 | do { spin_unlock(&kmap_lock); (void)(flags); } while (0) |
95 | #endif | 95 | #endif |
96 | 96 | ||
97 | struct page *kmap_to_page(void *vaddr) | ||
98 | { | ||
99 | unsigned long addr = (unsigned long)vaddr; | ||
100 | |||
101 | if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) { | ||
102 | int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; | ||
103 | return pte_page(pkmap_page_table[i]); | ||
104 | } | ||
105 | |||
106 | return virt_to_page(addr); | ||
107 | } | ||
108 | |||
97 | static void flush_all_zero_pkmaps(void) | 109 | static void flush_all_zero_pkmaps(void) |
98 | { | 110 | { |
99 | int i; | 111 | int i; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e198831276a3..bc727122dd44 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,17 +24,20 @@ | |||
24 | 24 | ||
25 | #include <asm/page.h> | 25 | #include <asm/page.h> |
26 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
27 | #include <linux/io.h> | 27 | #include <asm/tlb.h> |
28 | 28 | ||
29 | #include <linux/io.h> | ||
29 | #include <linux/hugetlb.h> | 30 | #include <linux/hugetlb.h> |
31 | #include <linux/hugetlb_cgroup.h> | ||
30 | #include <linux/node.h> | 32 | #include <linux/node.h> |
33 | #include <linux/hugetlb_cgroup.h> | ||
31 | #include "internal.h" | 34 | #include "internal.h" |
32 | 35 | ||
33 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 36 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
34 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 37 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
35 | unsigned long hugepages_treat_as_movable; | 38 | unsigned long hugepages_treat_as_movable; |
36 | 39 | ||
37 | static int max_hstate; | 40 | int hugetlb_max_hstate __read_mostly; |
38 | unsigned int default_hstate_idx; | 41 | unsigned int default_hstate_idx; |
39 | struct hstate hstates[HUGE_MAX_HSTATE]; | 42 | struct hstate hstates[HUGE_MAX_HSTATE]; |
40 | 43 | ||
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate; | |||
45 | static unsigned long __initdata default_hstate_max_huge_pages; | 48 | static unsigned long __initdata default_hstate_max_huge_pages; |
46 | static unsigned long __initdata default_hstate_size; | 49 | static unsigned long __initdata default_hstate_size; |
47 | 50 | ||
48 | #define for_each_hstate(h) \ | ||
49 | for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) | ||
50 | |||
51 | /* | 51 | /* |
52 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 52 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
53 | */ | 53 | */ |
54 | static DEFINE_SPINLOCK(hugetlb_lock); | 54 | DEFINE_SPINLOCK(hugetlb_lock); |
55 | 55 | ||
56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | 56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) |
57 | { | 57 | { |
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src) | |||
509 | static void enqueue_huge_page(struct hstate *h, struct page *page) | 509 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
510 | { | 510 | { |
511 | int nid = page_to_nid(page); | 511 | int nid = page_to_nid(page); |
512 | list_add(&page->lru, &h->hugepage_freelists[nid]); | 512 | list_move(&page->lru, &h->hugepage_freelists[nid]); |
513 | h->free_huge_pages++; | 513 | h->free_huge_pages++; |
514 | h->free_huge_pages_node[nid]++; | 514 | h->free_huge_pages_node[nid]++; |
515 | } | 515 | } |
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | |||
521 | if (list_empty(&h->hugepage_freelists[nid])) | 521 | if (list_empty(&h->hugepage_freelists[nid])) |
522 | return NULL; | 522 | return NULL; |
523 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | 523 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); |
524 | list_del(&page->lru); | 524 | list_move(&page->lru, &h->hugepage_activelist); |
525 | set_page_refcounted(page); | 525 | set_page_refcounted(page); |
526 | h->free_huge_pages--; | 526 | h->free_huge_pages--; |
527 | h->free_huge_pages_node[nid]--; | 527 | h->free_huge_pages_node[nid]--; |
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
593 | 1 << PG_active | 1 << PG_reserved | | 593 | 1 << PG_active | 1 << PG_reserved | |
594 | 1 << PG_private | 1 << PG_writeback); | 594 | 1 << PG_private | 1 << PG_writeback); |
595 | } | 595 | } |
596 | VM_BUG_ON(hugetlb_cgroup_from_page(page)); | ||
596 | set_compound_page_dtor(page, NULL); | 597 | set_compound_page_dtor(page, NULL); |
597 | set_page_refcounted(page); | 598 | set_page_refcounted(page); |
598 | arch_release_hugepage(page); | 599 | arch_release_hugepage(page); |
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page) | |||
625 | page->mapping = NULL; | 626 | page->mapping = NULL; |
626 | BUG_ON(page_count(page)); | 627 | BUG_ON(page_count(page)); |
627 | BUG_ON(page_mapcount(page)); | 628 | BUG_ON(page_mapcount(page)); |
628 | INIT_LIST_HEAD(&page->lru); | ||
629 | 629 | ||
630 | spin_lock(&hugetlb_lock); | 630 | spin_lock(&hugetlb_lock); |
631 | hugetlb_cgroup_uncharge_page(hstate_index(h), | ||
632 | pages_per_huge_page(h), page); | ||
631 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { | 633 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { |
634 | /* remove the page from active list */ | ||
635 | list_del(&page->lru); | ||
632 | update_and_free_page(h, page); | 636 | update_and_free_page(h, page); |
633 | h->surplus_huge_pages--; | 637 | h->surplus_huge_pages--; |
634 | h->surplus_huge_pages_node[nid]--; | 638 | h->surplus_huge_pages_node[nid]--; |
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page) | |||
641 | 645 | ||
642 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 646 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
643 | { | 647 | { |
648 | INIT_LIST_HEAD(&page->lru); | ||
644 | set_compound_page_dtor(page, free_huge_page); | 649 | set_compound_page_dtor(page, free_huge_page); |
645 | spin_lock(&hugetlb_lock); | 650 | spin_lock(&hugetlb_lock); |
651 | set_hugetlb_cgroup(page, NULL); | ||
646 | h->nr_huge_pages++; | 652 | h->nr_huge_pages++; |
647 | h->nr_huge_pages_node[nid]++; | 653 | h->nr_huge_pages_node[nid]++; |
648 | spin_unlock(&hugetlb_lock); | 654 | spin_unlock(&hugetlb_lock); |
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
889 | 895 | ||
890 | spin_lock(&hugetlb_lock); | 896 | spin_lock(&hugetlb_lock); |
891 | if (page) { | 897 | if (page) { |
898 | INIT_LIST_HEAD(&page->lru); | ||
892 | r_nid = page_to_nid(page); | 899 | r_nid = page_to_nid(page); |
893 | set_compound_page_dtor(page, free_huge_page); | 900 | set_compound_page_dtor(page, free_huge_page); |
901 | set_hugetlb_cgroup(page, NULL); | ||
894 | /* | 902 | /* |
895 | * We incremented the global counters already | 903 | * We incremented the global counters already |
896 | */ | 904 | */ |
@@ -993,7 +1001,6 @@ retry: | |||
993 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1001 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
994 | if ((--needed) < 0) | 1002 | if ((--needed) < 0) |
995 | break; | 1003 | break; |
996 | list_del(&page->lru); | ||
997 | /* | 1004 | /* |
998 | * This page is now managed by the hugetlb allocator and has | 1005 | * This page is now managed by the hugetlb allocator and has |
999 | * no users -- drop the buddy allocator's reference. | 1006 | * no users -- drop the buddy allocator's reference. |
@@ -1008,7 +1015,6 @@ free: | |||
1008 | /* Free unnecessary surplus pages to the buddy allocator */ | 1015 | /* Free unnecessary surplus pages to the buddy allocator */ |
1009 | if (!list_empty(&surplus_list)) { | 1016 | if (!list_empty(&surplus_list)) { |
1010 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1017 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
1011 | list_del(&page->lru); | ||
1012 | put_page(page); | 1018 | put_page(page); |
1013 | } | 1019 | } |
1014 | } | 1020 | } |
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1112 | struct hstate *h = hstate_vma(vma); | 1118 | struct hstate *h = hstate_vma(vma); |
1113 | struct page *page; | 1119 | struct page *page; |
1114 | long chg; | 1120 | long chg; |
1121 | int ret, idx; | ||
1122 | struct hugetlb_cgroup *h_cg; | ||
1115 | 1123 | ||
1124 | idx = hstate_index(h); | ||
1116 | /* | 1125 | /* |
1117 | * Processes that did not create the mapping will have no | 1126 | * Processes that did not create the mapping will have no |
1118 | * reserves and will not have accounted against subpool | 1127 | * reserves and will not have accounted against subpool |
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1123 | */ | 1132 | */ |
1124 | chg = vma_needs_reservation(h, vma, addr); | 1133 | chg = vma_needs_reservation(h, vma, addr); |
1125 | if (chg < 0) | 1134 | if (chg < 0) |
1126 | return ERR_PTR(-VM_FAULT_OOM); | 1135 | return ERR_PTR(-ENOMEM); |
1127 | if (chg) | 1136 | if (chg) |
1128 | if (hugepage_subpool_get_pages(spool, chg)) | 1137 | if (hugepage_subpool_get_pages(spool, chg)) |
1129 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1138 | return ERR_PTR(-ENOSPC); |
1130 | 1139 | ||
1140 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | ||
1141 | if (ret) { | ||
1142 | hugepage_subpool_put_pages(spool, chg); | ||
1143 | return ERR_PTR(-ENOSPC); | ||
1144 | } | ||
1131 | spin_lock(&hugetlb_lock); | 1145 | spin_lock(&hugetlb_lock); |
1132 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); | 1146 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
1133 | spin_unlock(&hugetlb_lock); | 1147 | if (page) { |
1134 | 1148 | /* update page cgroup details */ | |
1135 | if (!page) { | 1149 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), |
1150 | h_cg, page); | ||
1151 | spin_unlock(&hugetlb_lock); | ||
1152 | } else { | ||
1153 | spin_unlock(&hugetlb_lock); | ||
1136 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1154 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1137 | if (!page) { | 1155 | if (!page) { |
1156 | hugetlb_cgroup_uncharge_cgroup(idx, | ||
1157 | pages_per_huge_page(h), | ||
1158 | h_cg); | ||
1138 | hugepage_subpool_put_pages(spool, chg); | 1159 | hugepage_subpool_put_pages(spool, chg); |
1139 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1160 | return ERR_PTR(-ENOSPC); |
1140 | } | 1161 | } |
1162 | spin_lock(&hugetlb_lock); | ||
1163 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), | ||
1164 | h_cg, page); | ||
1165 | list_move(&page->lru, &h->hugepage_activelist); | ||
1166 | spin_unlock(&hugetlb_lock); | ||
1141 | } | 1167 | } |
1142 | 1168 | ||
1143 | set_page_private(page, (unsigned long)spool); | 1169 | set_page_private(page, (unsigned long)spool); |
1144 | 1170 | ||
1145 | vma_commit_reservation(h, vma, addr); | 1171 | vma_commit_reservation(h, vma, addr); |
1146 | |||
1147 | return page; | 1172 | return page; |
1148 | } | 1173 | } |
1149 | 1174 | ||
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, | |||
1646 | struct attribute_group *hstate_attr_group) | 1671 | struct attribute_group *hstate_attr_group) |
1647 | { | 1672 | { |
1648 | int retval; | 1673 | int retval; |
1649 | int hi = h - hstates; | 1674 | int hi = hstate_index(h); |
1650 | 1675 | ||
1651 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); | 1676 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
1652 | if (!hstate_kobjs[hi]) | 1677 | if (!hstate_kobjs[hi]) |
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node) | |||
1741 | if (!nhs->hugepages_kobj) | 1766 | if (!nhs->hugepages_kobj) |
1742 | return; /* no hstate attributes */ | 1767 | return; /* no hstate attributes */ |
1743 | 1768 | ||
1744 | for_each_hstate(h) | 1769 | for_each_hstate(h) { |
1745 | if (nhs->hstate_kobjs[h - hstates]) { | 1770 | int idx = hstate_index(h); |
1746 | kobject_put(nhs->hstate_kobjs[h - hstates]); | 1771 | if (nhs->hstate_kobjs[idx]) { |
1747 | nhs->hstate_kobjs[h - hstates] = NULL; | 1772 | kobject_put(nhs->hstate_kobjs[idx]); |
1773 | nhs->hstate_kobjs[idx] = NULL; | ||
1748 | } | 1774 | } |
1775 | } | ||
1749 | 1776 | ||
1750 | kobject_put(nhs->hugepages_kobj); | 1777 | kobject_put(nhs->hugepages_kobj); |
1751 | nhs->hugepages_kobj = NULL; | 1778 | nhs->hugepages_kobj = NULL; |
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void) | |||
1848 | hugetlb_unregister_all_nodes(); | 1875 | hugetlb_unregister_all_nodes(); |
1849 | 1876 | ||
1850 | for_each_hstate(h) { | 1877 | for_each_hstate(h) { |
1851 | kobject_put(hstate_kobjs[h - hstates]); | 1878 | kobject_put(hstate_kobjs[hstate_index(h)]); |
1852 | } | 1879 | } |
1853 | 1880 | ||
1854 | kobject_put(hugepages_kobj); | 1881 | kobject_put(hugepages_kobj); |
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void) | |||
1869 | if (!size_to_hstate(default_hstate_size)) | 1896 | if (!size_to_hstate(default_hstate_size)) |
1870 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); | 1897 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); |
1871 | } | 1898 | } |
1872 | default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; | 1899 | default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); |
1873 | if (default_hstate_max_huge_pages) | 1900 | if (default_hstate_max_huge_pages) |
1874 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; | 1901 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; |
1875 | 1902 | ||
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1897 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); | 1924 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); |
1898 | return; | 1925 | return; |
1899 | } | 1926 | } |
1900 | BUG_ON(max_hstate >= HUGE_MAX_HSTATE); | 1927 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); |
1901 | BUG_ON(order == 0); | 1928 | BUG_ON(order == 0); |
1902 | h = &hstates[max_hstate++]; | 1929 | h = &hstates[hugetlb_max_hstate++]; |
1903 | h->order = order; | 1930 | h->order = order; |
1904 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); | 1931 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); |
1905 | h->nr_huge_pages = 0; | 1932 | h->nr_huge_pages = 0; |
1906 | h->free_huge_pages = 0; | 1933 | h->free_huge_pages = 0; |
1907 | for (i = 0; i < MAX_NUMNODES; ++i) | 1934 | for (i = 0; i < MAX_NUMNODES; ++i) |
1908 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1935 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1936 | INIT_LIST_HEAD(&h->hugepage_activelist); | ||
1909 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); | 1937 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
1910 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); | 1938 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
1911 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1939 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1912 | huge_page_size(h)/1024); | 1940 | huge_page_size(h)/1024); |
1941 | /* | ||
1942 | * Add cgroup control files only if the huge page consists | ||
1943 | * of more than two normal pages. This is because we use | ||
1944 | * page[2].lru.next for storing cgoup details. | ||
1945 | */ | ||
1946 | if (order >= HUGETLB_CGROUP_MIN_ORDER) | ||
1947 | hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); | ||
1913 | 1948 | ||
1914 | parsed_hstate = h; | 1949 | parsed_hstate = h; |
1915 | } | 1950 | } |
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1920 | static unsigned long *last_mhp; | 1955 | static unsigned long *last_mhp; |
1921 | 1956 | ||
1922 | /* | 1957 | /* |
1923 | * !max_hstate means we haven't parsed a hugepagesz= parameter yet, | 1958 | * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, |
1924 | * so this hugepages= parameter goes to the "default hstate". | 1959 | * so this hugepages= parameter goes to the "default hstate". |
1925 | */ | 1960 | */ |
1926 | if (!max_hstate) | 1961 | if (!hugetlb_max_hstate) |
1927 | mhp = &default_hstate_max_huge_pages; | 1962 | mhp = &default_hstate_max_huge_pages; |
1928 | else | 1963 | else |
1929 | mhp = &parsed_hstate->max_huge_pages; | 1964 | mhp = &parsed_hstate->max_huge_pages; |
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1942 | * But we need to allocate >= MAX_ORDER hstates here early to still | 1977 | * But we need to allocate >= MAX_ORDER hstates here early to still |
1943 | * use the bootmem allocator. | 1978 | * use the bootmem allocator. |
1944 | */ | 1979 | */ |
1945 | if (max_hstate && parsed_hstate->order >= MAX_ORDER) | 1980 | if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) |
1946 | hugetlb_hstate_alloc_pages(parsed_hstate); | 1981 | hugetlb_hstate_alloc_pages(parsed_hstate); |
1947 | 1982 | ||
1948 | last_mhp = mhp; | 1983 | last_mhp = mhp; |
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) | |||
2308 | return 0; | 2343 | return 0; |
2309 | } | 2344 | } |
2310 | 2345 | ||
2311 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2346 | void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, |
2312 | unsigned long end, struct page *ref_page) | 2347 | unsigned long start, unsigned long end, |
2348 | struct page *ref_page) | ||
2313 | { | 2349 | { |
2350 | int force_flush = 0; | ||
2314 | struct mm_struct *mm = vma->vm_mm; | 2351 | struct mm_struct *mm = vma->vm_mm; |
2315 | unsigned long address; | 2352 | unsigned long address; |
2316 | pte_t *ptep; | 2353 | pte_t *ptep; |
2317 | pte_t pte; | 2354 | pte_t pte; |
2318 | struct page *page; | 2355 | struct page *page; |
2319 | struct page *tmp; | ||
2320 | struct hstate *h = hstate_vma(vma); | 2356 | struct hstate *h = hstate_vma(vma); |
2321 | unsigned long sz = huge_page_size(h); | 2357 | unsigned long sz = huge_page_size(h); |
2322 | 2358 | ||
2323 | /* | ||
2324 | * A page gathering list, protected by per file i_mmap_mutex. The | ||
2325 | * lock is used to avoid list corruption from multiple unmapping | ||
2326 | * of the same page since we are using page->lru. | ||
2327 | */ | ||
2328 | LIST_HEAD(page_list); | ||
2329 | |||
2330 | WARN_ON(!is_vm_hugetlb_page(vma)); | 2359 | WARN_ON(!is_vm_hugetlb_page(vma)); |
2331 | BUG_ON(start & ~huge_page_mask(h)); | 2360 | BUG_ON(start & ~huge_page_mask(h)); |
2332 | BUG_ON(end & ~huge_page_mask(h)); | 2361 | BUG_ON(end & ~huge_page_mask(h)); |
2333 | 2362 | ||
2363 | tlb_start_vma(tlb, vma); | ||
2334 | mmu_notifier_invalidate_range_start(mm, start, end); | 2364 | mmu_notifier_invalidate_range_start(mm, start, end); |
2365 | again: | ||
2335 | spin_lock(&mm->page_table_lock); | 2366 | spin_lock(&mm->page_table_lock); |
2336 | for (address = start; address < end; address += sz) { | 2367 | for (address = start; address < end; address += sz) { |
2337 | ptep = huge_pte_offset(mm, address); | 2368 | ptep = huge_pte_offset(mm, address); |
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2370 | } | 2401 | } |
2371 | 2402 | ||
2372 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 2403 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
2404 | tlb_remove_tlb_entry(tlb, ptep, address); | ||
2373 | if (pte_dirty(pte)) | 2405 | if (pte_dirty(pte)) |
2374 | set_page_dirty(page); | 2406 | set_page_dirty(page); |
2375 | list_add(&page->lru, &page_list); | ||
2376 | 2407 | ||
2408 | page_remove_rmap(page); | ||
2409 | force_flush = !__tlb_remove_page(tlb, page); | ||
2410 | if (force_flush) | ||
2411 | break; | ||
2377 | /* Bail out after unmapping reference page if supplied */ | 2412 | /* Bail out after unmapping reference page if supplied */ |
2378 | if (ref_page) | 2413 | if (ref_page) |
2379 | break; | 2414 | break; |
2380 | } | 2415 | } |
2381 | flush_tlb_range(vma, start, end); | ||
2382 | spin_unlock(&mm->page_table_lock); | 2416 | spin_unlock(&mm->page_table_lock); |
2383 | mmu_notifier_invalidate_range_end(mm, start, end); | 2417 | /* |
2384 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 2418 | * mmu_gather ran out of room to batch pages, we break out of |
2385 | page_remove_rmap(page); | 2419 | * the PTE lock to avoid doing the potential expensive TLB invalidate |
2386 | list_del(&page->lru); | 2420 | * and page-free while holding it. |
2387 | put_page(page); | 2421 | */ |
2422 | if (force_flush) { | ||
2423 | force_flush = 0; | ||
2424 | tlb_flush_mmu(tlb); | ||
2425 | if (address < end && !ref_page) | ||
2426 | goto again; | ||
2388 | } | 2427 | } |
2428 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
2429 | tlb_end_vma(tlb, vma); | ||
2430 | } | ||
2431 | |||
2432 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, | ||
2433 | struct vm_area_struct *vma, unsigned long start, | ||
2434 | unsigned long end, struct page *ref_page) | ||
2435 | { | ||
2436 | __unmap_hugepage_range(tlb, vma, start, end, ref_page); | ||
2437 | |||
2438 | /* | ||
2439 | * Clear this flag so that x86's huge_pmd_share page_table_shareable | ||
2440 | * test will fail on a vma being torn down, and not grab a page table | ||
2441 | * on its way out. We're lucky that the flag has such an appropriate | ||
2442 | * name, and can in fact be safely cleared here. We could clear it | ||
2443 | * before the __unmap_hugepage_range above, but all that's necessary | ||
2444 | * is to clear it before releasing the i_mmap_mutex. This works | ||
2445 | * because in the context this is called, the VMA is about to be | ||
2446 | * destroyed and the i_mmap_mutex is held. | ||
2447 | */ | ||
2448 | vma->vm_flags &= ~VM_MAYSHARE; | ||
2389 | } | 2449 | } |
2390 | 2450 | ||
2391 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2451 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
2392 | unsigned long end, struct page *ref_page) | 2452 | unsigned long end, struct page *ref_page) |
2393 | { | 2453 | { |
2394 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2454 | struct mm_struct *mm; |
2395 | __unmap_hugepage_range(vma, start, end, ref_page); | 2455 | struct mmu_gather tlb; |
2396 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2456 | |
2457 | mm = vma->vm_mm; | ||
2458 | |||
2459 | tlb_gather_mmu(&tlb, mm, 0); | ||
2460 | __unmap_hugepage_range(&tlb, vma, start, end, ref_page); | ||
2461 | tlb_finish_mmu(&tlb, start, end); | ||
2397 | } | 2462 | } |
2398 | 2463 | ||
2399 | /* | 2464 | /* |
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2438 | * from the time of fork. This would look like data corruption | 2503 | * from the time of fork. This would look like data corruption |
2439 | */ | 2504 | */ |
2440 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2505 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
2441 | __unmap_hugepage_range(iter_vma, | 2506 | unmap_hugepage_range(iter_vma, address, |
2442 | address, address + huge_page_size(h), | 2507 | address + huge_page_size(h), page); |
2443 | page); | ||
2444 | } | 2508 | } |
2445 | mutex_unlock(&mapping->i_mmap_mutex); | 2509 | mutex_unlock(&mapping->i_mmap_mutex); |
2446 | 2510 | ||
@@ -2496,6 +2560,7 @@ retry_avoidcopy: | |||
2496 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2560 | new_page = alloc_huge_page(vma, address, outside_reserve); |
2497 | 2561 | ||
2498 | if (IS_ERR(new_page)) { | 2562 | if (IS_ERR(new_page)) { |
2563 | long err = PTR_ERR(new_page); | ||
2499 | page_cache_release(old_page); | 2564 | page_cache_release(old_page); |
2500 | 2565 | ||
2501 | /* | 2566 | /* |
@@ -2524,7 +2589,10 @@ retry_avoidcopy: | |||
2524 | 2589 | ||
2525 | /* Caller expects lock to be held */ | 2590 | /* Caller expects lock to be held */ |
2526 | spin_lock(&mm->page_table_lock); | 2591 | spin_lock(&mm->page_table_lock); |
2527 | return -PTR_ERR(new_page); | 2592 | if (err == -ENOMEM) |
2593 | return VM_FAULT_OOM; | ||
2594 | else | ||
2595 | return VM_FAULT_SIGBUS; | ||
2528 | } | 2596 | } |
2529 | 2597 | ||
2530 | /* | 2598 | /* |
@@ -2642,7 +2710,11 @@ retry: | |||
2642 | goto out; | 2710 | goto out; |
2643 | page = alloc_huge_page(vma, address, 0); | 2711 | page = alloc_huge_page(vma, address, 0); |
2644 | if (IS_ERR(page)) { | 2712 | if (IS_ERR(page)) { |
2645 | ret = -PTR_ERR(page); | 2713 | ret = PTR_ERR(page); |
2714 | if (ret == -ENOMEM) | ||
2715 | ret = VM_FAULT_OOM; | ||
2716 | else | ||
2717 | ret = VM_FAULT_SIGBUS; | ||
2646 | goto out; | 2718 | goto out; |
2647 | } | 2719 | } |
2648 | clear_huge_page(page, address, pages_per_huge_page(h)); | 2720 | clear_huge_page(page, address, pages_per_huge_page(h)); |
@@ -2679,7 +2751,7 @@ retry: | |||
2679 | */ | 2751 | */ |
2680 | if (unlikely(PageHWPoison(page))) { | 2752 | if (unlikely(PageHWPoison(page))) { |
2681 | ret = VM_FAULT_HWPOISON | | 2753 | ret = VM_FAULT_HWPOISON | |
2682 | VM_FAULT_SET_HINDEX(h - hstates); | 2754 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
2683 | goto backout_unlocked; | 2755 | goto backout_unlocked; |
2684 | } | 2756 | } |
2685 | } | 2757 | } |
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2752 | return 0; | 2824 | return 0; |
2753 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2825 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
2754 | return VM_FAULT_HWPOISON_LARGE | | 2826 | return VM_FAULT_HWPOISON_LARGE | |
2755 | VM_FAULT_SET_HINDEX(h - hstates); | 2827 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
2756 | } | 2828 | } |
2757 | 2829 | ||
2758 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); | 2830 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2959 | } | 3031 | } |
2960 | } | 3032 | } |
2961 | spin_unlock(&mm->page_table_lock); | 3033 | spin_unlock(&mm->page_table_lock); |
2962 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3034 | /* |
2963 | 3035 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | |
3036 | * may have cleared our pud entry and done put_page on the page table: | ||
3037 | * once we release i_mmap_mutex, another task can do the final put_page | ||
3038 | * and that page table be reused and filled with junk. | ||
3039 | */ | ||
2964 | flush_tlb_range(vma, start, end); | 3040 | flush_tlb_range(vma, start, end); |
3041 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
2965 | } | 3042 | } |
2966 | 3043 | ||
2967 | int hugetlb_reserve_pages(struct inode *inode, | 3044 | int hugetlb_reserve_pages(struct inode *inode, |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c new file mode 100644 index 000000000000..a3f358fb8a0c --- /dev/null +++ b/mm/hugetlb_cgroup.c | |||
@@ -0,0 +1,418 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Copyright IBM Corporation, 2012 | ||
4 | * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms of version 2.1 of the GNU Lesser General Public License | ||
8 | * as published by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope that it would be useful, but | ||
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/cgroup.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/hugetlb.h> | ||
19 | #include <linux/hugetlb_cgroup.h> | ||
20 | |||
21 | struct hugetlb_cgroup { | ||
22 | struct cgroup_subsys_state css; | ||
23 | /* | ||
24 | * the counter to account for hugepages from hugetlb. | ||
25 | */ | ||
26 | struct res_counter hugepage[HUGE_MAX_HSTATE]; | ||
27 | }; | ||
28 | |||
29 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | ||
30 | #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) | ||
31 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | ||
32 | |||
33 | struct cgroup_subsys hugetlb_subsys __read_mostly; | ||
34 | static struct hugetlb_cgroup *root_h_cgroup __read_mostly; | ||
35 | |||
36 | static inline | ||
37 | struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) | ||
38 | { | ||
39 | return container_of(s, struct hugetlb_cgroup, css); | ||
40 | } | ||
41 | |||
42 | static inline | ||
43 | struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup) | ||
44 | { | ||
45 | return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup, | ||
46 | hugetlb_subsys_id)); | ||
47 | } | ||
48 | |||
49 | static inline | ||
50 | struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) | ||
51 | { | ||
52 | return hugetlb_cgroup_from_css(task_subsys_state(task, | ||
53 | hugetlb_subsys_id)); | ||
54 | } | ||
55 | |||
56 | static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) | ||
57 | { | ||
58 | return (h_cg == root_h_cgroup); | ||
59 | } | ||
60 | |||
61 | static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg) | ||
62 | { | ||
63 | if (!cg->parent) | ||
64 | return NULL; | ||
65 | return hugetlb_cgroup_from_cgroup(cg->parent); | ||
66 | } | ||
67 | |||
68 | static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) | ||
69 | { | ||
70 | int idx; | ||
71 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg); | ||
72 | |||
73 | for (idx = 0; idx < hugetlb_max_hstate; idx++) { | ||
74 | if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) | ||
75 | return true; | ||
76 | } | ||
77 | return false; | ||
78 | } | ||
79 | |||
80 | static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) | ||
81 | { | ||
82 | int idx; | ||
83 | struct cgroup *parent_cgroup; | ||
84 | struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup; | ||
85 | |||
86 | h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); | ||
87 | if (!h_cgroup) | ||
88 | return ERR_PTR(-ENOMEM); | ||
89 | |||
90 | parent_cgroup = cgroup->parent; | ||
91 | if (parent_cgroup) { | ||
92 | parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup); | ||
93 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | ||
94 | res_counter_init(&h_cgroup->hugepage[idx], | ||
95 | &parent_h_cgroup->hugepage[idx]); | ||
96 | } else { | ||
97 | root_h_cgroup = h_cgroup; | ||
98 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | ||
99 | res_counter_init(&h_cgroup->hugepage[idx], NULL); | ||
100 | } | ||
101 | return &h_cgroup->css; | ||
102 | } | ||
103 | |||
104 | static void hugetlb_cgroup_destroy(struct cgroup *cgroup) | ||
105 | { | ||
106 | struct hugetlb_cgroup *h_cgroup; | ||
107 | |||
108 | h_cgroup = hugetlb_cgroup_from_cgroup(cgroup); | ||
109 | kfree(h_cgroup); | ||
110 | } | ||
111 | |||
112 | |||
113 | /* | ||
114 | * Should be called with hugetlb_lock held. | ||
115 | * Since we are holding hugetlb_lock, pages cannot get moved from | ||
116 | * active list or uncharged from the cgroup, So no need to get | ||
117 | * page reference and test for page active here. This function | ||
118 | * cannot fail. | ||
119 | */ | ||
120 | static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup, | ||
121 | struct page *page) | ||
122 | { | ||
123 | int csize; | ||
124 | struct res_counter *counter; | ||
125 | struct res_counter *fail_res; | ||
126 | struct hugetlb_cgroup *page_hcg; | ||
127 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
128 | struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup); | ||
129 | |||
130 | page_hcg = hugetlb_cgroup_from_page(page); | ||
131 | /* | ||
132 | * We can have pages in active list without any cgroup | ||
133 | * ie, hugepage with less than 3 pages. We can safely | ||
134 | * ignore those pages. | ||
135 | */ | ||
136 | if (!page_hcg || page_hcg != h_cg) | ||
137 | goto out; | ||
138 | |||
139 | csize = PAGE_SIZE << compound_order(page); | ||
140 | if (!parent) { | ||
141 | parent = root_h_cgroup; | ||
142 | /* root has no limit */ | ||
143 | res_counter_charge_nofail(&parent->hugepage[idx], | ||
144 | csize, &fail_res); | ||
145 | } | ||
146 | counter = &h_cg->hugepage[idx]; | ||
147 | res_counter_uncharge_until(counter, counter->parent, csize); | ||
148 | |||
149 | set_hugetlb_cgroup(page, parent); | ||
150 | out: | ||
151 | return; | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to | ||
156 | * the parent cgroup. | ||
157 | */ | ||
158 | static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) | ||
159 | { | ||
160 | struct hstate *h; | ||
161 | struct page *page; | ||
162 | int ret = 0, idx = 0; | ||
163 | |||
164 | do { | ||
165 | if (cgroup_task_count(cgroup) || | ||
166 | !list_empty(&cgroup->children)) { | ||
167 | ret = -EBUSY; | ||
168 | goto out; | ||
169 | } | ||
170 | for_each_hstate(h) { | ||
171 | spin_lock(&hugetlb_lock); | ||
172 | list_for_each_entry(page, &h->hugepage_activelist, lru) | ||
173 | hugetlb_cgroup_move_parent(idx, cgroup, page); | ||
174 | |||
175 | spin_unlock(&hugetlb_lock); | ||
176 | idx++; | ||
177 | } | ||
178 | cond_resched(); | ||
179 | } while (hugetlb_cgroup_have_usage(cgroup)); | ||
180 | out: | ||
181 | return ret; | ||
182 | } | ||
183 | |||
184 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | ||
185 | struct hugetlb_cgroup **ptr) | ||
186 | { | ||
187 | int ret = 0; | ||
188 | struct res_counter *fail_res; | ||
189 | struct hugetlb_cgroup *h_cg = NULL; | ||
190 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
191 | |||
192 | if (hugetlb_cgroup_disabled()) | ||
193 | goto done; | ||
194 | /* | ||
195 | * We don't charge any cgroup if the compound page have less | ||
196 | * than 3 pages. | ||
197 | */ | ||
198 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) | ||
199 | goto done; | ||
200 | again: | ||
201 | rcu_read_lock(); | ||
202 | h_cg = hugetlb_cgroup_from_task(current); | ||
203 | if (!css_tryget(&h_cg->css)) { | ||
204 | rcu_read_unlock(); | ||
205 | goto again; | ||
206 | } | ||
207 | rcu_read_unlock(); | ||
208 | |||
209 | ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); | ||
210 | css_put(&h_cg->css); | ||
211 | done: | ||
212 | *ptr = h_cg; | ||
213 | return ret; | ||
214 | } | ||
215 | |||
216 | /* Should be called with hugetlb_lock held */ | ||
217 | void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, | ||
218 | struct hugetlb_cgroup *h_cg, | ||
219 | struct page *page) | ||
220 | { | ||
221 | if (hugetlb_cgroup_disabled() || !h_cg) | ||
222 | return; | ||
223 | |||
224 | set_hugetlb_cgroup(page, h_cg); | ||
225 | return; | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * Should be called with hugetlb_lock held | ||
230 | */ | ||
231 | void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, | ||
232 | struct page *page) | ||
233 | { | ||
234 | struct hugetlb_cgroup *h_cg; | ||
235 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
236 | |||
237 | if (hugetlb_cgroup_disabled()) | ||
238 | return; | ||
239 | VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); | ||
240 | h_cg = hugetlb_cgroup_from_page(page); | ||
241 | if (unlikely(!h_cg)) | ||
242 | return; | ||
243 | set_hugetlb_cgroup(page, NULL); | ||
244 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | ||
245 | return; | ||
246 | } | ||
247 | |||
248 | void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | ||
249 | struct hugetlb_cgroup *h_cg) | ||
250 | { | ||
251 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
252 | |||
253 | if (hugetlb_cgroup_disabled() || !h_cg) | ||
254 | return; | ||
255 | |||
256 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) | ||
257 | return; | ||
258 | |||
259 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | ||
260 | return; | ||
261 | } | ||
262 | |||
263 | static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, | ||
264 | struct file *file, char __user *buf, | ||
265 | size_t nbytes, loff_t *ppos) | ||
266 | { | ||
267 | u64 val; | ||
268 | char str[64]; | ||
269 | int idx, name, len; | ||
270 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
271 | |||
272 | idx = MEMFILE_IDX(cft->private); | ||
273 | name = MEMFILE_ATTR(cft->private); | ||
274 | |||
275 | val = res_counter_read_u64(&h_cg->hugepage[idx], name); | ||
276 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | ||
277 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | ||
278 | } | ||
279 | |||
280 | static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, | ||
281 | const char *buffer) | ||
282 | { | ||
283 | int idx, name, ret; | ||
284 | unsigned long long val; | ||
285 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
286 | |||
287 | idx = MEMFILE_IDX(cft->private); | ||
288 | name = MEMFILE_ATTR(cft->private); | ||
289 | |||
290 | switch (name) { | ||
291 | case RES_LIMIT: | ||
292 | if (hugetlb_cgroup_is_root(h_cg)) { | ||
293 | /* Can't set limit on root */ | ||
294 | ret = -EINVAL; | ||
295 | break; | ||
296 | } | ||
297 | /* This function does all necessary parse...reuse it */ | ||
298 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
299 | if (ret) | ||
300 | break; | ||
301 | ret = res_counter_set_limit(&h_cg->hugepage[idx], val); | ||
302 | break; | ||
303 | default: | ||
304 | ret = -EINVAL; | ||
305 | break; | ||
306 | } | ||
307 | return ret; | ||
308 | } | ||
309 | |||
310 | static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event) | ||
311 | { | ||
312 | int idx, name, ret = 0; | ||
313 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
314 | |||
315 | idx = MEMFILE_IDX(event); | ||
316 | name = MEMFILE_ATTR(event); | ||
317 | |||
318 | switch (name) { | ||
319 | case RES_MAX_USAGE: | ||
320 | res_counter_reset_max(&h_cg->hugepage[idx]); | ||
321 | break; | ||
322 | case RES_FAILCNT: | ||
323 | res_counter_reset_failcnt(&h_cg->hugepage[idx]); | ||
324 | break; | ||
325 | default: | ||
326 | ret = -EINVAL; | ||
327 | break; | ||
328 | } | ||
329 | return ret; | ||
330 | } | ||
331 | |||
332 | static char *mem_fmt(char *buf, int size, unsigned long hsize) | ||
333 | { | ||
334 | if (hsize >= (1UL << 30)) | ||
335 | snprintf(buf, size, "%luGB", hsize >> 30); | ||
336 | else if (hsize >= (1UL << 20)) | ||
337 | snprintf(buf, size, "%luMB", hsize >> 20); | ||
338 | else | ||
339 | snprintf(buf, size, "%luKB", hsize >> 10); | ||
340 | return buf; | ||
341 | } | ||
342 | |||
343 | int __init hugetlb_cgroup_file_init(int idx) | ||
344 | { | ||
345 | char buf[32]; | ||
346 | struct cftype *cft; | ||
347 | struct hstate *h = &hstates[idx]; | ||
348 | |||
349 | /* format the size */ | ||
350 | mem_fmt(buf, 32, huge_page_size(h)); | ||
351 | |||
352 | /* Add the limit file */ | ||
353 | cft = &h->cgroup_files[0]; | ||
354 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); | ||
355 | cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); | ||
356 | cft->read = hugetlb_cgroup_read; | ||
357 | cft->write_string = hugetlb_cgroup_write; | ||
358 | |||
359 | /* Add the usage file */ | ||
360 | cft = &h->cgroup_files[1]; | ||
361 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); | ||
362 | cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); | ||
363 | cft->read = hugetlb_cgroup_read; | ||
364 | |||
365 | /* Add the MAX usage file */ | ||
366 | cft = &h->cgroup_files[2]; | ||
367 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); | ||
368 | cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); | ||
369 | cft->trigger = hugetlb_cgroup_reset; | ||
370 | cft->read = hugetlb_cgroup_read; | ||
371 | |||
372 | /* Add the failcntfile */ | ||
373 | cft = &h->cgroup_files[3]; | ||
374 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); | ||
375 | cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); | ||
376 | cft->trigger = hugetlb_cgroup_reset; | ||
377 | cft->read = hugetlb_cgroup_read; | ||
378 | |||
379 | /* NULL terminate the last cft */ | ||
380 | cft = &h->cgroup_files[4]; | ||
381 | memset(cft, 0, sizeof(*cft)); | ||
382 | |||
383 | WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); | ||
384 | |||
385 | return 0; | ||
386 | } | ||
387 | |||
388 | /* | ||
389 | * hugetlb_lock will make sure a parallel cgroup rmdir won't happen | ||
390 | * when we migrate hugepages | ||
391 | */ | ||
392 | void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) | ||
393 | { | ||
394 | struct hugetlb_cgroup *h_cg; | ||
395 | struct hstate *h = page_hstate(oldhpage); | ||
396 | |||
397 | if (hugetlb_cgroup_disabled()) | ||
398 | return; | ||
399 | |||
400 | VM_BUG_ON(!PageHuge(oldhpage)); | ||
401 | spin_lock(&hugetlb_lock); | ||
402 | h_cg = hugetlb_cgroup_from_page(oldhpage); | ||
403 | set_hugetlb_cgroup(oldhpage, NULL); | ||
404 | |||
405 | /* move the h_cg details to new cgroup */ | ||
406 | set_hugetlb_cgroup(newhpage, h_cg); | ||
407 | list_move(&newhpage->lru, &h->hugepage_activelist); | ||
408 | spin_unlock(&hugetlb_lock); | ||
409 | return; | ||
410 | } | ||
411 | |||
412 | struct cgroup_subsys hugetlb_subsys = { | ||
413 | .name = "hugetlb", | ||
414 | .create = hugetlb_cgroup_create, | ||
415 | .pre_destroy = hugetlb_cgroup_pre_destroy, | ||
416 | .destroy = hugetlb_cgroup_destroy, | ||
417 | .subsys_id = hugetlb_subsys_id, | ||
418 | }; | ||
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index cc448bb983ba..3a61efc518d5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -123,7 +123,7 @@ static int pfn_inject_init(void) | |||
123 | if (!dentry) | 123 | if (!dentry) |
124 | goto fail; | 124 | goto fail; |
125 | 125 | ||
126 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 126 | #ifdef CONFIG_MEMCG_SWAP |
127 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | 127 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, |
128 | hwpoison_dir, &hwpoison_filter_memcg); | 128 | hwpoison_dir, &hwpoison_filter_memcg); |
129 | if (!dentry) | 129 | if (!dentry) |
diff --git a/mm/internal.h b/mm/internal.h index 2ba87fbfb75b..3314f79d775a 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -118,8 +118,14 @@ struct compact_control { | |||
118 | unsigned long nr_freepages; /* Number of isolated free pages */ | 118 | unsigned long nr_freepages; /* Number of isolated free pages */ |
119 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 119 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
120 | unsigned long free_pfn; /* isolate_freepages search base */ | 120 | unsigned long free_pfn; /* isolate_freepages search base */ |
121 | unsigned long start_free_pfn; /* where we started the search */ | ||
121 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 122 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
122 | bool sync; /* Synchronous migration */ | 123 | bool sync; /* Synchronous migration */ |
124 | bool wrapped; /* Order > 0 compactions are | ||
125 | incremental, once free_pfn | ||
126 | and migrate_pfn meet, we restart | ||
127 | from the top of the zone; | ||
128 | remember we wrapped around. */ | ||
123 | 129 | ||
124 | int order; /* order a direct compactor needs */ | 130 | int order; /* order a direct compactor needs */ |
125 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 131 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
@@ -347,3 +353,5 @@ extern u32 hwpoison_filter_enable; | |||
347 | extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, | 353 | extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, |
348 | unsigned long, unsigned long, | 354 | unsigned long, unsigned long, |
349 | unsigned long, unsigned long); | 355 | unsigned long, unsigned long); |
356 | |||
357 | extern void set_pageblock_order(void); | ||
diff --git a/mm/madvise.c b/mm/madvise.c index deff1b64a08c..14d260fa0d17 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/ksm.h> | 16 | #include <linux/ksm.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/file.h> | ||
18 | 19 | ||
19 | /* | 20 | /* |
20 | * Any behaviour which results in changes to the vma->vm_flags needs to | 21 | * Any behaviour which results in changes to the vma->vm_flags needs to |
@@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
204 | { | 205 | { |
205 | loff_t offset; | 206 | loff_t offset; |
206 | int error; | 207 | int error; |
208 | struct file *f; | ||
207 | 209 | ||
208 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 210 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
209 | 211 | ||
210 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | 212 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) |
211 | return -EINVAL; | 213 | return -EINVAL; |
212 | 214 | ||
213 | if (!vma->vm_file || !vma->vm_file->f_mapping | 215 | f = vma->vm_file; |
214 | || !vma->vm_file->f_mapping->host) { | 216 | |
217 | if (!f || !f->f_mapping || !f->f_mapping->host) { | ||
215 | return -EINVAL; | 218 | return -EINVAL; |
216 | } | 219 | } |
217 | 220 | ||
@@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
221 | offset = (loff_t)(start - vma->vm_start) | 224 | offset = (loff_t)(start - vma->vm_start) |
222 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 225 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
223 | 226 | ||
224 | /* filesystem's fallocate may need to take i_mutex */ | 227 | /* |
228 | * Filesystem's fallocate may need to take i_mutex. We need to | ||
229 | * explicitly grab a reference because the vma (and hence the | ||
230 | * vma's reference to the file) can go away as soon as we drop | ||
231 | * mmap_sem. | ||
232 | */ | ||
233 | get_file(f); | ||
225 | up_read(¤t->mm->mmap_sem); | 234 | up_read(¤t->mm->mmap_sem); |
226 | error = do_fallocate(vma->vm_file, | 235 | error = do_fallocate(f, |
227 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, | 236 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, |
228 | offset, end - start); | 237 | offset, end - start); |
238 | fput(f); | ||
229 | down_read(¤t->mm->mmap_sem); | 239 | down_read(¤t->mm->mmap_sem); |
230 | return error; | 240 | return error; |
231 | } | 241 | } |
diff --git a/mm/memblock.c b/mm/memblock.c index 952123eba433..4d9393c7edc9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | |||
143 | MAX_NUMNODES); | 143 | MAX_NUMNODES); |
144 | } | 144 | } |
145 | 145 | ||
146 | /* | ||
147 | * Free memblock.reserved.regions | ||
148 | */ | ||
149 | int __init_memblock memblock_free_reserved_regions(void) | ||
150 | { | ||
151 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
152 | return 0; | ||
153 | |||
154 | return memblock_free(__pa(memblock.reserved.regions), | ||
155 | sizeof(struct memblock_region) * memblock.reserved.max); | ||
156 | } | ||
157 | |||
158 | /* | ||
159 | * Reserve memblock.reserved.regions | ||
160 | */ | ||
161 | int __init_memblock memblock_reserve_reserved_regions(void) | ||
162 | { | ||
163 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
164 | return 0; | ||
165 | |||
166 | return memblock_reserve(__pa(memblock.reserved.regions), | ||
167 | sizeof(struct memblock_region) * memblock.reserved.max); | ||
168 | } | ||
169 | |||
170 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) | 146 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) |
171 | { | 147 | { |
172 | type->total_size -= type->regions[r].size; | 148 | type->total_size -= type->regions[r].size; |
@@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u | |||
184 | } | 160 | } |
185 | } | 161 | } |
186 | 162 | ||
187 | static int __init_memblock memblock_double_array(struct memblock_type *type) | 163 | phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( |
164 | phys_addr_t *addr) | ||
165 | { | ||
166 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
167 | return 0; | ||
168 | |||
169 | *addr = __pa(memblock.reserved.regions); | ||
170 | |||
171 | return PAGE_ALIGN(sizeof(struct memblock_region) * | ||
172 | memblock.reserved.max); | ||
173 | } | ||
174 | |||
175 | /** | ||
176 | * memblock_double_array - double the size of the memblock regions array | ||
177 | * @type: memblock type of the regions array being doubled | ||
178 | * @new_area_start: starting address of memory range to avoid overlap with | ||
179 | * @new_area_size: size of memory range to avoid overlap with | ||
180 | * | ||
181 | * Double the size of the @type regions array. If memblock is being used to | ||
182 | * allocate memory for a new reserved regions array and there is a previously | ||
183 | * allocated memory range [@new_area_start,@new_area_start+@new_area_size] | ||
184 | * waiting to be reserved, ensure the memory used by the new array does | ||
185 | * not overlap. | ||
186 | * | ||
187 | * RETURNS: | ||
188 | * 0 on success, -1 on failure. | ||
189 | */ | ||
190 | static int __init_memblock memblock_double_array(struct memblock_type *type, | ||
191 | phys_addr_t new_area_start, | ||
192 | phys_addr_t new_area_size) | ||
188 | { | 193 | { |
189 | struct memblock_region *new_array, *old_array; | 194 | struct memblock_region *new_array, *old_array; |
195 | phys_addr_t old_alloc_size, new_alloc_size; | ||
190 | phys_addr_t old_size, new_size, addr; | 196 | phys_addr_t old_size, new_size, addr; |
191 | int use_slab = slab_is_available(); | 197 | int use_slab = slab_is_available(); |
192 | int *in_slab; | 198 | int *in_slab; |
@@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
200 | /* Calculate new doubled size */ | 206 | /* Calculate new doubled size */ |
201 | old_size = type->max * sizeof(struct memblock_region); | 207 | old_size = type->max * sizeof(struct memblock_region); |
202 | new_size = old_size << 1; | 208 | new_size = old_size << 1; |
209 | /* | ||
210 | * We need to allocated new one align to PAGE_SIZE, | ||
211 | * so we can free them completely later. | ||
212 | */ | ||
213 | old_alloc_size = PAGE_ALIGN(old_size); | ||
214 | new_alloc_size = PAGE_ALIGN(new_size); | ||
203 | 215 | ||
204 | /* Retrieve the slab flag */ | 216 | /* Retrieve the slab flag */ |
205 | if (type == &memblock.memory) | 217 | if (type == &memblock.memory) |
@@ -210,19 +222,30 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
210 | /* Try to find some space for it. | 222 | /* Try to find some space for it. |
211 | * | 223 | * |
212 | * WARNING: We assume that either slab_is_available() and we use it or | 224 | * WARNING: We assume that either slab_is_available() and we use it or |
213 | * we use MEMBLOCK for allocations. That means that this is unsafe to use | 225 | * we use MEMBLOCK for allocations. That means that this is unsafe to |
214 | * when bootmem is currently active (unless bootmem itself is implemented | 226 | * use when bootmem is currently active (unless bootmem itself is |
215 | * on top of MEMBLOCK which isn't the case yet) | 227 | * implemented on top of MEMBLOCK which isn't the case yet) |
216 | * | 228 | * |
217 | * This should however not be an issue for now, as we currently only | 229 | * This should however not be an issue for now, as we currently only |
218 | * call into MEMBLOCK while it's still active, or much later when slab is | 230 | * call into MEMBLOCK while it's still active, or much later when slab |
219 | * active for memory hotplug operations | 231 | * is active for memory hotplug operations |
220 | */ | 232 | */ |
221 | if (use_slab) { | 233 | if (use_slab) { |
222 | new_array = kmalloc(new_size, GFP_KERNEL); | 234 | new_array = kmalloc(new_size, GFP_KERNEL); |
223 | addr = new_array ? __pa(new_array) : 0; | 235 | addr = new_array ? __pa(new_array) : 0; |
224 | } else { | 236 | } else { |
225 | addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); | 237 | /* only exclude range when trying to double reserved.regions */ |
238 | if (type != &memblock.reserved) | ||
239 | new_area_start = new_area_size = 0; | ||
240 | |||
241 | addr = memblock_find_in_range(new_area_start + new_area_size, | ||
242 | memblock.current_limit, | ||
243 | new_alloc_size, PAGE_SIZE); | ||
244 | if (!addr && new_area_size) | ||
245 | addr = memblock_find_in_range(0, | ||
246 | min(new_area_start, memblock.current_limit), | ||
247 | new_alloc_size, PAGE_SIZE); | ||
248 | |||
226 | new_array = addr ? __va(addr) : 0; | 249 | new_array = addr ? __va(addr) : 0; |
227 | } | 250 | } |
228 | if (!addr) { | 251 | if (!addr) { |
@@ -231,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
231 | return -1; | 254 | return -1; |
232 | } | 255 | } |
233 | 256 | ||
234 | memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", | 257 | memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", |
235 | memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); | 258 | memblock_type_name(type), type->max * 2, (u64)addr, |
259 | (u64)addr + new_size - 1); | ||
236 | 260 | ||
237 | /* Found space, we now need to move the array over before | 261 | /* |
238 | * we add the reserved region since it may be our reserved | 262 | * Found space, we now need to move the array over before we add the |
239 | * array itself that is full. | 263 | * reserved region since it may be our reserved array itself that is |
264 | * full. | ||
240 | */ | 265 | */ |
241 | memcpy(new_array, type->regions, old_size); | 266 | memcpy(new_array, type->regions, old_size); |
242 | memset(new_array + type->max, 0, old_size); | 267 | memset(new_array + type->max, 0, old_size); |
@@ -244,20 +269,19 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
244 | type->regions = new_array; | 269 | type->regions = new_array; |
245 | type->max <<= 1; | 270 | type->max <<= 1; |
246 | 271 | ||
247 | /* Free old array. We needn't free it if the array is the | 272 | /* Free old array. We needn't free it if the array is the static one */ |
248 | * static one | ||
249 | */ | ||
250 | if (*in_slab) | 273 | if (*in_slab) |
251 | kfree(old_array); | 274 | kfree(old_array); |
252 | else if (old_array != memblock_memory_init_regions && | 275 | else if (old_array != memblock_memory_init_regions && |
253 | old_array != memblock_reserved_init_regions) | 276 | old_array != memblock_reserved_init_regions) |
254 | memblock_free(__pa(old_array), old_size); | 277 | memblock_free(__pa(old_array), old_alloc_size); |
255 | 278 | ||
256 | /* Reserve the new array if that comes from the memblock. | 279 | /* |
257 | * Otherwise, we needn't do it | 280 | * Reserve the new array if that comes from the memblock. Otherwise, we |
281 | * needn't do it | ||
258 | */ | 282 | */ |
259 | if (!use_slab) | 283 | if (!use_slab) |
260 | BUG_ON(memblock_reserve(addr, new_size)); | 284 | BUG_ON(memblock_reserve(addr, new_alloc_size)); |
261 | 285 | ||
262 | /* Update slab flag */ | 286 | /* Update slab flag */ |
263 | *in_slab = use_slab; | 287 | *in_slab = use_slab; |
@@ -399,7 +423,7 @@ repeat: | |||
399 | */ | 423 | */ |
400 | if (!insert) { | 424 | if (!insert) { |
401 | while (type->cnt + nr_new > type->max) | 425 | while (type->cnt + nr_new > type->max) |
402 | if (memblock_double_array(type) < 0) | 426 | if (memblock_double_array(type, obase, size) < 0) |
403 | return -ENOMEM; | 427 | return -ENOMEM; |
404 | insert = true; | 428 | insert = true; |
405 | goto repeat; | 429 | goto repeat; |
@@ -450,7 +474,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
450 | 474 | ||
451 | /* we'll create at most two more regions */ | 475 | /* we'll create at most two more regions */ |
452 | while (type->cnt + 2 > type->max) | 476 | while (type->cnt + 2 > type->max) |
453 | if (memblock_double_array(type) < 0) | 477 | if (memblock_double_array(type, base, size) < 0) |
454 | return -ENOMEM; | 478 | return -ENOMEM; |
455 | 479 | ||
456 | for (i = 0; i < type->cnt; i++) { | 480 | for (i = 0; i < type->cnt; i++) { |
@@ -540,9 +564,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
540 | * __next_free_mem_range - next function for for_each_free_mem_range() | 564 | * __next_free_mem_range - next function for for_each_free_mem_range() |
541 | * @idx: pointer to u64 loop variable | 565 | * @idx: pointer to u64 loop variable |
542 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 566 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes |
543 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 567 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
544 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 568 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
545 | * @p_nid: ptr to int for nid of the range, can be %NULL | 569 | * @out_nid: ptr to int for nid of the range, can be %NULL |
546 | * | 570 | * |
547 | * Find the first free area from *@idx which matches @nid, fill the out | 571 | * Find the first free area from *@idx which matches @nid, fill the out |
548 | * parameters, and update *@idx for the next iteration. The lower 32bit of | 572 | * parameters, and update *@idx for the next iteration. The lower 32bit of |
@@ -616,9 +640,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
616 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() | 640 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() |
617 | * @idx: pointer to u64 loop variable | 641 | * @idx: pointer to u64 loop variable |
618 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 642 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes |
619 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 643 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
620 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 644 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
621 | * @p_nid: ptr to int for nid of the range, can be %NULL | 645 | * @out_nid: ptr to int for nid of the range, can be %NULL |
622 | * | 646 | * |
623 | * Reverse of __next_free_mem_range(). | 647 | * Reverse of __next_free_mem_range(). |
624 | */ | 648 | */ |
@@ -867,6 +891,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) | |||
867 | return memblock_search(&memblock.memory, addr) != -1; | 891 | return memblock_search(&memblock.memory, addr) != -1; |
868 | } | 892 | } |
869 | 893 | ||
894 | /** | ||
895 | * memblock_is_region_memory - check if a region is a subset of memory | ||
896 | * @base: base of region to check | ||
897 | * @size: size of region to check | ||
898 | * | ||
899 | * Check if the region [@base, @base+@size) is a subset of a memory block. | ||
900 | * | ||
901 | * RETURNS: | ||
902 | * 0 if false, non-zero if true | ||
903 | */ | ||
870 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) | 904 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) |
871 | { | 905 | { |
872 | int idx = memblock_search(&memblock.memory, base); | 906 | int idx = memblock_search(&memblock.memory, base); |
@@ -879,6 +913,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size | |||
879 | memblock.memory.regions[idx].size) >= end; | 913 | memblock.memory.regions[idx].size) >= end; |
880 | } | 914 | } |
881 | 915 | ||
916 | /** | ||
917 | * memblock_is_region_reserved - check if a region intersects reserved memory | ||
918 | * @base: base of region to check | ||
919 | * @size: size of region to check | ||
920 | * | ||
921 | * Check if the region [@base, @base+@size) intersects a reserved memory block. | ||
922 | * | ||
923 | * RETURNS: | ||
924 | * 0 if false, non-zero if true | ||
925 | */ | ||
882 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) | 926 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) |
883 | { | 927 | { |
884 | memblock_cap_size(base, &size); | 928 | memblock_cap_size(base, &size); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac35bccadb7b..795e525afaba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; | |||
61 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 61 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
62 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 62 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
63 | 63 | ||
64 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 64 | #ifdef CONFIG_MEMCG_SWAP |
65 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 65 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
66 | int do_swap_account __read_mostly; | 66 | int do_swap_account __read_mostly; |
67 | 67 | ||
68 | /* for remember boot option*/ | 68 | /* for remember boot option*/ |
69 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED | 69 | #ifdef CONFIG_MEMCG_SWAP_ENABLED |
70 | static int really_do_swap_account __initdata = 1; | 70 | static int really_do_swap_account __initdata = 1; |
71 | #else | 71 | #else |
72 | static int really_do_swap_account __initdata = 0; | 72 | static int really_do_swap_account __initdata = 0; |
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index { | |||
87 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 87 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
88 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 88 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 90 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ |
91 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
92 | }; | 92 | }; |
93 | 93 | ||
@@ -378,9 +378,7 @@ static bool move_file(void) | |||
378 | 378 | ||
379 | enum charge_type { | 379 | enum charge_type { |
380 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 380 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
381 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 381 | MEM_CGROUP_CHARGE_TYPE_ANON, |
382 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | ||
383 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | ||
384 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ | 382 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ |
385 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ | 383 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ |
386 | NR_CHARGE_TYPE, | 384 | NR_CHARGE_TYPE, |
@@ -407,8 +405,14 @@ enum charge_type { | |||
407 | static void mem_cgroup_get(struct mem_cgroup *memcg); | 405 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
408 | static void mem_cgroup_put(struct mem_cgroup *memcg); | 406 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
409 | 407 | ||
408 | static inline | ||
409 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | ||
410 | { | ||
411 | return container_of(s, struct mem_cgroup, css); | ||
412 | } | ||
413 | |||
410 | /* Writing them here to avoid exposing memcg's inner layout */ | 414 | /* Writing them here to avoid exposing memcg's inner layout */ |
411 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 415 | #ifdef CONFIG_MEMCG_KMEM |
412 | #include <net/sock.h> | 416 | #include <net/sock.h> |
413 | #include <net/ip.h> | 417 | #include <net/ip.h> |
414 | 418 | ||
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | |||
467 | } | 471 | } |
468 | EXPORT_SYMBOL(tcp_proto_cgroup); | 472 | EXPORT_SYMBOL(tcp_proto_cgroup); |
469 | #endif /* CONFIG_INET */ | 473 | #endif /* CONFIG_INET */ |
470 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ | 474 | #endif /* CONFIG_MEMCG_KMEM */ |
471 | 475 | ||
472 | #if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) | 476 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) |
473 | static void disarm_sock_keys(struct mem_cgroup *memcg) | 477 | static void disarm_sock_keys(struct mem_cgroup *memcg) |
474 | { | 478 | { |
475 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) | 479 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) |
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | |||
703 | bool charge) | 707 | bool charge) |
704 | { | 708 | { |
705 | int val = (charge) ? 1 : -1; | 709 | int val = (charge) ? 1 : -1; |
706 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 710 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); |
707 | } | 711 | } |
708 | 712 | ||
709 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | 713 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, |
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
864 | 868 | ||
865 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 869 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
866 | { | 870 | { |
867 | return container_of(cgroup_subsys_state(cont, | 871 | return mem_cgroup_from_css( |
868 | mem_cgroup_subsys_id), struct mem_cgroup, | 872 | cgroup_subsys_state(cont, mem_cgroup_subsys_id)); |
869 | css); | ||
870 | } | 873 | } |
871 | 874 | ||
872 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | 875 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
879 | if (unlikely(!p)) | 882 | if (unlikely(!p)) |
880 | return NULL; | 883 | return NULL; |
881 | 884 | ||
882 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | 885 | return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); |
883 | struct mem_cgroup, css); | ||
884 | } | 886 | } |
885 | 887 | ||
886 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 888 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
966 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); | 968 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); |
967 | if (css) { | 969 | if (css) { |
968 | if (css == &root->css || css_tryget(css)) | 970 | if (css == &root->css || css_tryget(css)) |
969 | memcg = container_of(css, | 971 | memcg = mem_cgroup_from_css(css); |
970 | struct mem_cgroup, css); | ||
971 | } else | 972 | } else |
972 | id = 0; | 973 | id = 0; |
973 | rcu_read_unlock(); | 974 | rcu_read_unlock(); |
@@ -1148,7 +1149,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | |||
1148 | { | 1149 | { |
1149 | if (root_memcg == memcg) | 1150 | if (root_memcg == memcg) |
1150 | return true; | 1151 | return true; |
1151 | if (!root_memcg->use_hierarchy) | 1152 | if (!root_memcg->use_hierarchy || !memcg) |
1152 | return false; | 1153 | return false; |
1153 | return css_is_ancestor(&memcg->css, &root_memcg->css); | 1154 | return css_is_ancestor(&memcg->css, &root_memcg->css); |
1154 | } | 1155 | } |
@@ -1234,7 +1235,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) | |||
1234 | 1235 | ||
1235 | /** | 1236 | /** |
1236 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup | 1237 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1237 | * @mem: the memory cgroup | 1238 | * @memcg: the memory cgroup |
1238 | * | 1239 | * |
1239 | * Returns the maximum amount of memory @mem can be charged with, in | 1240 | * Returns the maximum amount of memory @mem can be charged with, in |
1240 | * pages. | 1241 | * pages. |
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) | |||
1454 | /* | 1455 | /* |
1455 | * Return the memory (and swap, if configured) limit for a memcg. | 1456 | * Return the memory (and swap, if configured) limit for a memcg. |
1456 | */ | 1457 | */ |
1457 | u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | 1458 | static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) |
1458 | { | 1459 | { |
1459 | u64 limit; | 1460 | u64 limit; |
1460 | u64 memsw; | 1461 | u64 memsw; |
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1470 | return min(limit, memsw); | 1471 | return min(limit, memsw); |
1471 | } | 1472 | } |
1472 | 1473 | ||
1474 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
1475 | int order) | ||
1476 | { | ||
1477 | struct mem_cgroup *iter; | ||
1478 | unsigned long chosen_points = 0; | ||
1479 | unsigned long totalpages; | ||
1480 | unsigned int points = 0; | ||
1481 | struct task_struct *chosen = NULL; | ||
1482 | |||
1483 | /* | ||
1484 | * If current has a pending SIGKILL, then automatically select it. The | ||
1485 | * goal is to allow it to allocate so that it may quickly exit and free | ||
1486 | * its memory. | ||
1487 | */ | ||
1488 | if (fatal_signal_pending(current)) { | ||
1489 | set_thread_flag(TIF_MEMDIE); | ||
1490 | return; | ||
1491 | } | ||
1492 | |||
1493 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | ||
1494 | totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; | ||
1495 | for_each_mem_cgroup_tree(iter, memcg) { | ||
1496 | struct cgroup *cgroup = iter->css.cgroup; | ||
1497 | struct cgroup_iter it; | ||
1498 | struct task_struct *task; | ||
1499 | |||
1500 | cgroup_iter_start(cgroup, &it); | ||
1501 | while ((task = cgroup_iter_next(cgroup, &it))) { | ||
1502 | switch (oom_scan_process_thread(task, totalpages, NULL, | ||
1503 | false)) { | ||
1504 | case OOM_SCAN_SELECT: | ||
1505 | if (chosen) | ||
1506 | put_task_struct(chosen); | ||
1507 | chosen = task; | ||
1508 | chosen_points = ULONG_MAX; | ||
1509 | get_task_struct(chosen); | ||
1510 | /* fall through */ | ||
1511 | case OOM_SCAN_CONTINUE: | ||
1512 | continue; | ||
1513 | case OOM_SCAN_ABORT: | ||
1514 | cgroup_iter_end(cgroup, &it); | ||
1515 | mem_cgroup_iter_break(memcg, iter); | ||
1516 | if (chosen) | ||
1517 | put_task_struct(chosen); | ||
1518 | return; | ||
1519 | case OOM_SCAN_OK: | ||
1520 | break; | ||
1521 | }; | ||
1522 | points = oom_badness(task, memcg, NULL, totalpages); | ||
1523 | if (points > chosen_points) { | ||
1524 | if (chosen) | ||
1525 | put_task_struct(chosen); | ||
1526 | chosen = task; | ||
1527 | chosen_points = points; | ||
1528 | get_task_struct(chosen); | ||
1529 | } | ||
1530 | } | ||
1531 | cgroup_iter_end(cgroup, &it); | ||
1532 | } | ||
1533 | |||
1534 | if (!chosen) | ||
1535 | return; | ||
1536 | points = chosen_points * 1000 / totalpages; | ||
1537 | oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, | ||
1538 | NULL, "Memory cgroup out of memory"); | ||
1539 | } | ||
1540 | |||
1473 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | 1541 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, |
1474 | gfp_t gfp_mask, | 1542 | gfp_t gfp_mask, |
1475 | unsigned long flags) | 1543 | unsigned long flags) |
@@ -1508,7 +1576,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1508 | 1576 | ||
1509 | /** | 1577 | /** |
1510 | * test_mem_cgroup_node_reclaimable | 1578 | * test_mem_cgroup_node_reclaimable |
1511 | * @mem: the target memcg | 1579 | * @memcg: the target memcg |
1512 | * @nid: the node ID to be checked. | 1580 | * @nid: the node ID to be checked. |
1513 | * @noswap : specify true here if the user wants flle only information. | 1581 | * @noswap : specify true here if the user wants flle only information. |
1514 | * | 1582 | * |
@@ -1899,7 +1967,7 @@ again: | |||
1899 | return; | 1967 | return; |
1900 | /* | 1968 | /* |
1901 | * If this memory cgroup is not under account moving, we don't | 1969 | * If this memory cgroup is not under account moving, we don't |
1902 | * need to take move_lock_page_cgroup(). Because we already hold | 1970 | * need to take move_lock_mem_cgroup(). Because we already hold |
1903 | * rcu_read_lock(), any calls to move_account will be delayed until | 1971 | * rcu_read_lock(), any calls to move_account will be delayed until |
1904 | * rcu_read_unlock() if mem_cgroup_stolen() == true. | 1972 | * rcu_read_unlock() if mem_cgroup_stolen() == true. |
1905 | */ | 1973 | */ |
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | |||
1921 | /* | 1989 | /* |
1922 | * It's guaranteed that pc->mem_cgroup never changes while | 1990 | * It's guaranteed that pc->mem_cgroup never changes while |
1923 | * lock is held because a routine modifies pc->mem_cgroup | 1991 | * lock is held because a routine modifies pc->mem_cgroup |
1924 | * should take move_lock_page_cgroup(). | 1992 | * should take move_lock_mem_cgroup(). |
1925 | */ | 1993 | */ |
1926 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); | 1994 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); |
1927 | } | 1995 | } |
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2268 | * We always charge the cgroup the mm_struct belongs to. | 2336 | * We always charge the cgroup the mm_struct belongs to. |
2269 | * The mm_struct's mem_cgroup changes on task migration if the | 2337 | * The mm_struct's mem_cgroup changes on task migration if the |
2270 | * thread group leader migrates. It's possible that mm is not | 2338 | * thread group leader migrates. It's possible that mm is not |
2271 | * set, if so charge the init_mm (happens for pagecache usage). | 2339 | * set, if so charge the root memcg (happens for pagecache usage). |
2272 | */ | 2340 | */ |
2273 | if (!*ptr && !mm) | 2341 | if (!*ptr && !mm) |
2274 | *ptr = root_mem_cgroup; | 2342 | *ptr = root_mem_cgroup; |
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
2429 | css = css_lookup(&mem_cgroup_subsys, id); | 2497 | css = css_lookup(&mem_cgroup_subsys, id); |
2430 | if (!css) | 2498 | if (!css) |
2431 | return NULL; | 2499 | return NULL; |
2432 | return container_of(css, struct mem_cgroup, css); | 2500 | return mem_cgroup_from_css(css); |
2433 | } | 2501 | } |
2434 | 2502 | ||
2435 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2503 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2473 | bool anon; | 2541 | bool anon; |
2474 | 2542 | ||
2475 | lock_page_cgroup(pc); | 2543 | lock_page_cgroup(pc); |
2476 | if (unlikely(PageCgroupUsed(pc))) { | 2544 | VM_BUG_ON(PageCgroupUsed(pc)); |
2477 | unlock_page_cgroup(pc); | ||
2478 | __mem_cgroup_cancel_charge(memcg, nr_pages); | ||
2479 | return; | ||
2480 | } | ||
2481 | /* | 2545 | /* |
2482 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2546 | * we don't need page_cgroup_lock about tail pages, becase they are not |
2483 | * accessed by any other context at this point. | 2547 | * accessed by any other context at this point. |
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2519 | spin_unlock_irq(&zone->lru_lock); | 2583 | spin_unlock_irq(&zone->lru_lock); |
2520 | } | 2584 | } |
2521 | 2585 | ||
2522 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 2586 | if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) |
2523 | anon = true; | 2587 | anon = true; |
2524 | else | 2588 | else |
2525 | anon = false; | 2589 | anon = false; |
@@ -2644,8 +2708,7 @@ out: | |||
2644 | 2708 | ||
2645 | static int mem_cgroup_move_parent(struct page *page, | 2709 | static int mem_cgroup_move_parent(struct page *page, |
2646 | struct page_cgroup *pc, | 2710 | struct page_cgroup *pc, |
2647 | struct mem_cgroup *child, | 2711 | struct mem_cgroup *child) |
2648 | gfp_t gfp_mask) | ||
2649 | { | 2712 | { |
2650 | struct mem_cgroup *parent; | 2713 | struct mem_cgroup *parent; |
2651 | unsigned int nr_pages; | 2714 | unsigned int nr_pages; |
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
2728 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 2791 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
2729 | VM_BUG_ON(!mm); | 2792 | VM_BUG_ON(!mm); |
2730 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2793 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
2731 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2794 | MEM_CGROUP_CHARGE_TYPE_ANON); |
2732 | } | ||
2733 | |||
2734 | static void | ||
2735 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | ||
2736 | enum charge_type ctype); | ||
2737 | |||
2738 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | ||
2739 | gfp_t gfp_mask) | ||
2740 | { | ||
2741 | struct mem_cgroup *memcg = NULL; | ||
2742 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2743 | int ret; | ||
2744 | |||
2745 | if (mem_cgroup_disabled()) | ||
2746 | return 0; | ||
2747 | if (PageCompound(page)) | ||
2748 | return 0; | ||
2749 | |||
2750 | if (unlikely(!mm)) | ||
2751 | mm = &init_mm; | ||
2752 | if (!page_is_file_cache(page)) | ||
2753 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2754 | |||
2755 | if (!PageSwapCache(page)) | ||
2756 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); | ||
2757 | else { /* page is swapcache/shmem */ | ||
2758 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); | ||
2759 | if (!ret) | ||
2760 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | ||
2761 | } | ||
2762 | return ret; | ||
2763 | } | 2795 | } |
2764 | 2796 | ||
2765 | /* | 2797 | /* |
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2768 | * struct page_cgroup is acquired. This refcnt will be consumed by | 2800 | * struct page_cgroup is acquired. This refcnt will be consumed by |
2769 | * "commit()" or removed by "cancel()" | 2801 | * "commit()" or removed by "cancel()" |
2770 | */ | 2802 | */ |
2771 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 2803 | static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
2772 | struct page *page, | 2804 | struct page *page, |
2773 | gfp_t mask, struct mem_cgroup **memcgp) | 2805 | gfp_t mask, |
2806 | struct mem_cgroup **memcgp) | ||
2774 | { | 2807 | { |
2775 | struct mem_cgroup *memcg; | 2808 | struct mem_cgroup *memcg; |
2809 | struct page_cgroup *pc; | ||
2776 | int ret; | 2810 | int ret; |
2777 | 2811 | ||
2778 | *memcgp = NULL; | 2812 | pc = lookup_page_cgroup(page); |
2779 | |||
2780 | if (mem_cgroup_disabled()) | ||
2781 | return 0; | ||
2782 | |||
2783 | if (!do_swap_account) | ||
2784 | goto charge_cur_mm; | ||
2785 | /* | 2813 | /* |
2786 | * A racing thread's fault, or swapoff, may have already updated | 2814 | * Every swap fault against a single page tries to charge the |
2787 | * the pte, and even removed page from swap cache: in those cases | 2815 | * page, bail as early as possible. shmem_unuse() encounters |
2788 | * do_swap_page()'s pte_same() test will fail; but there's also a | 2816 | * already charged pages, too. The USED bit is protected by |
2789 | * KSM case which does need to charge the page. | 2817 | * the page lock, which serializes swap cache removal, which |
2818 | * in turn serializes uncharging. | ||
2790 | */ | 2819 | */ |
2791 | if (!PageSwapCache(page)) | 2820 | if (PageCgroupUsed(pc)) |
2821 | return 0; | ||
2822 | if (!do_swap_account) | ||
2792 | goto charge_cur_mm; | 2823 | goto charge_cur_mm; |
2793 | memcg = try_get_mem_cgroup_from_page(page); | 2824 | memcg = try_get_mem_cgroup_from_page(page); |
2794 | if (!memcg) | 2825 | if (!memcg) |
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2800 | ret = 0; | 2831 | ret = 0; |
2801 | return ret; | 2832 | return ret; |
2802 | charge_cur_mm: | 2833 | charge_cur_mm: |
2803 | if (unlikely(!mm)) | ||
2804 | mm = &init_mm; | ||
2805 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); | 2834 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); |
2806 | if (ret == -EINTR) | 2835 | if (ret == -EINTR) |
2807 | ret = 0; | 2836 | ret = 0; |
2808 | return ret; | 2837 | return ret; |
2809 | } | 2838 | } |
2810 | 2839 | ||
2840 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, | ||
2841 | gfp_t gfp_mask, struct mem_cgroup **memcgp) | ||
2842 | { | ||
2843 | *memcgp = NULL; | ||
2844 | if (mem_cgroup_disabled()) | ||
2845 | return 0; | ||
2846 | /* | ||
2847 | * A racing thread's fault, or swapoff, may have already | ||
2848 | * updated the pte, and even removed page from swap cache: in | ||
2849 | * those cases unuse_pte()'s pte_same() test will fail; but | ||
2850 | * there's also a KSM case which does need to charge the page. | ||
2851 | */ | ||
2852 | if (!PageSwapCache(page)) { | ||
2853 | int ret; | ||
2854 | |||
2855 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); | ||
2856 | if (ret == -EINTR) | ||
2857 | ret = 0; | ||
2858 | return ret; | ||
2859 | } | ||
2860 | return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); | ||
2861 | } | ||
2862 | |||
2863 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) | ||
2864 | { | ||
2865 | if (mem_cgroup_disabled()) | ||
2866 | return; | ||
2867 | if (!memcg) | ||
2868 | return; | ||
2869 | __mem_cgroup_cancel_charge(memcg, 1); | ||
2870 | } | ||
2871 | |||
2811 | static void | 2872 | static void |
2812 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | 2873 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, |
2813 | enum charge_type ctype) | 2874 | enum charge_type ctype) |
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page, | |||
2842 | struct mem_cgroup *memcg) | 2903 | struct mem_cgroup *memcg) |
2843 | { | 2904 | { |
2844 | __mem_cgroup_commit_charge_swapin(page, memcg, | 2905 | __mem_cgroup_commit_charge_swapin(page, memcg, |
2845 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2906 | MEM_CGROUP_CHARGE_TYPE_ANON); |
2846 | } | 2907 | } |
2847 | 2908 | ||
2848 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) | 2909 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2910 | gfp_t gfp_mask) | ||
2849 | { | 2911 | { |
2912 | struct mem_cgroup *memcg = NULL; | ||
2913 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2914 | int ret; | ||
2915 | |||
2850 | if (mem_cgroup_disabled()) | 2916 | if (mem_cgroup_disabled()) |
2851 | return; | 2917 | return 0; |
2852 | if (!memcg) | 2918 | if (PageCompound(page)) |
2853 | return; | 2919 | return 0; |
2854 | __mem_cgroup_cancel_charge(memcg, 1); | 2920 | |
2921 | if (!PageSwapCache(page)) | ||
2922 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); | ||
2923 | else { /* page is swapcache/shmem */ | ||
2924 | ret = __mem_cgroup_try_charge_swapin(mm, page, | ||
2925 | gfp_mask, &memcg); | ||
2926 | if (!ret) | ||
2927 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | ||
2928 | } | ||
2929 | return ret; | ||
2855 | } | 2930 | } |
2856 | 2931 | ||
2857 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, | 2932 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, |
@@ -2911,7 +2986,8 @@ direct_uncharge: | |||
2911 | * uncharge if !page_mapped(page) | 2986 | * uncharge if !page_mapped(page) |
2912 | */ | 2987 | */ |
2913 | static struct mem_cgroup * | 2988 | static struct mem_cgroup * |
2914 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2989 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, |
2990 | bool end_migration) | ||
2915 | { | 2991 | { |
2916 | struct mem_cgroup *memcg = NULL; | 2992 | struct mem_cgroup *memcg = NULL; |
2917 | unsigned int nr_pages = 1; | 2993 | unsigned int nr_pages = 1; |
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2921 | if (mem_cgroup_disabled()) | 2997 | if (mem_cgroup_disabled()) |
2922 | return NULL; | 2998 | return NULL; |
2923 | 2999 | ||
2924 | if (PageSwapCache(page)) | 3000 | VM_BUG_ON(PageSwapCache(page)); |
2925 | return NULL; | ||
2926 | 3001 | ||
2927 | if (PageTransHuge(page)) { | 3002 | if (PageTransHuge(page)) { |
2928 | nr_pages <<= compound_order(page); | 3003 | nr_pages <<= compound_order(page); |
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2945 | anon = PageAnon(page); | 3020 | anon = PageAnon(page); |
2946 | 3021 | ||
2947 | switch (ctype) { | 3022 | switch (ctype) { |
2948 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 3023 | case MEM_CGROUP_CHARGE_TYPE_ANON: |
2949 | /* | 3024 | /* |
2950 | * Generally PageAnon tells if it's the anon statistics to be | 3025 | * Generally PageAnon tells if it's the anon statistics to be |
2951 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is | 3026 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is |
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2955 | /* fallthrough */ | 3030 | /* fallthrough */ |
2956 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 3031 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2957 | /* See mem_cgroup_prepare_migration() */ | 3032 | /* See mem_cgroup_prepare_migration() */ |
2958 | if (page_mapped(page) || PageCgroupMigration(pc)) | 3033 | if (page_mapped(page)) |
3034 | goto unlock_out; | ||
3035 | /* | ||
3036 | * Pages under migration may not be uncharged. But | ||
3037 | * end_migration() /must/ be the one uncharging the | ||
3038 | * unused post-migration page and so it has to call | ||
3039 | * here with the migration bit still set. See the | ||
3040 | * res_counter handling below. | ||
3041 | */ | ||
3042 | if (!end_migration && PageCgroupMigration(pc)) | ||
2959 | goto unlock_out; | 3043 | goto unlock_out; |
2960 | break; | 3044 | break; |
2961 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | 3045 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: |
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2989 | mem_cgroup_swap_statistics(memcg, true); | 3073 | mem_cgroup_swap_statistics(memcg, true); |
2990 | mem_cgroup_get(memcg); | 3074 | mem_cgroup_get(memcg); |
2991 | } | 3075 | } |
2992 | if (!mem_cgroup_is_root(memcg)) | 3076 | /* |
3077 | * Migration does not charge the res_counter for the | ||
3078 | * replacement page, so leave it alone when phasing out the | ||
3079 | * page that is unused after the migration. | ||
3080 | */ | ||
3081 | if (!end_migration && !mem_cgroup_is_root(memcg)) | ||
2993 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); | 3082 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); |
2994 | 3083 | ||
2995 | return memcg; | 3084 | return memcg; |
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
3005 | if (page_mapped(page)) | 3094 | if (page_mapped(page)) |
3006 | return; | 3095 | return; |
3007 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 3096 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
3008 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 3097 | if (PageSwapCache(page)) |
3098 | return; | ||
3099 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); | ||
3009 | } | 3100 | } |
3010 | 3101 | ||
3011 | void mem_cgroup_uncharge_cache_page(struct page *page) | 3102 | void mem_cgroup_uncharge_cache_page(struct page *page) |
3012 | { | 3103 | { |
3013 | VM_BUG_ON(page_mapped(page)); | 3104 | VM_BUG_ON(page_mapped(page)); |
3014 | VM_BUG_ON(page->mapping); | 3105 | VM_BUG_ON(page->mapping); |
3015 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 3106 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); |
3016 | } | 3107 | } |
3017 | 3108 | ||
3018 | /* | 3109 | /* |
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
3076 | if (!swapout) /* this was a swap cache but the swap is unused ! */ | 3167 | if (!swapout) /* this was a swap cache but the swap is unused ! */ |
3077 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; | 3168 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; |
3078 | 3169 | ||
3079 | memcg = __mem_cgroup_uncharge_common(page, ctype); | 3170 | memcg = __mem_cgroup_uncharge_common(page, ctype, false); |
3080 | 3171 | ||
3081 | /* | 3172 | /* |
3082 | * record memcg information, if swapout && memcg != NULL, | 3173 | * record memcg information, if swapout && memcg != NULL, |
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
3087 | } | 3178 | } |
3088 | #endif | 3179 | #endif |
3089 | 3180 | ||
3090 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3181 | #ifdef CONFIG_MEMCG_SWAP |
3091 | /* | 3182 | /* |
3092 | * called from swap_entry_free(). remove record in swap_cgroup and | 3183 | * called from swap_entry_free(). remove record in swap_cgroup and |
3093 | * uncharge "memsw" account. | 3184 | * uncharge "memsw" account. |
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3166 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 3257 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
3167 | * page belongs to. | 3258 | * page belongs to. |
3168 | */ | 3259 | */ |
3169 | int mem_cgroup_prepare_migration(struct page *page, | 3260 | void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, |
3170 | struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) | 3261 | struct mem_cgroup **memcgp) |
3171 | { | 3262 | { |
3172 | struct mem_cgroup *memcg = NULL; | 3263 | struct mem_cgroup *memcg = NULL; |
3173 | struct page_cgroup *pc; | 3264 | struct page_cgroup *pc; |
3174 | enum charge_type ctype; | 3265 | enum charge_type ctype; |
3175 | int ret = 0; | ||
3176 | 3266 | ||
3177 | *memcgp = NULL; | 3267 | *memcgp = NULL; |
3178 | 3268 | ||
3179 | VM_BUG_ON(PageTransHuge(page)); | 3269 | VM_BUG_ON(PageTransHuge(page)); |
3180 | if (mem_cgroup_disabled()) | 3270 | if (mem_cgroup_disabled()) |
3181 | return 0; | 3271 | return; |
3182 | 3272 | ||
3183 | pc = lookup_page_cgroup(page); | 3273 | pc = lookup_page_cgroup(page); |
3184 | lock_page_cgroup(pc); | 3274 | lock_page_cgroup(pc); |
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3223 | * we return here. | 3313 | * we return here. |
3224 | */ | 3314 | */ |
3225 | if (!memcg) | 3315 | if (!memcg) |
3226 | return 0; | 3316 | return; |
3227 | 3317 | ||
3228 | *memcgp = memcg; | 3318 | *memcgp = memcg; |
3229 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); | ||
3230 | css_put(&memcg->css);/* drop extra refcnt */ | ||
3231 | if (ret) { | ||
3232 | if (PageAnon(page)) { | ||
3233 | lock_page_cgroup(pc); | ||
3234 | ClearPageCgroupMigration(pc); | ||
3235 | unlock_page_cgroup(pc); | ||
3236 | /* | ||
3237 | * The old page may be fully unmapped while we kept it. | ||
3238 | */ | ||
3239 | mem_cgroup_uncharge_page(page); | ||
3240 | } | ||
3241 | /* we'll need to revisit this error code (we have -EINTR) */ | ||
3242 | return -ENOMEM; | ||
3243 | } | ||
3244 | /* | 3319 | /* |
3245 | * We charge new page before it's used/mapped. So, even if unlock_page() | 3320 | * We charge new page before it's used/mapped. So, even if unlock_page() |
3246 | * is called before end_migration, we can catch all events on this new | 3321 | * is called before end_migration, we can catch all events on this new |
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3248 | * mapcount will be finally 0 and we call uncharge in end_migration(). | 3323 | * mapcount will be finally 0 and we call uncharge in end_migration(). |
3249 | */ | 3324 | */ |
3250 | if (PageAnon(page)) | 3325 | if (PageAnon(page)) |
3251 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | 3326 | ctype = MEM_CGROUP_CHARGE_TYPE_ANON; |
3252 | else if (page_is_file_cache(page)) | ||
3253 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
3254 | else | 3327 | else |
3255 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3328 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3329 | /* | ||
3330 | * The page is committed to the memcg, but it's not actually | ||
3331 | * charged to the res_counter since we plan on replacing the | ||
3332 | * old one and only one page is going to be left afterwards. | ||
3333 | */ | ||
3256 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); | 3334 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); |
3257 | return ret; | ||
3258 | } | 3335 | } |
3259 | 3336 | ||
3260 | /* remove redundant charge if migration failed*/ | 3337 | /* remove redundant charge if migration failed*/ |
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3276 | used = newpage; | 3353 | used = newpage; |
3277 | unused = oldpage; | 3354 | unused = oldpage; |
3278 | } | 3355 | } |
3356 | anon = PageAnon(used); | ||
3357 | __mem_cgroup_uncharge_common(unused, | ||
3358 | anon ? MEM_CGROUP_CHARGE_TYPE_ANON | ||
3359 | : MEM_CGROUP_CHARGE_TYPE_CACHE, | ||
3360 | true); | ||
3361 | css_put(&memcg->css); | ||
3279 | /* | 3362 | /* |
3280 | * We disallowed uncharge of pages under migration because mapcount | 3363 | * We disallowed uncharge of pages under migration because mapcount |
3281 | * of the page goes down to zero, temporarly. | 3364 | * of the page goes down to zero, temporarly. |
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3285 | lock_page_cgroup(pc); | 3368 | lock_page_cgroup(pc); |
3286 | ClearPageCgroupMigration(pc); | 3369 | ClearPageCgroupMigration(pc); |
3287 | unlock_page_cgroup(pc); | 3370 | unlock_page_cgroup(pc); |
3288 | anon = PageAnon(used); | ||
3289 | __mem_cgroup_uncharge_common(unused, | ||
3290 | anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED | ||
3291 | : MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
3292 | 3371 | ||
3293 | /* | 3372 | /* |
3294 | * If a page is a file cache, radix-tree replacement is very atomic | 3373 | * If a page is a file cache, radix-tree replacement is very atomic |
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3340 | */ | 3419 | */ |
3341 | if (!memcg) | 3420 | if (!memcg) |
3342 | return; | 3421 | return; |
3343 | |||
3344 | if (PageSwapBacked(oldpage)) | ||
3345 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
3346 | |||
3347 | /* | 3422 | /* |
3348 | * Even if newpage->mapping was NULL before starting replacement, | 3423 | * Even if newpage->mapping was NULL before starting replacement, |
3349 | * the newpage may be on LRU(or pagevec for LRU) already. We lock | 3424 | * the newpage may be on LRU(or pagevec for LRU) already. We lock |
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3418 | /* | 3493 | /* |
3419 | * Rather than hide all in some function, I do this in | 3494 | * Rather than hide all in some function, I do this in |
3420 | * open coded manner. You see what this really does. | 3495 | * open coded manner. You see what this really does. |
3421 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. | 3496 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3422 | */ | 3497 | */ |
3423 | mutex_lock(&set_limit_mutex); | 3498 | mutex_lock(&set_limit_mutex); |
3424 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3499 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3479 | /* | 3554 | /* |
3480 | * Rather than hide all in some function, I do this in | 3555 | * Rather than hide all in some function, I do this in |
3481 | * open coded manner. You see what this really does. | 3556 | * open coded manner. You see what this really does. |
3482 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. | 3557 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3483 | */ | 3558 | */ |
3484 | mutex_lock(&set_limit_mutex); | 3559 | mutex_lock(&set_limit_mutex); |
3485 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3560 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3611 | } | 3686 | } |
3612 | 3687 | ||
3613 | /* | 3688 | /* |
3614 | * This routine traverse page_cgroup in given list and drop them all. | 3689 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't |
3615 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 3690 | * reclaim the pages page themselves - it just removes the page_cgroups. |
3691 | * Returns true if some page_cgroups were not freed, indicating that the caller | ||
3692 | * must retry this operation. | ||
3616 | */ | 3693 | */ |
3617 | static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | 3694 | static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3618 | int node, int zid, enum lru_list lru) | 3695 | int node, int zid, enum lru_list lru) |
3619 | { | 3696 | { |
3620 | struct mem_cgroup_per_zone *mz; | 3697 | struct mem_cgroup_per_zone *mz; |
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3622 | struct list_head *list; | 3699 | struct list_head *list; |
3623 | struct page *busy; | 3700 | struct page *busy; |
3624 | struct zone *zone; | 3701 | struct zone *zone; |
3625 | int ret = 0; | ||
3626 | 3702 | ||
3627 | zone = &NODE_DATA(node)->node_zones[zid]; | 3703 | zone = &NODE_DATA(node)->node_zones[zid]; |
3628 | mz = mem_cgroup_zoneinfo(memcg, node, zid); | 3704 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3636 | struct page_cgroup *pc; | 3712 | struct page_cgroup *pc; |
3637 | struct page *page; | 3713 | struct page *page; |
3638 | 3714 | ||
3639 | ret = 0; | ||
3640 | spin_lock_irqsave(&zone->lru_lock, flags); | 3715 | spin_lock_irqsave(&zone->lru_lock, flags); |
3641 | if (list_empty(list)) { | 3716 | if (list_empty(list)) { |
3642 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3717 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3653 | 3728 | ||
3654 | pc = lookup_page_cgroup(page); | 3729 | pc = lookup_page_cgroup(page); |
3655 | 3730 | ||
3656 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); | 3731 | if (mem_cgroup_move_parent(page, pc, memcg)) { |
3657 | if (ret == -ENOMEM || ret == -EINTR) | ||
3658 | break; | ||
3659 | |||
3660 | if (ret == -EBUSY || ret == -EINVAL) { | ||
3661 | /* found lock contention or "pc" is obsolete. */ | 3732 | /* found lock contention or "pc" is obsolete. */ |
3662 | busy = page; | 3733 | busy = page; |
3663 | cond_resched(); | 3734 | cond_resched(); |
3664 | } else | 3735 | } else |
3665 | busy = NULL; | 3736 | busy = NULL; |
3666 | } | 3737 | } |
3667 | 3738 | return !list_empty(list); | |
3668 | if (!ret && !list_empty(list)) | ||
3669 | return -EBUSY; | ||
3670 | return ret; | ||
3671 | } | 3739 | } |
3672 | 3740 | ||
3673 | /* | 3741 | /* |
@@ -3692,9 +3760,6 @@ move_account: | |||
3692 | ret = -EBUSY; | 3760 | ret = -EBUSY; |
3693 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | 3761 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
3694 | goto out; | 3762 | goto out; |
3695 | ret = -EINTR; | ||
3696 | if (signal_pending(current)) | ||
3697 | goto out; | ||
3698 | /* This is for making all *used* pages to be on LRU. */ | 3763 | /* This is for making all *used* pages to be on LRU. */ |
3699 | lru_add_drain_all(); | 3764 | lru_add_drain_all(); |
3700 | drain_all_stock_sync(memcg); | 3765 | drain_all_stock_sync(memcg); |
@@ -3715,9 +3780,6 @@ move_account: | |||
3715 | } | 3780 | } |
3716 | mem_cgroup_end_move(memcg); | 3781 | mem_cgroup_end_move(memcg); |
3717 | memcg_oom_recover(memcg); | 3782 | memcg_oom_recover(memcg); |
3718 | /* it seems parent cgroup doesn't have enough mem */ | ||
3719 | if (ret == -ENOMEM) | ||
3720 | goto try_to_free; | ||
3721 | cond_resched(); | 3783 | cond_resched(); |
3722 | /* "ret" should also be checked to ensure all lists are empty. */ | 3784 | /* "ret" should also be checked to ensure all lists are empty. */ |
3723 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); | 3785 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); |
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3779 | parent_memcg = mem_cgroup_from_cont(parent); | 3841 | parent_memcg = mem_cgroup_from_cont(parent); |
3780 | 3842 | ||
3781 | cgroup_lock(); | 3843 | cgroup_lock(); |
3844 | |||
3845 | if (memcg->use_hierarchy == val) | ||
3846 | goto out; | ||
3847 | |||
3782 | /* | 3848 | /* |
3783 | * If parent's use_hierarchy is set, we can't make any modifications | 3849 | * If parent's use_hierarchy is set, we can't make any modifications |
3784 | * in the child subtrees. If it is unset, then the change can | 3850 | * in the child subtrees. If it is unset, then the change can |
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3795 | retval = -EBUSY; | 3861 | retval = -EBUSY; |
3796 | } else | 3862 | } else |
3797 | retval = -EINVAL; | 3863 | retval = -EINVAL; |
3864 | |||
3865 | out: | ||
3798 | cgroup_unlock(); | 3866 | cgroup_unlock(); |
3799 | 3867 | ||
3800 | return retval; | 3868 | return retval; |
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
3831 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | 3899 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); |
3832 | 3900 | ||
3833 | if (swap) | 3901 | if (swap) |
3834 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); | 3902 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); |
3835 | 3903 | ||
3836 | return val << PAGE_SHIFT; | 3904 | return val << PAGE_SHIFT; |
3837 | } | 3905 | } |
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
4015 | #endif | 4083 | #endif |
4016 | 4084 | ||
4017 | #ifdef CONFIG_NUMA | 4085 | #ifdef CONFIG_NUMA |
4018 | static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft, | 4086 | static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, |
4019 | struct seq_file *m) | 4087 | struct seq_file *m) |
4020 | { | 4088 | { |
4021 | int nid; | 4089 | int nid; |
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) | |||
4074 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 4142 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
4075 | } | 4143 | } |
4076 | 4144 | ||
4077 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 4145 | static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, |
4078 | struct seq_file *m) | 4146 | struct seq_file *m) |
4079 | { | 4147 | { |
4080 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4148 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4082 | unsigned int i; | 4150 | unsigned int i; |
4083 | 4151 | ||
4084 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 4152 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4085 | if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) | 4153 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
4086 | continue; | 4154 | continue; |
4087 | seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], | 4155 | seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], |
4088 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); | 4156 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); |
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4109 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 4177 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4110 | long long val = 0; | 4178 | long long val = 0; |
4111 | 4179 | ||
4112 | if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) | 4180 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
4113 | continue; | 4181 | continue; |
4114 | for_each_mem_cgroup_tree(mi, memcg) | 4182 | for_each_mem_cgroup_tree(mi, memcg) |
4115 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; | 4183 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; |
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
4533 | return 0; | 4601 | return 0; |
4534 | } | 4602 | } |
4535 | 4603 | ||
4536 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 4604 | #ifdef CONFIG_MEMCG_KMEM |
4537 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 4605 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4538 | { | 4606 | { |
4539 | return mem_cgroup_sockets_init(memcg, ss); | 4607 | return mem_cgroup_sockets_init(memcg, ss); |
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = { | |||
4588 | }, | 4656 | }, |
4589 | { | 4657 | { |
4590 | .name = "stat", | 4658 | .name = "stat", |
4591 | .read_seq_string = mem_control_stat_show, | 4659 | .read_seq_string = memcg_stat_show, |
4592 | }, | 4660 | }, |
4593 | { | 4661 | { |
4594 | .name = "force_empty", | 4662 | .name = "force_empty", |
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = { | |||
4620 | #ifdef CONFIG_NUMA | 4688 | #ifdef CONFIG_NUMA |
4621 | { | 4689 | { |
4622 | .name = "numa_stat", | 4690 | .name = "numa_stat", |
4623 | .read_seq_string = mem_control_numa_stat_show, | 4691 | .read_seq_string = memcg_numa_stat_show, |
4624 | }, | 4692 | }, |
4625 | #endif | 4693 | #endif |
4626 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4694 | #ifdef CONFIG_MEMCG_SWAP |
4627 | { | 4695 | { |
4628 | .name = "memsw.usage_in_bytes", | 4696 | .name = "memsw.usage_in_bytes", |
4629 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 4697 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
4810 | } | 4878 | } |
4811 | EXPORT_SYMBOL(parent_mem_cgroup); | 4879 | EXPORT_SYMBOL(parent_mem_cgroup); |
4812 | 4880 | ||
4813 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4881 | #ifdef CONFIG_MEMCG_SWAP |
4814 | static void __init enable_swap_cgroup(void) | 4882 | static void __init enable_swap_cgroup(void) |
4815 | { | 4883 | { |
4816 | if (!mem_cgroup_disabled() && really_do_swap_account) | 4884 | if (!mem_cgroup_disabled() && really_do_swap_account) |
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
5541 | .__DEPRECATED_clear_css_refs = true, | 5609 | .__DEPRECATED_clear_css_refs = true, |
5542 | }; | 5610 | }; |
5543 | 5611 | ||
5544 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 5612 | #ifdef CONFIG_MEMCG_SWAP |
5545 | static int __init enable_swap_account(char *s) | 5613 | static int __init enable_swap_account(char *s) |
5546 | { | 5614 | { |
5547 | /* consider enabled if no parameter or 1 is given */ | 5615 | /* consider enabled if no parameter or 1 is given */ |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ab1e7145e290..a6e2141a6610 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p) | |||
128 | * can only guarantee that the page either belongs to the memcg tasks, or is | 128 | * can only guarantee that the page either belongs to the memcg tasks, or is |
129 | * a freed page. | 129 | * a freed page. |
130 | */ | 130 | */ |
131 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 131 | #ifdef CONFIG_MEMCG_SWAP |
132 | u64 hwpoison_filter_memcg; | 132 | u64 hwpoison_filter_memcg; |
133 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | 133 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); |
134 | static int hwpoison_filter_task(struct page *p) | 134 | static int hwpoison_filter_task(struct page *p) |
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, | |||
345 | * Also when FAIL is set do a force kill because something went | 345 | * Also when FAIL is set do a force kill because something went |
346 | * wrong earlier. | 346 | * wrong earlier. |
347 | */ | 347 | */ |
348 | static void kill_procs(struct list_head *to_kill, int doit, int trapno, | 348 | static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, |
349 | int fail, struct page *page, unsigned long pfn, | 349 | int fail, struct page *page, unsigned long pfn, |
350 | int flags) | 350 | int flags) |
351 | { | 351 | { |
352 | struct to_kill *tk, *next; | 352 | struct to_kill *tk, *next; |
353 | 353 | ||
354 | list_for_each_entry_safe (tk, next, to_kill, nd) { | 354 | list_for_each_entry_safe (tk, next, to_kill, nd) { |
355 | if (doit) { | 355 | if (forcekill) { |
356 | /* | 356 | /* |
357 | * In case something went wrong with munmapping | 357 | * In case something went wrong with munmapping |
358 | * make sure the process doesn't catch the | 358 | * make sure the process doesn't catch the |
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
858 | struct address_space *mapping; | 858 | struct address_space *mapping; |
859 | LIST_HEAD(tokill); | 859 | LIST_HEAD(tokill); |
860 | int ret; | 860 | int ret; |
861 | int kill = 1; | 861 | int kill = 1, forcekill; |
862 | struct page *hpage = compound_head(p); | 862 | struct page *hpage = compound_head(p); |
863 | struct page *ppage; | 863 | struct page *ppage; |
864 | 864 | ||
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
888 | * be called inside page lock (it's recommended but not enforced). | 888 | * be called inside page lock (it's recommended but not enforced). |
889 | */ | 889 | */ |
890 | mapping = page_mapping(hpage); | 890 | mapping = page_mapping(hpage); |
891 | if (!PageDirty(hpage) && mapping && | 891 | if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && |
892 | mapping_cap_writeback_dirty(mapping)) { | 892 | mapping_cap_writeback_dirty(mapping)) { |
893 | if (page_mkclean(hpage)) { | 893 | if (page_mkclean(hpage)) { |
894 | SetPageDirty(hpage); | 894 | SetPageDirty(hpage); |
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
965 | * Now that the dirty bit has been propagated to the | 965 | * Now that the dirty bit has been propagated to the |
966 | * struct page and all unmaps done we can decide if | 966 | * struct page and all unmaps done we can decide if |
967 | * killing is needed or not. Only kill when the page | 967 | * killing is needed or not. Only kill when the page |
968 | * was dirty, otherwise the tokill list is merely | 968 | * was dirty or the process is not restartable, |
969 | * otherwise the tokill list is merely | ||
969 | * freed. When there was a problem unmapping earlier | 970 | * freed. When there was a problem unmapping earlier |
970 | * use a more force-full uncatchable kill to prevent | 971 | * use a more force-full uncatchable kill to prevent |
971 | * any accesses to the poisoned memory. | 972 | * any accesses to the poisoned memory. |
972 | */ | 973 | */ |
973 | kill_procs(&tokill, !!PageDirty(ppage), trapno, | 974 | forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); |
975 | kill_procs(&tokill, forcekill, trapno, | ||
974 | ret != SWAP_SUCCESS, p, pfn, flags); | 976 | ret != SWAP_SUCCESS, p, pfn, flags); |
975 | 977 | ||
976 | return ret; | 978 | return ret; |
@@ -1414,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1414 | int ret; | 1416 | int ret; |
1415 | unsigned long pfn = page_to_pfn(page); | 1417 | unsigned long pfn = page_to_pfn(page); |
1416 | struct page *hpage = compound_head(page); | 1418 | struct page *hpage = compound_head(page); |
1417 | LIST_HEAD(pagelist); | ||
1418 | 1419 | ||
1419 | ret = get_any_page(page, pfn, flags); | 1420 | ret = get_any_page(page, pfn, flags); |
1420 | if (ret < 0) | 1421 | if (ret < 0) |
@@ -1429,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1429 | } | 1430 | } |
1430 | 1431 | ||
1431 | /* Keep page count to indicate a given hugepage is isolated. */ | 1432 | /* Keep page count to indicate a given hugepage is isolated. */ |
1432 | 1433 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, | |
1433 | list_add(&hpage->lru, &pagelist); | 1434 | MIGRATE_SYNC); |
1434 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, | 1435 | put_page(hpage); |
1435 | true); | ||
1436 | if (ret) { | 1436 | if (ret) { |
1437 | struct page *page1, *page2; | ||
1438 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | ||
1439 | put_page(page1); | ||
1440 | |||
1441 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1437 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1442 | pfn, ret, page->flags); | 1438 | pfn, ret, page->flags); |
1443 | if (ret > 0) | ||
1444 | ret = -EIO; | ||
1445 | return ret; | 1439 | return ret; |
1446 | } | 1440 | } |
1447 | done: | 1441 | done: |
1448 | if (!PageHWPoison(hpage)) | 1442 | if (!PageHWPoison(hpage)) |
1449 | atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); | 1443 | atomic_long_add(1 << compound_trans_order(hpage), |
1444 | &mce_bad_pages); | ||
1450 | set_page_hwpoison_huge_page(hpage); | 1445 | set_page_hwpoison_huge_page(hpage); |
1451 | dequeue_hwpoisoned_huge_page(hpage); | 1446 | dequeue_hwpoisoned_huge_page(hpage); |
1452 | /* keep elevated page count for bad page */ | 1447 | /* keep elevated page count for bad page */ |
@@ -1561,7 +1556,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1561 | page_is_file_cache(page)); | 1556 | page_is_file_cache(page)); |
1562 | list_add(&page->lru, &pagelist); | 1557 | list_add(&page->lru, &pagelist); |
1563 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1558 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1564 | 0, MIGRATE_SYNC); | 1559 | false, MIGRATE_SYNC); |
1565 | if (ret) { | 1560 | if (ret) { |
1566 | putback_lru_pages(&pagelist); | 1561 | putback_lru_pages(&pagelist); |
1567 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1562 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
diff --git a/mm/memory.c b/mm/memory.c index 1b7dc662bf9f..482f089765ff 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) | |||
206 | tlb->mm = mm; | 206 | tlb->mm = mm; |
207 | 207 | ||
208 | tlb->fullmm = fullmm; | 208 | tlb->fullmm = fullmm; |
209 | tlb->start = -1UL; | ||
210 | tlb->end = 0; | ||
209 | tlb->need_flush = 0; | 211 | tlb->need_flush = 0; |
210 | tlb->fast_mode = (num_possible_cpus() == 1); | 212 | tlb->fast_mode = (num_possible_cpus() == 1); |
211 | tlb->local.next = NULL; | 213 | tlb->local.next = NULL; |
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e | |||
248 | { | 250 | { |
249 | struct mmu_gather_batch *batch, *next; | 251 | struct mmu_gather_batch *batch, *next; |
250 | 252 | ||
253 | tlb->start = start; | ||
254 | tlb->end = end; | ||
251 | tlb_flush_mmu(tlb); | 255 | tlb_flush_mmu(tlb); |
252 | 256 | ||
253 | /* keep the page table cache within bounds */ | 257 | /* keep the page table cache within bounds */ |
@@ -1204,6 +1208,11 @@ again: | |||
1204 | */ | 1208 | */ |
1205 | if (force_flush) { | 1209 | if (force_flush) { |
1206 | force_flush = 0; | 1210 | force_flush = 0; |
1211 | |||
1212 | #ifdef HAVE_GENERIC_MMU_GATHER | ||
1213 | tlb->start = addr; | ||
1214 | tlb->end = end; | ||
1215 | #endif | ||
1207 | tlb_flush_mmu(tlb); | 1216 | tlb_flush_mmu(tlb); |
1208 | if (addr != end) | 1217 | if (addr != end) |
1209 | goto again; | 1218 | goto again; |
@@ -1225,7 +1234,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1225 | next = pmd_addr_end(addr, end); | 1234 | next = pmd_addr_end(addr, end); |
1226 | if (pmd_trans_huge(*pmd)) { | 1235 | if (pmd_trans_huge(*pmd)) { |
1227 | if (next - addr != HPAGE_PMD_SIZE) { | 1236 | if (next - addr != HPAGE_PMD_SIZE) { |
1228 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1237 | #ifdef CONFIG_DEBUG_VM |
1238 | if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { | ||
1239 | pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", | ||
1240 | __func__, addr, end, | ||
1241 | vma->vm_start, | ||
1242 | vma->vm_end); | ||
1243 | BUG(); | ||
1244 | } | ||
1245 | #endif | ||
1229 | split_huge_page_pmd(vma->vm_mm, pmd); | 1246 | split_huge_page_pmd(vma->vm_mm, pmd); |
1230 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1247 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1231 | goto next; | 1248 | goto next; |
@@ -1326,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1326 | * Since no pte has actually been setup, it is | 1343 | * Since no pte has actually been setup, it is |
1327 | * safe to do nothing in this case. | 1344 | * safe to do nothing in this case. |
1328 | */ | 1345 | */ |
1329 | if (vma->vm_file) | 1346 | if (vma->vm_file) { |
1330 | unmap_hugepage_range(vma, start, end, NULL); | 1347 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
1348 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); | ||
1349 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
1350 | } | ||
1331 | } else | 1351 | } else |
1332 | unmap_page_range(tlb, vma, start, end, details); | 1352 | unmap_page_range(tlb, vma, start, end, details); |
1333 | } | 1353 | } |
@@ -1366,7 +1386,7 @@ void unmap_vmas(struct mmu_gather *tlb, | |||
1366 | /** | 1386 | /** |
1367 | * zap_page_range - remove user pages in a given range | 1387 | * zap_page_range - remove user pages in a given range |
1368 | * @vma: vm_area_struct holding the applicable pages | 1388 | * @vma: vm_area_struct holding the applicable pages |
1369 | * @address: starting address of pages to zap | 1389 | * @start: starting address of pages to zap |
1370 | * @size: number of bytes to zap | 1390 | * @size: number of bytes to zap |
1371 | * @details: details of nonlinear truncation or shared cache invalidation | 1391 | * @details: details of nonlinear truncation or shared cache invalidation |
1372 | * | 1392 | * |
@@ -3921,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip) | |||
3921 | free_page((unsigned long)buf); | 3941 | free_page((unsigned long)buf); |
3922 | } | 3942 | } |
3923 | } | 3943 | } |
3924 | up_read(¤t->mm->mmap_sem); | 3944 | up_read(&mm->mmap_sem); |
3925 | } | 3945 | } |
3926 | 3946 | ||
3927 | #ifdef CONFIG_PROVE_LOCKING | 3947 | #ifdef CONFIG_PROVE_LOCKING |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0d7e3ec8e0f3..3ad25f9d1fc1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
512 | 512 | ||
513 | zone->present_pages += onlined_pages; | 513 | zone->present_pages += onlined_pages; |
514 | zone->zone_pgdat->node_present_pages += onlined_pages; | 514 | zone->zone_pgdat->node_present_pages += onlined_pages; |
515 | if (need_zonelists_rebuild) | 515 | if (onlined_pages) { |
516 | build_all_zonelists(zone); | 516 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
517 | else | 517 | if (need_zonelists_rebuild) |
518 | zone_pcp_update(zone); | 518 | build_all_zonelists(NULL, zone); |
519 | else | ||
520 | zone_pcp_update(zone); | ||
521 | } | ||
519 | 522 | ||
520 | mutex_unlock(&zonelists_mutex); | 523 | mutex_unlock(&zonelists_mutex); |
521 | 524 | ||
522 | init_per_zone_wmark_min(); | 525 | init_per_zone_wmark_min(); |
523 | 526 | ||
524 | if (onlined_pages) { | 527 | if (onlined_pages) |
525 | kswapd_run(zone_to_nid(zone)); | 528 | kswapd_run(zone_to_nid(zone)); |
526 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | ||
527 | } | ||
528 | 529 | ||
529 | vm_total_pages = nr_free_pagecache_pages(); | 530 | vm_total_pages = nr_free_pagecache_pages(); |
530 | 531 | ||
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
562 | * to access not-initialized zonelist, build here. | 563 | * to access not-initialized zonelist, build here. |
563 | */ | 564 | */ |
564 | mutex_lock(&zonelists_mutex); | 565 | mutex_lock(&zonelists_mutex); |
565 | build_all_zonelists(NULL); | 566 | build_all_zonelists(pgdat, NULL); |
566 | mutex_unlock(&zonelists_mutex); | 567 | mutex_unlock(&zonelists_mutex); |
567 | 568 | ||
568 | return pgdat; | 569 | return pgdat; |
@@ -618,7 +619,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
618 | pgdat = hotadd_new_pgdat(nid, start); | 619 | pgdat = hotadd_new_pgdat(nid, start); |
619 | ret = -ENOMEM; | 620 | ret = -ENOMEM; |
620 | if (!pgdat) | 621 | if (!pgdat) |
621 | goto out; | 622 | goto error; |
622 | new_pgdat = 1; | 623 | new_pgdat = 1; |
623 | } | 624 | } |
624 | 625 | ||
@@ -965,6 +966,9 @@ repeat: | |||
965 | 966 | ||
966 | init_per_zone_wmark_min(); | 967 | init_per_zone_wmark_min(); |
967 | 968 | ||
969 | if (!populated_zone(zone)) | ||
970 | zone_pcp_reset(zone); | ||
971 | |||
968 | if (!node_present_pages(node)) { | 972 | if (!node_present_pages(node)) { |
969 | node_clear_state(node, N_HIGH_MEMORY); | 973 | node_clear_state(node, N_HIGH_MEMORY); |
970 | kswapd_stop(node); | 974 | kswapd_stop(node); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f15c1b24ca18..bd92431d4c49 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1177 | if (!list_empty(&pagelist)) { | 1177 | if (!list_empty(&pagelist)) { |
1178 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1178 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1179 | (unsigned long)vma, | 1179 | (unsigned long)vma, |
1180 | false, true); | 1180 | false, MIGRATE_SYNC); |
1181 | if (nr_failed) | 1181 | if (nr_failed) |
1182 | putback_lru_pages(&pagelist); | 1182 | putback_lru_pages(&pagelist); |
1183 | } | 1183 | } |
@@ -1602,8 +1602,14 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
1602 | * task can change it's policy. The system default policy requires no | 1602 | * task can change it's policy. The system default policy requires no |
1603 | * such protection. | 1603 | * such protection. |
1604 | */ | 1604 | */ |
1605 | unsigned slab_node(struct mempolicy *policy) | 1605 | unsigned slab_node(void) |
1606 | { | 1606 | { |
1607 | struct mempolicy *policy; | ||
1608 | |||
1609 | if (in_interrupt()) | ||
1610 | return numa_node_id(); | ||
1611 | |||
1612 | policy = current->mempolicy; | ||
1607 | if (!policy || policy->flags & MPOL_F_LOCAL) | 1613 | if (!policy || policy->flags & MPOL_F_LOCAL) |
1608 | return numa_node_id(); | 1614 | return numa_node_id(); |
1609 | 1615 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index be26d5cbe56b..77ed2d773705 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> |
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
36 | #include <linux/hugetlb_cgroup.h> | ||
36 | #include <linux/gfp.h> | 37 | #include <linux/gfp.h> |
37 | 38 | ||
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
682 | { | 683 | { |
683 | int rc = -EAGAIN; | 684 | int rc = -EAGAIN; |
684 | int remap_swapcache = 1; | 685 | int remap_swapcache = 1; |
685 | int charge = 0; | ||
686 | struct mem_cgroup *mem; | 686 | struct mem_cgroup *mem; |
687 | struct anon_vma *anon_vma = NULL; | 687 | struct anon_vma *anon_vma = NULL; |
688 | 688 | ||
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
724 | } | 724 | } |
725 | 725 | ||
726 | /* charge against new page */ | 726 | /* charge against new page */ |
727 | charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); | 727 | mem_cgroup_prepare_migration(page, newpage, &mem); |
728 | if (charge == -ENOMEM) { | ||
729 | rc = -ENOMEM; | ||
730 | goto unlock; | ||
731 | } | ||
732 | BUG_ON(charge); | ||
733 | 728 | ||
734 | if (PageWriteback(page)) { | 729 | if (PageWriteback(page)) { |
735 | /* | 730 | /* |
@@ -819,8 +814,7 @@ skip_unmap: | |||
819 | put_anon_vma(anon_vma); | 814 | put_anon_vma(anon_vma); |
820 | 815 | ||
821 | uncharge: | 816 | uncharge: |
822 | if (!charge) | 817 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
823 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | ||
824 | unlock: | 818 | unlock: |
825 | unlock_page(page); | 819 | unlock_page(page); |
826 | out: | 820 | out: |
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
931 | 925 | ||
932 | if (anon_vma) | 926 | if (anon_vma) |
933 | put_anon_vma(anon_vma); | 927 | put_anon_vma(anon_vma); |
934 | unlock_page(hpage); | ||
935 | 928 | ||
936 | out: | 929 | if (!rc) |
937 | if (rc != -EAGAIN) { | 930 | hugetlb_cgroup_migrate(hpage, new_hpage); |
938 | list_del(&hpage->lru); | ||
939 | put_page(hpage); | ||
940 | } | ||
941 | 931 | ||
932 | unlock_page(hpage); | ||
933 | out: | ||
942 | put_page(new_hpage); | 934 | put_page(new_hpage); |
943 | |||
944 | if (result) { | 935 | if (result) { |
945 | if (rc) | 936 | if (rc) |
946 | *result = rc; | 937 | *result = rc; |
@@ -1016,48 +1007,32 @@ out: | |||
1016 | return nr_failed + retry; | 1007 | return nr_failed + retry; |
1017 | } | 1008 | } |
1018 | 1009 | ||
1019 | int migrate_huge_pages(struct list_head *from, | 1010 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, |
1020 | new_page_t get_new_page, unsigned long private, bool offlining, | 1011 | unsigned long private, bool offlining, |
1021 | enum migrate_mode mode) | 1012 | enum migrate_mode mode) |
1022 | { | 1013 | { |
1023 | int retry = 1; | 1014 | int pass, rc; |
1024 | int nr_failed = 0; | 1015 | |
1025 | int pass = 0; | 1016 | for (pass = 0; pass < 10; pass++) { |
1026 | struct page *page; | 1017 | rc = unmap_and_move_huge_page(get_new_page, |
1027 | struct page *page2; | 1018 | private, hpage, pass > 2, offlining, |
1028 | int rc; | 1019 | mode); |
1029 | 1020 | switch (rc) { | |
1030 | for (pass = 0; pass < 10 && retry; pass++) { | 1021 | case -ENOMEM: |
1031 | retry = 0; | 1022 | goto out; |
1032 | 1023 | case -EAGAIN: | |
1033 | list_for_each_entry_safe(page, page2, from, lru) { | 1024 | /* try again */ |
1034 | cond_resched(); | 1025 | cond_resched(); |
1035 | 1026 | break; | |
1036 | rc = unmap_and_move_huge_page(get_new_page, | 1027 | case 0: |
1037 | private, page, pass > 2, offlining, | 1028 | goto out; |
1038 | mode); | 1029 | default: |
1039 | 1030 | rc = -EIO; | |
1040 | switch(rc) { | 1031 | goto out; |
1041 | case -ENOMEM: | ||
1042 | goto out; | ||
1043 | case -EAGAIN: | ||
1044 | retry++; | ||
1045 | break; | ||
1046 | case 0: | ||
1047 | break; | ||
1048 | default: | ||
1049 | /* Permanent failure */ | ||
1050 | nr_failed++; | ||
1051 | break; | ||
1052 | } | ||
1053 | } | 1032 | } |
1054 | } | 1033 | } |
1055 | rc = 0; | ||
1056 | out: | 1034 | out: |
1057 | if (rc) | 1035 | return rc; |
1058 | return rc; | ||
1059 | |||
1060 | return nr_failed + retry; | ||
1061 | } | 1036 | } |
1062 | 1037 | ||
1063 | #ifdef CONFIG_NUMA | 1038 | #ifdef CONFIG_NUMA |
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
943 | const unsigned long stack_flags | 943 | const unsigned long stack_flags |
944 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | 944 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); |
945 | 945 | ||
946 | mm->total_vm += pages; | ||
947 | |||
946 | if (file) { | 948 | if (file) { |
947 | mm->shared_vm += pages; | 949 | mm->shared_vm += pages; |
948 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | 950 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) |
@@ -1347,7 +1349,6 @@ munmap_back: | |||
1347 | out: | 1349 | out: |
1348 | perf_event_mmap(vma); | 1350 | perf_event_mmap(vma); |
1349 | 1351 | ||
1350 | mm->total_vm += len >> PAGE_SHIFT; | ||
1351 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1352 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1352 | if (vm_flags & VM_LOCKED) { | 1353 | if (vm_flags & VM_LOCKED) { |
1353 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 1354 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
@@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
1707 | return -ENOMEM; | 1708 | return -ENOMEM; |
1708 | 1709 | ||
1709 | /* Ok, everything looks good - let it rip */ | 1710 | /* Ok, everything looks good - let it rip */ |
1710 | mm->total_vm += grow; | ||
1711 | if (vma->vm_flags & VM_LOCKED) | 1711 | if (vma->vm_flags & VM_LOCKED) |
1712 | mm->locked_vm += grow; | 1712 | mm->locked_vm += grow; |
1713 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | 1713 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); |
@@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | |||
1889 | 1889 | ||
1890 | if (vma->vm_flags & VM_ACCOUNT) | 1890 | if (vma->vm_flags & VM_ACCOUNT) |
1891 | nr_accounted += nrpages; | 1891 | nr_accounted += nrpages; |
1892 | mm->total_vm -= nrpages; | ||
1893 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 1892 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
1894 | vma = remove_vma(vma); | 1893 | vma = remove_vma(vma); |
1895 | } while (vma); | 1894 | } while (vma); |
@@ -2345,9 +2344,6 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | |||
2345 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | 2344 | security_vm_enough_memory_mm(mm, vma_pages(vma))) |
2346 | return -ENOMEM; | 2345 | return -ENOMEM; |
2347 | 2346 | ||
2348 | if (vma->vm_file && uprobe_mmap(vma)) | ||
2349 | return -EINVAL; | ||
2350 | |||
2351 | vma_link(mm, vma, prev, rb_link, rb_parent); | 2347 | vma_link(mm, vma, prev, rb_link, rb_parent); |
2352 | return 0; | 2348 | return 0; |
2353 | } | 2349 | } |
@@ -2418,9 +2414,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2418 | if (new_vma->vm_file) { | 2414 | if (new_vma->vm_file) { |
2419 | get_file(new_vma->vm_file); | 2415 | get_file(new_vma->vm_file); |
2420 | 2416 | ||
2421 | if (uprobe_mmap(new_vma)) | ||
2422 | goto out_free_mempol; | ||
2423 | |||
2424 | if (vma->vm_flags & VM_EXECUTABLE) | 2417 | if (vma->vm_flags & VM_EXECUTABLE) |
2425 | added_exe_file_vma(mm); | 2418 | added_exe_file_vma(mm); |
2426 | } | 2419 | } |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9a611d3a1848..862b60822d9f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -33,6 +33,24 @@ | |||
33 | void __mmu_notifier_release(struct mm_struct *mm) | 33 | void __mmu_notifier_release(struct mm_struct *mm) |
34 | { | 34 | { |
35 | struct mmu_notifier *mn; | 35 | struct mmu_notifier *mn; |
36 | struct hlist_node *n; | ||
37 | |||
38 | /* | ||
39 | * RCU here will block mmu_notifier_unregister until | ||
40 | * ->release returns. | ||
41 | */ | ||
42 | rcu_read_lock(); | ||
43 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | ||
44 | /* | ||
45 | * if ->release runs before mmu_notifier_unregister it | ||
46 | * must be handled as it's the only way for the driver | ||
47 | * to flush all existing sptes and stop the driver | ||
48 | * from establishing any more sptes before all the | ||
49 | * pages in the mm are freed. | ||
50 | */ | ||
51 | if (mn->ops->release) | ||
52 | mn->ops->release(mn, mm); | ||
53 | rcu_read_unlock(); | ||
36 | 54 | ||
37 | spin_lock(&mm->mmu_notifier_mm->lock); | 55 | spin_lock(&mm->mmu_notifier_mm->lock); |
38 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 56 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
46 | * mmu_notifier_unregister to return. | 64 | * mmu_notifier_unregister to return. |
47 | */ | 65 | */ |
48 | hlist_del_init_rcu(&mn->hlist); | 66 | hlist_del_init_rcu(&mn->hlist); |
49 | /* | ||
50 | * RCU here will block mmu_notifier_unregister until | ||
51 | * ->release returns. | ||
52 | */ | ||
53 | rcu_read_lock(); | ||
54 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
55 | /* | ||
56 | * if ->release runs before mmu_notifier_unregister it | ||
57 | * must be handled as it's the only way for the driver | ||
58 | * to flush all existing sptes and stop the driver | ||
59 | * from establishing any more sptes before all the | ||
60 | * pages in the mm are freed. | ||
61 | */ | ||
62 | if (mn->ops->release) | ||
63 | mn->ops->release(mn, mm); | ||
64 | rcu_read_unlock(); | ||
65 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
66 | } | 67 | } |
67 | spin_unlock(&mm->mmu_notifier_mm->lock); | 68 | spin_unlock(&mm->mmu_notifier_mm->lock); |
68 | 69 | ||
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
284 | { | 285 | { |
285 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 286 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
286 | 287 | ||
287 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
288 | if (!hlist_unhashed(&mn->hlist)) { | 288 | if (!hlist_unhashed(&mn->hlist)) { |
289 | hlist_del_rcu(&mn->hlist); | ||
290 | |||
291 | /* | 289 | /* |
292 | * RCU here will force exit_mmap to wait ->release to finish | 290 | * RCU here will force exit_mmap to wait ->release to finish |
293 | * before freeing the pages. | 291 | * before freeing the pages. |
294 | */ | 292 | */ |
295 | rcu_read_lock(); | 293 | rcu_read_lock(); |
296 | spin_unlock(&mm->mmu_notifier_mm->lock); | 294 | |
297 | /* | 295 | /* |
298 | * exit_mmap will block in mmu_notifier_release to | 296 | * exit_mmap will block in mmu_notifier_release to |
299 | * guarantee ->release is called before freeing the | 297 | * guarantee ->release is called before freeing the |
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
302 | if (mn->ops->release) | 300 | if (mn->ops->release) |
303 | mn->ops->release(mn, mm); | 301 | mn->ops->release(mn, mm); |
304 | rcu_read_unlock(); | 302 | rcu_read_unlock(); |
305 | } else | 303 | |
304 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
305 | hlist_del_rcu(&mn->hlist); | ||
306 | spin_unlock(&mm->mmu_notifier_mm->lock); | 306 | spin_unlock(&mm->mmu_notifier_mm->lock); |
307 | } | ||
307 | 308 | ||
308 | /* | 309 | /* |
309 | * Wait any running method to finish, of course including | 310 | * Wait any running method to finish, of course including |
diff --git a/mm/mmzone.c b/mm/mmzone.c index 6830eab5bf09..3cef80f6ac79 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone) | |||
96 | for_each_lru(lru) | 96 | for_each_lru(lru) |
97 | INIT_LIST_HEAD(&lruvec->lists[lru]); | 97 | INIT_LIST_HEAD(&lruvec->lists[lru]); |
98 | 98 | ||
99 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 99 | #ifdef CONFIG_MEMCG |
100 | lruvec->zone = zone; | 100 | lruvec->zone = zone; |
101 | #endif | 101 | #endif |
102 | } | 102 | } |
diff --git a/mm/mremap.c b/mm/mremap.c index 21fed202ddad..cc06d0e48d05 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
260 | * If this were a serious issue, we'd add a flag to do_munmap(). | 260 | * If this were a serious issue, we'd add a flag to do_munmap(). |
261 | */ | 261 | */ |
262 | hiwater_vm = mm->hiwater_vm; | 262 | hiwater_vm = mm->hiwater_vm; |
263 | mm->total_vm += new_len >> PAGE_SHIFT; | ||
264 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); | 263 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); |
265 | 264 | ||
266 | if (do_munmap(mm, old_addr, old_len) < 0) { | 265 | if (do_munmap(mm, old_addr, old_len) < 0) { |
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
497 | goto out; | 496 | goto out; |
498 | } | 497 | } |
499 | 498 | ||
500 | mm->total_vm += pages; | ||
501 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 499 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
502 | if (vma->vm_flags & VM_LOCKED) { | 500 | if (vma->vm_flags & VM_LOCKED) { |
503 | mm->locked_vm += pages; | 501 | mm->locked_vm += pages; |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index d23415c001bc..405573010f99 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) | |||
105 | __free_pages_bootmem(pfn_to_page(i), 0); | 105 | __free_pages_bootmem(pfn_to_page(i), 0); |
106 | } | 106 | } |
107 | 107 | ||
108 | static unsigned long __init __free_memory_core(phys_addr_t start, | ||
109 | phys_addr_t end) | ||
110 | { | ||
111 | unsigned long start_pfn = PFN_UP(start); | ||
112 | unsigned long end_pfn = min_t(unsigned long, | ||
113 | PFN_DOWN(end), max_low_pfn); | ||
114 | |||
115 | if (start_pfn > end_pfn) | ||
116 | return 0; | ||
117 | |||
118 | __free_pages_memory(start_pfn, end_pfn); | ||
119 | |||
120 | return end_pfn - start_pfn; | ||
121 | } | ||
122 | |||
108 | unsigned long __init free_low_memory_core_early(int nodeid) | 123 | unsigned long __init free_low_memory_core_early(int nodeid) |
109 | { | 124 | { |
110 | unsigned long count = 0; | 125 | unsigned long count = 0; |
111 | phys_addr_t start, end; | 126 | phys_addr_t start, end, size; |
112 | u64 i; | 127 | u64 i; |
113 | 128 | ||
114 | /* free reserved array temporarily so that it's treated as free area */ | 129 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) |
115 | memblock_free_reserved_regions(); | 130 | count += __free_memory_core(start, end); |
116 | 131 | ||
117 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { | 132 | /* free range that is used for reserved array if we allocate it */ |
118 | unsigned long start_pfn = PFN_UP(start); | 133 | size = get_allocated_memblock_reserved_regions_info(&start); |
119 | unsigned long end_pfn = min_t(unsigned long, | 134 | if (size) |
120 | PFN_DOWN(end), max_low_pfn); | 135 | count += __free_memory_core(start, start + size); |
121 | if (start_pfn < end_pfn) { | ||
122 | __free_pages_memory(start_pfn, end_pfn); | ||
123 | count += end_pfn - start_pfn; | ||
124 | } | ||
125 | } | ||
126 | 136 | ||
127 | /* put region array back? */ | ||
128 | memblock_reserve_reserved_regions(); | ||
129 | return count; | 137 | return count; |
130 | } | 138 | } |
131 | 139 | ||
@@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
274 | return ___alloc_bootmem(size, align, goal, limit); | 282 | return ___alloc_bootmem(size, align, goal, limit); |
275 | } | 283 | } |
276 | 284 | ||
277 | static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, | 285 | void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, |
278 | unsigned long size, | 286 | unsigned long size, |
279 | unsigned long align, | 287 | unsigned long align, |
280 | unsigned long goal, | 288 | unsigned long goal, |
diff --git a/mm/nommu.c b/mm/nommu.c index c4acfbc09972..d4b0c10872de 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1486 | 1486 | ||
1487 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1487 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
1488 | 1488 | ||
1489 | ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); | 1489 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
1490 | 1490 | ||
1491 | if (file) | 1491 | if (file) |
1492 | fput(file); | 1492 | fput(file); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ed0e19677360..198600861638 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
183 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | 183 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, |
184 | const nodemask_t *nodemask, unsigned long totalpages) | 184 | const nodemask_t *nodemask, unsigned long totalpages) |
185 | { | 185 | { |
186 | unsigned long points; | 186 | long points; |
187 | long adj; | ||
187 | 188 | ||
188 | if (oom_unkillable_task(p, memcg, nodemask)) | 189 | if (oom_unkillable_task(p, memcg, nodemask)) |
189 | return 0; | 190 | return 0; |
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
192 | if (!p) | 193 | if (!p) |
193 | return 0; | 194 | return 0; |
194 | 195 | ||
195 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { | 196 | adj = p->signal->oom_score_adj; |
197 | if (adj == OOM_SCORE_ADJ_MIN) { | ||
196 | task_unlock(p); | 198 | task_unlock(p); |
197 | return 0; | 199 | return 0; |
198 | } | 200 | } |
@@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
210 | * implementation used by LSMs. | 212 | * implementation used by LSMs. |
211 | */ | 213 | */ |
212 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) | 214 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) |
213 | points -= 30 * totalpages / 1000; | 215 | adj -= 30; |
214 | 216 | ||
215 | /* | 217 | /* Normalize to oom_score_adj units */ |
216 | * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may | 218 | adj *= totalpages / 1000; |
217 | * either completely disable oom killing or always prefer a certain | 219 | points += adj; |
218 | * task. | ||
219 | */ | ||
220 | points += p->signal->oom_score_adj * totalpages / 1000; | ||
221 | 220 | ||
222 | /* | 221 | /* |
223 | * Never return 0 for an eligible task regardless of the root bonus and | 222 | * Never return 0 for an eligible task regardless of the root bonus and |
224 | * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). | 223 | * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). |
225 | */ | 224 | */ |
226 | return points ? points : 1; | 225 | return points > 0 ? points : 1; |
227 | } | 226 | } |
228 | 227 | ||
229 | /* | 228 | /* |
@@ -289,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
289 | } | 288 | } |
290 | #endif | 289 | #endif |
291 | 290 | ||
291 | enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | ||
292 | unsigned long totalpages, const nodemask_t *nodemask, | ||
293 | bool force_kill) | ||
294 | { | ||
295 | if (task->exit_state) | ||
296 | return OOM_SCAN_CONTINUE; | ||
297 | if (oom_unkillable_task(task, NULL, nodemask)) | ||
298 | return OOM_SCAN_CONTINUE; | ||
299 | |||
300 | /* | ||
301 | * This task already has access to memory reserves and is being killed. | ||
302 | * Don't allow any other task to have access to the reserves. | ||
303 | */ | ||
304 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { | ||
305 | if (unlikely(frozen(task))) | ||
306 | __thaw_task(task); | ||
307 | if (!force_kill) | ||
308 | return OOM_SCAN_ABORT; | ||
309 | } | ||
310 | if (!task->mm) | ||
311 | return OOM_SCAN_CONTINUE; | ||
312 | |||
313 | if (task->flags & PF_EXITING) { | ||
314 | /* | ||
315 | * If task is current and is in the process of releasing memory, | ||
316 | * allow the "kill" to set TIF_MEMDIE, which will allow it to | ||
317 | * access memory reserves. Otherwise, it may stall forever. | ||
318 | * | ||
319 | * The iteration isn't broken here, however, in case other | ||
320 | * threads are found to have already been oom killed. | ||
321 | */ | ||
322 | if (task == current) | ||
323 | return OOM_SCAN_SELECT; | ||
324 | else if (!force_kill) { | ||
325 | /* | ||
326 | * If this task is not being ptraced on exit, then wait | ||
327 | * for it to finish before killing some other task | ||
328 | * unnecessarily. | ||
329 | */ | ||
330 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) | ||
331 | return OOM_SCAN_ABORT; | ||
332 | } | ||
333 | } | ||
334 | return OOM_SCAN_OK; | ||
335 | } | ||
336 | |||
292 | /* | 337 | /* |
293 | * Simple selection loop. We chose the process with the highest | 338 | * Simple selection loop. We chose the process with the highest |
294 | * number of 'points'. We expect the caller will lock the tasklist. | 339 | * number of 'points'. |
295 | * | 340 | * |
296 | * (not docbooked, we don't want this one cluttering up the manual) | 341 | * (not docbooked, we don't want this one cluttering up the manual) |
297 | */ | 342 | */ |
298 | static struct task_struct *select_bad_process(unsigned int *ppoints, | 343 | static struct task_struct *select_bad_process(unsigned int *ppoints, |
299 | unsigned long totalpages, struct mem_cgroup *memcg, | 344 | unsigned long totalpages, const nodemask_t *nodemask, |
300 | const nodemask_t *nodemask, bool force_kill) | 345 | bool force_kill) |
301 | { | 346 | { |
302 | struct task_struct *g, *p; | 347 | struct task_struct *g, *p; |
303 | struct task_struct *chosen = NULL; | 348 | struct task_struct *chosen = NULL; |
304 | unsigned long chosen_points = 0; | 349 | unsigned long chosen_points = 0; |
305 | 350 | ||
351 | rcu_read_lock(); | ||
306 | do_each_thread(g, p) { | 352 | do_each_thread(g, p) { |
307 | unsigned int points; | 353 | unsigned int points; |
308 | 354 | ||
309 | if (p->exit_state) | 355 | switch (oom_scan_process_thread(p, totalpages, nodemask, |
310 | continue; | 356 | force_kill)) { |
311 | if (oom_unkillable_task(p, memcg, nodemask)) | 357 | case OOM_SCAN_SELECT: |
312 | continue; | 358 | chosen = p; |
313 | 359 | chosen_points = ULONG_MAX; | |
314 | /* | 360 | /* fall through */ |
315 | * This task already has access to memory reserves and is | 361 | case OOM_SCAN_CONTINUE: |
316 | * being killed. Don't allow any other task access to the | ||
317 | * memory reserve. | ||
318 | * | ||
319 | * Note: this may have a chance of deadlock if it gets | ||
320 | * blocked waiting for another task which itself is waiting | ||
321 | * for memory. Is there a better alternative? | ||
322 | */ | ||
323 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { | ||
324 | if (unlikely(frozen(p))) | ||
325 | __thaw_task(p); | ||
326 | if (!force_kill) | ||
327 | return ERR_PTR(-1UL); | ||
328 | } | ||
329 | if (!p->mm) | ||
330 | continue; | 362 | continue; |
331 | 363 | case OOM_SCAN_ABORT: | |
332 | if (p->flags & PF_EXITING) { | 364 | rcu_read_unlock(); |
333 | /* | 365 | return ERR_PTR(-1UL); |
334 | * If p is the current task and is in the process of | 366 | case OOM_SCAN_OK: |
335 | * releasing memory, we allow the "kill" to set | 367 | break; |
336 | * TIF_MEMDIE, which will allow it to gain access to | 368 | }; |
337 | * memory reserves. Otherwise, it may stall forever. | 369 | points = oom_badness(p, NULL, nodemask, totalpages); |
338 | * | ||
339 | * The loop isn't broken here, however, in case other | ||
340 | * threads are found to have already been oom killed. | ||
341 | */ | ||
342 | if (p == current) { | ||
343 | chosen = p; | ||
344 | chosen_points = ULONG_MAX; | ||
345 | } else if (!force_kill) { | ||
346 | /* | ||
347 | * If this task is not being ptraced on exit, | ||
348 | * then wait for it to finish before killing | ||
349 | * some other task unnecessarily. | ||
350 | */ | ||
351 | if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) | ||
352 | return ERR_PTR(-1UL); | ||
353 | } | ||
354 | } | ||
355 | |||
356 | points = oom_badness(p, memcg, nodemask, totalpages); | ||
357 | if (points > chosen_points) { | 370 | if (points > chosen_points) { |
358 | chosen = p; | 371 | chosen = p; |
359 | chosen_points = points; | 372 | chosen_points = points; |
360 | } | 373 | } |
361 | } while_each_thread(g, p); | 374 | } while_each_thread(g, p); |
375 | if (chosen) | ||
376 | get_task_struct(chosen); | ||
377 | rcu_read_unlock(); | ||
362 | 378 | ||
363 | *ppoints = chosen_points * 1000 / totalpages; | 379 | *ppoints = chosen_points * 1000 / totalpages; |
364 | return chosen; | 380 | return chosen; |
@@ -366,23 +382,22 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
366 | 382 | ||
367 | /** | 383 | /** |
368 | * dump_tasks - dump current memory state of all system tasks | 384 | * dump_tasks - dump current memory state of all system tasks |
369 | * @mem: current's memory controller, if constrained | 385 | * @memcg: current's memory controller, if constrained |
370 | * @nodemask: nodemask passed to page allocator for mempolicy ooms | 386 | * @nodemask: nodemask passed to page allocator for mempolicy ooms |
371 | * | 387 | * |
372 | * Dumps the current memory state of all eligible tasks. Tasks not in the same | 388 | * Dumps the current memory state of all eligible tasks. Tasks not in the same |
373 | * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes | 389 | * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes |
374 | * are not shown. | 390 | * are not shown. |
375 | * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj | 391 | * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, |
376 | * value, oom_score_adj value, and name. | 392 | * swapents, oom_score_adj value, and name. |
377 | * | ||
378 | * Call with tasklist_lock read-locked. | ||
379 | */ | 393 | */ |
380 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) | 394 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) |
381 | { | 395 | { |
382 | struct task_struct *p; | 396 | struct task_struct *p; |
383 | struct task_struct *task; | 397 | struct task_struct *task; |
384 | 398 | ||
385 | pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); | 399 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); |
400 | rcu_read_lock(); | ||
386 | for_each_process(p) { | 401 | for_each_process(p) { |
387 | if (oom_unkillable_task(p, memcg, nodemask)) | 402 | if (oom_unkillable_task(p, memcg, nodemask)) |
388 | continue; | 403 | continue; |
@@ -397,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas | |||
397 | continue; | 412 | continue; |
398 | } | 413 | } |
399 | 414 | ||
400 | pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", | 415 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", |
401 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 416 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
402 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 417 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
403 | task_cpu(task), task->signal->oom_adj, | 418 | task->mm->nr_ptes, |
419 | get_mm_counter(task->mm, MM_SWAPENTS), | ||
404 | task->signal->oom_score_adj, task->comm); | 420 | task->signal->oom_score_adj, task->comm); |
405 | task_unlock(task); | 421 | task_unlock(task); |
406 | } | 422 | } |
423 | rcu_read_unlock(); | ||
407 | } | 424 | } |
408 | 425 | ||
409 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | 426 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, |
@@ -424,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
424 | } | 441 | } |
425 | 442 | ||
426 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 443 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
427 | static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | 444 | /* |
428 | unsigned int points, unsigned long totalpages, | 445 | * Must be called while holding a reference to p, which will be released upon |
429 | struct mem_cgroup *memcg, nodemask_t *nodemask, | 446 | * returning. |
430 | const char *message) | 447 | */ |
448 | void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | ||
449 | unsigned int points, unsigned long totalpages, | ||
450 | struct mem_cgroup *memcg, nodemask_t *nodemask, | ||
451 | const char *message) | ||
431 | { | 452 | { |
432 | struct task_struct *victim = p; | 453 | struct task_struct *victim = p; |
433 | struct task_struct *child; | 454 | struct task_struct *child; |
@@ -443,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
443 | */ | 464 | */ |
444 | if (p->flags & PF_EXITING) { | 465 | if (p->flags & PF_EXITING) { |
445 | set_tsk_thread_flag(p, TIF_MEMDIE); | 466 | set_tsk_thread_flag(p, TIF_MEMDIE); |
467 | put_task_struct(p); | ||
446 | return; | 468 | return; |
447 | } | 469 | } |
448 | 470 | ||
@@ -460,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
460 | * parent. This attempts to lose the minimal amount of work done while | 482 | * parent. This attempts to lose the minimal amount of work done while |
461 | * still freeing memory. | 483 | * still freeing memory. |
462 | */ | 484 | */ |
485 | read_lock(&tasklist_lock); | ||
463 | do { | 486 | do { |
464 | list_for_each_entry(child, &t->children, sibling) { | 487 | list_for_each_entry(child, &t->children, sibling) { |
465 | unsigned int child_points; | 488 | unsigned int child_points; |
@@ -472,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
472 | child_points = oom_badness(child, memcg, nodemask, | 495 | child_points = oom_badness(child, memcg, nodemask, |
473 | totalpages); | 496 | totalpages); |
474 | if (child_points > victim_points) { | 497 | if (child_points > victim_points) { |
498 | put_task_struct(victim); | ||
475 | victim = child; | 499 | victim = child; |
476 | victim_points = child_points; | 500 | victim_points = child_points; |
501 | get_task_struct(victim); | ||
477 | } | 502 | } |
478 | } | 503 | } |
479 | } while_each_thread(p, t); | 504 | } while_each_thread(p, t); |
505 | read_unlock(&tasklist_lock); | ||
480 | 506 | ||
481 | victim = find_lock_task_mm(victim); | 507 | rcu_read_lock(); |
482 | if (!victim) | 508 | p = find_lock_task_mm(victim); |
509 | if (!p) { | ||
510 | rcu_read_unlock(); | ||
511 | put_task_struct(victim); | ||
483 | return; | 512 | return; |
513 | } else if (victim != p) { | ||
514 | get_task_struct(p); | ||
515 | put_task_struct(victim); | ||
516 | victim = p; | ||
517 | } | ||
484 | 518 | ||
485 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | 519 | /* mm cannot safely be dereferenced after task_unlock(victim) */ |
486 | mm = victim->mm; | 520 | mm = victim->mm; |
@@ -511,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
511 | task_unlock(p); | 545 | task_unlock(p); |
512 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | 546 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); |
513 | } | 547 | } |
548 | rcu_read_unlock(); | ||
514 | 549 | ||
515 | set_tsk_thread_flag(victim, TIF_MEMDIE); | 550 | set_tsk_thread_flag(victim, TIF_MEMDIE); |
516 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | 551 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); |
552 | put_task_struct(victim); | ||
517 | } | 553 | } |
518 | #undef K | 554 | #undef K |
519 | 555 | ||
520 | /* | 556 | /* |
521 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 557 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
522 | */ | 558 | */ |
523 | static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | 559 | void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, |
524 | int order, const nodemask_t *nodemask) | 560 | int order, const nodemask_t *nodemask) |
525 | { | 561 | { |
526 | if (likely(!sysctl_panic_on_oom)) | 562 | if (likely(!sysctl_panic_on_oom)) |
527 | return; | 563 | return; |
@@ -534,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
534 | if (constraint != CONSTRAINT_NONE) | 570 | if (constraint != CONSTRAINT_NONE) |
535 | return; | 571 | return; |
536 | } | 572 | } |
537 | read_lock(&tasklist_lock); | ||
538 | dump_header(NULL, gfp_mask, order, NULL, nodemask); | 573 | dump_header(NULL, gfp_mask, order, NULL, nodemask); |
539 | read_unlock(&tasklist_lock); | ||
540 | panic("Out of memory: %s panic_on_oom is enabled\n", | 574 | panic("Out of memory: %s panic_on_oom is enabled\n", |
541 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); | 575 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); |
542 | } | 576 | } |
543 | 577 | ||
544 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
545 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
546 | int order) | ||
547 | { | ||
548 | unsigned long limit; | ||
549 | unsigned int points = 0; | ||
550 | struct task_struct *p; | ||
551 | |||
552 | /* | ||
553 | * If current has a pending SIGKILL, then automatically select it. The | ||
554 | * goal is to allow it to allocate so that it may quickly exit and free | ||
555 | * its memory. | ||
556 | */ | ||
557 | if (fatal_signal_pending(current)) { | ||
558 | set_thread_flag(TIF_MEMDIE); | ||
559 | return; | ||
560 | } | ||
561 | |||
562 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | ||
563 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; | ||
564 | read_lock(&tasklist_lock); | ||
565 | p = select_bad_process(&points, limit, memcg, NULL, false); | ||
566 | if (p && PTR_ERR(p) != -1UL) | ||
567 | oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, | ||
568 | "Memory cgroup out of memory"); | ||
569 | read_unlock(&tasklist_lock); | ||
570 | } | ||
571 | #endif | ||
572 | |||
573 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | 578 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); |
574 | 579 | ||
575 | int register_oom_notifier(struct notifier_block *nb) | 580 | int register_oom_notifier(struct notifier_block *nb) |
@@ -691,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
691 | struct task_struct *p; | 696 | struct task_struct *p; |
692 | unsigned long totalpages; | 697 | unsigned long totalpages; |
693 | unsigned long freed = 0; | 698 | unsigned long freed = 0; |
694 | unsigned int points; | 699 | unsigned int uninitialized_var(points); |
695 | enum oom_constraint constraint = CONSTRAINT_NONE; | 700 | enum oom_constraint constraint = CONSTRAINT_NONE; |
696 | int killed = 0; | 701 | int killed = 0; |
697 | 702 | ||
@@ -719,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
719 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; | 724 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; |
720 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); | 725 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); |
721 | 726 | ||
722 | read_lock(&tasklist_lock); | 727 | if (sysctl_oom_kill_allocating_task && current->mm && |
723 | if (sysctl_oom_kill_allocating_task && | ||
724 | !oom_unkillable_task(current, NULL, nodemask) && | 728 | !oom_unkillable_task(current, NULL, nodemask) && |
725 | current->mm) { | 729 | current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { |
730 | get_task_struct(current); | ||
726 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, | 731 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, |
727 | nodemask, | 732 | nodemask, |
728 | "Out of memory (oom_kill_allocating_task)"); | 733 | "Out of memory (oom_kill_allocating_task)"); |
729 | goto out; | 734 | goto out; |
730 | } | 735 | } |
731 | 736 | ||
732 | p = select_bad_process(&points, totalpages, NULL, mpol_mask, | 737 | p = select_bad_process(&points, totalpages, mpol_mask, force_kill); |
733 | force_kill); | ||
734 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 738 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
735 | if (!p) { | 739 | if (!p) { |
736 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); | 740 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); |
737 | read_unlock(&tasklist_lock); | ||
738 | panic("Out of memory and no killable processes...\n"); | 741 | panic("Out of memory and no killable processes...\n"); |
739 | } | 742 | } |
740 | if (PTR_ERR(p) != -1UL) { | 743 | if (PTR_ERR(p) != -1UL) { |
@@ -743,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
743 | killed = 1; | 746 | killed = 1; |
744 | } | 747 | } |
745 | out: | 748 | out: |
746 | read_unlock(&tasklist_lock); | ||
747 | |||
748 | /* | 749 | /* |
749 | * Give "p" a good chance of killing itself before we | 750 | * Give the killed threads a good chance of exiting before trying to |
750 | * retry to allocate memory unless "p" is current | 751 | * allocate memory again. |
751 | */ | 752 | */ |
752 | if (killed && !test_thread_flag(TIF_MEMDIE)) | 753 | if (killed) |
753 | schedule_timeout_uninterruptible(1); | 754 | schedule_timeout_killable(1); |
754 | } | 755 | } |
755 | 756 | ||
756 | /* | 757 | /* |
@@ -765,6 +766,5 @@ void pagefault_out_of_memory(void) | |||
765 | out_of_memory(NULL, 0, 0, NULL, false); | 766 | out_of_memory(NULL, 0, 0, NULL, false); |
766 | clear_system_oom(); | 767 | clear_system_oom(); |
767 | } | 768 | } |
768 | if (!test_thread_flag(TIF_MEMDIE)) | 769 | schedule_timeout_killable(1); |
769 | schedule_timeout_uninterruptible(1); | ||
770 | } | 770 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 93d8d2f7108c..e5363f34e025 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ | 35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ |
36 | #include <linux/pagevec.h> | 36 | #include <linux/pagevec.h> |
37 | #include <linux/timer.h> | ||
37 | #include <trace/events/writeback.h> | 38 | #include <trace/events/writeback.h> |
38 | 39 | ||
39 | /* | 40 | /* |
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit; | |||
135 | * measured in page writeback completions. | 136 | * measured in page writeback completions. |
136 | * | 137 | * |
137 | */ | 138 | */ |
138 | static struct prop_descriptor vm_completions; | 139 | static struct fprop_global writeout_completions; |
140 | |||
141 | static void writeout_period(unsigned long t); | ||
142 | /* Timer for aging of writeout_completions */ | ||
143 | static struct timer_list writeout_period_timer = | ||
144 | TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); | ||
145 | static unsigned long writeout_period_time = 0; | ||
146 | |||
147 | /* | ||
148 | * Length of period for aging writeout fractions of bdis. This is an | ||
149 | * arbitrarily chosen number. The longer the period, the slower fractions will | ||
150 | * reflect changes in current writeout rate. | ||
151 | */ | ||
152 | #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) | ||
139 | 153 | ||
140 | /* | 154 | /* |
141 | * Work out the current dirty-memory clamping and background writeout | 155 | * Work out the current dirty-memory clamping and background writeout |
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone) | |||
322 | zone_page_state(zone, NR_WRITEBACK) <= limit; | 336 | zone_page_state(zone, NR_WRITEBACK) <= limit; |
323 | } | 337 | } |
324 | 338 | ||
325 | /* | ||
326 | * couple the period to the dirty_ratio: | ||
327 | * | ||
328 | * period/2 ~ roundup_pow_of_two(dirty limit) | ||
329 | */ | ||
330 | static int calc_period_shift(void) | ||
331 | { | ||
332 | unsigned long dirty_total; | ||
333 | |||
334 | if (vm_dirty_bytes) | ||
335 | dirty_total = vm_dirty_bytes / PAGE_SIZE; | ||
336 | else | ||
337 | dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / | ||
338 | 100; | ||
339 | return 2 + ilog2(dirty_total - 1); | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * update the period when the dirty threshold changes. | ||
344 | */ | ||
345 | static void update_completion_period(void) | ||
346 | { | ||
347 | int shift = calc_period_shift(); | ||
348 | prop_change_shift(&vm_completions, shift); | ||
349 | |||
350 | writeback_set_ratelimit(); | ||
351 | } | ||
352 | |||
353 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 339 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
354 | void __user *buffer, size_t *lenp, | 340 | void __user *buffer, size_t *lenp, |
355 | loff_t *ppos) | 341 | loff_t *ppos) |
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, | |||
383 | 369 | ||
384 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 370 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
385 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 371 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
386 | update_completion_period(); | 372 | writeback_set_ratelimit(); |
387 | vm_dirty_bytes = 0; | 373 | vm_dirty_bytes = 0; |
388 | } | 374 | } |
389 | return ret; | 375 | return ret; |
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
398 | 384 | ||
399 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | 385 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
400 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | 386 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
401 | update_completion_period(); | 387 | writeback_set_ratelimit(); |
402 | vm_dirty_ratio = 0; | 388 | vm_dirty_ratio = 0; |
403 | } | 389 | } |
404 | return ret; | 390 | return ret; |
405 | } | 391 | } |
406 | 392 | ||
393 | static unsigned long wp_next_time(unsigned long cur_time) | ||
394 | { | ||
395 | cur_time += VM_COMPLETIONS_PERIOD_LEN; | ||
396 | /* 0 has a special meaning... */ | ||
397 | if (!cur_time) | ||
398 | return 1; | ||
399 | return cur_time; | ||
400 | } | ||
401 | |||
407 | /* | 402 | /* |
408 | * Increment the BDI's writeout completion count and the global writeout | 403 | * Increment the BDI's writeout completion count and the global writeout |
409 | * completion count. Called from test_clear_page_writeback(). | 404 | * completion count. Called from test_clear_page_writeback(). |
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
411 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 406 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
412 | { | 407 | { |
413 | __inc_bdi_stat(bdi, BDI_WRITTEN); | 408 | __inc_bdi_stat(bdi, BDI_WRITTEN); |
414 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 409 | __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, |
415 | bdi->max_prop_frac); | 410 | bdi->max_prop_frac); |
411 | /* First event after period switching was turned off? */ | ||
412 | if (!unlikely(writeout_period_time)) { | ||
413 | /* | ||
414 | * We can race with other __bdi_writeout_inc calls here but | ||
415 | * it does not cause any harm since the resulting time when | ||
416 | * timer will fire and what is in writeout_period_time will be | ||
417 | * roughly the same. | ||
418 | */ | ||
419 | writeout_period_time = wp_next_time(jiffies); | ||
420 | mod_timer(&writeout_period_timer, writeout_period_time); | ||
421 | } | ||
416 | } | 422 | } |
417 | 423 | ||
418 | void bdi_writeout_inc(struct backing_dev_info *bdi) | 424 | void bdi_writeout_inc(struct backing_dev_info *bdi) |
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc); | |||
431 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 437 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
432 | long *numerator, long *denominator) | 438 | long *numerator, long *denominator) |
433 | { | 439 | { |
434 | prop_fraction_percpu(&vm_completions, &bdi->completions, | 440 | fprop_fraction_percpu(&writeout_completions, &bdi->completions, |
435 | numerator, denominator); | 441 | numerator, denominator); |
436 | } | 442 | } |
437 | 443 | ||
438 | /* | 444 | /* |
445 | * On idle system, we can be called long after we scheduled because we use | ||
446 | * deferred timers so count with missed periods. | ||
447 | */ | ||
448 | static void writeout_period(unsigned long t) | ||
449 | { | ||
450 | int miss_periods = (jiffies - writeout_period_time) / | ||
451 | VM_COMPLETIONS_PERIOD_LEN; | ||
452 | |||
453 | if (fprop_new_period(&writeout_completions, miss_periods + 1)) { | ||
454 | writeout_period_time = wp_next_time(writeout_period_time + | ||
455 | miss_periods * VM_COMPLETIONS_PERIOD_LEN); | ||
456 | mod_timer(&writeout_period_timer, writeout_period_time); | ||
457 | } else { | ||
458 | /* | ||
459 | * Aging has zeroed all fractions. Stop wasting CPU on period | ||
460 | * updates. | ||
461 | */ | ||
462 | writeout_period_time = 0; | ||
463 | } | ||
464 | } | ||
465 | |||
466 | /* | ||
439 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all | 467 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all |
440 | * registered backing devices, which, for obvious reasons, can not | 468 | * registered backing devices, which, for obvious reasons, can not |
441 | * exceed 100%. | 469 | * exceed 100%. |
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) | |||
475 | ret = -EINVAL; | 503 | ret = -EINVAL; |
476 | } else { | 504 | } else { |
477 | bdi->max_ratio = max_ratio; | 505 | bdi->max_ratio = max_ratio; |
478 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; | 506 | bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; |
479 | } | 507 | } |
480 | spin_unlock_bh(&bdi_lock); | 508 | spin_unlock_bh(&bdi_lock); |
481 | 509 | ||
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
918 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; | 946 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; |
919 | * | 947 | * |
920 | * However to get a more stable dirty_ratelimit, the below elaborated | 948 | * However to get a more stable dirty_ratelimit, the below elaborated |
921 | * code makes use of task_ratelimit to filter out sigular points and | 949 | * code makes use of task_ratelimit to filter out singular points and |
922 | * limit the step size. | 950 | * limit the step size. |
923 | * | 951 | * |
924 | * The below code essentially only uses the relative value of | 952 | * The below code essentially only uses the relative value of |
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
941 | * feel and care are stable dirty rate and small position error. | 969 | * feel and care are stable dirty rate and small position error. |
942 | * | 970 | * |
943 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size | 971 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size |
944 | * and filter out the sigular points of balanced_dirty_ratelimit. Which | 972 | * and filter out the singular points of balanced_dirty_ratelimit. Which |
945 | * keeps jumping around randomly and can even leap far away at times | 973 | * keeps jumping around randomly and can even leap far away at times |
946 | * due to the small 200ms estimation period of dirty_rate (we want to | 974 | * due to the small 200ms estimation period of dirty_rate (we want to |
947 | * keep that period small to reduce time lags). | 975 | * keep that period small to reduce time lags). |
@@ -1606,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { | |||
1606 | */ | 1634 | */ |
1607 | void __init page_writeback_init(void) | 1635 | void __init page_writeback_init(void) |
1608 | { | 1636 | { |
1609 | int shift; | ||
1610 | |||
1611 | writeback_set_ratelimit(); | 1637 | writeback_set_ratelimit(); |
1612 | register_cpu_notifier(&ratelimit_nb); | 1638 | register_cpu_notifier(&ratelimit_nb); |
1613 | 1639 | ||
1614 | shift = calc_period_shift(); | 1640 | fprop_global_init(&writeout_completions); |
1615 | prop_descriptor_init(&vm_completions, shift); | ||
1616 | } | 1641 | } |
1617 | 1642 | ||
1618 | /** | 1643 | /** |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44030096da63..889532b8e6c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -51,7 +51,6 @@ | |||
51 | #include <linux/page_cgroup.h> | 51 | #include <linux/page_cgroup.h> |
52 | #include <linux/debugobjects.h> | 52 | #include <linux/debugobjects.h> |
53 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
54 | #include <linux/memory.h> | ||
55 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
56 | #include <trace/events/kmem.h> | 55 | #include <trace/events/kmem.h> |
57 | #include <linux/ftrace_event.h> | 56 | #include <linux/ftrace_event.h> |
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
219 | 218 | ||
220 | int page_group_by_mobility_disabled __read_mostly; | 219 | int page_group_by_mobility_disabled __read_mostly; |
221 | 220 | ||
222 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 221 | /* |
222 | * NOTE: | ||
223 | * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. | ||
224 | * Instead, use {un}set_pageblock_isolate. | ||
225 | */ | ||
226 | void set_pageblock_migratetype(struct page *page, int migratetype) | ||
223 | { | 227 | { |
224 | 228 | ||
225 | if (unlikely(page_group_by_mobility_disabled)) | 229 | if (unlikely(page_group_by_mobility_disabled)) |
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone, | |||
954 | return pages_moved; | 958 | return pages_moved; |
955 | } | 959 | } |
956 | 960 | ||
957 | static int move_freepages_block(struct zone *zone, struct page *page, | 961 | int move_freepages_block(struct zone *zone, struct page *page, |
958 | int migratetype) | 962 | int migratetype) |
959 | { | 963 | { |
960 | unsigned long start_pfn, end_pfn; | 964 | unsigned long start_pfn, end_pfn; |
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1158 | to_drain = pcp->batch; | 1162 | to_drain = pcp->batch; |
1159 | else | 1163 | else |
1160 | to_drain = pcp->count; | 1164 | to_drain = pcp->count; |
1161 | free_pcppages_bulk(zone, to_drain, pcp); | 1165 | if (to_drain > 0) { |
1162 | pcp->count -= to_drain; | 1166 | free_pcppages_bulk(zone, to_drain, pcp); |
1167 | pcp->count -= to_drain; | ||
1168 | } | ||
1163 | local_irq_restore(flags); | 1169 | local_irq_restore(flags); |
1164 | } | 1170 | } |
1165 | #endif | 1171 | #endif |
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str) | |||
1529 | } | 1535 | } |
1530 | __setup("fail_page_alloc=", setup_fail_page_alloc); | 1536 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
1531 | 1537 | ||
1532 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1538 | static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1533 | { | 1539 | { |
1534 | if (order < fail_page_alloc.min_order) | 1540 | if (order < fail_page_alloc.min_order) |
1535 | return 0; | 1541 | return false; |
1536 | if (gfp_mask & __GFP_NOFAIL) | 1542 | if (gfp_mask & __GFP_NOFAIL) |
1537 | return 0; | 1543 | return false; |
1538 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | 1544 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) |
1539 | return 0; | 1545 | return false; |
1540 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | 1546 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) |
1541 | return 0; | 1547 | return false; |
1542 | 1548 | ||
1543 | return should_fail(&fail_page_alloc.attr, 1 << order); | 1549 | return should_fail(&fail_page_alloc.attr, 1 << order); |
1544 | } | 1550 | } |
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs); | |||
1578 | 1584 | ||
1579 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | 1585 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
1580 | 1586 | ||
1581 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1587 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1582 | { | 1588 | { |
1583 | return 0; | 1589 | return false; |
1584 | } | 1590 | } |
1585 | 1591 | ||
1586 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1592 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1594 | { | 1600 | { |
1595 | /* free_pages my go negative - that's OK */ | 1601 | /* free_pages my go negative - that's OK */ |
1596 | long min = mark; | 1602 | long min = mark; |
1603 | long lowmem_reserve = z->lowmem_reserve[classzone_idx]; | ||
1597 | int o; | 1604 | int o; |
1598 | 1605 | ||
1599 | free_pages -= (1 << order) - 1; | 1606 | free_pages -= (1 << order) - 1; |
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1602 | if (alloc_flags & ALLOC_HARDER) | 1609 | if (alloc_flags & ALLOC_HARDER) |
1603 | min -= min / 4; | 1610 | min -= min / 4; |
1604 | 1611 | ||
1605 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1612 | if (free_pages <= min + lowmem_reserve) |
1606 | return false; | 1613 | return false; |
1607 | for (o = 0; o < order; o++) { | 1614 | for (o = 0; o < order; o++) { |
1608 | /* At the next order, this order's pages become unavailable */ | 1615 | /* At the next order, this order's pages become unavailable */ |
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1617 | return true; | 1624 | return true; |
1618 | } | 1625 | } |
1619 | 1626 | ||
1627 | #ifdef CONFIG_MEMORY_ISOLATION | ||
1628 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1629 | { | ||
1630 | if (unlikely(zone->nr_pageblock_isolate)) | ||
1631 | return zone->nr_pageblock_isolate * pageblock_nr_pages; | ||
1632 | return 0; | ||
1633 | } | ||
1634 | #else | ||
1635 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1636 | { | ||
1637 | return 0; | ||
1638 | } | ||
1639 | #endif | ||
1640 | |||
1620 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1641 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1621 | int classzone_idx, int alloc_flags) | 1642 | int classzone_idx, int alloc_flags) |
1622 | { | 1643 | { |
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1632 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | 1653 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
1633 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | 1654 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
1634 | 1655 | ||
1656 | /* | ||
1657 | * If the zone has MIGRATE_ISOLATE type free pages, we should consider | ||
1658 | * it. nr_zone_isolate_freepages is never accurate so kswapd might not | ||
1659 | * sleep although it could do so. But this is more desirable for memory | ||
1660 | * hotplug than sleeping which can cause a livelock in the direct | ||
1661 | * reclaim path. | ||
1662 | */ | ||
1663 | free_pages -= nr_zone_isolate_freepages(z); | ||
1635 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1664 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1636 | free_pages); | 1665 | free_pages); |
1637 | } | 1666 | } |
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2087 | 2116 | ||
2088 | page = get_page_from_freelist(gfp_mask, nodemask, | 2117 | page = get_page_from_freelist(gfp_mask, nodemask, |
2089 | order, zonelist, high_zoneidx, | 2118 | order, zonelist, high_zoneidx, |
2090 | alloc_flags, preferred_zone, | 2119 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2091 | migratetype); | 2120 | preferred_zone, migratetype); |
2092 | if (page) { | 2121 | if (page) { |
2093 | preferred_zone->compact_considered = 0; | 2122 | preferred_zone->compact_considered = 0; |
2094 | preferred_zone->compact_defer_shift = 0; | 2123 | preferred_zone->compact_defer_shift = 0; |
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2180 | retry: | 2209 | retry: |
2181 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2210 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2182 | zonelist, high_zoneidx, | 2211 | zonelist, high_zoneidx, |
2183 | alloc_flags, preferred_zone, | 2212 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2184 | migratetype); | 2213 | preferred_zone, migratetype); |
2185 | 2214 | ||
2186 | /* | 2215 | /* |
2187 | * If an allocation failed after direct reclaim, it could be because | 2216 | * If an allocation failed after direct reclaim, it could be because |
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2265 | alloc_flags |= ALLOC_HARDER; | 2294 | alloc_flags |= ALLOC_HARDER; |
2266 | 2295 | ||
2267 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 2296 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
2268 | if (!in_interrupt() && | 2297 | if (gfp_mask & __GFP_MEMALLOC) |
2269 | ((current->flags & PF_MEMALLOC) || | 2298 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2270 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 2299 | else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) |
2300 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
2301 | else if (!in_interrupt() && | ||
2302 | ((current->flags & PF_MEMALLOC) || | ||
2303 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
2271 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2304 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2272 | } | 2305 | } |
2273 | 2306 | ||
2274 | return alloc_flags; | 2307 | return alloc_flags; |
2275 | } | 2308 | } |
2276 | 2309 | ||
2310 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | ||
2311 | { | ||
2312 | return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); | ||
2313 | } | ||
2314 | |||
2277 | static inline struct page * | 2315 | static inline struct page * |
2278 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2316 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2279 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2317 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
@@ -2340,11 +2378,27 @@ rebalance: | |||
2340 | 2378 | ||
2341 | /* Allocate without watermarks if the context allows */ | 2379 | /* Allocate without watermarks if the context allows */ |
2342 | if (alloc_flags & ALLOC_NO_WATERMARKS) { | 2380 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
2381 | /* | ||
2382 | * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds | ||
2383 | * the allocation is high priority and these type of | ||
2384 | * allocations are system rather than user orientated | ||
2385 | */ | ||
2386 | zonelist = node_zonelist(numa_node_id(), gfp_mask); | ||
2387 | |||
2343 | page = __alloc_pages_high_priority(gfp_mask, order, | 2388 | page = __alloc_pages_high_priority(gfp_mask, order, |
2344 | zonelist, high_zoneidx, nodemask, | 2389 | zonelist, high_zoneidx, nodemask, |
2345 | preferred_zone, migratetype); | 2390 | preferred_zone, migratetype); |
2346 | if (page) | 2391 | if (page) { |
2392 | /* | ||
2393 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | ||
2394 | * necessary to allocate the page. The expectation is | ||
2395 | * that the caller is taking steps that will free more | ||
2396 | * memory. The caller should avoid the page being used | ||
2397 | * for !PFMEMALLOC purposes. | ||
2398 | */ | ||
2399 | page->pfmemalloc = true; | ||
2347 | goto got_pg; | 2400 | goto got_pg; |
2401 | } | ||
2348 | } | 2402 | } |
2349 | 2403 | ||
2350 | /* Atomic allocations - we can't balance anything */ | 2404 | /* Atomic allocations - we can't balance anything */ |
@@ -2463,8 +2517,8 @@ nopage: | |||
2463 | got_pg: | 2517 | got_pg: |
2464 | if (kmemcheck_enabled) | 2518 | if (kmemcheck_enabled) |
2465 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | 2519 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); |
2466 | return page; | ||
2467 | 2520 | ||
2521 | return page; | ||
2468 | } | 2522 | } |
2469 | 2523 | ||
2470 | /* | 2524 | /* |
@@ -2515,6 +2569,8 @@ retry_cpuset: | |||
2515 | page = __alloc_pages_slowpath(gfp_mask, order, | 2569 | page = __alloc_pages_slowpath(gfp_mask, order, |
2516 | zonelist, high_zoneidx, nodemask, | 2570 | zonelist, high_zoneidx, nodemask, |
2517 | preferred_zone, migratetype); | 2571 | preferred_zone, migratetype); |
2572 | else | ||
2573 | page->pfmemalloc = false; | ||
2518 | 2574 | ||
2519 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2575 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2520 | 2576 | ||
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
3030 | user_zonelist_order = oldval; | 3086 | user_zonelist_order = oldval; |
3031 | } else if (oldval != user_zonelist_order) { | 3087 | } else if (oldval != user_zonelist_order) { |
3032 | mutex_lock(&zonelists_mutex); | 3088 | mutex_lock(&zonelists_mutex); |
3033 | build_all_zonelists(NULL); | 3089 | build_all_zonelists(NULL, NULL); |
3034 | mutex_unlock(&zonelists_mutex); | 3090 | mutex_unlock(&zonelists_mutex); |
3035 | } | 3091 | } |
3036 | } | 3092 | } |
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone); | |||
3409 | DEFINE_MUTEX(zonelists_mutex); | 3465 | DEFINE_MUTEX(zonelists_mutex); |
3410 | 3466 | ||
3411 | /* return values int ....just for stop_machine() */ | 3467 | /* return values int ....just for stop_machine() */ |
3412 | static __init_refok int __build_all_zonelists(void *data) | 3468 | static int __build_all_zonelists(void *data) |
3413 | { | 3469 | { |
3414 | int nid; | 3470 | int nid; |
3415 | int cpu; | 3471 | int cpu; |
3472 | pg_data_t *self = data; | ||
3416 | 3473 | ||
3417 | #ifdef CONFIG_NUMA | 3474 | #ifdef CONFIG_NUMA |
3418 | memset(node_load, 0, sizeof(node_load)); | 3475 | memset(node_load, 0, sizeof(node_load)); |
3419 | #endif | 3476 | #endif |
3477 | |||
3478 | if (self && !node_online(self->node_id)) { | ||
3479 | build_zonelists(self); | ||
3480 | build_zonelist_cache(self); | ||
3481 | } | ||
3482 | |||
3420 | for_each_online_node(nid) { | 3483 | for_each_online_node(nid) { |
3421 | pg_data_t *pgdat = NODE_DATA(nid); | 3484 | pg_data_t *pgdat = NODE_DATA(nid); |
3422 | 3485 | ||
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data) | |||
3461 | * Called with zonelists_mutex held always | 3524 | * Called with zonelists_mutex held always |
3462 | * unless system_state == SYSTEM_BOOTING. | 3525 | * unless system_state == SYSTEM_BOOTING. |
3463 | */ | 3526 | */ |
3464 | void __ref build_all_zonelists(void *data) | 3527 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
3465 | { | 3528 | { |
3466 | set_zonelist_order(); | 3529 | set_zonelist_order(); |
3467 | 3530 | ||
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data) | |||
3473 | /* we have to stop all cpus to guarantee there is no user | 3536 | /* we have to stop all cpus to guarantee there is no user |
3474 | of zonelist */ | 3537 | of zonelist */ |
3475 | #ifdef CONFIG_MEMORY_HOTPLUG | 3538 | #ifdef CONFIG_MEMORY_HOTPLUG |
3476 | if (data) | 3539 | if (zone) |
3477 | setup_zone_pageset((struct zone *)data); | 3540 | setup_zone_pageset(zone); |
3478 | #endif | 3541 | #endif |
3479 | stop_machine(__build_all_zonelists, NULL, NULL); | 3542 | stop_machine(__build_all_zonelists, pgdat, NULL); |
3480 | /* cpuset refresh routine should be here */ | 3543 | /* cpuset refresh routine should be here */ |
3481 | } | 3544 | } |
3482 | vm_total_pages = nr_free_pagecache_pages(); | 3545 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) | |||
3746 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) | 3809 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
3747 | #endif | 3810 | #endif |
3748 | 3811 | ||
3749 | static int zone_batchsize(struct zone *zone) | 3812 | static int __meminit zone_batchsize(struct zone *zone) |
3750 | { | 3813 | { |
3751 | #ifdef CONFIG_MMU | 3814 | #ifdef CONFIG_MMU |
3752 | int batch; | 3815 | int batch; |
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3828 | pcp->batch = PAGE_SHIFT * 8; | 3891 | pcp->batch = PAGE_SHIFT * 8; |
3829 | } | 3892 | } |
3830 | 3893 | ||
3831 | static void setup_zone_pageset(struct zone *zone) | 3894 | static void __meminit setup_zone_pageset(struct zone *zone) |
3832 | { | 3895 | { |
3833 | int cpu; | 3896 | int cpu; |
3834 | 3897 | ||
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
3901 | return 0; | 3964 | return 0; |
3902 | } | 3965 | } |
3903 | 3966 | ||
3904 | static int __zone_pcp_update(void *data) | ||
3905 | { | ||
3906 | struct zone *zone = data; | ||
3907 | int cpu; | ||
3908 | unsigned long batch = zone_batchsize(zone), flags; | ||
3909 | |||
3910 | for_each_possible_cpu(cpu) { | ||
3911 | struct per_cpu_pageset *pset; | ||
3912 | struct per_cpu_pages *pcp; | ||
3913 | |||
3914 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
3915 | pcp = &pset->pcp; | ||
3916 | |||
3917 | local_irq_save(flags); | ||
3918 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
3919 | setup_pageset(pset, batch); | ||
3920 | local_irq_restore(flags); | ||
3921 | } | ||
3922 | return 0; | ||
3923 | } | ||
3924 | |||
3925 | void zone_pcp_update(struct zone *zone) | ||
3926 | { | ||
3927 | stop_machine(__zone_pcp_update, zone, NULL); | ||
3928 | } | ||
3929 | |||
3930 | static __meminit void zone_pcp_init(struct zone *zone) | 3967 | static __meminit void zone_pcp_init(struct zone *zone) |
3931 | { | 3968 | { |
3932 | /* | 3969 | /* |
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
3942 | zone_batchsize(zone)); | 3979 | zone_batchsize(zone)); |
3943 | } | 3980 | } |
3944 | 3981 | ||
3945 | __meminit int init_currently_empty_zone(struct zone *zone, | 3982 | int __meminit init_currently_empty_zone(struct zone *zone, |
3946 | unsigned long zone_start_pfn, | 3983 | unsigned long zone_start_pfn, |
3947 | unsigned long size, | 3984 | unsigned long size, |
3948 | enum memmap_context context) | 3985 | enum memmap_context context) |
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, | |||
4301 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4338 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4302 | 4339 | ||
4303 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4340 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4304 | static inline void __init set_pageblock_order(void) | 4341 | void __init set_pageblock_order(void) |
4305 | { | 4342 | { |
4306 | unsigned int order; | 4343 | unsigned int order; |
4307 | 4344 | ||
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void) | |||
4329 | * include/linux/pageblock-flags.h for the values of pageblock_order based on | 4366 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4330 | * the kernel config | 4367 | * the kernel config |
4331 | */ | 4368 | */ |
4332 | static inline void set_pageblock_order(void) | 4369 | void __init set_pageblock_order(void) |
4333 | { | 4370 | { |
4334 | } | 4371 | } |
4335 | 4372 | ||
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void) | |||
4340 | * - mark all pages reserved | 4377 | * - mark all pages reserved |
4341 | * - mark all memory queues empty | 4378 | * - mark all memory queues empty |
4342 | * - clear the memory bitmaps | 4379 | * - clear the memory bitmaps |
4380 | * | ||
4381 | * NOTE: pgdat should get zeroed by caller. | ||
4343 | */ | 4382 | */ |
4344 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, | 4383 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, |
4345 | unsigned long *zones_size, unsigned long *zholes_size) | 4384 | unsigned long *zones_size, unsigned long *zholes_size) |
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4350 | int ret; | 4389 | int ret; |
4351 | 4390 | ||
4352 | pgdat_resize_init(pgdat); | 4391 | pgdat_resize_init(pgdat); |
4353 | pgdat->nr_zones = 0; | ||
4354 | init_waitqueue_head(&pgdat->kswapd_wait); | 4392 | init_waitqueue_head(&pgdat->kswapd_wait); |
4355 | pgdat->kswapd_max_order = 0; | 4393 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4356 | pgdat_page_cgroup_init(pgdat); | 4394 | pgdat_page_cgroup_init(pgdat); |
4357 | 4395 | ||
4358 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4396 | for (j = 0; j < MAX_NR_ZONES; j++) { |
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4394 | 4432 | ||
4395 | zone->spanned_pages = size; | 4433 | zone->spanned_pages = size; |
4396 | zone->present_pages = realsize; | 4434 | zone->present_pages = realsize; |
4435 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
4436 | zone->compact_cached_free_pfn = zone->zone_start_pfn + | ||
4437 | zone->spanned_pages; | ||
4438 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); | ||
4439 | #endif | ||
4397 | #ifdef CONFIG_NUMA | 4440 | #ifdef CONFIG_NUMA |
4398 | zone->node = nid; | 4441 | zone->node = nid; |
4399 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4442 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4408 | 4451 | ||
4409 | zone_pcp_init(zone); | 4452 | zone_pcp_init(zone); |
4410 | lruvec_init(&zone->lruvec, zone); | 4453 | lruvec_init(&zone->lruvec, zone); |
4411 | zap_zone_vm_stats(zone); | ||
4412 | zone->flags = 0; | ||
4413 | if (!size) | 4454 | if (!size) |
4414 | continue; | 4455 | continue; |
4415 | 4456 | ||
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4469 | { | 4510 | { |
4470 | pg_data_t *pgdat = NODE_DATA(nid); | 4511 | pg_data_t *pgdat = NODE_DATA(nid); |
4471 | 4512 | ||
4513 | /* pg_data_t should be reset to zero when it's allocated */ | ||
4514 | WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx); | ||
4515 | |||
4472 | pgdat->node_id = nid; | 4516 | pgdat->node_id = nid; |
4473 | pgdat->node_start_pfn = node_start_pfn; | 4517 | pgdat->node_start_pfn = node_start_pfn; |
4474 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4518 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
@@ -4750,7 +4794,7 @@ out: | |||
4750 | } | 4794 | } |
4751 | 4795 | ||
4752 | /* Any regular memory on that node ? */ | 4796 | /* Any regular memory on that node ? */ |
4753 | static void check_for_regular_memory(pg_data_t *pgdat) | 4797 | static void __init check_for_regular_memory(pg_data_t *pgdat) |
4754 | { | 4798 | { |
4755 | #ifdef CONFIG_HIGHMEM | 4799 | #ifdef CONFIG_HIGHMEM |
4756 | enum zone_type zone_type; | 4800 | enum zone_type zone_type; |
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5468 | } | 5512 | } |
5469 | 5513 | ||
5470 | /* | 5514 | /* |
5471 | * This is designed as sub function...plz see page_isolation.c also. | 5515 | * This function checks whether pageblock includes unmovable pages or not. |
5472 | * set/clear page block's type to be ISOLATE. | 5516 | * If @count is not zero, it is okay to include less @count unmovable pages |
5473 | * page allocater never alloc memory from ISOLATE block. | 5517 | * |
5518 | * PageLRU check wihtout isolation or lru_lock could race so that | ||
5519 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | ||
5520 | * expect this function should be exact. | ||
5474 | */ | 5521 | */ |
5475 | 5522 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | |
5476 | static int | ||
5477 | __count_immobile_pages(struct zone *zone, struct page *page, int count) | ||
5478 | { | 5523 | { |
5479 | unsigned long pfn, iter, found; | 5524 | unsigned long pfn, iter, found; |
5480 | int mt; | 5525 | int mt; |
5481 | 5526 | ||
5482 | /* | 5527 | /* |
5483 | * For avoiding noise data, lru_add_drain_all() should be called | 5528 | * For avoiding noise data, lru_add_drain_all() should be called |
5484 | * If ZONE_MOVABLE, the zone never contains immobile pages | 5529 | * If ZONE_MOVABLE, the zone never contains unmovable pages |
5485 | */ | 5530 | */ |
5486 | if (zone_idx(zone) == ZONE_MOVABLE) | 5531 | if (zone_idx(zone) == ZONE_MOVABLE) |
5487 | return true; | 5532 | return false; |
5488 | mt = get_pageblock_migratetype(page); | 5533 | mt = get_pageblock_migratetype(page); |
5489 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) | 5534 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) |
5490 | return true; | 5535 | return false; |
5491 | 5536 | ||
5492 | pfn = page_to_pfn(page); | 5537 | pfn = page_to_pfn(page); |
5493 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { | 5538 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { |
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5497 | continue; | 5542 | continue; |
5498 | 5543 | ||
5499 | page = pfn_to_page(check); | 5544 | page = pfn_to_page(check); |
5500 | if (!page_count(page)) { | 5545 | /* |
5546 | * We can't use page_count without pin a page | ||
5547 | * because another CPU can free compound page. | ||
5548 | * This check already skips compound tails of THP | ||
5549 | * because their page->_count is zero at all time. | ||
5550 | */ | ||
5551 | if (!atomic_read(&page->_count)) { | ||
5501 | if (PageBuddy(page)) | 5552 | if (PageBuddy(page)) |
5502 | iter += (1 << page_order(page)) - 1; | 5553 | iter += (1 << page_order(page)) - 1; |
5503 | continue; | 5554 | continue; |
5504 | } | 5555 | } |
5556 | |||
5505 | if (!PageLRU(page)) | 5557 | if (!PageLRU(page)) |
5506 | found++; | 5558 | found++; |
5507 | /* | 5559 | /* |
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5518 | * page at boot. | 5570 | * page at boot. |
5519 | */ | 5571 | */ |
5520 | if (found > count) | 5572 | if (found > count) |
5521 | return false; | 5573 | return true; |
5522 | } | 5574 | } |
5523 | return true; | 5575 | return false; |
5524 | } | 5576 | } |
5525 | 5577 | ||
5526 | bool is_pageblock_removable_nolock(struct page *page) | 5578 | bool is_pageblock_removable_nolock(struct page *page) |
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5544 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5596 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5545 | return false; | 5597 | return false; |
5546 | 5598 | ||
5547 | return __count_immobile_pages(zone, page, 0); | 5599 | return !has_unmovable_pages(zone, page, 0); |
5548 | } | ||
5549 | |||
5550 | int set_migratetype_isolate(struct page *page) | ||
5551 | { | ||
5552 | struct zone *zone; | ||
5553 | unsigned long flags, pfn; | ||
5554 | struct memory_isolate_notify arg; | ||
5555 | int notifier_ret; | ||
5556 | int ret = -EBUSY; | ||
5557 | |||
5558 | zone = page_zone(page); | ||
5559 | |||
5560 | spin_lock_irqsave(&zone->lock, flags); | ||
5561 | |||
5562 | pfn = page_to_pfn(page); | ||
5563 | arg.start_pfn = pfn; | ||
5564 | arg.nr_pages = pageblock_nr_pages; | ||
5565 | arg.pages_found = 0; | ||
5566 | |||
5567 | /* | ||
5568 | * It may be possible to isolate a pageblock even if the | ||
5569 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
5570 | * notifier chain is used by balloon drivers to return the | ||
5571 | * number of pages in a range that are held by the balloon | ||
5572 | * driver to shrink memory. If all the pages are accounted for | ||
5573 | * by balloons, are free, or on the LRU, isolation can continue. | ||
5574 | * Later, for example, when memory hotplug notifier runs, these | ||
5575 | * pages reported as "can be isolated" should be isolated(freed) | ||
5576 | * by the balloon driver through the memory notifier chain. | ||
5577 | */ | ||
5578 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | ||
5579 | notifier_ret = notifier_to_errno(notifier_ret); | ||
5580 | if (notifier_ret) | ||
5581 | goto out; | ||
5582 | /* | ||
5583 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | ||
5584 | * We just check MOVABLE pages. | ||
5585 | */ | ||
5586 | if (__count_immobile_pages(zone, page, arg.pages_found)) | ||
5587 | ret = 0; | ||
5588 | |||
5589 | /* | ||
5590 | * immobile means "not-on-lru" paes. If immobile is larger than | ||
5591 | * removable-by-driver pages reported by notifier, we'll fail. | ||
5592 | */ | ||
5593 | |||
5594 | out: | ||
5595 | if (!ret) { | ||
5596 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
5597 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
5598 | } | ||
5599 | |||
5600 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5601 | if (!ret) | ||
5602 | drain_all_pages(); | ||
5603 | return ret; | ||
5604 | } | ||
5605 | |||
5606 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | ||
5607 | { | ||
5608 | struct zone *zone; | ||
5609 | unsigned long flags; | ||
5610 | zone = page_zone(page); | ||
5611 | spin_lock_irqsave(&zone->lock, flags); | ||
5612 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
5613 | goto out; | ||
5614 | set_pageblock_migratetype(page, migratetype); | ||
5615 | move_freepages_block(zone, page, migratetype); | ||
5616 | out: | ||
5617 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5618 | } | 5600 | } |
5619 | 5601 | ||
5620 | #ifdef CONFIG_CMA | 5602 | #ifdef CONFIG_CMA |
@@ -5635,7 +5617,12 @@ static struct page * | |||
5635 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, | 5617 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, |
5636 | int **resultp) | 5618 | int **resultp) |
5637 | { | 5619 | { |
5638 | return alloc_page(GFP_HIGHUSER_MOVABLE); | 5620 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; |
5621 | |||
5622 | if (PageHighMem(page)) | ||
5623 | gfp_mask |= __GFP_HIGHMEM; | ||
5624 | |||
5625 | return alloc_page(gfp_mask); | ||
5639 | } | 5626 | } |
5640 | 5627 | ||
5641 | /* [start, end) must belong to a single zone. */ | 5628 | /* [start, end) must belong to a single zone. */ |
@@ -5864,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) | |||
5864 | } | 5851 | } |
5865 | #endif | 5852 | #endif |
5866 | 5853 | ||
5854 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
5855 | static int __meminit __zone_pcp_update(void *data) | ||
5856 | { | ||
5857 | struct zone *zone = data; | ||
5858 | int cpu; | ||
5859 | unsigned long batch = zone_batchsize(zone), flags; | ||
5860 | |||
5861 | for_each_possible_cpu(cpu) { | ||
5862 | struct per_cpu_pageset *pset; | ||
5863 | struct per_cpu_pages *pcp; | ||
5864 | |||
5865 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
5866 | pcp = &pset->pcp; | ||
5867 | |||
5868 | local_irq_save(flags); | ||
5869 | if (pcp->count > 0) | ||
5870 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
5871 | setup_pageset(pset, batch); | ||
5872 | local_irq_restore(flags); | ||
5873 | } | ||
5874 | return 0; | ||
5875 | } | ||
5876 | |||
5877 | void __meminit zone_pcp_update(struct zone *zone) | ||
5878 | { | ||
5879 | stop_machine(__zone_pcp_update, zone, NULL); | ||
5880 | } | ||
5881 | #endif | ||
5882 | |||
5867 | #ifdef CONFIG_MEMORY_HOTREMOVE | 5883 | #ifdef CONFIG_MEMORY_HOTREMOVE |
5884 | void zone_pcp_reset(struct zone *zone) | ||
5885 | { | ||
5886 | unsigned long flags; | ||
5887 | |||
5888 | /* avoid races with drain_pages() */ | ||
5889 | local_irq_save(flags); | ||
5890 | if (zone->pageset != &boot_pageset) { | ||
5891 | free_percpu(zone->pageset); | ||
5892 | zone->pageset = &boot_pageset; | ||
5893 | } | ||
5894 | local_irq_restore(flags); | ||
5895 | } | ||
5896 | |||
5868 | /* | 5897 | /* |
5869 | * All pages in the range must be isolated before calling this. | 5898 | * All pages in the range must be isolated before calling this. |
5870 | */ | 5899 | */ |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 1ccbd714059c..5ddad0c6daa6 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | |||
317 | #endif | 317 | #endif |
318 | 318 | ||
319 | 319 | ||
320 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 320 | #ifdef CONFIG_MEMCG_SWAP |
321 | 321 | ||
322 | static DEFINE_MUTEX(swap_cgroup_mutex); | 322 | static DEFINE_MUTEX(swap_cgroup_mutex); |
323 | struct swap_cgroup_ctrl { | 323 | struct swap_cgroup_ctrl { |
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | |||
392 | 392 | ||
393 | /** | 393 | /** |
394 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | 394 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. |
395 | * @end: swap entry to be cmpxchged | 395 | * @ent: swap entry to be cmpxchged |
396 | * @old: old id | 396 | * @old: old id |
397 | * @new: new id | 397 | * @new: new id |
398 | * | 398 | * |
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | |||
422 | /** | 422 | /** |
423 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | 423 | * swap_cgroup_record - record mem_cgroup for this swp_entry. |
424 | * @ent: swap entry to be recorded into | 424 | * @ent: swap entry to be recorded into |
425 | * @mem: mem_cgroup to be recorded | 425 | * @id: mem_cgroup to be recorded |
426 | * | 426 | * |
427 | * Returns old value at success, 0 at failure. | 427 | * Returns old value at success, 0 at failure. |
428 | * (Of course, old value can be 0.) | 428 | * (Of course, old value can be 0.) |
diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611e..78eee32ee486 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -17,7 +17,9 @@ | |||
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/bio.h> | 18 | #include <linux/bio.h> |
19 | #include <linux/swapops.h> | 19 | #include <linux/swapops.h> |
20 | #include <linux/buffer_head.h> | ||
20 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
22 | #include <linux/frontswap.h> | ||
21 | #include <asm/pgtable.h> | 23 | #include <asm/pgtable.h> |
22 | 24 | ||
23 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 25 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
@@ -85,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err) | |||
85 | bio_put(bio); | 87 | bio_put(bio); |
86 | } | 88 | } |
87 | 89 | ||
90 | int generic_swapfile_activate(struct swap_info_struct *sis, | ||
91 | struct file *swap_file, | ||
92 | sector_t *span) | ||
93 | { | ||
94 | struct address_space *mapping = swap_file->f_mapping; | ||
95 | struct inode *inode = mapping->host; | ||
96 | unsigned blocks_per_page; | ||
97 | unsigned long page_no; | ||
98 | unsigned blkbits; | ||
99 | sector_t probe_block; | ||
100 | sector_t last_block; | ||
101 | sector_t lowest_block = -1; | ||
102 | sector_t highest_block = 0; | ||
103 | int nr_extents = 0; | ||
104 | int ret; | ||
105 | |||
106 | blkbits = inode->i_blkbits; | ||
107 | blocks_per_page = PAGE_SIZE >> blkbits; | ||
108 | |||
109 | /* | ||
110 | * Map all the blocks into the extent list. This code doesn't try | ||
111 | * to be very smart. | ||
112 | */ | ||
113 | probe_block = 0; | ||
114 | page_no = 0; | ||
115 | last_block = i_size_read(inode) >> blkbits; | ||
116 | while ((probe_block + blocks_per_page) <= last_block && | ||
117 | page_no < sis->max) { | ||
118 | unsigned block_in_page; | ||
119 | sector_t first_block; | ||
120 | |||
121 | first_block = bmap(inode, probe_block); | ||
122 | if (first_block == 0) | ||
123 | goto bad_bmap; | ||
124 | |||
125 | /* | ||
126 | * It must be PAGE_SIZE aligned on-disk | ||
127 | */ | ||
128 | if (first_block & (blocks_per_page - 1)) { | ||
129 | probe_block++; | ||
130 | goto reprobe; | ||
131 | } | ||
132 | |||
133 | for (block_in_page = 1; block_in_page < blocks_per_page; | ||
134 | block_in_page++) { | ||
135 | sector_t block; | ||
136 | |||
137 | block = bmap(inode, probe_block + block_in_page); | ||
138 | if (block == 0) | ||
139 | goto bad_bmap; | ||
140 | if (block != first_block + block_in_page) { | ||
141 | /* Discontiguity */ | ||
142 | probe_block++; | ||
143 | goto reprobe; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | first_block >>= (PAGE_SHIFT - blkbits); | ||
148 | if (page_no) { /* exclude the header page */ | ||
149 | if (first_block < lowest_block) | ||
150 | lowest_block = first_block; | ||
151 | if (first_block > highest_block) | ||
152 | highest_block = first_block; | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks | ||
157 | */ | ||
158 | ret = add_swap_extent(sis, page_no, 1, first_block); | ||
159 | if (ret < 0) | ||
160 | goto out; | ||
161 | nr_extents += ret; | ||
162 | page_no++; | ||
163 | probe_block += blocks_per_page; | ||
164 | reprobe: | ||
165 | continue; | ||
166 | } | ||
167 | ret = nr_extents; | ||
168 | *span = 1 + highest_block - lowest_block; | ||
169 | if (page_no == 0) | ||
170 | page_no = 1; /* force Empty message */ | ||
171 | sis->max = page_no; | ||
172 | sis->pages = page_no - 1; | ||
173 | sis->highest_bit = page_no - 1; | ||
174 | out: | ||
175 | return ret; | ||
176 | bad_bmap: | ||
177 | printk(KERN_ERR "swapon: swapfile has holes\n"); | ||
178 | ret = -EINVAL; | ||
179 | goto out; | ||
180 | } | ||
181 | |||
88 | /* | 182 | /* |
89 | * We may have stale swap cache pages in memory: notice | 183 | * We may have stale swap cache pages in memory: notice |
90 | * them here and get rid of the unnecessary final write. | 184 | * them here and get rid of the unnecessary final write. |
@@ -93,11 +187,45 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
93 | { | 187 | { |
94 | struct bio *bio; | 188 | struct bio *bio; |
95 | int ret = 0, rw = WRITE; | 189 | int ret = 0, rw = WRITE; |
190 | struct swap_info_struct *sis = page_swap_info(page); | ||
96 | 191 | ||
97 | if (try_to_free_swap(page)) { | 192 | if (try_to_free_swap(page)) { |
98 | unlock_page(page); | 193 | unlock_page(page); |
99 | goto out; | 194 | goto out; |
100 | } | 195 | } |
196 | if (frontswap_store(page) == 0) { | ||
197 | set_page_writeback(page); | ||
198 | unlock_page(page); | ||
199 | end_page_writeback(page); | ||
200 | goto out; | ||
201 | } | ||
202 | |||
203 | if (sis->flags & SWP_FILE) { | ||
204 | struct kiocb kiocb; | ||
205 | struct file *swap_file = sis->swap_file; | ||
206 | struct address_space *mapping = swap_file->f_mapping; | ||
207 | struct iovec iov = { | ||
208 | .iov_base = kmap(page), | ||
209 | .iov_len = PAGE_SIZE, | ||
210 | }; | ||
211 | |||
212 | init_sync_kiocb(&kiocb, swap_file); | ||
213 | kiocb.ki_pos = page_file_offset(page); | ||
214 | kiocb.ki_left = PAGE_SIZE; | ||
215 | kiocb.ki_nbytes = PAGE_SIZE; | ||
216 | |||
217 | unlock_page(page); | ||
218 | ret = mapping->a_ops->direct_IO(KERNEL_WRITE, | ||
219 | &kiocb, &iov, | ||
220 | kiocb.ki_pos, 1); | ||
221 | kunmap(page); | ||
222 | if (ret == PAGE_SIZE) { | ||
223 | count_vm_event(PSWPOUT); | ||
224 | ret = 0; | ||
225 | } | ||
226 | return ret; | ||
227 | } | ||
228 | |||
101 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); | 229 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
102 | if (bio == NULL) { | 230 | if (bio == NULL) { |
103 | set_page_dirty(page); | 231 | set_page_dirty(page); |
@@ -119,9 +247,26 @@ int swap_readpage(struct page *page) | |||
119 | { | 247 | { |
120 | struct bio *bio; | 248 | struct bio *bio; |
121 | int ret = 0; | 249 | int ret = 0; |
250 | struct swap_info_struct *sis = page_swap_info(page); | ||
122 | 251 | ||
123 | VM_BUG_ON(!PageLocked(page)); | 252 | VM_BUG_ON(!PageLocked(page)); |
124 | VM_BUG_ON(PageUptodate(page)); | 253 | VM_BUG_ON(PageUptodate(page)); |
254 | if (frontswap_load(page) == 0) { | ||
255 | SetPageUptodate(page); | ||
256 | unlock_page(page); | ||
257 | goto out; | ||
258 | } | ||
259 | |||
260 | if (sis->flags & SWP_FILE) { | ||
261 | struct file *swap_file = sis->swap_file; | ||
262 | struct address_space *mapping = swap_file->f_mapping; | ||
263 | |||
264 | ret = mapping->a_ops->readpage(swap_file, page); | ||
265 | if (!ret) | ||
266 | count_vm_event(PSWPIN); | ||
267 | return ret; | ||
268 | } | ||
269 | |||
125 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); | 270 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
126 | if (bio == NULL) { | 271 | if (bio == NULL) { |
127 | unlock_page(page); | 272 | unlock_page(page); |
@@ -133,3 +278,15 @@ int swap_readpage(struct page *page) | |||
133 | out: | 278 | out: |
134 | return ret; | 279 | return ret; |
135 | } | 280 | } |
281 | |||
282 | int swap_set_page_dirty(struct page *page) | ||
283 | { | ||
284 | struct swap_info_struct *sis = page_swap_info(page); | ||
285 | |||
286 | if (sis->flags & SWP_FILE) { | ||
287 | struct address_space *mapping = sis->swap_file->f_mapping; | ||
288 | return mapping->a_ops->set_page_dirty(page); | ||
289 | } else { | ||
290 | return __set_page_dirty_no_writeback(page); | ||
291 | } | ||
292 | } | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c9f04774f2b8..247d1f175739 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -5,8 +5,101 @@ | |||
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/page-isolation.h> | 6 | #include <linux/page-isolation.h> |
7 | #include <linux/pageblock-flags.h> | 7 | #include <linux/pageblock-flags.h> |
8 | #include <linux/memory.h> | ||
8 | #include "internal.h" | 9 | #include "internal.h" |
9 | 10 | ||
11 | /* called while holding zone->lock */ | ||
12 | static void set_pageblock_isolate(struct page *page) | ||
13 | { | ||
14 | if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) | ||
15 | return; | ||
16 | |||
17 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
18 | page_zone(page)->nr_pageblock_isolate++; | ||
19 | } | ||
20 | |||
21 | /* called while holding zone->lock */ | ||
22 | static void restore_pageblock_isolate(struct page *page, int migratetype) | ||
23 | { | ||
24 | struct zone *zone = page_zone(page); | ||
25 | if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) | ||
26 | return; | ||
27 | |||
28 | BUG_ON(zone->nr_pageblock_isolate <= 0); | ||
29 | set_pageblock_migratetype(page, migratetype); | ||
30 | zone->nr_pageblock_isolate--; | ||
31 | } | ||
32 | |||
33 | int set_migratetype_isolate(struct page *page) | ||
34 | { | ||
35 | struct zone *zone; | ||
36 | unsigned long flags, pfn; | ||
37 | struct memory_isolate_notify arg; | ||
38 | int notifier_ret; | ||
39 | int ret = -EBUSY; | ||
40 | |||
41 | zone = page_zone(page); | ||
42 | |||
43 | spin_lock_irqsave(&zone->lock, flags); | ||
44 | |||
45 | pfn = page_to_pfn(page); | ||
46 | arg.start_pfn = pfn; | ||
47 | arg.nr_pages = pageblock_nr_pages; | ||
48 | arg.pages_found = 0; | ||
49 | |||
50 | /* | ||
51 | * It may be possible to isolate a pageblock even if the | ||
52 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
53 | * notifier chain is used by balloon drivers to return the | ||
54 | * number of pages in a range that are held by the balloon | ||
55 | * driver to shrink memory. If all the pages are accounted for | ||
56 | * by balloons, are free, or on the LRU, isolation can continue. | ||
57 | * Later, for example, when memory hotplug notifier runs, these | ||
58 | * pages reported as "can be isolated" should be isolated(freed) | ||
59 | * by the balloon driver through the memory notifier chain. | ||
60 | */ | ||
61 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | ||
62 | notifier_ret = notifier_to_errno(notifier_ret); | ||
63 | if (notifier_ret) | ||
64 | goto out; | ||
65 | /* | ||
66 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | ||
67 | * We just check MOVABLE pages. | ||
68 | */ | ||
69 | if (!has_unmovable_pages(zone, page, arg.pages_found)) | ||
70 | ret = 0; | ||
71 | |||
72 | /* | ||
73 | * immobile means "not-on-lru" paes. If immobile is larger than | ||
74 | * removable-by-driver pages reported by notifier, we'll fail. | ||
75 | */ | ||
76 | |||
77 | out: | ||
78 | if (!ret) { | ||
79 | set_pageblock_isolate(page); | ||
80 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
81 | } | ||
82 | |||
83 | spin_unlock_irqrestore(&zone->lock, flags); | ||
84 | if (!ret) | ||
85 | drain_all_pages(); | ||
86 | return ret; | ||
87 | } | ||
88 | |||
89 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | ||
90 | { | ||
91 | struct zone *zone; | ||
92 | unsigned long flags; | ||
93 | zone = page_zone(page); | ||
94 | spin_lock_irqsave(&zone->lock, flags); | ||
95 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
96 | goto out; | ||
97 | move_freepages_block(zone, page, migratetype); | ||
98 | restore_pageblock_isolate(page, migratetype); | ||
99 | out: | ||
100 | spin_unlock_irqrestore(&zone->lock, flags); | ||
101 | } | ||
102 | |||
10 | static inline struct page * | 103 | static inline struct page * |
11 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) | 104 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) |
12 | { | 105 | { |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index aa9701e12714..6c118d012bb5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
162 | 162 | ||
163 | /** | 163 | /** |
164 | * walk_page_range - walk a memory map's page tables with a callback | 164 | * walk_page_range - walk a memory map's page tables with a callback |
165 | * @mm: memory map to walk | ||
166 | * @addr: starting address | 165 | * @addr: starting address |
167 | * @end: ending address | 166 | * @end: ending address |
168 | * @walk: set of callbacks to invoke for each level of the tree | 167 | * @walk: set of callbacks to invoke for each level of the tree |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 405d331804c3..3707c71ae4cd 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -360,7 +360,6 @@ err_free: | |||
360 | * @chunk: chunk to depopulate | 360 | * @chunk: chunk to depopulate |
361 | * @off: offset to the area to depopulate | 361 | * @off: offset to the area to depopulate |
362 | * @size: size of the area to depopulate in bytes | 362 | * @size: size of the area to depopulate in bytes |
363 | * @flush: whether to flush cache and tlb or not | ||
364 | * | 363 | * |
365 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 364 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
366 | * from @chunk. If @flush is true, vcache is flushed before unmapping | 365 | * from @chunk. If @flush is true, vcache is flushed before unmapping |
diff --git a/mm/shmem.c b/mm/shmem.c index c244e93a70fa..d4e184e2a38e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping, | |||
264 | } | 264 | } |
265 | 265 | ||
266 | /* | 266 | /* |
267 | * Sometimes, before we decide whether to proceed or to fail, we must check | ||
268 | * that an entry was not already brought back from swap by a racing thread. | ||
269 | * | ||
270 | * Checking page is not enough: by the time a SwapCache page is locked, it | ||
271 | * might be reused, and again be SwapCache, using the same swap as before. | ||
272 | */ | ||
273 | static bool shmem_confirm_swap(struct address_space *mapping, | ||
274 | pgoff_t index, swp_entry_t swap) | ||
275 | { | ||
276 | void *item; | ||
277 | |||
278 | rcu_read_lock(); | ||
279 | item = radix_tree_lookup(&mapping->page_tree, index); | ||
280 | rcu_read_unlock(); | ||
281 | return item == swp_to_radix_entry(swap); | ||
282 | } | ||
283 | |||
284 | /* | ||
267 | * Like add_to_page_cache_locked, but error if expected item has gone. | 285 | * Like add_to_page_cache_locked, but error if expected item has gone. |
268 | */ | 286 | */ |
269 | static int shmem_add_to_page_cache(struct page *page, | 287 | static int shmem_add_to_page_cache(struct page *page, |
270 | struct address_space *mapping, | 288 | struct address_space *mapping, |
271 | pgoff_t index, gfp_t gfp, void *expected) | 289 | pgoff_t index, gfp_t gfp, void *expected) |
272 | { | 290 | { |
273 | int error = 0; | 291 | int error; |
274 | 292 | ||
275 | VM_BUG_ON(!PageLocked(page)); | 293 | VM_BUG_ON(!PageLocked(page)); |
276 | VM_BUG_ON(!PageSwapBacked(page)); | 294 | VM_BUG_ON(!PageSwapBacked(page)); |
277 | 295 | ||
296 | page_cache_get(page); | ||
297 | page->mapping = mapping; | ||
298 | page->index = index; | ||
299 | |||
300 | spin_lock_irq(&mapping->tree_lock); | ||
278 | if (!expected) | 301 | if (!expected) |
279 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | 302 | error = radix_tree_insert(&mapping->page_tree, index, page); |
303 | else | ||
304 | error = shmem_radix_tree_replace(mapping, index, expected, | ||
305 | page); | ||
280 | if (!error) { | 306 | if (!error) { |
281 | page_cache_get(page); | 307 | mapping->nrpages++; |
282 | page->mapping = mapping; | 308 | __inc_zone_page_state(page, NR_FILE_PAGES); |
283 | page->index = index; | 309 | __inc_zone_page_state(page, NR_SHMEM); |
284 | 310 | spin_unlock_irq(&mapping->tree_lock); | |
285 | spin_lock_irq(&mapping->tree_lock); | 311 | } else { |
286 | if (!expected) | 312 | page->mapping = NULL; |
287 | error = radix_tree_insert(&mapping->page_tree, | 313 | spin_unlock_irq(&mapping->tree_lock); |
288 | index, page); | 314 | page_cache_release(page); |
289 | else | ||
290 | error = shmem_radix_tree_replace(mapping, index, | ||
291 | expected, page); | ||
292 | if (!error) { | ||
293 | mapping->nrpages++; | ||
294 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
295 | __inc_zone_page_state(page, NR_SHMEM); | ||
296 | spin_unlock_irq(&mapping->tree_lock); | ||
297 | } else { | ||
298 | page->mapping = NULL; | ||
299 | spin_unlock_irq(&mapping->tree_lock); | ||
300 | page_cache_release(page); | ||
301 | } | ||
302 | if (!expected) | ||
303 | radix_tree_preload_end(); | ||
304 | } | 315 | } |
305 | if (error) | ||
306 | mem_cgroup_uncharge_cache_page(page); | ||
307 | return error; | 316 | return error; |
308 | } | 317 | } |
309 | 318 | ||
@@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
683 | mutex_lock(&shmem_swaplist_mutex); | 692 | mutex_lock(&shmem_swaplist_mutex); |
684 | /* | 693 | /* |
685 | * We needed to drop mutex to make that restrictive page | 694 | * We needed to drop mutex to make that restrictive page |
686 | * allocation; but the inode might already be freed by now, | 695 | * allocation, but the inode might have been freed while we |
687 | * and we cannot refer to inode or mapping or info to check. | 696 | * dropped it: although a racing shmem_evict_inode() cannot |
688 | * However, we do hold page lock on the PageSwapCache page, | 697 | * complete without emptying the radix_tree, our page lock |
689 | * so can check if that still has our reference remaining. | 698 | * on this swapcache page is not enough to prevent that - |
699 | * free_swap_and_cache() of our swap entry will only | ||
700 | * trylock_page(), removing swap from radix_tree whatever. | ||
701 | * | ||
702 | * We must not proceed to shmem_add_to_page_cache() if the | ||
703 | * inode has been freed, but of course we cannot rely on | ||
704 | * inode or mapping or info to check that. However, we can | ||
705 | * safely check if our swap entry is still in use (and here | ||
706 | * it can't have got reused for another page): if it's still | ||
707 | * in use, then the inode cannot have been freed yet, and we | ||
708 | * can safely proceed (if it's no longer in use, that tells | ||
709 | * nothing about the inode, but we don't need to unuse swap). | ||
690 | */ | 710 | */ |
691 | if (!page_swapcount(*pagep)) | 711 | if (!page_swapcount(*pagep)) |
692 | error = -ENOENT; | 712 | error = -ENOENT; |
@@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
730 | 750 | ||
731 | /* | 751 | /* |
732 | * There's a faint possibility that swap page was replaced before | 752 | * There's a faint possibility that swap page was replaced before |
733 | * caller locked it: it will come back later with the right page. | 753 | * caller locked it: caller will come back later with the right page. |
734 | */ | 754 | */ |
735 | if (unlikely(!PageSwapCache(page))) | 755 | if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) |
736 | goto out; | 756 | goto out; |
737 | 757 | ||
738 | /* | 758 | /* |
@@ -909,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, | |||
909 | 929 | ||
910 | /* Create a pseudo vma that just contains the policy */ | 930 | /* Create a pseudo vma that just contains the policy */ |
911 | pvma.vm_start = 0; | 931 | pvma.vm_start = 0; |
912 | pvma.vm_pgoff = index; | 932 | /* Bias interleave by inode number to distribute better across nodes */ |
933 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
913 | pvma.vm_ops = NULL; | 934 | pvma.vm_ops = NULL; |
914 | pvma.vm_policy = spol; | 935 | pvma.vm_policy = spol; |
915 | return swapin_readahead(swap, gfp, &pvma, 0); | 936 | return swapin_readahead(swap, gfp, &pvma, 0); |
@@ -922,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
922 | 943 | ||
923 | /* Create a pseudo vma that just contains the policy */ | 944 | /* Create a pseudo vma that just contains the policy */ |
924 | pvma.vm_start = 0; | 945 | pvma.vm_start = 0; |
925 | pvma.vm_pgoff = index; | 946 | /* Bias interleave by inode number to distribute better across nodes */ |
947 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
926 | pvma.vm_ops = NULL; | 948 | pvma.vm_ops = NULL; |
927 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); | 949 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
928 | 950 | ||
@@ -995,21 +1017,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
995 | newpage = shmem_alloc_page(gfp, info, index); | 1017 | newpage = shmem_alloc_page(gfp, info, index); |
996 | if (!newpage) | 1018 | if (!newpage) |
997 | return -ENOMEM; | 1019 | return -ENOMEM; |
998 | VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); | ||
999 | 1020 | ||
1000 | *pagep = newpage; | ||
1001 | page_cache_get(newpage); | 1021 | page_cache_get(newpage); |
1002 | copy_highpage(newpage, oldpage); | 1022 | copy_highpage(newpage, oldpage); |
1023 | flush_dcache_page(newpage); | ||
1003 | 1024 | ||
1004 | VM_BUG_ON(!PageLocked(oldpage)); | ||
1005 | __set_page_locked(newpage); | 1025 | __set_page_locked(newpage); |
1006 | VM_BUG_ON(!PageUptodate(oldpage)); | ||
1007 | SetPageUptodate(newpage); | 1026 | SetPageUptodate(newpage); |
1008 | VM_BUG_ON(!PageSwapBacked(oldpage)); | ||
1009 | SetPageSwapBacked(newpage); | 1027 | SetPageSwapBacked(newpage); |
1010 | VM_BUG_ON(!swap_index); | ||
1011 | set_page_private(newpage, swap_index); | 1028 | set_page_private(newpage, swap_index); |
1012 | VM_BUG_ON(!PageSwapCache(oldpage)); | ||
1013 | SetPageSwapCache(newpage); | 1029 | SetPageSwapCache(newpage); |
1014 | 1030 | ||
1015 | /* | 1031 | /* |
@@ -1019,13 +1035,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1019 | spin_lock_irq(&swap_mapping->tree_lock); | 1035 | spin_lock_irq(&swap_mapping->tree_lock); |
1020 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | 1036 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, |
1021 | newpage); | 1037 | newpage); |
1022 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 1038 | if (!error) { |
1023 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | 1039 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
1040 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | ||
1041 | } | ||
1024 | spin_unlock_irq(&swap_mapping->tree_lock); | 1042 | spin_unlock_irq(&swap_mapping->tree_lock); |
1025 | BUG_ON(error); | ||
1026 | 1043 | ||
1027 | mem_cgroup_replace_page_cache(oldpage, newpage); | 1044 | if (unlikely(error)) { |
1028 | lru_cache_add_anon(newpage); | 1045 | /* |
1046 | * Is this possible? I think not, now that our callers check | ||
1047 | * both PageSwapCache and page_private after getting page lock; | ||
1048 | * but be defensive. Reverse old to newpage for clear and free. | ||
1049 | */ | ||
1050 | oldpage = newpage; | ||
1051 | } else { | ||
1052 | mem_cgroup_replace_page_cache(oldpage, newpage); | ||
1053 | lru_cache_add_anon(newpage); | ||
1054 | *pagep = newpage; | ||
1055 | } | ||
1029 | 1056 | ||
1030 | ClearPageSwapCache(oldpage); | 1057 | ClearPageSwapCache(oldpage); |
1031 | set_page_private(oldpage, 0); | 1058 | set_page_private(oldpage, 0); |
@@ -1033,7 +1060,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1033 | unlock_page(oldpage); | 1060 | unlock_page(oldpage); |
1034 | page_cache_release(oldpage); | 1061 | page_cache_release(oldpage); |
1035 | page_cache_release(oldpage); | 1062 | page_cache_release(oldpage); |
1036 | return 0; | 1063 | return error; |
1037 | } | 1064 | } |
1038 | 1065 | ||
1039 | /* | 1066 | /* |
@@ -1107,9 +1134,10 @@ repeat: | |||
1107 | 1134 | ||
1108 | /* We have to do this with page locked to prevent races */ | 1135 | /* We have to do this with page locked to prevent races */ |
1109 | lock_page(page); | 1136 | lock_page(page); |
1110 | if (!PageSwapCache(page) || page->mapping) { | 1137 | if (!PageSwapCache(page) || page_private(page) != swap.val || |
1138 | !shmem_confirm_swap(mapping, index, swap)) { | ||
1111 | error = -EEXIST; /* try again */ | 1139 | error = -EEXIST; /* try again */ |
1112 | goto failed; | 1140 | goto unlock; |
1113 | } | 1141 | } |
1114 | if (!PageUptodate(page)) { | 1142 | if (!PageUptodate(page)) { |
1115 | error = -EIO; | 1143 | error = -EIO; |
@@ -1125,9 +1153,12 @@ repeat: | |||
1125 | 1153 | ||
1126 | error = mem_cgroup_cache_charge(page, current->mm, | 1154 | error = mem_cgroup_cache_charge(page, current->mm, |
1127 | gfp & GFP_RECLAIM_MASK); | 1155 | gfp & GFP_RECLAIM_MASK); |
1128 | if (!error) | 1156 | if (!error) { |
1129 | error = shmem_add_to_page_cache(page, mapping, index, | 1157 | error = shmem_add_to_page_cache(page, mapping, index, |
1130 | gfp, swp_to_radix_entry(swap)); | 1158 | gfp, swp_to_radix_entry(swap)); |
1159 | /* We already confirmed swap, and make no allocation */ | ||
1160 | VM_BUG_ON(error); | ||
1161 | } | ||
1131 | if (error) | 1162 | if (error) |
1132 | goto failed; | 1163 | goto failed; |
1133 | 1164 | ||
@@ -1164,11 +1195,18 @@ repeat: | |||
1164 | __set_page_locked(page); | 1195 | __set_page_locked(page); |
1165 | error = mem_cgroup_cache_charge(page, current->mm, | 1196 | error = mem_cgroup_cache_charge(page, current->mm, |
1166 | gfp & GFP_RECLAIM_MASK); | 1197 | gfp & GFP_RECLAIM_MASK); |
1167 | if (!error) | ||
1168 | error = shmem_add_to_page_cache(page, mapping, index, | ||
1169 | gfp, NULL); | ||
1170 | if (error) | 1198 | if (error) |
1171 | goto decused; | 1199 | goto decused; |
1200 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | ||
1201 | if (!error) { | ||
1202 | error = shmem_add_to_page_cache(page, mapping, index, | ||
1203 | gfp, NULL); | ||
1204 | radix_tree_preload_end(); | ||
1205 | } | ||
1206 | if (error) { | ||
1207 | mem_cgroup_uncharge_cache_page(page); | ||
1208 | goto decused; | ||
1209 | } | ||
1172 | lru_cache_add_anon(page); | 1210 | lru_cache_add_anon(page); |
1173 | 1211 | ||
1174 | spin_lock(&info->lock); | 1212 | spin_lock(&info->lock); |
@@ -1228,14 +1266,10 @@ decused: | |||
1228 | unacct: | 1266 | unacct: |
1229 | shmem_unacct_blocks(info->flags, 1); | 1267 | shmem_unacct_blocks(info->flags, 1); |
1230 | failed: | 1268 | failed: |
1231 | if (swap.val && error != -EINVAL) { | 1269 | if (swap.val && error != -EINVAL && |
1232 | struct page *test = find_get_page(mapping, index); | 1270 | !shmem_confirm_swap(mapping, index, swap)) |
1233 | if (test && !radix_tree_exceptional_entry(test)) | 1271 | error = -EEXIST; |
1234 | page_cache_release(test); | 1272 | unlock: |
1235 | /* Have another try if the entry has changed */ | ||
1236 | if (test != swp_to_radix_entry(swap)) | ||
1237 | error = -EEXIST; | ||
1238 | } | ||
1239 | if (page) { | 1273 | if (page) { |
1240 | unlock_page(page); | 1274 | unlock_page(page); |
1241 | page_cache_release(page); | 1275 | page_cache_release(page); |
@@ -1247,7 +1281,7 @@ failed: | |||
1247 | spin_unlock(&info->lock); | 1281 | spin_unlock(&info->lock); |
1248 | goto repeat; | 1282 | goto repeat; |
1249 | } | 1283 | } |
1250 | if (error == -EEXIST) | 1284 | if (error == -EEXIST) /* from above or from radix_tree_insert */ |
1251 | goto repeat; | 1285 | goto repeat; |
1252 | return error; | 1286 | return error; |
1253 | } | 1287 | } |
@@ -1675,98 +1709,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1675 | return error; | 1709 | return error; |
1676 | } | 1710 | } |
1677 | 1711 | ||
1678 | /* | ||
1679 | * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. | ||
1680 | */ | ||
1681 | static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | ||
1682 | pgoff_t index, pgoff_t end, int origin) | ||
1683 | { | ||
1684 | struct page *page; | ||
1685 | struct pagevec pvec; | ||
1686 | pgoff_t indices[PAGEVEC_SIZE]; | ||
1687 | bool done = false; | ||
1688 | int i; | ||
1689 | |||
1690 | pagevec_init(&pvec, 0); | ||
1691 | pvec.nr = 1; /* start small: we may be there already */ | ||
1692 | while (!done) { | ||
1693 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | ||
1694 | pvec.nr, pvec.pages, indices); | ||
1695 | if (!pvec.nr) { | ||
1696 | if (origin == SEEK_DATA) | ||
1697 | index = end; | ||
1698 | break; | ||
1699 | } | ||
1700 | for (i = 0; i < pvec.nr; i++, index++) { | ||
1701 | if (index < indices[i]) { | ||
1702 | if (origin == SEEK_HOLE) { | ||
1703 | done = true; | ||
1704 | break; | ||
1705 | } | ||
1706 | index = indices[i]; | ||
1707 | } | ||
1708 | page = pvec.pages[i]; | ||
1709 | if (page && !radix_tree_exceptional_entry(page)) { | ||
1710 | if (!PageUptodate(page)) | ||
1711 | page = NULL; | ||
1712 | } | ||
1713 | if (index >= end || | ||
1714 | (page && origin == SEEK_DATA) || | ||
1715 | (!page && origin == SEEK_HOLE)) { | ||
1716 | done = true; | ||
1717 | break; | ||
1718 | } | ||
1719 | } | ||
1720 | shmem_deswap_pagevec(&pvec); | ||
1721 | pagevec_release(&pvec); | ||
1722 | pvec.nr = PAGEVEC_SIZE; | ||
1723 | cond_resched(); | ||
1724 | } | ||
1725 | return index; | ||
1726 | } | ||
1727 | |||
1728 | static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) | ||
1729 | { | ||
1730 | struct address_space *mapping; | ||
1731 | struct inode *inode; | ||
1732 | pgoff_t start, end; | ||
1733 | loff_t new_offset; | ||
1734 | |||
1735 | if (origin != SEEK_DATA && origin != SEEK_HOLE) | ||
1736 | return generic_file_llseek_size(file, offset, origin, | ||
1737 | MAX_LFS_FILESIZE); | ||
1738 | mapping = file->f_mapping; | ||
1739 | inode = mapping->host; | ||
1740 | mutex_lock(&inode->i_mutex); | ||
1741 | /* We're holding i_mutex so we can access i_size directly */ | ||
1742 | |||
1743 | if (offset < 0) | ||
1744 | offset = -EINVAL; | ||
1745 | else if (offset >= inode->i_size) | ||
1746 | offset = -ENXIO; | ||
1747 | else { | ||
1748 | start = offset >> PAGE_CACHE_SHIFT; | ||
1749 | end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1750 | new_offset = shmem_seek_hole_data(mapping, start, end, origin); | ||
1751 | new_offset <<= PAGE_CACHE_SHIFT; | ||
1752 | if (new_offset > offset) { | ||
1753 | if (new_offset < inode->i_size) | ||
1754 | offset = new_offset; | ||
1755 | else if (origin == SEEK_DATA) | ||
1756 | offset = -ENXIO; | ||
1757 | else | ||
1758 | offset = inode->i_size; | ||
1759 | } | ||
1760 | } | ||
1761 | |||
1762 | if (offset >= 0 && offset != file->f_pos) { | ||
1763 | file->f_pos = offset; | ||
1764 | file->f_version = 0; | ||
1765 | } | ||
1766 | mutex_unlock(&inode->i_mutex); | ||
1767 | return offset; | ||
1768 | } | ||
1769 | |||
1770 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | 1712 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, |
1771 | loff_t len) | 1713 | loff_t len) |
1772 | { | 1714 | { |
@@ -1937,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
1937 | } | 1879 | } |
1938 | 1880 | ||
1939 | static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, | 1881 | static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, |
1940 | struct nameidata *nd) | 1882 | bool excl) |
1941 | { | 1883 | { |
1942 | return shmem_mknod(dir, dentry, mode | S_IFREG, 0); | 1884 | return shmem_mknod(dir, dentry, mode | S_IFREG, 0); |
1943 | } | 1885 | } |
@@ -2770,7 +2712,7 @@ static const struct address_space_operations shmem_aops = { | |||
2770 | static const struct file_operations shmem_file_operations = { | 2712 | static const struct file_operations shmem_file_operations = { |
2771 | .mmap = shmem_mmap, | 2713 | .mmap = shmem_mmap, |
2772 | #ifdef CONFIG_TMPFS | 2714 | #ifdef CONFIG_TMPFS |
2773 | .llseek = shmem_file_llseek, | 2715 | .llseek = generic_file_llseek, |
2774 | .read = do_sync_read, | 2716 | .read = do_sync_read, |
2775 | .write = do_sync_write, | 2717 | .write = do_sync_write, |
2776 | .aio_read = shmem_file_aio_read, | 2718 | .aio_read = shmem_file_aio_read, |
@@ -68,7 +68,7 @@ | |||
68 | * Further notes from the original documentation: | 68 | * Further notes from the original documentation: |
69 | * | 69 | * |
70 | * 11 April '97. Started multi-threading - markhe | 70 | * 11 April '97. Started multi-threading - markhe |
71 | * The global cache-chain is protected by the mutex 'cache_chain_mutex'. | 71 | * The global cache-chain is protected by the mutex 'slab_mutex'. |
72 | * The sem is only needed when accessing/extending the cache-chain, which | 72 | * The sem is only needed when accessing/extending the cache-chain, which |
73 | * can never happen inside an interrupt (kmem_cache_create(), | 73 | * can never happen inside an interrupt (kmem_cache_create(), |
74 | * kmem_cache_shrink() and kmem_cache_reap()). | 74 | * kmem_cache_shrink() and kmem_cache_reap()). |
@@ -87,6 +87,7 @@ | |||
87 | */ | 87 | */ |
88 | 88 | ||
89 | #include <linux/slab.h> | 89 | #include <linux/slab.h> |
90 | #include "slab.h" | ||
90 | #include <linux/mm.h> | 91 | #include <linux/mm.h> |
91 | #include <linux/poison.h> | 92 | #include <linux/poison.h> |
92 | #include <linux/swap.h> | 93 | #include <linux/swap.h> |
@@ -117,12 +118,16 @@ | |||
117 | #include <linux/memory.h> | 118 | #include <linux/memory.h> |
118 | #include <linux/prefetch.h> | 119 | #include <linux/prefetch.h> |
119 | 120 | ||
121 | #include <net/sock.h> | ||
122 | |||
120 | #include <asm/cacheflush.h> | 123 | #include <asm/cacheflush.h> |
121 | #include <asm/tlbflush.h> | 124 | #include <asm/tlbflush.h> |
122 | #include <asm/page.h> | 125 | #include <asm/page.h> |
123 | 126 | ||
124 | #include <trace/events/kmem.h> | 127 | #include <trace/events/kmem.h> |
125 | 128 | ||
129 | #include "internal.h" | ||
130 | |||
126 | /* | 131 | /* |
127 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. | 132 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. |
128 | * 0 for faster, smaller code (especially in the critical paths). | 133 | * 0 for faster, smaller code (especially in the critical paths). |
@@ -151,6 +156,12 @@ | |||
151 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN | 156 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN |
152 | #endif | 157 | #endif |
153 | 158 | ||
159 | /* | ||
160 | * true if a page was allocated from pfmemalloc reserves for network-based | ||
161 | * swap | ||
162 | */ | ||
163 | static bool pfmemalloc_active __read_mostly; | ||
164 | |||
154 | /* Legal flag mask for kmem_cache_create(). */ | 165 | /* Legal flag mask for kmem_cache_create(). */ |
155 | #if DEBUG | 166 | #if DEBUG |
156 | # define CREATE_MASK (SLAB_RED_ZONE | \ | 167 | # define CREATE_MASK (SLAB_RED_ZONE | \ |
@@ -256,9 +267,30 @@ struct array_cache { | |||
256 | * Must have this definition in here for the proper | 267 | * Must have this definition in here for the proper |
257 | * alignment of array_cache. Also simplifies accessing | 268 | * alignment of array_cache. Also simplifies accessing |
258 | * the entries. | 269 | * the entries. |
270 | * | ||
271 | * Entries should not be directly dereferenced as | ||
272 | * entries belonging to slabs marked pfmemalloc will | ||
273 | * have the lower bits set SLAB_OBJ_PFMEMALLOC | ||
259 | */ | 274 | */ |
260 | }; | 275 | }; |
261 | 276 | ||
277 | #define SLAB_OBJ_PFMEMALLOC 1 | ||
278 | static inline bool is_obj_pfmemalloc(void *objp) | ||
279 | { | ||
280 | return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; | ||
281 | } | ||
282 | |||
283 | static inline void set_obj_pfmemalloc(void **objp) | ||
284 | { | ||
285 | *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); | ||
286 | return; | ||
287 | } | ||
288 | |||
289 | static inline void clear_obj_pfmemalloc(void **objp) | ||
290 | { | ||
291 | *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); | ||
292 | } | ||
293 | |||
262 | /* | 294 | /* |
263 | * bootstrap: The caches do not work without cpuarrays anymore, but the | 295 | * bootstrap: The caches do not work without cpuarrays anymore, but the |
264 | * cpuarrays are allocated from the generic caches... | 296 | * cpuarrays are allocated from the generic caches... |
@@ -424,8 +456,8 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
424 | * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: | 456 | * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: |
425 | * redzone word. | 457 | * redzone word. |
426 | * cachep->obj_offset: The real object. | 458 | * cachep->obj_offset: The real object. |
427 | * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] | 459 | * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] |
428 | * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address | 460 | * cachep->size - 1* BYTES_PER_WORD: last caller address |
429 | * [BYTES_PER_WORD long] | 461 | * [BYTES_PER_WORD long] |
430 | */ | 462 | */ |
431 | static int obj_offset(struct kmem_cache *cachep) | 463 | static int obj_offset(struct kmem_cache *cachep) |
@@ -433,11 +465,6 @@ static int obj_offset(struct kmem_cache *cachep) | |||
433 | return cachep->obj_offset; | 465 | return cachep->obj_offset; |
434 | } | 466 | } |
435 | 467 | ||
436 | static int obj_size(struct kmem_cache *cachep) | ||
437 | { | ||
438 | return cachep->obj_size; | ||
439 | } | ||
440 | |||
441 | static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) | 468 | static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) |
442 | { | 469 | { |
443 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | 470 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); |
@@ -449,23 +476,22 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) | |||
449 | { | 476 | { |
450 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | 477 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); |
451 | if (cachep->flags & SLAB_STORE_USER) | 478 | if (cachep->flags & SLAB_STORE_USER) |
452 | return (unsigned long long *)(objp + cachep->buffer_size - | 479 | return (unsigned long long *)(objp + cachep->size - |
453 | sizeof(unsigned long long) - | 480 | sizeof(unsigned long long) - |
454 | REDZONE_ALIGN); | 481 | REDZONE_ALIGN); |
455 | return (unsigned long long *) (objp + cachep->buffer_size - | 482 | return (unsigned long long *) (objp + cachep->size - |
456 | sizeof(unsigned long long)); | 483 | sizeof(unsigned long long)); |
457 | } | 484 | } |
458 | 485 | ||
459 | static void **dbg_userword(struct kmem_cache *cachep, void *objp) | 486 | static void **dbg_userword(struct kmem_cache *cachep, void *objp) |
460 | { | 487 | { |
461 | BUG_ON(!(cachep->flags & SLAB_STORE_USER)); | 488 | BUG_ON(!(cachep->flags & SLAB_STORE_USER)); |
462 | return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); | 489 | return (void **)(objp + cachep->size - BYTES_PER_WORD); |
463 | } | 490 | } |
464 | 491 | ||
465 | #else | 492 | #else |
466 | 493 | ||
467 | #define obj_offset(x) 0 | 494 | #define obj_offset(x) 0 |
468 | #define obj_size(cachep) (cachep->buffer_size) | ||
469 | #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) | 495 | #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) |
470 | #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) | 496 | #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) |
471 | #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) | 497 | #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) |
@@ -475,7 +501,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
475 | #ifdef CONFIG_TRACING | 501 | #ifdef CONFIG_TRACING |
476 | size_t slab_buffer_size(struct kmem_cache *cachep) | 502 | size_t slab_buffer_size(struct kmem_cache *cachep) |
477 | { | 503 | { |
478 | return cachep->buffer_size; | 504 | return cachep->size; |
479 | } | 505 | } |
480 | EXPORT_SYMBOL(slab_buffer_size); | 506 | EXPORT_SYMBOL(slab_buffer_size); |
481 | #endif | 507 | #endif |
@@ -489,56 +515,37 @@ EXPORT_SYMBOL(slab_buffer_size); | |||
489 | static int slab_max_order = SLAB_MAX_ORDER_LO; | 515 | static int slab_max_order = SLAB_MAX_ORDER_LO; |
490 | static bool slab_max_order_set __initdata; | 516 | static bool slab_max_order_set __initdata; |
491 | 517 | ||
492 | /* | ||
493 | * Functions for storing/retrieving the cachep and or slab from the page | ||
494 | * allocator. These are used to find the slab an obj belongs to. With kfree(), | ||
495 | * these are used to find the cache which an obj belongs to. | ||
496 | */ | ||
497 | static inline void page_set_cache(struct page *page, struct kmem_cache *cache) | ||
498 | { | ||
499 | page->lru.next = (struct list_head *)cache; | ||
500 | } | ||
501 | |||
502 | static inline struct kmem_cache *page_get_cache(struct page *page) | 518 | static inline struct kmem_cache *page_get_cache(struct page *page) |
503 | { | 519 | { |
504 | page = compound_head(page); | 520 | page = compound_head(page); |
505 | BUG_ON(!PageSlab(page)); | 521 | BUG_ON(!PageSlab(page)); |
506 | return (struct kmem_cache *)page->lru.next; | 522 | return page->slab_cache; |
507 | } | ||
508 | |||
509 | static inline void page_set_slab(struct page *page, struct slab *slab) | ||
510 | { | ||
511 | page->lru.prev = (struct list_head *)slab; | ||
512 | } | ||
513 | |||
514 | static inline struct slab *page_get_slab(struct page *page) | ||
515 | { | ||
516 | BUG_ON(!PageSlab(page)); | ||
517 | return (struct slab *)page->lru.prev; | ||
518 | } | 523 | } |
519 | 524 | ||
520 | static inline struct kmem_cache *virt_to_cache(const void *obj) | 525 | static inline struct kmem_cache *virt_to_cache(const void *obj) |
521 | { | 526 | { |
522 | struct page *page = virt_to_head_page(obj); | 527 | struct page *page = virt_to_head_page(obj); |
523 | return page_get_cache(page); | 528 | return page->slab_cache; |
524 | } | 529 | } |
525 | 530 | ||
526 | static inline struct slab *virt_to_slab(const void *obj) | 531 | static inline struct slab *virt_to_slab(const void *obj) |
527 | { | 532 | { |
528 | struct page *page = virt_to_head_page(obj); | 533 | struct page *page = virt_to_head_page(obj); |
529 | return page_get_slab(page); | 534 | |
535 | VM_BUG_ON(!PageSlab(page)); | ||
536 | return page->slab_page; | ||
530 | } | 537 | } |
531 | 538 | ||
532 | static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, | 539 | static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, |
533 | unsigned int idx) | 540 | unsigned int idx) |
534 | { | 541 | { |
535 | return slab->s_mem + cache->buffer_size * idx; | 542 | return slab->s_mem + cache->size * idx; |
536 | } | 543 | } |
537 | 544 | ||
538 | /* | 545 | /* |
539 | * We want to avoid an expensive divide : (offset / cache->buffer_size) | 546 | * We want to avoid an expensive divide : (offset / cache->size) |
540 | * Using the fact that buffer_size is a constant for a particular cache, | 547 | * Using the fact that size is a constant for a particular cache, |
541 | * we can replace (offset / cache->buffer_size) by | 548 | * we can replace (offset / cache->size) by |
542 | * reciprocal_divide(offset, cache->reciprocal_buffer_size) | 549 | * reciprocal_divide(offset, cache->reciprocal_buffer_size) |
543 | */ | 550 | */ |
544 | static inline unsigned int obj_to_index(const struct kmem_cache *cache, | 551 | static inline unsigned int obj_to_index(const struct kmem_cache *cache, |
@@ -584,33 +591,12 @@ static struct kmem_cache cache_cache = { | |||
584 | .batchcount = 1, | 591 | .batchcount = 1, |
585 | .limit = BOOT_CPUCACHE_ENTRIES, | 592 | .limit = BOOT_CPUCACHE_ENTRIES, |
586 | .shared = 1, | 593 | .shared = 1, |
587 | .buffer_size = sizeof(struct kmem_cache), | 594 | .size = sizeof(struct kmem_cache), |
588 | .name = "kmem_cache", | 595 | .name = "kmem_cache", |
589 | }; | 596 | }; |
590 | 597 | ||
591 | #define BAD_ALIEN_MAGIC 0x01020304ul | 598 | #define BAD_ALIEN_MAGIC 0x01020304ul |
592 | 599 | ||
593 | /* | ||
594 | * chicken and egg problem: delay the per-cpu array allocation | ||
595 | * until the general caches are up. | ||
596 | */ | ||
597 | static enum { | ||
598 | NONE, | ||
599 | PARTIAL_AC, | ||
600 | PARTIAL_L3, | ||
601 | EARLY, | ||
602 | LATE, | ||
603 | FULL | ||
604 | } g_cpucache_up; | ||
605 | |||
606 | /* | ||
607 | * used by boot code to determine if it can use slab based allocator | ||
608 | */ | ||
609 | int slab_is_available(void) | ||
610 | { | ||
611 | return g_cpucache_up >= EARLY; | ||
612 | } | ||
613 | |||
614 | #ifdef CONFIG_LOCKDEP | 600 | #ifdef CONFIG_LOCKDEP |
615 | 601 | ||
616 | /* | 602 | /* |
@@ -676,7 +662,7 @@ static void init_node_lock_keys(int q) | |||
676 | { | 662 | { |
677 | struct cache_sizes *s = malloc_sizes; | 663 | struct cache_sizes *s = malloc_sizes; |
678 | 664 | ||
679 | if (g_cpucache_up < LATE) | 665 | if (slab_state < UP) |
680 | return; | 666 | return; |
681 | 667 | ||
682 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { | 668 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { |
@@ -716,12 +702,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | |||
716 | } | 702 | } |
717 | #endif | 703 | #endif |
718 | 704 | ||
719 | /* | ||
720 | * Guard access to the cache-chain. | ||
721 | */ | ||
722 | static DEFINE_MUTEX(cache_chain_mutex); | ||
723 | static struct list_head cache_chain; | ||
724 | |||
725 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); | 705 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); |
726 | 706 | ||
727 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 707 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
@@ -951,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
951 | return nc; | 931 | return nc; |
952 | } | 932 | } |
953 | 933 | ||
934 | static inline bool is_slab_pfmemalloc(struct slab *slabp) | ||
935 | { | ||
936 | struct page *page = virt_to_page(slabp->s_mem); | ||
937 | |||
938 | return PageSlabPfmemalloc(page); | ||
939 | } | ||
940 | |||
941 | /* Clears pfmemalloc_active if no slabs have pfmalloc set */ | ||
942 | static void recheck_pfmemalloc_active(struct kmem_cache *cachep, | ||
943 | struct array_cache *ac) | ||
944 | { | ||
945 | struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()]; | ||
946 | struct slab *slabp; | ||
947 | unsigned long flags; | ||
948 | |||
949 | if (!pfmemalloc_active) | ||
950 | return; | ||
951 | |||
952 | spin_lock_irqsave(&l3->list_lock, flags); | ||
953 | list_for_each_entry(slabp, &l3->slabs_full, list) | ||
954 | if (is_slab_pfmemalloc(slabp)) | ||
955 | goto out; | ||
956 | |||
957 | list_for_each_entry(slabp, &l3->slabs_partial, list) | ||
958 | if (is_slab_pfmemalloc(slabp)) | ||
959 | goto out; | ||
960 | |||
961 | list_for_each_entry(slabp, &l3->slabs_free, list) | ||
962 | if (is_slab_pfmemalloc(slabp)) | ||
963 | goto out; | ||
964 | |||
965 | pfmemalloc_active = false; | ||
966 | out: | ||
967 | spin_unlock_irqrestore(&l3->list_lock, flags); | ||
968 | } | ||
969 | |||
970 | static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
971 | gfp_t flags, bool force_refill) | ||
972 | { | ||
973 | int i; | ||
974 | void *objp = ac->entry[--ac->avail]; | ||
975 | |||
976 | /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ | ||
977 | if (unlikely(is_obj_pfmemalloc(objp))) { | ||
978 | struct kmem_list3 *l3; | ||
979 | |||
980 | if (gfp_pfmemalloc_allowed(flags)) { | ||
981 | clear_obj_pfmemalloc(&objp); | ||
982 | return objp; | ||
983 | } | ||
984 | |||
985 | /* The caller cannot use PFMEMALLOC objects, find another one */ | ||
986 | for (i = 1; i < ac->avail; i++) { | ||
987 | /* If a !PFMEMALLOC object is found, swap them */ | ||
988 | if (!is_obj_pfmemalloc(ac->entry[i])) { | ||
989 | objp = ac->entry[i]; | ||
990 | ac->entry[i] = ac->entry[ac->avail]; | ||
991 | ac->entry[ac->avail] = objp; | ||
992 | return objp; | ||
993 | } | ||
994 | } | ||
995 | |||
996 | /* | ||
997 | * If there are empty slabs on the slabs_free list and we are | ||
998 | * being forced to refill the cache, mark this one !pfmemalloc. | ||
999 | */ | ||
1000 | l3 = cachep->nodelists[numa_mem_id()]; | ||
1001 | if (!list_empty(&l3->slabs_free) && force_refill) { | ||
1002 | struct slab *slabp = virt_to_slab(objp); | ||
1003 | ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem)); | ||
1004 | clear_obj_pfmemalloc(&objp); | ||
1005 | recheck_pfmemalloc_active(cachep, ac); | ||
1006 | return objp; | ||
1007 | } | ||
1008 | |||
1009 | /* No !PFMEMALLOC objects available */ | ||
1010 | ac->avail++; | ||
1011 | objp = NULL; | ||
1012 | } | ||
1013 | |||
1014 | return objp; | ||
1015 | } | ||
1016 | |||
1017 | static inline void *ac_get_obj(struct kmem_cache *cachep, | ||
1018 | struct array_cache *ac, gfp_t flags, bool force_refill) | ||
1019 | { | ||
1020 | void *objp; | ||
1021 | |||
1022 | if (unlikely(sk_memalloc_socks())) | ||
1023 | objp = __ac_get_obj(cachep, ac, flags, force_refill); | ||
1024 | else | ||
1025 | objp = ac->entry[--ac->avail]; | ||
1026 | |||
1027 | return objp; | ||
1028 | } | ||
1029 | |||
1030 | static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
1031 | void *objp) | ||
1032 | { | ||
1033 | if (unlikely(pfmemalloc_active)) { | ||
1034 | /* Some pfmemalloc slabs exist, check if this is one */ | ||
1035 | struct page *page = virt_to_page(objp); | ||
1036 | if (PageSlabPfmemalloc(page)) | ||
1037 | set_obj_pfmemalloc(&objp); | ||
1038 | } | ||
1039 | |||
1040 | return objp; | ||
1041 | } | ||
1042 | |||
1043 | static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
1044 | void *objp) | ||
1045 | { | ||
1046 | if (unlikely(sk_memalloc_socks())) | ||
1047 | objp = __ac_put_obj(cachep, ac, objp); | ||
1048 | |||
1049 | ac->entry[ac->avail++] = objp; | ||
1050 | } | ||
1051 | |||
954 | /* | 1052 | /* |
955 | * Transfer objects in one arraycache to another. | 1053 | * Transfer objects in one arraycache to another. |
956 | * Locking must be handled by the caller. | 1054 | * Locking must be handled by the caller. |
@@ -1127,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1127 | STATS_INC_ACOVERFLOW(cachep); | 1225 | STATS_INC_ACOVERFLOW(cachep); |
1128 | __drain_alien_cache(cachep, alien, nodeid); | 1226 | __drain_alien_cache(cachep, alien, nodeid); |
1129 | } | 1227 | } |
1130 | alien->entry[alien->avail++] = objp; | 1228 | ac_put_obj(cachep, alien, objp); |
1131 | spin_unlock(&alien->lock); | 1229 | spin_unlock(&alien->lock); |
1132 | } else { | 1230 | } else { |
1133 | spin_lock(&(cachep->nodelists[nodeid])->list_lock); | 1231 | spin_lock(&(cachep->nodelists[nodeid])->list_lock); |
@@ -1145,7 +1243,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1145 | * When hotplugging memory or a cpu, existing nodelists are not replaced if | 1243 | * When hotplugging memory or a cpu, existing nodelists are not replaced if |
1146 | * already in use. | 1244 | * already in use. |
1147 | * | 1245 | * |
1148 | * Must hold cache_chain_mutex. | 1246 | * Must hold slab_mutex. |
1149 | */ | 1247 | */ |
1150 | static int init_cache_nodelists_node(int node) | 1248 | static int init_cache_nodelists_node(int node) |
1151 | { | 1249 | { |
@@ -1153,7 +1251,7 @@ static int init_cache_nodelists_node(int node) | |||
1153 | struct kmem_list3 *l3; | 1251 | struct kmem_list3 *l3; |
1154 | const int memsize = sizeof(struct kmem_list3); | 1252 | const int memsize = sizeof(struct kmem_list3); |
1155 | 1253 | ||
1156 | list_for_each_entry(cachep, &cache_chain, next) { | 1254 | list_for_each_entry(cachep, &slab_caches, list) { |
1157 | /* | 1255 | /* |
1158 | * Set up the size64 kmemlist for cpu before we can | 1256 | * Set up the size64 kmemlist for cpu before we can |
1159 | * begin anything. Make sure some other cpu on this | 1257 | * begin anything. Make sure some other cpu on this |
@@ -1169,7 +1267,7 @@ static int init_cache_nodelists_node(int node) | |||
1169 | 1267 | ||
1170 | /* | 1268 | /* |
1171 | * The l3s don't come and go as CPUs come and | 1269 | * The l3s don't come and go as CPUs come and |
1172 | * go. cache_chain_mutex is sufficient | 1270 | * go. slab_mutex is sufficient |
1173 | * protection here. | 1271 | * protection here. |
1174 | */ | 1272 | */ |
1175 | cachep->nodelists[node] = l3; | 1273 | cachep->nodelists[node] = l3; |
@@ -1191,7 +1289,7 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
1191 | int node = cpu_to_mem(cpu); | 1289 | int node = cpu_to_mem(cpu); |
1192 | const struct cpumask *mask = cpumask_of_node(node); | 1290 | const struct cpumask *mask = cpumask_of_node(node); |
1193 | 1291 | ||
1194 | list_for_each_entry(cachep, &cache_chain, next) { | 1292 | list_for_each_entry(cachep, &slab_caches, list) { |
1195 | struct array_cache *nc; | 1293 | struct array_cache *nc; |
1196 | struct array_cache *shared; | 1294 | struct array_cache *shared; |
1197 | struct array_cache **alien; | 1295 | struct array_cache **alien; |
@@ -1241,7 +1339,7 @@ free_array_cache: | |||
1241 | * the respective cache's slabs, now we can go ahead and | 1339 | * the respective cache's slabs, now we can go ahead and |
1242 | * shrink each nodelist to its limit. | 1340 | * shrink each nodelist to its limit. |
1243 | */ | 1341 | */ |
1244 | list_for_each_entry(cachep, &cache_chain, next) { | 1342 | list_for_each_entry(cachep, &slab_caches, list) { |
1245 | l3 = cachep->nodelists[node]; | 1343 | l3 = cachep->nodelists[node]; |
1246 | if (!l3) | 1344 | if (!l3) |
1247 | continue; | 1345 | continue; |
@@ -1270,7 +1368,7 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1270 | * Now we can go ahead with allocating the shared arrays and | 1368 | * Now we can go ahead with allocating the shared arrays and |
1271 | * array caches | 1369 | * array caches |
1272 | */ | 1370 | */ |
1273 | list_for_each_entry(cachep, &cache_chain, next) { | 1371 | list_for_each_entry(cachep, &slab_caches, list) { |
1274 | struct array_cache *nc; | 1372 | struct array_cache *nc; |
1275 | struct array_cache *shared = NULL; | 1373 | struct array_cache *shared = NULL; |
1276 | struct array_cache **alien = NULL; | 1374 | struct array_cache **alien = NULL; |
@@ -1338,9 +1436,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1338 | switch (action) { | 1436 | switch (action) { |
1339 | case CPU_UP_PREPARE: | 1437 | case CPU_UP_PREPARE: |
1340 | case CPU_UP_PREPARE_FROZEN: | 1438 | case CPU_UP_PREPARE_FROZEN: |
1341 | mutex_lock(&cache_chain_mutex); | 1439 | mutex_lock(&slab_mutex); |
1342 | err = cpuup_prepare(cpu); | 1440 | err = cpuup_prepare(cpu); |
1343 | mutex_unlock(&cache_chain_mutex); | 1441 | mutex_unlock(&slab_mutex); |
1344 | break; | 1442 | break; |
1345 | case CPU_ONLINE: | 1443 | case CPU_ONLINE: |
1346 | case CPU_ONLINE_FROZEN: | 1444 | case CPU_ONLINE_FROZEN: |
@@ -1350,7 +1448,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1350 | case CPU_DOWN_PREPARE: | 1448 | case CPU_DOWN_PREPARE: |
1351 | case CPU_DOWN_PREPARE_FROZEN: | 1449 | case CPU_DOWN_PREPARE_FROZEN: |
1352 | /* | 1450 | /* |
1353 | * Shutdown cache reaper. Note that the cache_chain_mutex is | 1451 | * Shutdown cache reaper. Note that the slab_mutex is |
1354 | * held so that if cache_reap() is invoked it cannot do | 1452 | * held so that if cache_reap() is invoked it cannot do |
1355 | * anything expensive but will only modify reap_work | 1453 | * anything expensive but will only modify reap_work |
1356 | * and reschedule the timer. | 1454 | * and reschedule the timer. |
@@ -1377,9 +1475,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1377 | #endif | 1475 | #endif |
1378 | case CPU_UP_CANCELED: | 1476 | case CPU_UP_CANCELED: |
1379 | case CPU_UP_CANCELED_FROZEN: | 1477 | case CPU_UP_CANCELED_FROZEN: |
1380 | mutex_lock(&cache_chain_mutex); | 1478 | mutex_lock(&slab_mutex); |
1381 | cpuup_canceled(cpu); | 1479 | cpuup_canceled(cpu); |
1382 | mutex_unlock(&cache_chain_mutex); | 1480 | mutex_unlock(&slab_mutex); |
1383 | break; | 1481 | break; |
1384 | } | 1482 | } |
1385 | return notifier_from_errno(err); | 1483 | return notifier_from_errno(err); |
@@ -1395,14 +1493,14 @@ static struct notifier_block __cpuinitdata cpucache_notifier = { | |||
1395 | * Returns -EBUSY if all objects cannot be drained so that the node is not | 1493 | * Returns -EBUSY if all objects cannot be drained so that the node is not |
1396 | * removed. | 1494 | * removed. |
1397 | * | 1495 | * |
1398 | * Must hold cache_chain_mutex. | 1496 | * Must hold slab_mutex. |
1399 | */ | 1497 | */ |
1400 | static int __meminit drain_cache_nodelists_node(int node) | 1498 | static int __meminit drain_cache_nodelists_node(int node) |
1401 | { | 1499 | { |
1402 | struct kmem_cache *cachep; | 1500 | struct kmem_cache *cachep; |
1403 | int ret = 0; | 1501 | int ret = 0; |
1404 | 1502 | ||
1405 | list_for_each_entry(cachep, &cache_chain, next) { | 1503 | list_for_each_entry(cachep, &slab_caches, list) { |
1406 | struct kmem_list3 *l3; | 1504 | struct kmem_list3 *l3; |
1407 | 1505 | ||
1408 | l3 = cachep->nodelists[node]; | 1506 | l3 = cachep->nodelists[node]; |
@@ -1433,14 +1531,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self, | |||
1433 | 1531 | ||
1434 | switch (action) { | 1532 | switch (action) { |
1435 | case MEM_GOING_ONLINE: | 1533 | case MEM_GOING_ONLINE: |
1436 | mutex_lock(&cache_chain_mutex); | 1534 | mutex_lock(&slab_mutex); |
1437 | ret = init_cache_nodelists_node(nid); | 1535 | ret = init_cache_nodelists_node(nid); |
1438 | mutex_unlock(&cache_chain_mutex); | 1536 | mutex_unlock(&slab_mutex); |
1439 | break; | 1537 | break; |
1440 | case MEM_GOING_OFFLINE: | 1538 | case MEM_GOING_OFFLINE: |
1441 | mutex_lock(&cache_chain_mutex); | 1539 | mutex_lock(&slab_mutex); |
1442 | ret = drain_cache_nodelists_node(nid); | 1540 | ret = drain_cache_nodelists_node(nid); |
1443 | mutex_unlock(&cache_chain_mutex); | 1541 | mutex_unlock(&slab_mutex); |
1444 | break; | 1542 | break; |
1445 | case MEM_ONLINE: | 1543 | case MEM_ONLINE: |
1446 | case MEM_OFFLINE: | 1544 | case MEM_OFFLINE: |
@@ -1544,8 +1642,8 @@ void __init kmem_cache_init(void) | |||
1544 | node = numa_mem_id(); | 1642 | node = numa_mem_id(); |
1545 | 1643 | ||
1546 | /* 1) create the cache_cache */ | 1644 | /* 1) create the cache_cache */ |
1547 | INIT_LIST_HEAD(&cache_chain); | 1645 | INIT_LIST_HEAD(&slab_caches); |
1548 | list_add(&cache_cache.next, &cache_chain); | 1646 | list_add(&cache_cache.list, &slab_caches); |
1549 | cache_cache.colour_off = cache_line_size(); | 1647 | cache_cache.colour_off = cache_line_size(); |
1550 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; | 1648 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; |
1551 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; | 1649 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; |
@@ -1553,18 +1651,16 @@ void __init kmem_cache_init(void) | |||
1553 | /* | 1651 | /* |
1554 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids | 1652 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1555 | */ | 1653 | */ |
1556 | cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + | 1654 | cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + |
1557 | nr_node_ids * sizeof(struct kmem_list3 *); | 1655 | nr_node_ids * sizeof(struct kmem_list3 *); |
1558 | #if DEBUG | 1656 | cache_cache.object_size = cache_cache.size; |
1559 | cache_cache.obj_size = cache_cache.buffer_size; | 1657 | cache_cache.size = ALIGN(cache_cache.size, |
1560 | #endif | ||
1561 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, | ||
1562 | cache_line_size()); | 1658 | cache_line_size()); |
1563 | cache_cache.reciprocal_buffer_size = | 1659 | cache_cache.reciprocal_buffer_size = |
1564 | reciprocal_value(cache_cache.buffer_size); | 1660 | reciprocal_value(cache_cache.size); |
1565 | 1661 | ||
1566 | for (order = 0; order < MAX_ORDER; order++) { | 1662 | for (order = 0; order < MAX_ORDER; order++) { |
1567 | cache_estimate(order, cache_cache.buffer_size, | 1663 | cache_estimate(order, cache_cache.size, |
1568 | cache_line_size(), 0, &left_over, &cache_cache.num); | 1664 | cache_line_size(), 0, &left_over, &cache_cache.num); |
1569 | if (cache_cache.num) | 1665 | if (cache_cache.num) |
1570 | break; | 1666 | break; |
@@ -1585,7 +1681,7 @@ void __init kmem_cache_init(void) | |||
1585 | * bug. | 1681 | * bug. |
1586 | */ | 1682 | */ |
1587 | 1683 | ||
1588 | sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, | 1684 | sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name, |
1589 | sizes[INDEX_AC].cs_size, | 1685 | sizes[INDEX_AC].cs_size, |
1590 | ARCH_KMALLOC_MINALIGN, | 1686 | ARCH_KMALLOC_MINALIGN, |
1591 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, | 1687 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
@@ -1593,7 +1689,7 @@ void __init kmem_cache_init(void) | |||
1593 | 1689 | ||
1594 | if (INDEX_AC != INDEX_L3) { | 1690 | if (INDEX_AC != INDEX_L3) { |
1595 | sizes[INDEX_L3].cs_cachep = | 1691 | sizes[INDEX_L3].cs_cachep = |
1596 | kmem_cache_create(names[INDEX_L3].name, | 1692 | __kmem_cache_create(names[INDEX_L3].name, |
1597 | sizes[INDEX_L3].cs_size, | 1693 | sizes[INDEX_L3].cs_size, |
1598 | ARCH_KMALLOC_MINALIGN, | 1694 | ARCH_KMALLOC_MINALIGN, |
1599 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, | 1695 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
@@ -1611,14 +1707,14 @@ void __init kmem_cache_init(void) | |||
1611 | * allow tighter packing of the smaller caches. | 1707 | * allow tighter packing of the smaller caches. |
1612 | */ | 1708 | */ |
1613 | if (!sizes->cs_cachep) { | 1709 | if (!sizes->cs_cachep) { |
1614 | sizes->cs_cachep = kmem_cache_create(names->name, | 1710 | sizes->cs_cachep = __kmem_cache_create(names->name, |
1615 | sizes->cs_size, | 1711 | sizes->cs_size, |
1616 | ARCH_KMALLOC_MINALIGN, | 1712 | ARCH_KMALLOC_MINALIGN, |
1617 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, | 1713 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
1618 | NULL); | 1714 | NULL); |
1619 | } | 1715 | } |
1620 | #ifdef CONFIG_ZONE_DMA | 1716 | #ifdef CONFIG_ZONE_DMA |
1621 | sizes->cs_dmacachep = kmem_cache_create( | 1717 | sizes->cs_dmacachep = __kmem_cache_create( |
1622 | names->name_dma, | 1718 | names->name_dma, |
1623 | sizes->cs_size, | 1719 | sizes->cs_size, |
1624 | ARCH_KMALLOC_MINALIGN, | 1720 | ARCH_KMALLOC_MINALIGN, |
@@ -1676,27 +1772,27 @@ void __init kmem_cache_init(void) | |||
1676 | } | 1772 | } |
1677 | } | 1773 | } |
1678 | 1774 | ||
1679 | g_cpucache_up = EARLY; | 1775 | slab_state = UP; |
1680 | } | 1776 | } |
1681 | 1777 | ||
1682 | void __init kmem_cache_init_late(void) | 1778 | void __init kmem_cache_init_late(void) |
1683 | { | 1779 | { |
1684 | struct kmem_cache *cachep; | 1780 | struct kmem_cache *cachep; |
1685 | 1781 | ||
1686 | g_cpucache_up = LATE; | 1782 | slab_state = UP; |
1687 | 1783 | ||
1688 | /* Annotate slab for lockdep -- annotate the malloc caches */ | 1784 | /* Annotate slab for lockdep -- annotate the malloc caches */ |
1689 | init_lock_keys(); | 1785 | init_lock_keys(); |
1690 | 1786 | ||
1691 | /* 6) resize the head arrays to their final sizes */ | 1787 | /* 6) resize the head arrays to their final sizes */ |
1692 | mutex_lock(&cache_chain_mutex); | 1788 | mutex_lock(&slab_mutex); |
1693 | list_for_each_entry(cachep, &cache_chain, next) | 1789 | list_for_each_entry(cachep, &slab_caches, list) |
1694 | if (enable_cpucache(cachep, GFP_NOWAIT)) | 1790 | if (enable_cpucache(cachep, GFP_NOWAIT)) |
1695 | BUG(); | 1791 | BUG(); |
1696 | mutex_unlock(&cache_chain_mutex); | 1792 | mutex_unlock(&slab_mutex); |
1697 | 1793 | ||
1698 | /* Done! */ | 1794 | /* Done! */ |
1699 | g_cpucache_up = FULL; | 1795 | slab_state = FULL; |
1700 | 1796 | ||
1701 | /* | 1797 | /* |
1702 | * Register a cpu startup notifier callback that initializes | 1798 | * Register a cpu startup notifier callback that initializes |
@@ -1727,6 +1823,9 @@ static int __init cpucache_init(void) | |||
1727 | */ | 1823 | */ |
1728 | for_each_online_cpu(cpu) | 1824 | for_each_online_cpu(cpu) |
1729 | start_cpu_timer(cpu); | 1825 | start_cpu_timer(cpu); |
1826 | |||
1827 | /* Done! */ | ||
1828 | slab_state = FULL; | ||
1730 | return 0; | 1829 | return 0; |
1731 | } | 1830 | } |
1732 | __initcall(cpucache_init); | 1831 | __initcall(cpucache_init); |
@@ -1743,7 +1842,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | |||
1743 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 1842 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", |
1744 | nodeid, gfpflags); | 1843 | nodeid, gfpflags); |
1745 | printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", | 1844 | printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", |
1746 | cachep->name, cachep->buffer_size, cachep->gfporder); | 1845 | cachep->name, cachep->size, cachep->gfporder); |
1747 | 1846 | ||
1748 | for_each_online_node(node) { | 1847 | for_each_online_node(node) { |
1749 | unsigned long active_objs = 0, num_objs = 0, free_objects = 0; | 1848 | unsigned long active_objs = 0, num_objs = 0, free_objects = 0; |
@@ -1798,7 +1897,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1798 | flags |= __GFP_COMP; | 1897 | flags |= __GFP_COMP; |
1799 | #endif | 1898 | #endif |
1800 | 1899 | ||
1801 | flags |= cachep->gfpflags; | 1900 | flags |= cachep->allocflags; |
1802 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1901 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1803 | flags |= __GFP_RECLAIMABLE; | 1902 | flags |= __GFP_RECLAIMABLE; |
1804 | 1903 | ||
@@ -1809,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1809 | return NULL; | 1908 | return NULL; |
1810 | } | 1909 | } |
1811 | 1910 | ||
1911 | /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ | ||
1912 | if (unlikely(page->pfmemalloc)) | ||
1913 | pfmemalloc_active = true; | ||
1914 | |||
1812 | nr_pages = (1 << cachep->gfporder); | 1915 | nr_pages = (1 << cachep->gfporder); |
1813 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1916 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1814 | add_zone_page_state(page_zone(page), | 1917 | add_zone_page_state(page_zone(page), |
@@ -1816,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1816 | else | 1919 | else |
1817 | add_zone_page_state(page_zone(page), | 1920 | add_zone_page_state(page_zone(page), |
1818 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1921 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
1819 | for (i = 0; i < nr_pages; i++) | 1922 | for (i = 0; i < nr_pages; i++) { |
1820 | __SetPageSlab(page + i); | 1923 | __SetPageSlab(page + i); |
1821 | 1924 | ||
1925 | if (page->pfmemalloc) | ||
1926 | SetPageSlabPfmemalloc(page + i); | ||
1927 | } | ||
1928 | |||
1822 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | 1929 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { |
1823 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | 1930 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); |
1824 | 1931 | ||
@@ -1850,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1850 | NR_SLAB_UNRECLAIMABLE, nr_freed); | 1957 | NR_SLAB_UNRECLAIMABLE, nr_freed); |
1851 | while (i--) { | 1958 | while (i--) { |
1852 | BUG_ON(!PageSlab(page)); | 1959 | BUG_ON(!PageSlab(page)); |
1960 | __ClearPageSlabPfmemalloc(page); | ||
1853 | __ClearPageSlab(page); | 1961 | __ClearPageSlab(page); |
1854 | page++; | 1962 | page++; |
1855 | } | 1963 | } |
@@ -1874,7 +1982,7 @@ static void kmem_rcu_free(struct rcu_head *head) | |||
1874 | static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, | 1982 | static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, |
1875 | unsigned long caller) | 1983 | unsigned long caller) |
1876 | { | 1984 | { |
1877 | int size = obj_size(cachep); | 1985 | int size = cachep->object_size; |
1878 | 1986 | ||
1879 | addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; | 1987 | addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; |
1880 | 1988 | ||
@@ -1906,7 +2014,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, | |||
1906 | 2014 | ||
1907 | static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) | 2015 | static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) |
1908 | { | 2016 | { |
1909 | int size = obj_size(cachep); | 2017 | int size = cachep->object_size; |
1910 | addr = &((char *)addr)[obj_offset(cachep)]; | 2018 | addr = &((char *)addr)[obj_offset(cachep)]; |
1911 | 2019 | ||
1912 | memset(addr, val, size); | 2020 | memset(addr, val, size); |
@@ -1966,7 +2074,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) | |||
1966 | printk("\n"); | 2074 | printk("\n"); |
1967 | } | 2075 | } |
1968 | realobj = (char *)objp + obj_offset(cachep); | 2076 | realobj = (char *)objp + obj_offset(cachep); |
1969 | size = obj_size(cachep); | 2077 | size = cachep->object_size; |
1970 | for (i = 0; i < size && lines; i += 16, lines--) { | 2078 | for (i = 0; i < size && lines; i += 16, lines--) { |
1971 | int limit; | 2079 | int limit; |
1972 | limit = 16; | 2080 | limit = 16; |
@@ -1983,7 +2091,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
1983 | int lines = 0; | 2091 | int lines = 0; |
1984 | 2092 | ||
1985 | realobj = (char *)objp + obj_offset(cachep); | 2093 | realobj = (char *)objp + obj_offset(cachep); |
1986 | size = obj_size(cachep); | 2094 | size = cachep->object_size; |
1987 | 2095 | ||
1988 | for (i = 0; i < size; i++) { | 2096 | for (i = 0; i < size; i++) { |
1989 | char exp = POISON_FREE; | 2097 | char exp = POISON_FREE; |
@@ -2047,10 +2155,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab | |||
2047 | 2155 | ||
2048 | if (cachep->flags & SLAB_POISON) { | 2156 | if (cachep->flags & SLAB_POISON) { |
2049 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2157 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2050 | if (cachep->buffer_size % PAGE_SIZE == 0 && | 2158 | if (cachep->size % PAGE_SIZE == 0 && |
2051 | OFF_SLAB(cachep)) | 2159 | OFF_SLAB(cachep)) |
2052 | kernel_map_pages(virt_to_page(objp), | 2160 | kernel_map_pages(virt_to_page(objp), |
2053 | cachep->buffer_size / PAGE_SIZE, 1); | 2161 | cachep->size / PAGE_SIZE, 1); |
2054 | else | 2162 | else |
2055 | check_poison_obj(cachep, objp); | 2163 | check_poison_obj(cachep, objp); |
2056 | #else | 2164 | #else |
@@ -2194,10 +2302,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
2194 | 2302 | ||
2195 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | 2303 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) |
2196 | { | 2304 | { |
2197 | if (g_cpucache_up == FULL) | 2305 | if (slab_state >= FULL) |
2198 | return enable_cpucache(cachep, gfp); | 2306 | return enable_cpucache(cachep, gfp); |
2199 | 2307 | ||
2200 | if (g_cpucache_up == NONE) { | 2308 | if (slab_state == DOWN) { |
2201 | /* | 2309 | /* |
2202 | * Note: the first kmem_cache_create must create the cache | 2310 | * Note: the first kmem_cache_create must create the cache |
2203 | * that's used by kmalloc(24), otherwise the creation of | 2311 | * that's used by kmalloc(24), otherwise the creation of |
@@ -2212,16 +2320,16 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2212 | */ | 2320 | */ |
2213 | set_up_list3s(cachep, SIZE_AC); | 2321 | set_up_list3s(cachep, SIZE_AC); |
2214 | if (INDEX_AC == INDEX_L3) | 2322 | if (INDEX_AC == INDEX_L3) |
2215 | g_cpucache_up = PARTIAL_L3; | 2323 | slab_state = PARTIAL_L3; |
2216 | else | 2324 | else |
2217 | g_cpucache_up = PARTIAL_AC; | 2325 | slab_state = PARTIAL_ARRAYCACHE; |
2218 | } else { | 2326 | } else { |
2219 | cachep->array[smp_processor_id()] = | 2327 | cachep->array[smp_processor_id()] = |
2220 | kmalloc(sizeof(struct arraycache_init), gfp); | 2328 | kmalloc(sizeof(struct arraycache_init), gfp); |
2221 | 2329 | ||
2222 | if (g_cpucache_up == PARTIAL_AC) { | 2330 | if (slab_state == PARTIAL_ARRAYCACHE) { |
2223 | set_up_list3s(cachep, SIZE_L3); | 2331 | set_up_list3s(cachep, SIZE_L3); |
2224 | g_cpucache_up = PARTIAL_L3; | 2332 | slab_state = PARTIAL_L3; |
2225 | } else { | 2333 | } else { |
2226 | int node; | 2334 | int node; |
2227 | for_each_online_node(node) { | 2335 | for_each_online_node(node) { |
@@ -2247,7 +2355,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2247 | } | 2355 | } |
2248 | 2356 | ||
2249 | /** | 2357 | /** |
2250 | * kmem_cache_create - Create a cache. | 2358 | * __kmem_cache_create - Create a cache. |
2251 | * @name: A string which is used in /proc/slabinfo to identify this cache. | 2359 | * @name: A string which is used in /proc/slabinfo to identify this cache. |
2252 | * @size: The size of objects to be created in this cache. | 2360 | * @size: The size of objects to be created in this cache. |
2253 | * @align: The required alignment for the objects. | 2361 | * @align: The required alignment for the objects. |
@@ -2274,59 +2382,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2274 | * as davem. | 2382 | * as davem. |
2275 | */ | 2383 | */ |
2276 | struct kmem_cache * | 2384 | struct kmem_cache * |
2277 | kmem_cache_create (const char *name, size_t size, size_t align, | 2385 | __kmem_cache_create (const char *name, size_t size, size_t align, |
2278 | unsigned long flags, void (*ctor)(void *)) | 2386 | unsigned long flags, void (*ctor)(void *)) |
2279 | { | 2387 | { |
2280 | size_t left_over, slab_size, ralign; | 2388 | size_t left_over, slab_size, ralign; |
2281 | struct kmem_cache *cachep = NULL, *pc; | 2389 | struct kmem_cache *cachep = NULL; |
2282 | gfp_t gfp; | 2390 | gfp_t gfp; |
2283 | 2391 | ||
2284 | /* | ||
2285 | * Sanity checks... these are all serious usage bugs. | ||
2286 | */ | ||
2287 | if (!name || in_interrupt() || (size < BYTES_PER_WORD) || | ||
2288 | size > KMALLOC_MAX_SIZE) { | ||
2289 | printk(KERN_ERR "%s: Early error in slab %s\n", __func__, | ||
2290 | name); | ||
2291 | BUG(); | ||
2292 | } | ||
2293 | |||
2294 | /* | ||
2295 | * We use cache_chain_mutex to ensure a consistent view of | ||
2296 | * cpu_online_mask as well. Please see cpuup_callback | ||
2297 | */ | ||
2298 | if (slab_is_available()) { | ||
2299 | get_online_cpus(); | ||
2300 | mutex_lock(&cache_chain_mutex); | ||
2301 | } | ||
2302 | |||
2303 | list_for_each_entry(pc, &cache_chain, next) { | ||
2304 | char tmp; | ||
2305 | int res; | ||
2306 | |||
2307 | /* | ||
2308 | * This happens when the module gets unloaded and doesn't | ||
2309 | * destroy its slab cache and no-one else reuses the vmalloc | ||
2310 | * area of the module. Print a warning. | ||
2311 | */ | ||
2312 | res = probe_kernel_address(pc->name, tmp); | ||
2313 | if (res) { | ||
2314 | printk(KERN_ERR | ||
2315 | "SLAB: cache with size %d has lost its name\n", | ||
2316 | pc->buffer_size); | ||
2317 | continue; | ||
2318 | } | ||
2319 | |||
2320 | if (!strcmp(pc->name, name)) { | ||
2321 | printk(KERN_ERR | ||
2322 | "kmem_cache_create: duplicate cache %s\n", name); | ||
2323 | dump_stack(); | ||
2324 | goto oops; | ||
2325 | } | ||
2326 | } | ||
2327 | |||
2328 | #if DEBUG | 2392 | #if DEBUG |
2329 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ | ||
2330 | #if FORCED_DEBUG | 2393 | #if FORCED_DEBUG |
2331 | /* | 2394 | /* |
2332 | * Enable redzoning and last user accounting, except for caches with | 2395 | * Enable redzoning and last user accounting, except for caches with |
@@ -2415,11 +2478,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2415 | /* Get cache's description obj. */ | 2478 | /* Get cache's description obj. */ |
2416 | cachep = kmem_cache_zalloc(&cache_cache, gfp); | 2479 | cachep = kmem_cache_zalloc(&cache_cache, gfp); |
2417 | if (!cachep) | 2480 | if (!cachep) |
2418 | goto oops; | 2481 | return NULL; |
2419 | 2482 | ||
2420 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | 2483 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; |
2484 | cachep->object_size = size; | ||
2485 | cachep->align = align; | ||
2421 | #if DEBUG | 2486 | #if DEBUG |
2422 | cachep->obj_size = size; | ||
2423 | 2487 | ||
2424 | /* | 2488 | /* |
2425 | * Both debugging options require word-alignment which is calculated | 2489 | * Both debugging options require word-alignment which is calculated |
@@ -2442,7 +2506,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2442 | } | 2506 | } |
2443 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | 2507 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) |
2444 | if (size >= malloc_sizes[INDEX_L3 + 1].cs_size | 2508 | if (size >= malloc_sizes[INDEX_L3 + 1].cs_size |
2445 | && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { | 2509 | && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { |
2446 | cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); | 2510 | cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); |
2447 | size = PAGE_SIZE; | 2511 | size = PAGE_SIZE; |
2448 | } | 2512 | } |
@@ -2471,8 +2535,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2471 | printk(KERN_ERR | 2535 | printk(KERN_ERR |
2472 | "kmem_cache_create: couldn't create cache %s.\n", name); | 2536 | "kmem_cache_create: couldn't create cache %s.\n", name); |
2473 | kmem_cache_free(&cache_cache, cachep); | 2537 | kmem_cache_free(&cache_cache, cachep); |
2474 | cachep = NULL; | 2538 | return NULL; |
2475 | goto oops; | ||
2476 | } | 2539 | } |
2477 | slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) | 2540 | slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) |
2478 | + sizeof(struct slab), align); | 2541 | + sizeof(struct slab), align); |
@@ -2508,10 +2571,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2508 | cachep->colour = left_over / cachep->colour_off; | 2571 | cachep->colour = left_over / cachep->colour_off; |
2509 | cachep->slab_size = slab_size; | 2572 | cachep->slab_size = slab_size; |
2510 | cachep->flags = flags; | 2573 | cachep->flags = flags; |
2511 | cachep->gfpflags = 0; | 2574 | cachep->allocflags = 0; |
2512 | if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) | 2575 | if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) |
2513 | cachep->gfpflags |= GFP_DMA; | 2576 | cachep->allocflags |= GFP_DMA; |
2514 | cachep->buffer_size = size; | 2577 | cachep->size = size; |
2515 | cachep->reciprocal_buffer_size = reciprocal_value(size); | 2578 | cachep->reciprocal_buffer_size = reciprocal_value(size); |
2516 | 2579 | ||
2517 | if (flags & CFLGS_OFF_SLAB) { | 2580 | if (flags & CFLGS_OFF_SLAB) { |
@@ -2530,8 +2593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2530 | 2593 | ||
2531 | if (setup_cpu_cache(cachep, gfp)) { | 2594 | if (setup_cpu_cache(cachep, gfp)) { |
2532 | __kmem_cache_destroy(cachep); | 2595 | __kmem_cache_destroy(cachep); |
2533 | cachep = NULL; | 2596 | return NULL; |
2534 | goto oops; | ||
2535 | } | 2597 | } |
2536 | 2598 | ||
2537 | if (flags & SLAB_DEBUG_OBJECTS) { | 2599 | if (flags & SLAB_DEBUG_OBJECTS) { |
@@ -2545,18 +2607,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2545 | } | 2607 | } |
2546 | 2608 | ||
2547 | /* cache setup completed, link it into the list */ | 2609 | /* cache setup completed, link it into the list */ |
2548 | list_add(&cachep->next, &cache_chain); | 2610 | list_add(&cachep->list, &slab_caches); |
2549 | oops: | ||
2550 | if (!cachep && (flags & SLAB_PANIC)) | ||
2551 | panic("kmem_cache_create(): failed to create slab `%s'\n", | ||
2552 | name); | ||
2553 | if (slab_is_available()) { | ||
2554 | mutex_unlock(&cache_chain_mutex); | ||
2555 | put_online_cpus(); | ||
2556 | } | ||
2557 | return cachep; | 2611 | return cachep; |
2558 | } | 2612 | } |
2559 | EXPORT_SYMBOL(kmem_cache_create); | ||
2560 | 2613 | ||
2561 | #if DEBUG | 2614 | #if DEBUG |
2562 | static void check_irq_off(void) | 2615 | static void check_irq_off(void) |
@@ -2671,7 +2724,7 @@ out: | |||
2671 | return nr_freed; | 2724 | return nr_freed; |
2672 | } | 2725 | } |
2673 | 2726 | ||
2674 | /* Called with cache_chain_mutex held to protect against cpu hotplug */ | 2727 | /* Called with slab_mutex held to protect against cpu hotplug */ |
2675 | static int __cache_shrink(struct kmem_cache *cachep) | 2728 | static int __cache_shrink(struct kmem_cache *cachep) |
2676 | { | 2729 | { |
2677 | int ret = 0, i = 0; | 2730 | int ret = 0, i = 0; |
@@ -2706,9 +2759,9 @@ int kmem_cache_shrink(struct kmem_cache *cachep) | |||
2706 | BUG_ON(!cachep || in_interrupt()); | 2759 | BUG_ON(!cachep || in_interrupt()); |
2707 | 2760 | ||
2708 | get_online_cpus(); | 2761 | get_online_cpus(); |
2709 | mutex_lock(&cache_chain_mutex); | 2762 | mutex_lock(&slab_mutex); |
2710 | ret = __cache_shrink(cachep); | 2763 | ret = __cache_shrink(cachep); |
2711 | mutex_unlock(&cache_chain_mutex); | 2764 | mutex_unlock(&slab_mutex); |
2712 | put_online_cpus(); | 2765 | put_online_cpus(); |
2713 | return ret; | 2766 | return ret; |
2714 | } | 2767 | } |
@@ -2736,15 +2789,15 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
2736 | 2789 | ||
2737 | /* Find the cache in the chain of caches. */ | 2790 | /* Find the cache in the chain of caches. */ |
2738 | get_online_cpus(); | 2791 | get_online_cpus(); |
2739 | mutex_lock(&cache_chain_mutex); | 2792 | mutex_lock(&slab_mutex); |
2740 | /* | 2793 | /* |
2741 | * the chain is never empty, cache_cache is never destroyed | 2794 | * the chain is never empty, cache_cache is never destroyed |
2742 | */ | 2795 | */ |
2743 | list_del(&cachep->next); | 2796 | list_del(&cachep->list); |
2744 | if (__cache_shrink(cachep)) { | 2797 | if (__cache_shrink(cachep)) { |
2745 | slab_error(cachep, "Can't free all objects"); | 2798 | slab_error(cachep, "Can't free all objects"); |
2746 | list_add(&cachep->next, &cache_chain); | 2799 | list_add(&cachep->list, &slab_caches); |
2747 | mutex_unlock(&cache_chain_mutex); | 2800 | mutex_unlock(&slab_mutex); |
2748 | put_online_cpus(); | 2801 | put_online_cpus(); |
2749 | return; | 2802 | return; |
2750 | } | 2803 | } |
@@ -2753,7 +2806,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
2753 | rcu_barrier(); | 2806 | rcu_barrier(); |
2754 | 2807 | ||
2755 | __kmem_cache_destroy(cachep); | 2808 | __kmem_cache_destroy(cachep); |
2756 | mutex_unlock(&cache_chain_mutex); | 2809 | mutex_unlock(&slab_mutex); |
2757 | put_online_cpus(); | 2810 | put_online_cpus(); |
2758 | } | 2811 | } |
2759 | EXPORT_SYMBOL(kmem_cache_destroy); | 2812 | EXPORT_SYMBOL(kmem_cache_destroy); |
@@ -2840,10 +2893,10 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2840 | slab_error(cachep, "constructor overwrote the" | 2893 | slab_error(cachep, "constructor overwrote the" |
2841 | " start of an object"); | 2894 | " start of an object"); |
2842 | } | 2895 | } |
2843 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && | 2896 | if ((cachep->size % PAGE_SIZE) == 0 && |
2844 | OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) | 2897 | OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) |
2845 | kernel_map_pages(virt_to_page(objp), | 2898 | kernel_map_pages(virt_to_page(objp), |
2846 | cachep->buffer_size / PAGE_SIZE, 0); | 2899 | cachep->size / PAGE_SIZE, 0); |
2847 | #else | 2900 | #else |
2848 | if (cachep->ctor) | 2901 | if (cachep->ctor) |
2849 | cachep->ctor(objp); | 2902 | cachep->ctor(objp); |
@@ -2857,9 +2910,9 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) | |||
2857 | { | 2910 | { |
2858 | if (CONFIG_ZONE_DMA_FLAG) { | 2911 | if (CONFIG_ZONE_DMA_FLAG) { |
2859 | if (flags & GFP_DMA) | 2912 | if (flags & GFP_DMA) |
2860 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); | 2913 | BUG_ON(!(cachep->allocflags & GFP_DMA)); |
2861 | else | 2914 | else |
2862 | BUG_ON(cachep->gfpflags & GFP_DMA); | 2915 | BUG_ON(cachep->allocflags & GFP_DMA); |
2863 | } | 2916 | } |
2864 | } | 2917 | } |
2865 | 2918 | ||
@@ -2918,8 +2971,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, | |||
2918 | nr_pages <<= cache->gfporder; | 2971 | nr_pages <<= cache->gfporder; |
2919 | 2972 | ||
2920 | do { | 2973 | do { |
2921 | page_set_cache(page, cache); | 2974 | page->slab_cache = cache; |
2922 | page_set_slab(page, slab); | 2975 | page->slab_page = slab; |
2923 | page++; | 2976 | page++; |
2924 | } while (--nr_pages); | 2977 | } while (--nr_pages); |
2925 | } | 2978 | } |
@@ -3057,7 +3110,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
3057 | kfree_debugcheck(objp); | 3110 | kfree_debugcheck(objp); |
3058 | page = virt_to_head_page(objp); | 3111 | page = virt_to_head_page(objp); |
3059 | 3112 | ||
3060 | slabp = page_get_slab(page); | 3113 | slabp = page->slab_page; |
3061 | 3114 | ||
3062 | if (cachep->flags & SLAB_RED_ZONE) { | 3115 | if (cachep->flags & SLAB_RED_ZONE) { |
3063 | verify_redzone_free(cachep, objp); | 3116 | verify_redzone_free(cachep, objp); |
@@ -3077,10 +3130,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
3077 | #endif | 3130 | #endif |
3078 | if (cachep->flags & SLAB_POISON) { | 3131 | if (cachep->flags & SLAB_POISON) { |
3079 | #ifdef CONFIG_DEBUG_PAGEALLOC | 3132 | #ifdef CONFIG_DEBUG_PAGEALLOC |
3080 | if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { | 3133 | if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { |
3081 | store_stackinfo(cachep, objp, (unsigned long)caller); | 3134 | store_stackinfo(cachep, objp, (unsigned long)caller); |
3082 | kernel_map_pages(virt_to_page(objp), | 3135 | kernel_map_pages(virt_to_page(objp), |
3083 | cachep->buffer_size / PAGE_SIZE, 0); | 3136 | cachep->size / PAGE_SIZE, 0); |
3084 | } else { | 3137 | } else { |
3085 | poison_obj(cachep, objp, POISON_FREE); | 3138 | poison_obj(cachep, objp, POISON_FREE); |
3086 | } | 3139 | } |
@@ -3120,16 +3173,19 @@ bad: | |||
3120 | #define check_slabp(x,y) do { } while(0) | 3173 | #define check_slabp(x,y) do { } while(0) |
3121 | #endif | 3174 | #endif |
3122 | 3175 | ||
3123 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | 3176 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, |
3177 | bool force_refill) | ||
3124 | { | 3178 | { |
3125 | int batchcount; | 3179 | int batchcount; |
3126 | struct kmem_list3 *l3; | 3180 | struct kmem_list3 *l3; |
3127 | struct array_cache *ac; | 3181 | struct array_cache *ac; |
3128 | int node; | 3182 | int node; |
3129 | 3183 | ||
3130 | retry: | ||
3131 | check_irq_off(); | 3184 | check_irq_off(); |
3132 | node = numa_mem_id(); | 3185 | node = numa_mem_id(); |
3186 | if (unlikely(force_refill)) | ||
3187 | goto force_grow; | ||
3188 | retry: | ||
3133 | ac = cpu_cache_get(cachep); | 3189 | ac = cpu_cache_get(cachep); |
3134 | batchcount = ac->batchcount; | 3190 | batchcount = ac->batchcount; |
3135 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | 3191 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { |
@@ -3179,8 +3235,8 @@ retry: | |||
3179 | STATS_INC_ACTIVE(cachep); | 3235 | STATS_INC_ACTIVE(cachep); |
3180 | STATS_SET_HIGH(cachep); | 3236 | STATS_SET_HIGH(cachep); |
3181 | 3237 | ||
3182 | ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, | 3238 | ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, |
3183 | node); | 3239 | node)); |
3184 | } | 3240 | } |
3185 | check_slabp(cachep, slabp); | 3241 | check_slabp(cachep, slabp); |
3186 | 3242 | ||
@@ -3199,18 +3255,22 @@ alloc_done: | |||
3199 | 3255 | ||
3200 | if (unlikely(!ac->avail)) { | 3256 | if (unlikely(!ac->avail)) { |
3201 | int x; | 3257 | int x; |
3258 | force_grow: | ||
3202 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); | 3259 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); |
3203 | 3260 | ||
3204 | /* cache_grow can reenable interrupts, then ac could change. */ | 3261 | /* cache_grow can reenable interrupts, then ac could change. */ |
3205 | ac = cpu_cache_get(cachep); | 3262 | ac = cpu_cache_get(cachep); |
3206 | if (!x && ac->avail == 0) /* no objects in sight? abort */ | 3263 | |
3264 | /* no objects in sight? abort */ | ||
3265 | if (!x && (ac->avail == 0 || force_refill)) | ||
3207 | return NULL; | 3266 | return NULL; |
3208 | 3267 | ||
3209 | if (!ac->avail) /* objects refilled by interrupt? */ | 3268 | if (!ac->avail) /* objects refilled by interrupt? */ |
3210 | goto retry; | 3269 | goto retry; |
3211 | } | 3270 | } |
3212 | ac->touched = 1; | 3271 | ac->touched = 1; |
3213 | return ac->entry[--ac->avail]; | 3272 | |
3273 | return ac_get_obj(cachep, ac, flags, force_refill); | ||
3214 | } | 3274 | } |
3215 | 3275 | ||
3216 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, | 3276 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, |
@@ -3230,9 +3290,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3230 | return objp; | 3290 | return objp; |
3231 | if (cachep->flags & SLAB_POISON) { | 3291 | if (cachep->flags & SLAB_POISON) { |
3232 | #ifdef CONFIG_DEBUG_PAGEALLOC | 3292 | #ifdef CONFIG_DEBUG_PAGEALLOC |
3233 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) | 3293 | if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) |
3234 | kernel_map_pages(virt_to_page(objp), | 3294 | kernel_map_pages(virt_to_page(objp), |
3235 | cachep->buffer_size / PAGE_SIZE, 1); | 3295 | cachep->size / PAGE_SIZE, 1); |
3236 | else | 3296 | else |
3237 | check_poison_obj(cachep, objp); | 3297 | check_poison_obj(cachep, objp); |
3238 | #else | 3298 | #else |
@@ -3261,8 +3321,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3261 | struct slab *slabp; | 3321 | struct slab *slabp; |
3262 | unsigned objnr; | 3322 | unsigned objnr; |
3263 | 3323 | ||
3264 | slabp = page_get_slab(virt_to_head_page(objp)); | 3324 | slabp = virt_to_head_page(objp)->slab_page; |
3265 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; | 3325 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->size; |
3266 | slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; | 3326 | slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; |
3267 | } | 3327 | } |
3268 | #endif | 3328 | #endif |
@@ -3285,30 +3345,42 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) | |||
3285 | if (cachep == &cache_cache) | 3345 | if (cachep == &cache_cache) |
3286 | return false; | 3346 | return false; |
3287 | 3347 | ||
3288 | return should_failslab(obj_size(cachep), flags, cachep->flags); | 3348 | return should_failslab(cachep->object_size, flags, cachep->flags); |
3289 | } | 3349 | } |
3290 | 3350 | ||
3291 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3351 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
3292 | { | 3352 | { |
3293 | void *objp; | 3353 | void *objp; |
3294 | struct array_cache *ac; | 3354 | struct array_cache *ac; |
3355 | bool force_refill = false; | ||
3295 | 3356 | ||
3296 | check_irq_off(); | 3357 | check_irq_off(); |
3297 | 3358 | ||
3298 | ac = cpu_cache_get(cachep); | 3359 | ac = cpu_cache_get(cachep); |
3299 | if (likely(ac->avail)) { | 3360 | if (likely(ac->avail)) { |
3300 | STATS_INC_ALLOCHIT(cachep); | ||
3301 | ac->touched = 1; | 3361 | ac->touched = 1; |
3302 | objp = ac->entry[--ac->avail]; | 3362 | objp = ac_get_obj(cachep, ac, flags, false); |
3303 | } else { | 3363 | |
3304 | STATS_INC_ALLOCMISS(cachep); | ||
3305 | objp = cache_alloc_refill(cachep, flags); | ||
3306 | /* | 3364 | /* |
3307 | * the 'ac' may be updated by cache_alloc_refill(), | 3365 | * Allow for the possibility all avail objects are not allowed |
3308 | * and kmemleak_erase() requires its correct value. | 3366 | * by the current flags |
3309 | */ | 3367 | */ |
3310 | ac = cpu_cache_get(cachep); | 3368 | if (objp) { |
3369 | STATS_INC_ALLOCHIT(cachep); | ||
3370 | goto out; | ||
3371 | } | ||
3372 | force_refill = true; | ||
3311 | } | 3373 | } |
3374 | |||
3375 | STATS_INC_ALLOCMISS(cachep); | ||
3376 | objp = cache_alloc_refill(cachep, flags, force_refill); | ||
3377 | /* | ||
3378 | * the 'ac' may be updated by cache_alloc_refill(), | ||
3379 | * and kmemleak_erase() requires its correct value. | ||
3380 | */ | ||
3381 | ac = cpu_cache_get(cachep); | ||
3382 | |||
3383 | out: | ||
3312 | /* | 3384 | /* |
3313 | * To avoid a false negative, if an object that is in one of the | 3385 | * To avoid a false negative, if an object that is in one of the |
3314 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | 3386 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't |
@@ -3336,7 +3408,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3336 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3408 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
3337 | nid_alloc = cpuset_slab_spread_node(); | 3409 | nid_alloc = cpuset_slab_spread_node(); |
3338 | else if (current->mempolicy) | 3410 | else if (current->mempolicy) |
3339 | nid_alloc = slab_node(current->mempolicy); | 3411 | nid_alloc = slab_node(); |
3340 | if (nid_alloc != nid_here) | 3412 | if (nid_alloc != nid_here) |
3341 | return ____cache_alloc_node(cachep, flags, nid_alloc); | 3413 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3342 | return NULL; | 3414 | return NULL; |
@@ -3368,7 +3440,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3368 | 3440 | ||
3369 | retry_cpuset: | 3441 | retry_cpuset: |
3370 | cpuset_mems_cookie = get_mems_allowed(); | 3442 | cpuset_mems_cookie = get_mems_allowed(); |
3371 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 3443 | zonelist = node_zonelist(slab_node(), flags); |
3372 | 3444 | ||
3373 | retry: | 3445 | retry: |
3374 | /* | 3446 | /* |
@@ -3545,14 +3617,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3545 | out: | 3617 | out: |
3546 | local_irq_restore(save_flags); | 3618 | local_irq_restore(save_flags); |
3547 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); | 3619 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); |
3548 | kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, | 3620 | kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, |
3549 | flags); | 3621 | flags); |
3550 | 3622 | ||
3551 | if (likely(ptr)) | 3623 | if (likely(ptr)) |
3552 | kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); | 3624 | kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); |
3553 | 3625 | ||
3554 | if (unlikely((flags & __GFP_ZERO) && ptr)) | 3626 | if (unlikely((flags & __GFP_ZERO) && ptr)) |
3555 | memset(ptr, 0, obj_size(cachep)); | 3627 | memset(ptr, 0, cachep->object_size); |
3556 | 3628 | ||
3557 | return ptr; | 3629 | return ptr; |
3558 | } | 3630 | } |
@@ -3607,15 +3679,15 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
3607 | objp = __do_cache_alloc(cachep, flags); | 3679 | objp = __do_cache_alloc(cachep, flags); |
3608 | local_irq_restore(save_flags); | 3680 | local_irq_restore(save_flags); |
3609 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); | 3681 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); |
3610 | kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, | 3682 | kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, |
3611 | flags); | 3683 | flags); |
3612 | prefetchw(objp); | 3684 | prefetchw(objp); |
3613 | 3685 | ||
3614 | if (likely(objp)) | 3686 | if (likely(objp)) |
3615 | kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); | 3687 | kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); |
3616 | 3688 | ||
3617 | if (unlikely((flags & __GFP_ZERO) && objp)) | 3689 | if (unlikely((flags & __GFP_ZERO) && objp)) |
3618 | memset(objp, 0, obj_size(cachep)); | 3690 | memset(objp, 0, cachep->object_size); |
3619 | 3691 | ||
3620 | return objp; | 3692 | return objp; |
3621 | } | 3693 | } |
@@ -3630,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, | |||
3630 | struct kmem_list3 *l3; | 3702 | struct kmem_list3 *l3; |
3631 | 3703 | ||
3632 | for (i = 0; i < nr_objects; i++) { | 3704 | for (i = 0; i < nr_objects; i++) { |
3633 | void *objp = objpp[i]; | 3705 | void *objp; |
3634 | struct slab *slabp; | 3706 | struct slab *slabp; |
3635 | 3707 | ||
3708 | clear_obj_pfmemalloc(&objpp[i]); | ||
3709 | objp = objpp[i]; | ||
3710 | |||
3636 | slabp = virt_to_slab(objp); | 3711 | slabp = virt_to_slab(objp); |
3637 | l3 = cachep->nodelists[node]; | 3712 | l3 = cachep->nodelists[node]; |
3638 | list_del(&slabp->list); | 3713 | list_del(&slabp->list); |
@@ -3731,7 +3806,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3731 | kmemleak_free_recursive(objp, cachep->flags); | 3806 | kmemleak_free_recursive(objp, cachep->flags); |
3732 | objp = cache_free_debugcheck(cachep, objp, caller); | 3807 | objp = cache_free_debugcheck(cachep, objp, caller); |
3733 | 3808 | ||
3734 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); | 3809 | kmemcheck_slab_free(cachep, objp, cachep->object_size); |
3735 | 3810 | ||
3736 | /* | 3811 | /* |
3737 | * Skip calling cache_free_alien() when the platform is not numa. | 3812 | * Skip calling cache_free_alien() when the platform is not numa. |
@@ -3750,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3750 | cache_flusharray(cachep, ac); | 3825 | cache_flusharray(cachep, ac); |
3751 | } | 3826 | } |
3752 | 3827 | ||
3753 | ac->entry[ac->avail++] = objp; | 3828 | ac_put_obj(cachep, ac, objp); |
3754 | } | 3829 | } |
3755 | 3830 | ||
3756 | /** | 3831 | /** |
@@ -3766,7 +3841,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3766 | void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3841 | void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); |
3767 | 3842 | ||
3768 | trace_kmem_cache_alloc(_RET_IP_, ret, | 3843 | trace_kmem_cache_alloc(_RET_IP_, ret, |
3769 | obj_size(cachep), cachep->buffer_size, flags); | 3844 | cachep->object_size, cachep->size, flags); |
3770 | 3845 | ||
3771 | return ret; | 3846 | return ret; |
3772 | } | 3847 | } |
@@ -3794,7 +3869,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3794 | __builtin_return_address(0)); | 3869 | __builtin_return_address(0)); |
3795 | 3870 | ||
3796 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | 3871 | trace_kmem_cache_alloc_node(_RET_IP_, ret, |
3797 | obj_size(cachep), cachep->buffer_size, | 3872 | cachep->object_size, cachep->size, |
3798 | flags, nodeid); | 3873 | flags, nodeid); |
3799 | 3874 | ||
3800 | return ret; | 3875 | return ret; |
@@ -3876,7 +3951,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3876 | ret = __cache_alloc(cachep, flags, caller); | 3951 | ret = __cache_alloc(cachep, flags, caller); |
3877 | 3952 | ||
3878 | trace_kmalloc((unsigned long) caller, ret, | 3953 | trace_kmalloc((unsigned long) caller, ret, |
3879 | size, cachep->buffer_size, flags); | 3954 | size, cachep->size, flags); |
3880 | 3955 | ||
3881 | return ret; | 3956 | return ret; |
3882 | } | 3957 | } |
@@ -3916,9 +3991,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) | |||
3916 | unsigned long flags; | 3991 | unsigned long flags; |
3917 | 3992 | ||
3918 | local_irq_save(flags); | 3993 | local_irq_save(flags); |
3919 | debug_check_no_locks_freed(objp, obj_size(cachep)); | 3994 | debug_check_no_locks_freed(objp, cachep->object_size); |
3920 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) | 3995 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) |
3921 | debug_check_no_obj_freed(objp, obj_size(cachep)); | 3996 | debug_check_no_obj_freed(objp, cachep->object_size); |
3922 | __cache_free(cachep, objp, __builtin_return_address(0)); | 3997 | __cache_free(cachep, objp, __builtin_return_address(0)); |
3923 | local_irq_restore(flags); | 3998 | local_irq_restore(flags); |
3924 | 3999 | ||
@@ -3947,8 +4022,9 @@ void kfree(const void *objp) | |||
3947 | local_irq_save(flags); | 4022 | local_irq_save(flags); |
3948 | kfree_debugcheck(objp); | 4023 | kfree_debugcheck(objp); |
3949 | c = virt_to_cache(objp); | 4024 | c = virt_to_cache(objp); |
3950 | debug_check_no_locks_freed(objp, obj_size(c)); | 4025 | debug_check_no_locks_freed(objp, c->object_size); |
3951 | debug_check_no_obj_freed(objp, obj_size(c)); | 4026 | |
4027 | debug_check_no_obj_freed(objp, c->object_size); | ||
3952 | __cache_free(c, (void *)objp, __builtin_return_address(0)); | 4028 | __cache_free(c, (void *)objp, __builtin_return_address(0)); |
3953 | local_irq_restore(flags); | 4029 | local_irq_restore(flags); |
3954 | } | 4030 | } |
@@ -3956,7 +4032,7 @@ EXPORT_SYMBOL(kfree); | |||
3956 | 4032 | ||
3957 | unsigned int kmem_cache_size(struct kmem_cache *cachep) | 4033 | unsigned int kmem_cache_size(struct kmem_cache *cachep) |
3958 | { | 4034 | { |
3959 | return obj_size(cachep); | 4035 | return cachep->object_size; |
3960 | } | 4036 | } |
3961 | EXPORT_SYMBOL(kmem_cache_size); | 4037 | EXPORT_SYMBOL(kmem_cache_size); |
3962 | 4038 | ||
@@ -4030,7 +4106,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) | |||
4030 | return 0; | 4106 | return 0; |
4031 | 4107 | ||
4032 | fail: | 4108 | fail: |
4033 | if (!cachep->next.next) { | 4109 | if (!cachep->list.next) { |
4034 | /* Cache is not active yet. Roll back what we did */ | 4110 | /* Cache is not active yet. Roll back what we did */ |
4035 | node--; | 4111 | node--; |
4036 | while (node >= 0) { | 4112 | while (node >= 0) { |
@@ -4065,7 +4141,7 @@ static void do_ccupdate_local(void *info) | |||
4065 | new->new[smp_processor_id()] = old; | 4141 | new->new[smp_processor_id()] = old; |
4066 | } | 4142 | } |
4067 | 4143 | ||
4068 | /* Always called with the cache_chain_mutex held */ | 4144 | /* Always called with the slab_mutex held */ |
4069 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 4145 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, |
4070 | int batchcount, int shared, gfp_t gfp) | 4146 | int batchcount, int shared, gfp_t gfp) |
4071 | { | 4147 | { |
@@ -4109,7 +4185,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
4109 | return alloc_kmemlist(cachep, gfp); | 4185 | return alloc_kmemlist(cachep, gfp); |
4110 | } | 4186 | } |
4111 | 4187 | ||
4112 | /* Called with cache_chain_mutex held always */ | 4188 | /* Called with slab_mutex held always */ |
4113 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | 4189 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) |
4114 | { | 4190 | { |
4115 | int err; | 4191 | int err; |
@@ -4124,13 +4200,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | |||
4124 | * The numbers are guessed, we should auto-tune as described by | 4200 | * The numbers are guessed, we should auto-tune as described by |
4125 | * Bonwick. | 4201 | * Bonwick. |
4126 | */ | 4202 | */ |
4127 | if (cachep->buffer_size > 131072) | 4203 | if (cachep->size > 131072) |
4128 | limit = 1; | 4204 | limit = 1; |
4129 | else if (cachep->buffer_size > PAGE_SIZE) | 4205 | else if (cachep->size > PAGE_SIZE) |
4130 | limit = 8; | 4206 | limit = 8; |
4131 | else if (cachep->buffer_size > 1024) | 4207 | else if (cachep->size > 1024) |
4132 | limit = 24; | 4208 | limit = 24; |
4133 | else if (cachep->buffer_size > 256) | 4209 | else if (cachep->size > 256) |
4134 | limit = 54; | 4210 | limit = 54; |
4135 | else | 4211 | else |
4136 | limit = 120; | 4212 | limit = 120; |
@@ -4145,7 +4221,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | |||
4145 | * to a larger limit. Thus disabled by default. | 4221 | * to a larger limit. Thus disabled by default. |
4146 | */ | 4222 | */ |
4147 | shared = 0; | 4223 | shared = 0; |
4148 | if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) | 4224 | if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) |
4149 | shared = 8; | 4225 | shared = 8; |
4150 | 4226 | ||
4151 | #if DEBUG | 4227 | #if DEBUG |
@@ -4211,11 +4287,11 @@ static void cache_reap(struct work_struct *w) | |||
4211 | int node = numa_mem_id(); | 4287 | int node = numa_mem_id(); |
4212 | struct delayed_work *work = to_delayed_work(w); | 4288 | struct delayed_work *work = to_delayed_work(w); |
4213 | 4289 | ||
4214 | if (!mutex_trylock(&cache_chain_mutex)) | 4290 | if (!mutex_trylock(&slab_mutex)) |
4215 | /* Give up. Setup the next iteration. */ | 4291 | /* Give up. Setup the next iteration. */ |
4216 | goto out; | 4292 | goto out; |
4217 | 4293 | ||
4218 | list_for_each_entry(searchp, &cache_chain, next) { | 4294 | list_for_each_entry(searchp, &slab_caches, list) { |
4219 | check_irq_on(); | 4295 | check_irq_on(); |
4220 | 4296 | ||
4221 | /* | 4297 | /* |
@@ -4253,7 +4329,7 @@ next: | |||
4253 | cond_resched(); | 4329 | cond_resched(); |
4254 | } | 4330 | } |
4255 | check_irq_on(); | 4331 | check_irq_on(); |
4256 | mutex_unlock(&cache_chain_mutex); | 4332 | mutex_unlock(&slab_mutex); |
4257 | next_reap_node(); | 4333 | next_reap_node(); |
4258 | out: | 4334 | out: |
4259 | /* Set up the next iteration */ | 4335 | /* Set up the next iteration */ |
@@ -4289,26 +4365,26 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
4289 | { | 4365 | { |
4290 | loff_t n = *pos; | 4366 | loff_t n = *pos; |
4291 | 4367 | ||
4292 | mutex_lock(&cache_chain_mutex); | 4368 | mutex_lock(&slab_mutex); |
4293 | if (!n) | 4369 | if (!n) |
4294 | print_slabinfo_header(m); | 4370 | print_slabinfo_header(m); |
4295 | 4371 | ||
4296 | return seq_list_start(&cache_chain, *pos); | 4372 | return seq_list_start(&slab_caches, *pos); |
4297 | } | 4373 | } |
4298 | 4374 | ||
4299 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | 4375 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) |
4300 | { | 4376 | { |
4301 | return seq_list_next(p, &cache_chain, pos); | 4377 | return seq_list_next(p, &slab_caches, pos); |
4302 | } | 4378 | } |
4303 | 4379 | ||
4304 | static void s_stop(struct seq_file *m, void *p) | 4380 | static void s_stop(struct seq_file *m, void *p) |
4305 | { | 4381 | { |
4306 | mutex_unlock(&cache_chain_mutex); | 4382 | mutex_unlock(&slab_mutex); |
4307 | } | 4383 | } |
4308 | 4384 | ||
4309 | static int s_show(struct seq_file *m, void *p) | 4385 | static int s_show(struct seq_file *m, void *p) |
4310 | { | 4386 | { |
4311 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); | 4387 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); |
4312 | struct slab *slabp; | 4388 | struct slab *slabp; |
4313 | unsigned long active_objs; | 4389 | unsigned long active_objs; |
4314 | unsigned long num_objs; | 4390 | unsigned long num_objs; |
@@ -4364,7 +4440,7 @@ static int s_show(struct seq_file *m, void *p) | |||
4364 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); | 4440 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); |
4365 | 4441 | ||
4366 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", | 4442 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", |
4367 | name, active_objs, num_objs, cachep->buffer_size, | 4443 | name, active_objs, num_objs, cachep->size, |
4368 | cachep->num, (1 << cachep->gfporder)); | 4444 | cachep->num, (1 << cachep->gfporder)); |
4369 | seq_printf(m, " : tunables %4u %4u %4u", | 4445 | seq_printf(m, " : tunables %4u %4u %4u", |
4370 | cachep->limit, cachep->batchcount, cachep->shared); | 4446 | cachep->limit, cachep->batchcount, cachep->shared); |
@@ -4454,9 +4530,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
4454 | return -EINVAL; | 4530 | return -EINVAL; |
4455 | 4531 | ||
4456 | /* Find the cache in the chain of caches. */ | 4532 | /* Find the cache in the chain of caches. */ |
4457 | mutex_lock(&cache_chain_mutex); | 4533 | mutex_lock(&slab_mutex); |
4458 | res = -EINVAL; | 4534 | res = -EINVAL; |
4459 | list_for_each_entry(cachep, &cache_chain, next) { | 4535 | list_for_each_entry(cachep, &slab_caches, list) { |
4460 | if (!strcmp(cachep->name, kbuf)) { | 4536 | if (!strcmp(cachep->name, kbuf)) { |
4461 | if (limit < 1 || batchcount < 1 || | 4537 | if (limit < 1 || batchcount < 1 || |
4462 | batchcount > limit || shared < 0) { | 4538 | batchcount > limit || shared < 0) { |
@@ -4469,7 +4545,7 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
4469 | break; | 4545 | break; |
4470 | } | 4546 | } |
4471 | } | 4547 | } |
4472 | mutex_unlock(&cache_chain_mutex); | 4548 | mutex_unlock(&slab_mutex); |
4473 | if (res >= 0) | 4549 | if (res >= 0) |
4474 | res = count; | 4550 | res = count; |
4475 | return res; | 4551 | return res; |
@@ -4492,8 +4568,8 @@ static const struct file_operations proc_slabinfo_operations = { | |||
4492 | 4568 | ||
4493 | static void *leaks_start(struct seq_file *m, loff_t *pos) | 4569 | static void *leaks_start(struct seq_file *m, loff_t *pos) |
4494 | { | 4570 | { |
4495 | mutex_lock(&cache_chain_mutex); | 4571 | mutex_lock(&slab_mutex); |
4496 | return seq_list_start(&cache_chain, *pos); | 4572 | return seq_list_start(&slab_caches, *pos); |
4497 | } | 4573 | } |
4498 | 4574 | ||
4499 | static inline int add_caller(unsigned long *n, unsigned long v) | 4575 | static inline int add_caller(unsigned long *n, unsigned long v) |
@@ -4532,7 +4608,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) | |||
4532 | int i; | 4608 | int i; |
4533 | if (n[0] == n[1]) | 4609 | if (n[0] == n[1]) |
4534 | return; | 4610 | return; |
4535 | for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { | 4611 | for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { |
4536 | if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) | 4612 | if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) |
4537 | continue; | 4613 | continue; |
4538 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) | 4614 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) |
@@ -4558,7 +4634,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) | |||
4558 | 4634 | ||
4559 | static int leaks_show(struct seq_file *m, void *p) | 4635 | static int leaks_show(struct seq_file *m, void *p) |
4560 | { | 4636 | { |
4561 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); | 4637 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); |
4562 | struct slab *slabp; | 4638 | struct slab *slabp; |
4563 | struct kmem_list3 *l3; | 4639 | struct kmem_list3 *l3; |
4564 | const char *name; | 4640 | const char *name; |
@@ -4592,17 +4668,17 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4592 | name = cachep->name; | 4668 | name = cachep->name; |
4593 | if (n[0] == n[1]) { | 4669 | if (n[0] == n[1]) { |
4594 | /* Increase the buffer size */ | 4670 | /* Increase the buffer size */ |
4595 | mutex_unlock(&cache_chain_mutex); | 4671 | mutex_unlock(&slab_mutex); |
4596 | m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); | 4672 | m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); |
4597 | if (!m->private) { | 4673 | if (!m->private) { |
4598 | /* Too bad, we are really out */ | 4674 | /* Too bad, we are really out */ |
4599 | m->private = n; | 4675 | m->private = n; |
4600 | mutex_lock(&cache_chain_mutex); | 4676 | mutex_lock(&slab_mutex); |
4601 | return -ENOMEM; | 4677 | return -ENOMEM; |
4602 | } | 4678 | } |
4603 | *(unsigned long *)m->private = n[0] * 2; | 4679 | *(unsigned long *)m->private = n[0] * 2; |
4604 | kfree(n); | 4680 | kfree(n); |
4605 | mutex_lock(&cache_chain_mutex); | 4681 | mutex_lock(&slab_mutex); |
4606 | /* Now make sure this entry will be retried */ | 4682 | /* Now make sure this entry will be retried */ |
4607 | m->count = m->size; | 4683 | m->count = m->size; |
4608 | return 0; | 4684 | return 0; |
@@ -4677,6 +4753,6 @@ size_t ksize(const void *objp) | |||
4677 | if (unlikely(objp == ZERO_SIZE_PTR)) | 4753 | if (unlikely(objp == ZERO_SIZE_PTR)) |
4678 | return 0; | 4754 | return 0; |
4679 | 4755 | ||
4680 | return obj_size(virt_to_cache(objp)); | 4756 | return virt_to_cache(objp)->object_size; |
4681 | } | 4757 | } |
4682 | EXPORT_SYMBOL(ksize); | 4758 | EXPORT_SYMBOL(ksize); |
diff --git a/mm/slab.h b/mm/slab.h new file mode 100644 index 000000000000..db7848caaa25 --- /dev/null +++ b/mm/slab.h | |||
@@ -0,0 +1,33 @@ | |||
1 | #ifndef MM_SLAB_H | ||
2 | #define MM_SLAB_H | ||
3 | /* | ||
4 | * Internal slab definitions | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * State of the slab allocator. | ||
9 | * | ||
10 | * This is used to describe the states of the allocator during bootup. | ||
11 | * Allocators use this to gradually bootstrap themselves. Most allocators | ||
12 | * have the problem that the structures used for managing slab caches are | ||
13 | * allocated from slab caches themselves. | ||
14 | */ | ||
15 | enum slab_state { | ||
16 | DOWN, /* No slab functionality yet */ | ||
17 | PARTIAL, /* SLUB: kmem_cache_node available */ | ||
18 | PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ | ||
19 | PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */ | ||
20 | UP, /* Slab caches usable but not all extras yet */ | ||
21 | FULL /* Everything is working */ | ||
22 | }; | ||
23 | |||
24 | extern enum slab_state slab_state; | ||
25 | |||
26 | /* The slab cache mutex protects the management structures during changes */ | ||
27 | extern struct mutex slab_mutex; | ||
28 | extern struct list_head slab_caches; | ||
29 | |||
30 | struct kmem_cache *__kmem_cache_create(const char *name, size_t size, | ||
31 | size_t align, unsigned long flags, void (*ctor)(void *)); | ||
32 | |||
33 | #endif | ||
diff --git a/mm/slab_common.c b/mm/slab_common.c new file mode 100644 index 000000000000..aa3ca5bb01b5 --- /dev/null +++ b/mm/slab_common.c | |||
@@ -0,0 +1,120 @@ | |||
1 | /* | ||
2 | * Slab allocator functions that are independent of the allocator strategy | ||
3 | * | ||
4 | * (C) 2012 Christoph Lameter <cl@linux.com> | ||
5 | */ | ||
6 | #include <linux/slab.h> | ||
7 | |||
8 | #include <linux/mm.h> | ||
9 | #include <linux/poison.h> | ||
10 | #include <linux/interrupt.h> | ||
11 | #include <linux/memory.h> | ||
12 | #include <linux/compiler.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/uaccess.h> | ||
16 | #include <asm/cacheflush.h> | ||
17 | #include <asm/tlbflush.h> | ||
18 | #include <asm/page.h> | ||
19 | |||
20 | #include "slab.h" | ||
21 | |||
22 | enum slab_state slab_state; | ||
23 | LIST_HEAD(slab_caches); | ||
24 | DEFINE_MUTEX(slab_mutex); | ||
25 | |||
26 | /* | ||
27 | * kmem_cache_create - Create a cache. | ||
28 | * @name: A string which is used in /proc/slabinfo to identify this cache. | ||
29 | * @size: The size of objects to be created in this cache. | ||
30 | * @align: The required alignment for the objects. | ||
31 | * @flags: SLAB flags | ||
32 | * @ctor: A constructor for the objects. | ||
33 | * | ||
34 | * Returns a ptr to the cache on success, NULL on failure. | ||
35 | * Cannot be called within a interrupt, but can be interrupted. | ||
36 | * The @ctor is run when new pages are allocated by the cache. | ||
37 | * | ||
38 | * The flags are | ||
39 | * | ||
40 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) | ||
41 | * to catch references to uninitialised memory. | ||
42 | * | ||
43 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check | ||
44 | * for buffer overruns. | ||
45 | * | ||
46 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware | ||
47 | * cacheline. This can be beneficial if you're counting cycles as closely | ||
48 | * as davem. | ||
49 | */ | ||
50 | |||
51 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, | ||
52 | unsigned long flags, void (*ctor)(void *)) | ||
53 | { | ||
54 | struct kmem_cache *s = NULL; | ||
55 | |||
56 | #ifdef CONFIG_DEBUG_VM | ||
57 | if (!name || in_interrupt() || size < sizeof(void *) || | ||
58 | size > KMALLOC_MAX_SIZE) { | ||
59 | printk(KERN_ERR "kmem_cache_create(%s) integrity check" | ||
60 | " failed\n", name); | ||
61 | goto out; | ||
62 | } | ||
63 | #endif | ||
64 | |||
65 | get_online_cpus(); | ||
66 | mutex_lock(&slab_mutex); | ||
67 | |||
68 | #ifdef CONFIG_DEBUG_VM | ||
69 | list_for_each_entry(s, &slab_caches, list) { | ||
70 | char tmp; | ||
71 | int res; | ||
72 | |||
73 | /* | ||
74 | * This happens when the module gets unloaded and doesn't | ||
75 | * destroy its slab cache and no-one else reuses the vmalloc | ||
76 | * area of the module. Print a warning. | ||
77 | */ | ||
78 | res = probe_kernel_address(s->name, tmp); | ||
79 | if (res) { | ||
80 | printk(KERN_ERR | ||
81 | "Slab cache with size %d has lost its name\n", | ||
82 | s->object_size); | ||
83 | continue; | ||
84 | } | ||
85 | |||
86 | if (!strcmp(s->name, name)) { | ||
87 | printk(KERN_ERR "kmem_cache_create(%s): Cache name" | ||
88 | " already exists.\n", | ||
89 | name); | ||
90 | dump_stack(); | ||
91 | s = NULL; | ||
92 | goto oops; | ||
93 | } | ||
94 | } | ||
95 | |||
96 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ | ||
97 | #endif | ||
98 | |||
99 | s = __kmem_cache_create(name, size, align, flags, ctor); | ||
100 | |||
101 | #ifdef CONFIG_DEBUG_VM | ||
102 | oops: | ||
103 | #endif | ||
104 | mutex_unlock(&slab_mutex); | ||
105 | put_online_cpus(); | ||
106 | |||
107 | #ifdef CONFIG_DEBUG_VM | ||
108 | out: | ||
109 | #endif | ||
110 | if (!s && (flags & SLAB_PANIC)) | ||
111 | panic("kmem_cache_create: Failed to create slab '%s'\n", name); | ||
112 | |||
113 | return s; | ||
114 | } | ||
115 | EXPORT_SYMBOL(kmem_cache_create); | ||
116 | |||
117 | int slab_is_available(void) | ||
118 | { | ||
119 | return slab_state >= UP; | ||
120 | } | ||
@@ -59,6 +59,8 @@ | |||
59 | 59 | ||
60 | #include <linux/kernel.h> | 60 | #include <linux/kernel.h> |
61 | #include <linux/slab.h> | 61 | #include <linux/slab.h> |
62 | #include "slab.h" | ||
63 | |||
62 | #include <linux/mm.h> | 64 | #include <linux/mm.h> |
63 | #include <linux/swap.h> /* struct reclaim_state */ | 65 | #include <linux/swap.h> /* struct reclaim_state */ |
64 | #include <linux/cache.h> | 66 | #include <linux/cache.h> |
@@ -92,36 +94,6 @@ struct slob_block { | |||
92 | typedef struct slob_block slob_t; | 94 | typedef struct slob_block slob_t; |
93 | 95 | ||
94 | /* | 96 | /* |
95 | * We use struct page fields to manage some slob allocation aspects, | ||
96 | * however to avoid the horrible mess in include/linux/mm_types.h, we'll | ||
97 | * just define our own struct page type variant here. | ||
98 | */ | ||
99 | struct slob_page { | ||
100 | union { | ||
101 | struct { | ||
102 | unsigned long flags; /* mandatory */ | ||
103 | atomic_t _count; /* mandatory */ | ||
104 | slobidx_t units; /* free units left in page */ | ||
105 | unsigned long pad[2]; | ||
106 | slob_t *free; /* first free slob_t in page */ | ||
107 | struct list_head list; /* linked list of free pages */ | ||
108 | }; | ||
109 | struct page page; | ||
110 | }; | ||
111 | }; | ||
112 | static inline void struct_slob_page_wrong_size(void) | ||
113 | { BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); } | ||
114 | |||
115 | /* | ||
116 | * free_slob_page: call before a slob_page is returned to the page allocator. | ||
117 | */ | ||
118 | static inline void free_slob_page(struct slob_page *sp) | ||
119 | { | ||
120 | reset_page_mapcount(&sp->page); | ||
121 | sp->page.mapping = NULL; | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * All partially free slob pages go on these lists. | 97 | * All partially free slob pages go on these lists. |
126 | */ | 98 | */ |
127 | #define SLOB_BREAK1 256 | 99 | #define SLOB_BREAK1 256 |
@@ -131,46 +103,23 @@ static LIST_HEAD(free_slob_medium); | |||
131 | static LIST_HEAD(free_slob_large); | 103 | static LIST_HEAD(free_slob_large); |
132 | 104 | ||
133 | /* | 105 | /* |
134 | * is_slob_page: True for all slob pages (false for bigblock pages) | ||
135 | */ | ||
136 | static inline int is_slob_page(struct slob_page *sp) | ||
137 | { | ||
138 | return PageSlab((struct page *)sp); | ||
139 | } | ||
140 | |||
141 | static inline void set_slob_page(struct slob_page *sp) | ||
142 | { | ||
143 | __SetPageSlab((struct page *)sp); | ||
144 | } | ||
145 | |||
146 | static inline void clear_slob_page(struct slob_page *sp) | ||
147 | { | ||
148 | __ClearPageSlab((struct page *)sp); | ||
149 | } | ||
150 | |||
151 | static inline struct slob_page *slob_page(const void *addr) | ||
152 | { | ||
153 | return (struct slob_page *)virt_to_page(addr); | ||
154 | } | ||
155 | |||
156 | /* | ||
157 | * slob_page_free: true for pages on free_slob_pages list. | 106 | * slob_page_free: true for pages on free_slob_pages list. |
158 | */ | 107 | */ |
159 | static inline int slob_page_free(struct slob_page *sp) | 108 | static inline int slob_page_free(struct page *sp) |
160 | { | 109 | { |
161 | return PageSlobFree((struct page *)sp); | 110 | return PageSlobFree(sp); |
162 | } | 111 | } |
163 | 112 | ||
164 | static void set_slob_page_free(struct slob_page *sp, struct list_head *list) | 113 | static void set_slob_page_free(struct page *sp, struct list_head *list) |
165 | { | 114 | { |
166 | list_add(&sp->list, list); | 115 | list_add(&sp->list, list); |
167 | __SetPageSlobFree((struct page *)sp); | 116 | __SetPageSlobFree(sp); |
168 | } | 117 | } |
169 | 118 | ||
170 | static inline void clear_slob_page_free(struct slob_page *sp) | 119 | static inline void clear_slob_page_free(struct page *sp) |
171 | { | 120 | { |
172 | list_del(&sp->list); | 121 | list_del(&sp->list); |
173 | __ClearPageSlobFree((struct page *)sp); | 122 | __ClearPageSlobFree(sp); |
174 | } | 123 | } |
175 | 124 | ||
176 | #define SLOB_UNIT sizeof(slob_t) | 125 | #define SLOB_UNIT sizeof(slob_t) |
@@ -267,12 +216,12 @@ static void slob_free_pages(void *b, int order) | |||
267 | /* | 216 | /* |
268 | * Allocate a slob block within a given slob_page sp. | 217 | * Allocate a slob block within a given slob_page sp. |
269 | */ | 218 | */ |
270 | static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) | 219 | static void *slob_page_alloc(struct page *sp, size_t size, int align) |
271 | { | 220 | { |
272 | slob_t *prev, *cur, *aligned = NULL; | 221 | slob_t *prev, *cur, *aligned = NULL; |
273 | int delta = 0, units = SLOB_UNITS(size); | 222 | int delta = 0, units = SLOB_UNITS(size); |
274 | 223 | ||
275 | for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { | 224 | for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { |
276 | slobidx_t avail = slob_units(cur); | 225 | slobidx_t avail = slob_units(cur); |
277 | 226 | ||
278 | if (align) { | 227 | if (align) { |
@@ -296,12 +245,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) | |||
296 | if (prev) | 245 | if (prev) |
297 | set_slob(prev, slob_units(prev), next); | 246 | set_slob(prev, slob_units(prev), next); |
298 | else | 247 | else |
299 | sp->free = next; | 248 | sp->freelist = next; |
300 | } else { /* fragment */ | 249 | } else { /* fragment */ |
301 | if (prev) | 250 | if (prev) |
302 | set_slob(prev, slob_units(prev), cur + units); | 251 | set_slob(prev, slob_units(prev), cur + units); |
303 | else | 252 | else |
304 | sp->free = cur + units; | 253 | sp->freelist = cur + units; |
305 | set_slob(cur + units, avail - units, next); | 254 | set_slob(cur + units, avail - units, next); |
306 | } | 255 | } |
307 | 256 | ||
@@ -320,7 +269,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) | |||
320 | */ | 269 | */ |
321 | static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | 270 | static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) |
322 | { | 271 | { |
323 | struct slob_page *sp; | 272 | struct page *sp; |
324 | struct list_head *prev; | 273 | struct list_head *prev; |
325 | struct list_head *slob_list; | 274 | struct list_head *slob_list; |
326 | slob_t *b = NULL; | 275 | slob_t *b = NULL; |
@@ -341,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
341 | * If there's a node specification, search for a partial | 290 | * If there's a node specification, search for a partial |
342 | * page with a matching node id in the freelist. | 291 | * page with a matching node id in the freelist. |
343 | */ | 292 | */ |
344 | if (node != -1 && page_to_nid(&sp->page) != node) | 293 | if (node != -1 && page_to_nid(sp) != node) |
345 | continue; | 294 | continue; |
346 | #endif | 295 | #endif |
347 | /* Enough room on this page? */ | 296 | /* Enough room on this page? */ |
@@ -369,12 +318,12 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
369 | b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); | 318 | b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); |
370 | if (!b) | 319 | if (!b) |
371 | return NULL; | 320 | return NULL; |
372 | sp = slob_page(b); | 321 | sp = virt_to_page(b); |
373 | set_slob_page(sp); | 322 | __SetPageSlab(sp); |
374 | 323 | ||
375 | spin_lock_irqsave(&slob_lock, flags); | 324 | spin_lock_irqsave(&slob_lock, flags); |
376 | sp->units = SLOB_UNITS(PAGE_SIZE); | 325 | sp->units = SLOB_UNITS(PAGE_SIZE); |
377 | sp->free = b; | 326 | sp->freelist = b; |
378 | INIT_LIST_HEAD(&sp->list); | 327 | INIT_LIST_HEAD(&sp->list); |
379 | set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); | 328 | set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); |
380 | set_slob_page_free(sp, slob_list); | 329 | set_slob_page_free(sp, slob_list); |
@@ -392,7 +341,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
392 | */ | 341 | */ |
393 | static void slob_free(void *block, int size) | 342 | static void slob_free(void *block, int size) |
394 | { | 343 | { |
395 | struct slob_page *sp; | 344 | struct page *sp; |
396 | slob_t *prev, *next, *b = (slob_t *)block; | 345 | slob_t *prev, *next, *b = (slob_t *)block; |
397 | slobidx_t units; | 346 | slobidx_t units; |
398 | unsigned long flags; | 347 | unsigned long flags; |
@@ -402,7 +351,7 @@ static void slob_free(void *block, int size) | |||
402 | return; | 351 | return; |
403 | BUG_ON(!size); | 352 | BUG_ON(!size); |
404 | 353 | ||
405 | sp = slob_page(block); | 354 | sp = virt_to_page(block); |
406 | units = SLOB_UNITS(size); | 355 | units = SLOB_UNITS(size); |
407 | 356 | ||
408 | spin_lock_irqsave(&slob_lock, flags); | 357 | spin_lock_irqsave(&slob_lock, flags); |
@@ -412,8 +361,8 @@ static void slob_free(void *block, int size) | |||
412 | if (slob_page_free(sp)) | 361 | if (slob_page_free(sp)) |
413 | clear_slob_page_free(sp); | 362 | clear_slob_page_free(sp); |
414 | spin_unlock_irqrestore(&slob_lock, flags); | 363 | spin_unlock_irqrestore(&slob_lock, flags); |
415 | clear_slob_page(sp); | 364 | __ClearPageSlab(sp); |
416 | free_slob_page(sp); | 365 | reset_page_mapcount(sp); |
417 | slob_free_pages(b, 0); | 366 | slob_free_pages(b, 0); |
418 | return; | 367 | return; |
419 | } | 368 | } |
@@ -421,7 +370,7 @@ static void slob_free(void *block, int size) | |||
421 | if (!slob_page_free(sp)) { | 370 | if (!slob_page_free(sp)) { |
422 | /* This slob page is about to become partially free. Easy! */ | 371 | /* This slob page is about to become partially free. Easy! */ |
423 | sp->units = units; | 372 | sp->units = units; |
424 | sp->free = b; | 373 | sp->freelist = b; |
425 | set_slob(b, units, | 374 | set_slob(b, units, |
426 | (void *)((unsigned long)(b + | 375 | (void *)((unsigned long)(b + |
427 | SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); | 376 | SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); |
@@ -441,15 +390,15 @@ static void slob_free(void *block, int size) | |||
441 | */ | 390 | */ |
442 | sp->units += units; | 391 | sp->units += units; |
443 | 392 | ||
444 | if (b < sp->free) { | 393 | if (b < (slob_t *)sp->freelist) { |
445 | if (b + units == sp->free) { | 394 | if (b + units == sp->freelist) { |
446 | units += slob_units(sp->free); | 395 | units += slob_units(sp->freelist); |
447 | sp->free = slob_next(sp->free); | 396 | sp->freelist = slob_next(sp->freelist); |
448 | } | 397 | } |
449 | set_slob(b, units, sp->free); | 398 | set_slob(b, units, sp->freelist); |
450 | sp->free = b; | 399 | sp->freelist = b; |
451 | } else { | 400 | } else { |
452 | prev = sp->free; | 401 | prev = sp->freelist; |
453 | next = slob_next(prev); | 402 | next = slob_next(prev); |
454 | while (b > next) { | 403 | while (b > next) { |
455 | prev = next; | 404 | prev = next; |
@@ -522,7 +471,7 @@ EXPORT_SYMBOL(__kmalloc_node); | |||
522 | 471 | ||
523 | void kfree(const void *block) | 472 | void kfree(const void *block) |
524 | { | 473 | { |
525 | struct slob_page *sp; | 474 | struct page *sp; |
526 | 475 | ||
527 | trace_kfree(_RET_IP_, block); | 476 | trace_kfree(_RET_IP_, block); |
528 | 477 | ||
@@ -530,43 +479,36 @@ void kfree(const void *block) | |||
530 | return; | 479 | return; |
531 | kmemleak_free(block); | 480 | kmemleak_free(block); |
532 | 481 | ||
533 | sp = slob_page(block); | 482 | sp = virt_to_page(block); |
534 | if (is_slob_page(sp)) { | 483 | if (PageSlab(sp)) { |
535 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 484 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
536 | unsigned int *m = (unsigned int *)(block - align); | 485 | unsigned int *m = (unsigned int *)(block - align); |
537 | slob_free(m, *m + align); | 486 | slob_free(m, *m + align); |
538 | } else | 487 | } else |
539 | put_page(&sp->page); | 488 | put_page(sp); |
540 | } | 489 | } |
541 | EXPORT_SYMBOL(kfree); | 490 | EXPORT_SYMBOL(kfree); |
542 | 491 | ||
543 | /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ | 492 | /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ |
544 | size_t ksize(const void *block) | 493 | size_t ksize(const void *block) |
545 | { | 494 | { |
546 | struct slob_page *sp; | 495 | struct page *sp; |
547 | 496 | ||
548 | BUG_ON(!block); | 497 | BUG_ON(!block); |
549 | if (unlikely(block == ZERO_SIZE_PTR)) | 498 | if (unlikely(block == ZERO_SIZE_PTR)) |
550 | return 0; | 499 | return 0; |
551 | 500 | ||
552 | sp = slob_page(block); | 501 | sp = virt_to_page(block); |
553 | if (is_slob_page(sp)) { | 502 | if (PageSlab(sp)) { |
554 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 503 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
555 | unsigned int *m = (unsigned int *)(block - align); | 504 | unsigned int *m = (unsigned int *)(block - align); |
556 | return SLOB_UNITS(*m) * SLOB_UNIT; | 505 | return SLOB_UNITS(*m) * SLOB_UNIT; |
557 | } else | 506 | } else |
558 | return sp->page.private; | 507 | return sp->private; |
559 | } | 508 | } |
560 | EXPORT_SYMBOL(ksize); | 509 | EXPORT_SYMBOL(ksize); |
561 | 510 | ||
562 | struct kmem_cache { | 511 | struct kmem_cache *__kmem_cache_create(const char *name, size_t size, |
563 | unsigned int size, align; | ||
564 | unsigned long flags; | ||
565 | const char *name; | ||
566 | void (*ctor)(void *); | ||
567 | }; | ||
568 | |||
569 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | ||
570 | size_t align, unsigned long flags, void (*ctor)(void *)) | 512 | size_t align, unsigned long flags, void (*ctor)(void *)) |
571 | { | 513 | { |
572 | struct kmem_cache *c; | 514 | struct kmem_cache *c; |
@@ -589,13 +531,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
589 | c->align = ARCH_SLAB_MINALIGN; | 531 | c->align = ARCH_SLAB_MINALIGN; |
590 | if (c->align < align) | 532 | if (c->align < align) |
591 | c->align = align; | 533 | c->align = align; |
592 | } else if (flags & SLAB_PANIC) | ||
593 | panic("Cannot create slab cache %s\n", name); | ||
594 | 534 | ||
595 | kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); | 535 | kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); |
536 | c->refcount = 1; | ||
537 | } | ||
596 | return c; | 538 | return c; |
597 | } | 539 | } |
598 | EXPORT_SYMBOL(kmem_cache_create); | ||
599 | 540 | ||
600 | void kmem_cache_destroy(struct kmem_cache *c) | 541 | void kmem_cache_destroy(struct kmem_cache *c) |
601 | { | 542 | { |
@@ -678,19 +619,12 @@ int kmem_cache_shrink(struct kmem_cache *d) | |||
678 | } | 619 | } |
679 | EXPORT_SYMBOL(kmem_cache_shrink); | 620 | EXPORT_SYMBOL(kmem_cache_shrink); |
680 | 621 | ||
681 | static unsigned int slob_ready __read_mostly; | ||
682 | |||
683 | int slab_is_available(void) | ||
684 | { | ||
685 | return slob_ready; | ||
686 | } | ||
687 | |||
688 | void __init kmem_cache_init(void) | 622 | void __init kmem_cache_init(void) |
689 | { | 623 | { |
690 | slob_ready = 1; | 624 | slab_state = UP; |
691 | } | 625 | } |
692 | 626 | ||
693 | void __init kmem_cache_init_late(void) | 627 | void __init kmem_cache_init_late(void) |
694 | { | 628 | { |
695 | /* Nothing to do */ | 629 | slab_state = FULL; |
696 | } | 630 | } |
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | #include <linux/bitops.h> | 17 | #include <linux/bitops.h> |
18 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
19 | #include "slab.h" | ||
19 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
20 | #include <linux/seq_file.h> | 21 | #include <linux/seq_file.h> |
21 | #include <linux/kmemcheck.h> | 22 | #include <linux/kmemcheck.h> |
@@ -33,15 +34,17 @@ | |||
33 | 34 | ||
34 | #include <trace/events/kmem.h> | 35 | #include <trace/events/kmem.h> |
35 | 36 | ||
37 | #include "internal.h" | ||
38 | |||
36 | /* | 39 | /* |
37 | * Lock order: | 40 | * Lock order: |
38 | * 1. slub_lock (Global Semaphore) | 41 | * 1. slab_mutex (Global Mutex) |
39 | * 2. node->list_lock | 42 | * 2. node->list_lock |
40 | * 3. slab_lock(page) (Only on some arches and for debugging) | 43 | * 3. slab_lock(page) (Only on some arches and for debugging) |
41 | * | 44 | * |
42 | * slub_lock | 45 | * slab_mutex |
43 | * | 46 | * |
44 | * The role of the slub_lock is to protect the list of all the slabs | 47 | * The role of the slab_mutex is to protect the list of all the slabs |
45 | * and to synchronize major metadata changes to slab cache structures. | 48 | * and to synchronize major metadata changes to slab cache structures. |
46 | * | 49 | * |
47 | * The slab_lock is only used for debugging and on arches that do not | 50 | * The slab_lock is only used for debugging and on arches that do not |
@@ -182,17 +185,6 @@ static int kmem_size = sizeof(struct kmem_cache); | |||
182 | static struct notifier_block slab_notifier; | 185 | static struct notifier_block slab_notifier; |
183 | #endif | 186 | #endif |
184 | 187 | ||
185 | static enum { | ||
186 | DOWN, /* No slab functionality available */ | ||
187 | PARTIAL, /* Kmem_cache_node works */ | ||
188 | UP, /* Everything works but does not show up in sysfs */ | ||
189 | SYSFS /* Sysfs up */ | ||
190 | } slab_state = DOWN; | ||
191 | |||
192 | /* A list of all slab caches on the system */ | ||
193 | static DECLARE_RWSEM(slub_lock); | ||
194 | static LIST_HEAD(slab_caches); | ||
195 | |||
196 | /* | 188 | /* |
197 | * Tracking user of a slab. | 189 | * Tracking user of a slab. |
198 | */ | 190 | */ |
@@ -237,11 +229,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) | |||
237 | * Core slab cache functions | 229 | * Core slab cache functions |
238 | *******************************************************************/ | 230 | *******************************************************************/ |
239 | 231 | ||
240 | int slab_is_available(void) | ||
241 | { | ||
242 | return slab_state >= UP; | ||
243 | } | ||
244 | |||
245 | static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | 232 | static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) |
246 | { | 233 | { |
247 | return s->node[node]; | 234 | return s->node[node]; |
@@ -311,7 +298,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) | |||
311 | * and whatever may come after it. | 298 | * and whatever may come after it. |
312 | */ | 299 | */ |
313 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) | 300 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) |
314 | return s->objsize; | 301 | return s->object_size; |
315 | 302 | ||
316 | #endif | 303 | #endif |
317 | /* | 304 | /* |
@@ -609,11 +596,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
609 | if (p > addr + 16) | 596 | if (p > addr + 16) |
610 | print_section("Bytes b4 ", p - 16, 16); | 597 | print_section("Bytes b4 ", p - 16, 16); |
611 | 598 | ||
612 | print_section("Object ", p, min_t(unsigned long, s->objsize, | 599 | print_section("Object ", p, min_t(unsigned long, s->object_size, |
613 | PAGE_SIZE)); | 600 | PAGE_SIZE)); |
614 | if (s->flags & SLAB_RED_ZONE) | 601 | if (s->flags & SLAB_RED_ZONE) |
615 | print_section("Redzone ", p + s->objsize, | 602 | print_section("Redzone ", p + s->object_size, |
616 | s->inuse - s->objsize); | 603 | s->inuse - s->object_size); |
617 | 604 | ||
618 | if (s->offset) | 605 | if (s->offset) |
619 | off = s->offset + sizeof(void *); | 606 | off = s->offset + sizeof(void *); |
@@ -655,12 +642,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) | |||
655 | u8 *p = object; | 642 | u8 *p = object; |
656 | 643 | ||
657 | if (s->flags & __OBJECT_POISON) { | 644 | if (s->flags & __OBJECT_POISON) { |
658 | memset(p, POISON_FREE, s->objsize - 1); | 645 | memset(p, POISON_FREE, s->object_size - 1); |
659 | p[s->objsize - 1] = POISON_END; | 646 | p[s->object_size - 1] = POISON_END; |
660 | } | 647 | } |
661 | 648 | ||
662 | if (s->flags & SLAB_RED_ZONE) | 649 | if (s->flags & SLAB_RED_ZONE) |
663 | memset(p + s->objsize, val, s->inuse - s->objsize); | 650 | memset(p + s->object_size, val, s->inuse - s->object_size); |
664 | } | 651 | } |
665 | 652 | ||
666 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, | 653 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, |
@@ -705,10 +692,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | |||
705 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is | 692 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is |
706 | * 0xa5 (POISON_END) | 693 | * 0xa5 (POISON_END) |
707 | * | 694 | * |
708 | * object + s->objsize | 695 | * object + s->object_size |
709 | * Padding to reach word boundary. This is also used for Redzoning. | 696 | * Padding to reach word boundary. This is also used for Redzoning. |
710 | * Padding is extended by another word if Redzoning is enabled and | 697 | * Padding is extended by another word if Redzoning is enabled and |
711 | * objsize == inuse. | 698 | * object_size == inuse. |
712 | * | 699 | * |
713 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with | 700 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with |
714 | * 0xcc (RED_ACTIVE) for objects in use. | 701 | * 0xcc (RED_ACTIVE) for objects in use. |
@@ -727,7 +714,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | |||
727 | * object + s->size | 714 | * object + s->size |
728 | * Nothing is used beyond s->size. | 715 | * Nothing is used beyond s->size. |
729 | * | 716 | * |
730 | * If slabcaches are merged then the objsize and inuse boundaries are mostly | 717 | * If slabcaches are merged then the object_size and inuse boundaries are mostly |
731 | * ignored. And therefore no slab options that rely on these boundaries | 718 | * ignored. And therefore no slab options that rely on these boundaries |
732 | * may be used with merged slabcaches. | 719 | * may be used with merged slabcaches. |
733 | */ | 720 | */ |
@@ -787,25 +774,25 @@ static int check_object(struct kmem_cache *s, struct page *page, | |||
787 | void *object, u8 val) | 774 | void *object, u8 val) |
788 | { | 775 | { |
789 | u8 *p = object; | 776 | u8 *p = object; |
790 | u8 *endobject = object + s->objsize; | 777 | u8 *endobject = object + s->object_size; |
791 | 778 | ||
792 | if (s->flags & SLAB_RED_ZONE) { | 779 | if (s->flags & SLAB_RED_ZONE) { |
793 | if (!check_bytes_and_report(s, page, object, "Redzone", | 780 | if (!check_bytes_and_report(s, page, object, "Redzone", |
794 | endobject, val, s->inuse - s->objsize)) | 781 | endobject, val, s->inuse - s->object_size)) |
795 | return 0; | 782 | return 0; |
796 | } else { | 783 | } else { |
797 | if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { | 784 | if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { |
798 | check_bytes_and_report(s, page, p, "Alignment padding", | 785 | check_bytes_and_report(s, page, p, "Alignment padding", |
799 | endobject, POISON_INUSE, s->inuse - s->objsize); | 786 | endobject, POISON_INUSE, s->inuse - s->object_size); |
800 | } | 787 | } |
801 | } | 788 | } |
802 | 789 | ||
803 | if (s->flags & SLAB_POISON) { | 790 | if (s->flags & SLAB_POISON) { |
804 | if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && | 791 | if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && |
805 | (!check_bytes_and_report(s, page, p, "Poison", p, | 792 | (!check_bytes_and_report(s, page, p, "Poison", p, |
806 | POISON_FREE, s->objsize - 1) || | 793 | POISON_FREE, s->object_size - 1) || |
807 | !check_bytes_and_report(s, page, p, "Poison", | 794 | !check_bytes_and_report(s, page, p, "Poison", |
808 | p + s->objsize - 1, POISON_END, 1))) | 795 | p + s->object_size - 1, POISON_END, 1))) |
809 | return 0; | 796 | return 0; |
810 | /* | 797 | /* |
811 | * check_pad_bytes cleans up on its own. | 798 | * check_pad_bytes cleans up on its own. |
@@ -926,7 +913,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, | |||
926 | page->freelist); | 913 | page->freelist); |
927 | 914 | ||
928 | if (!alloc) | 915 | if (!alloc) |
929 | print_section("Object ", (void *)object, s->objsize); | 916 | print_section("Object ", (void *)object, s->object_size); |
930 | 917 | ||
931 | dump_stack(); | 918 | dump_stack(); |
932 | } | 919 | } |
@@ -942,14 +929,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) | |||
942 | lockdep_trace_alloc(flags); | 929 | lockdep_trace_alloc(flags); |
943 | might_sleep_if(flags & __GFP_WAIT); | 930 | might_sleep_if(flags & __GFP_WAIT); |
944 | 931 | ||
945 | return should_failslab(s->objsize, flags, s->flags); | 932 | return should_failslab(s->object_size, flags, s->flags); |
946 | } | 933 | } |
947 | 934 | ||
948 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) | 935 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) |
949 | { | 936 | { |
950 | flags &= gfp_allowed_mask; | 937 | flags &= gfp_allowed_mask; |
951 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | 938 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
952 | kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); | 939 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); |
953 | } | 940 | } |
954 | 941 | ||
955 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 942 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
@@ -966,13 +953,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
966 | unsigned long flags; | 953 | unsigned long flags; |
967 | 954 | ||
968 | local_irq_save(flags); | 955 | local_irq_save(flags); |
969 | kmemcheck_slab_free(s, x, s->objsize); | 956 | kmemcheck_slab_free(s, x, s->object_size); |
970 | debug_check_no_locks_freed(x, s->objsize); | 957 | debug_check_no_locks_freed(x, s->object_size); |
971 | local_irq_restore(flags); | 958 | local_irq_restore(flags); |
972 | } | 959 | } |
973 | #endif | 960 | #endif |
974 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 961 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
975 | debug_check_no_obj_freed(x, s->objsize); | 962 | debug_check_no_obj_freed(x, s->object_size); |
976 | } | 963 | } |
977 | 964 | ||
978 | /* | 965 | /* |
@@ -1207,7 +1194,7 @@ out: | |||
1207 | 1194 | ||
1208 | __setup("slub_debug", setup_slub_debug); | 1195 | __setup("slub_debug", setup_slub_debug); |
1209 | 1196 | ||
1210 | static unsigned long kmem_cache_flags(unsigned long objsize, | 1197 | static unsigned long kmem_cache_flags(unsigned long object_size, |
1211 | unsigned long flags, const char *name, | 1198 | unsigned long flags, const char *name, |
1212 | void (*ctor)(void *)) | 1199 | void (*ctor)(void *)) |
1213 | { | 1200 | { |
@@ -1237,7 +1224,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, | |||
1237 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, | 1224 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1238 | struct page *page) {} | 1225 | struct page *page) {} |
1239 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} | 1226 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} |
1240 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1227 | static inline unsigned long kmem_cache_flags(unsigned long object_size, |
1241 | unsigned long flags, const char *name, | 1228 | unsigned long flags, const char *name, |
1242 | void (*ctor)(void *)) | 1229 | void (*ctor)(void *)) |
1243 | { | 1230 | { |
@@ -1314,13 +1301,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1314 | stat(s, ORDER_FALLBACK); | 1301 | stat(s, ORDER_FALLBACK); |
1315 | } | 1302 | } |
1316 | 1303 | ||
1317 | if (flags & __GFP_WAIT) | 1304 | if (kmemcheck_enabled && page |
1318 | local_irq_disable(); | ||
1319 | |||
1320 | if (!page) | ||
1321 | return NULL; | ||
1322 | |||
1323 | if (kmemcheck_enabled | ||
1324 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { | 1305 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { |
1325 | int pages = 1 << oo_order(oo); | 1306 | int pages = 1 << oo_order(oo); |
1326 | 1307 | ||
@@ -1336,6 +1317,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1336 | kmemcheck_mark_unallocated_pages(page, pages); | 1317 | kmemcheck_mark_unallocated_pages(page, pages); |
1337 | } | 1318 | } |
1338 | 1319 | ||
1320 | if (flags & __GFP_WAIT) | ||
1321 | local_irq_disable(); | ||
1322 | if (!page) | ||
1323 | return NULL; | ||
1324 | |||
1339 | page->objects = oo_objects(oo); | 1325 | page->objects = oo_objects(oo); |
1340 | mod_zone_page_state(page_zone(page), | 1326 | mod_zone_page_state(page_zone(page), |
1341 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1327 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
@@ -1370,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1370 | inc_slabs_node(s, page_to_nid(page), page->objects); | 1356 | inc_slabs_node(s, page_to_nid(page), page->objects); |
1371 | page->slab = s; | 1357 | page->slab = s; |
1372 | __SetPageSlab(page); | 1358 | __SetPageSlab(page); |
1359 | if (page->pfmemalloc) | ||
1360 | SetPageSlabPfmemalloc(page); | ||
1373 | 1361 | ||
1374 | start = page_address(page); | 1362 | start = page_address(page); |
1375 | 1363 | ||
@@ -1413,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1413 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1401 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
1414 | -pages); | 1402 | -pages); |
1415 | 1403 | ||
1404 | __ClearPageSlabPfmemalloc(page); | ||
1416 | __ClearPageSlab(page); | 1405 | __ClearPageSlab(page); |
1417 | reset_page_mapcount(page); | 1406 | reset_page_mapcount(page); |
1418 | if (current->reclaim_state) | 1407 | if (current->reclaim_state) |
@@ -1490,12 +1479,12 @@ static inline void remove_partial(struct kmem_cache_node *n, | |||
1490 | } | 1479 | } |
1491 | 1480 | ||
1492 | /* | 1481 | /* |
1493 | * Lock slab, remove from the partial list and put the object into the | 1482 | * Remove slab from the partial list, freeze it and |
1494 | * per cpu freelist. | 1483 | * return the pointer to the freelist. |
1495 | * | 1484 | * |
1496 | * Returns a list of objects or NULL if it fails. | 1485 | * Returns a list of objects or NULL if it fails. |
1497 | * | 1486 | * |
1498 | * Must hold list_lock. | 1487 | * Must hold list_lock since we modify the partial list. |
1499 | */ | 1488 | */ |
1500 | static inline void *acquire_slab(struct kmem_cache *s, | 1489 | static inline void *acquire_slab(struct kmem_cache *s, |
1501 | struct kmem_cache_node *n, struct page *page, | 1490 | struct kmem_cache_node *n, struct page *page, |
@@ -1510,26 +1499,27 @@ static inline void *acquire_slab(struct kmem_cache *s, | |||
1510 | * The old freelist is the list of objects for the | 1499 | * The old freelist is the list of objects for the |
1511 | * per cpu allocation list. | 1500 | * per cpu allocation list. |
1512 | */ | 1501 | */ |
1513 | do { | 1502 | freelist = page->freelist; |
1514 | freelist = page->freelist; | 1503 | counters = page->counters; |
1515 | counters = page->counters; | 1504 | new.counters = counters; |
1516 | new.counters = counters; | 1505 | if (mode) { |
1517 | if (mode) { | 1506 | new.inuse = page->objects; |
1518 | new.inuse = page->objects; | 1507 | new.freelist = NULL; |
1519 | new.freelist = NULL; | 1508 | } else { |
1520 | } else { | 1509 | new.freelist = freelist; |
1521 | new.freelist = freelist; | 1510 | } |
1522 | } | ||
1523 | 1511 | ||
1524 | VM_BUG_ON(new.frozen); | 1512 | VM_BUG_ON(new.frozen); |
1525 | new.frozen = 1; | 1513 | new.frozen = 1; |
1526 | 1514 | ||
1527 | } while (!__cmpxchg_double_slab(s, page, | 1515 | if (!__cmpxchg_double_slab(s, page, |
1528 | freelist, counters, | 1516 | freelist, counters, |
1529 | new.freelist, new.counters, | 1517 | new.freelist, new.counters, |
1530 | "lock and freeze")); | 1518 | "acquire_slab")) |
1519 | return NULL; | ||
1531 | 1520 | ||
1532 | remove_partial(n, page); | 1521 | remove_partial(n, page); |
1522 | WARN_ON(!freelist); | ||
1533 | return freelist; | 1523 | return freelist; |
1534 | } | 1524 | } |
1535 | 1525 | ||
@@ -1563,7 +1553,6 @@ static void *get_partial_node(struct kmem_cache *s, | |||
1563 | 1553 | ||
1564 | if (!object) { | 1554 | if (!object) { |
1565 | c->page = page; | 1555 | c->page = page; |
1566 | c->node = page_to_nid(page); | ||
1567 | stat(s, ALLOC_FROM_PARTIAL); | 1556 | stat(s, ALLOC_FROM_PARTIAL); |
1568 | object = t; | 1557 | object = t; |
1569 | available = page->objects - page->inuse; | 1558 | available = page->objects - page->inuse; |
@@ -1617,7 +1606,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1617 | 1606 | ||
1618 | do { | 1607 | do { |
1619 | cpuset_mems_cookie = get_mems_allowed(); | 1608 | cpuset_mems_cookie = get_mems_allowed(); |
1620 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 1609 | zonelist = node_zonelist(slab_node(), flags); |
1621 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1610 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1622 | struct kmem_cache_node *n; | 1611 | struct kmem_cache_node *n; |
1623 | 1612 | ||
@@ -1731,14 +1720,12 @@ void init_kmem_cache_cpus(struct kmem_cache *s) | |||
1731 | /* | 1720 | /* |
1732 | * Remove the cpu slab | 1721 | * Remove the cpu slab |
1733 | */ | 1722 | */ |
1734 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1723 | static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist) |
1735 | { | 1724 | { |
1736 | enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; | 1725 | enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; |
1737 | struct page *page = c->page; | ||
1738 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | 1726 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1739 | int lock = 0; | 1727 | int lock = 0; |
1740 | enum slab_modes l = M_NONE, m = M_NONE; | 1728 | enum slab_modes l = M_NONE, m = M_NONE; |
1741 | void *freelist; | ||
1742 | void *nextfree; | 1729 | void *nextfree; |
1743 | int tail = DEACTIVATE_TO_HEAD; | 1730 | int tail = DEACTIVATE_TO_HEAD; |
1744 | struct page new; | 1731 | struct page new; |
@@ -1749,11 +1736,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1749 | tail = DEACTIVATE_TO_TAIL; | 1736 | tail = DEACTIVATE_TO_TAIL; |
1750 | } | 1737 | } |
1751 | 1738 | ||
1752 | c->tid = next_tid(c->tid); | ||
1753 | c->page = NULL; | ||
1754 | freelist = c->freelist; | ||
1755 | c->freelist = NULL; | ||
1756 | |||
1757 | /* | 1739 | /* |
1758 | * Stage one: Free all available per cpu objects back | 1740 | * Stage one: Free all available per cpu objects back |
1759 | * to the page freelist while it is still frozen. Leave the | 1741 | * to the page freelist while it is still frozen. Leave the |
@@ -1879,21 +1861,31 @@ redo: | |||
1879 | } | 1861 | } |
1880 | } | 1862 | } |
1881 | 1863 | ||
1882 | /* Unfreeze all the cpu partial slabs */ | 1864 | /* |
1865 | * Unfreeze all the cpu partial slabs. | ||
1866 | * | ||
1867 | * This function must be called with interrupt disabled. | ||
1868 | */ | ||
1883 | static void unfreeze_partials(struct kmem_cache *s) | 1869 | static void unfreeze_partials(struct kmem_cache *s) |
1884 | { | 1870 | { |
1885 | struct kmem_cache_node *n = NULL; | 1871 | struct kmem_cache_node *n = NULL, *n2 = NULL; |
1886 | struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); | 1872 | struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); |
1887 | struct page *page, *discard_page = NULL; | 1873 | struct page *page, *discard_page = NULL; |
1888 | 1874 | ||
1889 | while ((page = c->partial)) { | 1875 | while ((page = c->partial)) { |
1890 | enum slab_modes { M_PARTIAL, M_FREE }; | ||
1891 | enum slab_modes l, m; | ||
1892 | struct page new; | 1876 | struct page new; |
1893 | struct page old; | 1877 | struct page old; |
1894 | 1878 | ||
1895 | c->partial = page->next; | 1879 | c->partial = page->next; |
1896 | l = M_FREE; | 1880 | |
1881 | n2 = get_node(s, page_to_nid(page)); | ||
1882 | if (n != n2) { | ||
1883 | if (n) | ||
1884 | spin_unlock(&n->list_lock); | ||
1885 | |||
1886 | n = n2; | ||
1887 | spin_lock(&n->list_lock); | ||
1888 | } | ||
1897 | 1889 | ||
1898 | do { | 1890 | do { |
1899 | 1891 | ||
@@ -1906,43 +1898,17 @@ static void unfreeze_partials(struct kmem_cache *s) | |||
1906 | 1898 | ||
1907 | new.frozen = 0; | 1899 | new.frozen = 0; |
1908 | 1900 | ||
1909 | if (!new.inuse && (!n || n->nr_partial > s->min_partial)) | 1901 | } while (!__cmpxchg_double_slab(s, page, |
1910 | m = M_FREE; | ||
1911 | else { | ||
1912 | struct kmem_cache_node *n2 = get_node(s, | ||
1913 | page_to_nid(page)); | ||
1914 | |||
1915 | m = M_PARTIAL; | ||
1916 | if (n != n2) { | ||
1917 | if (n) | ||
1918 | spin_unlock(&n->list_lock); | ||
1919 | |||
1920 | n = n2; | ||
1921 | spin_lock(&n->list_lock); | ||
1922 | } | ||
1923 | } | ||
1924 | |||
1925 | if (l != m) { | ||
1926 | if (l == M_PARTIAL) { | ||
1927 | remove_partial(n, page); | ||
1928 | stat(s, FREE_REMOVE_PARTIAL); | ||
1929 | } else { | ||
1930 | add_partial(n, page, | ||
1931 | DEACTIVATE_TO_TAIL); | ||
1932 | stat(s, FREE_ADD_PARTIAL); | ||
1933 | } | ||
1934 | |||
1935 | l = m; | ||
1936 | } | ||
1937 | |||
1938 | } while (!cmpxchg_double_slab(s, page, | ||
1939 | old.freelist, old.counters, | 1902 | old.freelist, old.counters, |
1940 | new.freelist, new.counters, | 1903 | new.freelist, new.counters, |
1941 | "unfreezing slab")); | 1904 | "unfreezing slab")); |
1942 | 1905 | ||
1943 | if (m == M_FREE) { | 1906 | if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { |
1944 | page->next = discard_page; | 1907 | page->next = discard_page; |
1945 | discard_page = page; | 1908 | discard_page = page; |
1909 | } else { | ||
1910 | add_partial(n, page, DEACTIVATE_TO_TAIL); | ||
1911 | stat(s, FREE_ADD_PARTIAL); | ||
1946 | } | 1912 | } |
1947 | } | 1913 | } |
1948 | 1914 | ||
@@ -2011,7 +1977,11 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | |||
2011 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1977 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
2012 | { | 1978 | { |
2013 | stat(s, CPUSLAB_FLUSH); | 1979 | stat(s, CPUSLAB_FLUSH); |
2014 | deactivate_slab(s, c); | 1980 | deactivate_slab(s, c->page, c->freelist); |
1981 | |||
1982 | c->tid = next_tid(c->tid); | ||
1983 | c->page = NULL; | ||
1984 | c->freelist = NULL; | ||
2015 | } | 1985 | } |
2016 | 1986 | ||
2017 | /* | 1987 | /* |
@@ -2055,10 +2025,10 @@ static void flush_all(struct kmem_cache *s) | |||
2055 | * Check if the objects in a per cpu structure fit numa | 2025 | * Check if the objects in a per cpu structure fit numa |
2056 | * locality expectations. | 2026 | * locality expectations. |
2057 | */ | 2027 | */ |
2058 | static inline int node_match(struct kmem_cache_cpu *c, int node) | 2028 | static inline int node_match(struct page *page, int node) |
2059 | { | 2029 | { |
2060 | #ifdef CONFIG_NUMA | 2030 | #ifdef CONFIG_NUMA |
2061 | if (node != NUMA_NO_NODE && c->node != node) | 2031 | if (node != NUMA_NO_NODE && page_to_nid(page) != node) |
2062 | return 0; | 2032 | return 0; |
2063 | #endif | 2033 | #endif |
2064 | return 1; | 2034 | return 1; |
@@ -2101,10 +2071,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
2101 | "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 2071 | "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", |
2102 | nid, gfpflags); | 2072 | nid, gfpflags); |
2103 | printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " | 2073 | printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " |
2104 | "default order: %d, min order: %d\n", s->name, s->objsize, | 2074 | "default order: %d, min order: %d\n", s->name, s->object_size, |
2105 | s->size, oo_order(s->oo), oo_order(s->min)); | 2075 | s->size, oo_order(s->oo), oo_order(s->min)); |
2106 | 2076 | ||
2107 | if (oo_order(s->min) > get_order(s->objsize)) | 2077 | if (oo_order(s->min) > get_order(s->object_size)) |
2108 | printk(KERN_WARNING " %s debugging increased min order, use " | 2078 | printk(KERN_WARNING " %s debugging increased min order, use " |
2109 | "slub_debug=O to disable.\n", s->name); | 2079 | "slub_debug=O to disable.\n", s->name); |
2110 | 2080 | ||
@@ -2130,10 +2100,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
2130 | static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | 2100 | static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, |
2131 | int node, struct kmem_cache_cpu **pc) | 2101 | int node, struct kmem_cache_cpu **pc) |
2132 | { | 2102 | { |
2133 | void *object; | 2103 | void *freelist; |
2134 | struct kmem_cache_cpu *c; | 2104 | struct kmem_cache_cpu *c = *pc; |
2135 | struct page *page = new_slab(s, flags, node); | 2105 | struct page *page; |
2136 | 2106 | ||
2107 | freelist = get_partial(s, flags, node, c); | ||
2108 | |||
2109 | if (freelist) | ||
2110 | return freelist; | ||
2111 | |||
2112 | page = new_slab(s, flags, node); | ||
2137 | if (page) { | 2113 | if (page) { |
2138 | c = __this_cpu_ptr(s->cpu_slab); | 2114 | c = __this_cpu_ptr(s->cpu_slab); |
2139 | if (c->page) | 2115 | if (c->page) |
@@ -2143,17 +2119,24 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | |||
2143 | * No other reference to the page yet so we can | 2119 | * No other reference to the page yet so we can |
2144 | * muck around with it freely without cmpxchg | 2120 | * muck around with it freely without cmpxchg |
2145 | */ | 2121 | */ |
2146 | object = page->freelist; | 2122 | freelist = page->freelist; |
2147 | page->freelist = NULL; | 2123 | page->freelist = NULL; |
2148 | 2124 | ||
2149 | stat(s, ALLOC_SLAB); | 2125 | stat(s, ALLOC_SLAB); |
2150 | c->node = page_to_nid(page); | ||
2151 | c->page = page; | 2126 | c->page = page; |
2152 | *pc = c; | 2127 | *pc = c; |
2153 | } else | 2128 | } else |
2154 | object = NULL; | 2129 | freelist = NULL; |
2155 | 2130 | ||
2156 | return object; | 2131 | return freelist; |
2132 | } | ||
2133 | |||
2134 | static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) | ||
2135 | { | ||
2136 | if (unlikely(PageSlabPfmemalloc(page))) | ||
2137 | return gfp_pfmemalloc_allowed(gfpflags); | ||
2138 | |||
2139 | return true; | ||
2157 | } | 2140 | } |
2158 | 2141 | ||
2159 | /* | 2142 | /* |
@@ -2163,6 +2146,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | |||
2163 | * The page is still frozen if the return value is not NULL. | 2146 | * The page is still frozen if the return value is not NULL. |
2164 | * | 2147 | * |
2165 | * If this function returns NULL then the page has been unfrozen. | 2148 | * If this function returns NULL then the page has been unfrozen. |
2149 | * | ||
2150 | * This function must be called with interrupt disabled. | ||
2166 | */ | 2151 | */ |
2167 | static inline void *get_freelist(struct kmem_cache *s, struct page *page) | 2152 | static inline void *get_freelist(struct kmem_cache *s, struct page *page) |
2168 | { | 2153 | { |
@@ -2173,13 +2158,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) | |||
2173 | do { | 2158 | do { |
2174 | freelist = page->freelist; | 2159 | freelist = page->freelist; |
2175 | counters = page->counters; | 2160 | counters = page->counters; |
2161 | |||
2176 | new.counters = counters; | 2162 | new.counters = counters; |
2177 | VM_BUG_ON(!new.frozen); | 2163 | VM_BUG_ON(!new.frozen); |
2178 | 2164 | ||
2179 | new.inuse = page->objects; | 2165 | new.inuse = page->objects; |
2180 | new.frozen = freelist != NULL; | 2166 | new.frozen = freelist != NULL; |
2181 | 2167 | ||
2182 | } while (!cmpxchg_double_slab(s, page, | 2168 | } while (!__cmpxchg_double_slab(s, page, |
2183 | freelist, counters, | 2169 | freelist, counters, |
2184 | NULL, new.counters, | 2170 | NULL, new.counters, |
2185 | "get_freelist")); | 2171 | "get_freelist")); |
@@ -2206,7 +2192,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) | |||
2206 | static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | 2192 | static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, |
2207 | unsigned long addr, struct kmem_cache_cpu *c) | 2193 | unsigned long addr, struct kmem_cache_cpu *c) |
2208 | { | 2194 | { |
2209 | void **object; | 2195 | void *freelist; |
2196 | struct page *page; | ||
2210 | unsigned long flags; | 2197 | unsigned long flags; |
2211 | 2198 | ||
2212 | local_irq_save(flags); | 2199 | local_irq_save(flags); |
@@ -2219,25 +2206,41 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
2219 | c = this_cpu_ptr(s->cpu_slab); | 2206 | c = this_cpu_ptr(s->cpu_slab); |
2220 | #endif | 2207 | #endif |
2221 | 2208 | ||
2222 | if (!c->page) | 2209 | page = c->page; |
2210 | if (!page) | ||
2223 | goto new_slab; | 2211 | goto new_slab; |
2224 | redo: | 2212 | redo: |
2225 | if (unlikely(!node_match(c, node))) { | 2213 | |
2214 | if (unlikely(!node_match(page, node))) { | ||
2226 | stat(s, ALLOC_NODE_MISMATCH); | 2215 | stat(s, ALLOC_NODE_MISMATCH); |
2227 | deactivate_slab(s, c); | 2216 | deactivate_slab(s, page, c->freelist); |
2217 | c->page = NULL; | ||
2218 | c->freelist = NULL; | ||
2219 | goto new_slab; | ||
2220 | } | ||
2221 | |||
2222 | /* | ||
2223 | * By rights, we should be searching for a slab page that was | ||
2224 | * PFMEMALLOC but right now, we are losing the pfmemalloc | ||
2225 | * information when the page leaves the per-cpu allocator | ||
2226 | */ | ||
2227 | if (unlikely(!pfmemalloc_match(page, gfpflags))) { | ||
2228 | deactivate_slab(s, page, c->freelist); | ||
2229 | c->page = NULL; | ||
2230 | c->freelist = NULL; | ||
2228 | goto new_slab; | 2231 | goto new_slab; |
2229 | } | 2232 | } |
2230 | 2233 | ||
2231 | /* must check again c->freelist in case of cpu migration or IRQ */ | 2234 | /* must check again c->freelist in case of cpu migration or IRQ */ |
2232 | object = c->freelist; | 2235 | freelist = c->freelist; |
2233 | if (object) | 2236 | if (freelist) |
2234 | goto load_freelist; | 2237 | goto load_freelist; |
2235 | 2238 | ||
2236 | stat(s, ALLOC_SLOWPATH); | 2239 | stat(s, ALLOC_SLOWPATH); |
2237 | 2240 | ||
2238 | object = get_freelist(s, c->page); | 2241 | freelist = get_freelist(s, page); |
2239 | 2242 | ||
2240 | if (!object) { | 2243 | if (!freelist) { |
2241 | c->page = NULL; | 2244 | c->page = NULL; |
2242 | stat(s, DEACTIVATE_BYPASS); | 2245 | stat(s, DEACTIVATE_BYPASS); |
2243 | goto new_slab; | 2246 | goto new_slab; |
@@ -2246,50 +2249,50 @@ redo: | |||
2246 | stat(s, ALLOC_REFILL); | 2249 | stat(s, ALLOC_REFILL); |
2247 | 2250 | ||
2248 | load_freelist: | 2251 | load_freelist: |
2249 | c->freelist = get_freepointer(s, object); | 2252 | /* |
2253 | * freelist is pointing to the list of objects to be used. | ||
2254 | * page is pointing to the page from which the objects are obtained. | ||
2255 | * That page must be frozen for per cpu allocations to work. | ||
2256 | */ | ||
2257 | VM_BUG_ON(!c->page->frozen); | ||
2258 | c->freelist = get_freepointer(s, freelist); | ||
2250 | c->tid = next_tid(c->tid); | 2259 | c->tid = next_tid(c->tid); |
2251 | local_irq_restore(flags); | 2260 | local_irq_restore(flags); |
2252 | return object; | 2261 | return freelist; |
2253 | 2262 | ||
2254 | new_slab: | 2263 | new_slab: |
2255 | 2264 | ||
2256 | if (c->partial) { | 2265 | if (c->partial) { |
2257 | c->page = c->partial; | 2266 | page = c->page = c->partial; |
2258 | c->partial = c->page->next; | 2267 | c->partial = page->next; |
2259 | c->node = page_to_nid(c->page); | ||
2260 | stat(s, CPU_PARTIAL_ALLOC); | 2268 | stat(s, CPU_PARTIAL_ALLOC); |
2261 | c->freelist = NULL; | 2269 | c->freelist = NULL; |
2262 | goto redo; | 2270 | goto redo; |
2263 | } | 2271 | } |
2264 | 2272 | ||
2265 | /* Then do expensive stuff like retrieving pages from the partial lists */ | 2273 | freelist = new_slab_objects(s, gfpflags, node, &c); |
2266 | object = get_partial(s, gfpflags, node, c); | ||
2267 | |||
2268 | if (unlikely(!object)) { | ||
2269 | 2274 | ||
2270 | object = new_slab_objects(s, gfpflags, node, &c); | 2275 | if (unlikely(!freelist)) { |
2276 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | ||
2277 | slab_out_of_memory(s, gfpflags, node); | ||
2271 | 2278 | ||
2272 | if (unlikely(!object)) { | 2279 | local_irq_restore(flags); |
2273 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | 2280 | return NULL; |
2274 | slab_out_of_memory(s, gfpflags, node); | ||
2275 | |||
2276 | local_irq_restore(flags); | ||
2277 | return NULL; | ||
2278 | } | ||
2279 | } | 2281 | } |
2280 | 2282 | ||
2281 | if (likely(!kmem_cache_debug(s))) | 2283 | page = c->page; |
2284 | if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) | ||
2282 | goto load_freelist; | 2285 | goto load_freelist; |
2283 | 2286 | ||
2284 | /* Only entered in the debug case */ | 2287 | /* Only entered in the debug case */ |
2285 | if (!alloc_debug_processing(s, c->page, object, addr)) | 2288 | if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr)) |
2286 | goto new_slab; /* Slab failed checks. Next slab needed */ | 2289 | goto new_slab; /* Slab failed checks. Next slab needed */ |
2287 | 2290 | ||
2288 | c->freelist = get_freepointer(s, object); | 2291 | deactivate_slab(s, page, get_freepointer(s, freelist)); |
2289 | deactivate_slab(s, c); | 2292 | c->page = NULL; |
2290 | c->node = NUMA_NO_NODE; | 2293 | c->freelist = NULL; |
2291 | local_irq_restore(flags); | 2294 | local_irq_restore(flags); |
2292 | return object; | 2295 | return freelist; |
2293 | } | 2296 | } |
2294 | 2297 | ||
2295 | /* | 2298 | /* |
@@ -2307,6 +2310,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
2307 | { | 2310 | { |
2308 | void **object; | 2311 | void **object; |
2309 | struct kmem_cache_cpu *c; | 2312 | struct kmem_cache_cpu *c; |
2313 | struct page *page; | ||
2310 | unsigned long tid; | 2314 | unsigned long tid; |
2311 | 2315 | ||
2312 | if (slab_pre_alloc_hook(s, gfpflags)) | 2316 | if (slab_pre_alloc_hook(s, gfpflags)) |
@@ -2332,8 +2336,8 @@ redo: | |||
2332 | barrier(); | 2336 | barrier(); |
2333 | 2337 | ||
2334 | object = c->freelist; | 2338 | object = c->freelist; |
2335 | if (unlikely(!object || !node_match(c, node))) | 2339 | page = c->page; |
2336 | 2340 | if (unlikely(!object || !node_match(page, node))) | |
2337 | object = __slab_alloc(s, gfpflags, node, addr, c); | 2341 | object = __slab_alloc(s, gfpflags, node, addr, c); |
2338 | 2342 | ||
2339 | else { | 2343 | else { |
@@ -2364,7 +2368,7 @@ redo: | |||
2364 | } | 2368 | } |
2365 | 2369 | ||
2366 | if (unlikely(gfpflags & __GFP_ZERO) && object) | 2370 | if (unlikely(gfpflags & __GFP_ZERO) && object) |
2367 | memset(object, 0, s->objsize); | 2371 | memset(object, 0, s->object_size); |
2368 | 2372 | ||
2369 | slab_post_alloc_hook(s, gfpflags, object); | 2373 | slab_post_alloc_hook(s, gfpflags, object); |
2370 | 2374 | ||
@@ -2375,7 +2379,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | |||
2375 | { | 2379 | { |
2376 | void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); | 2380 | void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); |
2377 | 2381 | ||
2378 | trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); | 2382 | trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); |
2379 | 2383 | ||
2380 | return ret; | 2384 | return ret; |
2381 | } | 2385 | } |
@@ -2405,7 +2409,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | |||
2405 | void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); | 2409 | void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); |
2406 | 2410 | ||
2407 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | 2411 | trace_kmem_cache_alloc_node(_RET_IP_, ret, |
2408 | s->objsize, s->size, gfpflags, node); | 2412 | s->object_size, s->size, gfpflags, node); |
2409 | 2413 | ||
2410 | return ret; | 2414 | return ret; |
2411 | } | 2415 | } |
@@ -2900,7 +2904,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) | |||
2900 | static int calculate_sizes(struct kmem_cache *s, int forced_order) | 2904 | static int calculate_sizes(struct kmem_cache *s, int forced_order) |
2901 | { | 2905 | { |
2902 | unsigned long flags = s->flags; | 2906 | unsigned long flags = s->flags; |
2903 | unsigned long size = s->objsize; | 2907 | unsigned long size = s->object_size; |
2904 | unsigned long align = s->align; | 2908 | unsigned long align = s->align; |
2905 | int order; | 2909 | int order; |
2906 | 2910 | ||
@@ -2929,7 +2933,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2929 | * end of the object and the free pointer. If not then add an | 2933 | * end of the object and the free pointer. If not then add an |
2930 | * additional word to have some bytes to store Redzone information. | 2934 | * additional word to have some bytes to store Redzone information. |
2931 | */ | 2935 | */ |
2932 | if ((flags & SLAB_RED_ZONE) && size == s->objsize) | 2936 | if ((flags & SLAB_RED_ZONE) && size == s->object_size) |
2933 | size += sizeof(void *); | 2937 | size += sizeof(void *); |
2934 | #endif | 2938 | #endif |
2935 | 2939 | ||
@@ -2977,7 +2981,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2977 | * user specified and the dynamic determination of cache line size | 2981 | * user specified and the dynamic determination of cache line size |
2978 | * on bootup. | 2982 | * on bootup. |
2979 | */ | 2983 | */ |
2980 | align = calculate_alignment(flags, align, s->objsize); | 2984 | align = calculate_alignment(flags, align, s->object_size); |
2981 | s->align = align; | 2985 | s->align = align; |
2982 | 2986 | ||
2983 | /* | 2987 | /* |
@@ -3025,7 +3029,7 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
3025 | memset(s, 0, kmem_size); | 3029 | memset(s, 0, kmem_size); |
3026 | s->name = name; | 3030 | s->name = name; |
3027 | s->ctor = ctor; | 3031 | s->ctor = ctor; |
3028 | s->objsize = size; | 3032 | s->object_size = size; |
3029 | s->align = align; | 3033 | s->align = align; |
3030 | s->flags = kmem_cache_flags(size, flags, name, ctor); | 3034 | s->flags = kmem_cache_flags(size, flags, name, ctor); |
3031 | s->reserved = 0; | 3035 | s->reserved = 0; |
@@ -3040,7 +3044,7 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
3040 | * Disable debugging flags that store metadata if the min slab | 3044 | * Disable debugging flags that store metadata if the min slab |
3041 | * order increased. | 3045 | * order increased. |
3042 | */ | 3046 | */ |
3043 | if (get_order(s->size) > get_order(s->objsize)) { | 3047 | if (get_order(s->size) > get_order(s->object_size)) { |
3044 | s->flags &= ~DEBUG_METADATA_FLAGS; | 3048 | s->flags &= ~DEBUG_METADATA_FLAGS; |
3045 | s->offset = 0; | 3049 | s->offset = 0; |
3046 | if (!calculate_sizes(s, -1)) | 3050 | if (!calculate_sizes(s, -1)) |
@@ -3114,7 +3118,7 @@ error: | |||
3114 | */ | 3118 | */ |
3115 | unsigned int kmem_cache_size(struct kmem_cache *s) | 3119 | unsigned int kmem_cache_size(struct kmem_cache *s) |
3116 | { | 3120 | { |
3117 | return s->objsize; | 3121 | return s->object_size; |
3118 | } | 3122 | } |
3119 | EXPORT_SYMBOL(kmem_cache_size); | 3123 | EXPORT_SYMBOL(kmem_cache_size); |
3120 | 3124 | ||
@@ -3192,11 +3196,11 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
3192 | */ | 3196 | */ |
3193 | void kmem_cache_destroy(struct kmem_cache *s) | 3197 | void kmem_cache_destroy(struct kmem_cache *s) |
3194 | { | 3198 | { |
3195 | down_write(&slub_lock); | 3199 | mutex_lock(&slab_mutex); |
3196 | s->refcount--; | 3200 | s->refcount--; |
3197 | if (!s->refcount) { | 3201 | if (!s->refcount) { |
3198 | list_del(&s->list); | 3202 | list_del(&s->list); |
3199 | up_write(&slub_lock); | 3203 | mutex_unlock(&slab_mutex); |
3200 | if (kmem_cache_close(s)) { | 3204 | if (kmem_cache_close(s)) { |
3201 | printk(KERN_ERR "SLUB %s: %s called for cache that " | 3205 | printk(KERN_ERR "SLUB %s: %s called for cache that " |
3202 | "still has objects.\n", s->name, __func__); | 3206 | "still has objects.\n", s->name, __func__); |
@@ -3206,7 +3210,7 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
3206 | rcu_barrier(); | 3210 | rcu_barrier(); |
3207 | sysfs_slab_remove(s); | 3211 | sysfs_slab_remove(s); |
3208 | } else | 3212 | } else |
3209 | up_write(&slub_lock); | 3213 | mutex_unlock(&slab_mutex); |
3210 | } | 3214 | } |
3211 | EXPORT_SYMBOL(kmem_cache_destroy); | 3215 | EXPORT_SYMBOL(kmem_cache_destroy); |
3212 | 3216 | ||
@@ -3268,7 +3272,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, | |||
3268 | 3272 | ||
3269 | /* | 3273 | /* |
3270 | * This function is called with IRQs disabled during early-boot on | 3274 | * This function is called with IRQs disabled during early-boot on |
3271 | * single CPU so there's no need to take slub_lock here. | 3275 | * single CPU so there's no need to take slab_mutex here. |
3272 | */ | 3276 | */ |
3273 | if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, | 3277 | if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, |
3274 | flags, NULL)) | 3278 | flags, NULL)) |
@@ -3553,10 +3557,10 @@ static int slab_mem_going_offline_callback(void *arg) | |||
3553 | { | 3557 | { |
3554 | struct kmem_cache *s; | 3558 | struct kmem_cache *s; |
3555 | 3559 | ||
3556 | down_read(&slub_lock); | 3560 | mutex_lock(&slab_mutex); |
3557 | list_for_each_entry(s, &slab_caches, list) | 3561 | list_for_each_entry(s, &slab_caches, list) |
3558 | kmem_cache_shrink(s); | 3562 | kmem_cache_shrink(s); |
3559 | up_read(&slub_lock); | 3563 | mutex_unlock(&slab_mutex); |
3560 | 3564 | ||
3561 | return 0; | 3565 | return 0; |
3562 | } | 3566 | } |
@@ -3577,7 +3581,7 @@ static void slab_mem_offline_callback(void *arg) | |||
3577 | if (offline_node < 0) | 3581 | if (offline_node < 0) |
3578 | return; | 3582 | return; |
3579 | 3583 | ||
3580 | down_read(&slub_lock); | 3584 | mutex_lock(&slab_mutex); |
3581 | list_for_each_entry(s, &slab_caches, list) { | 3585 | list_for_each_entry(s, &slab_caches, list) { |
3582 | n = get_node(s, offline_node); | 3586 | n = get_node(s, offline_node); |
3583 | if (n) { | 3587 | if (n) { |
@@ -3593,7 +3597,7 @@ static void slab_mem_offline_callback(void *arg) | |||
3593 | kmem_cache_free(kmem_cache_node, n); | 3597 | kmem_cache_free(kmem_cache_node, n); |
3594 | } | 3598 | } |
3595 | } | 3599 | } |
3596 | up_read(&slub_lock); | 3600 | mutex_unlock(&slab_mutex); |
3597 | } | 3601 | } |
3598 | 3602 | ||
3599 | static int slab_mem_going_online_callback(void *arg) | 3603 | static int slab_mem_going_online_callback(void *arg) |
@@ -3616,7 +3620,7 @@ static int slab_mem_going_online_callback(void *arg) | |||
3616 | * allocate a kmem_cache_node structure in order to bring the node | 3620 | * allocate a kmem_cache_node structure in order to bring the node |
3617 | * online. | 3621 | * online. |
3618 | */ | 3622 | */ |
3619 | down_read(&slub_lock); | 3623 | mutex_lock(&slab_mutex); |
3620 | list_for_each_entry(s, &slab_caches, list) { | 3624 | list_for_each_entry(s, &slab_caches, list) { |
3621 | /* | 3625 | /* |
3622 | * XXX: kmem_cache_alloc_node will fallback to other nodes | 3626 | * XXX: kmem_cache_alloc_node will fallback to other nodes |
@@ -3632,7 +3636,7 @@ static int slab_mem_going_online_callback(void *arg) | |||
3632 | s->node[nid] = n; | 3636 | s->node[nid] = n; |
3633 | } | 3637 | } |
3634 | out: | 3638 | out: |
3635 | up_read(&slub_lock); | 3639 | mutex_unlock(&slab_mutex); |
3636 | return ret; | 3640 | return ret; |
3637 | } | 3641 | } |
3638 | 3642 | ||
@@ -3843,11 +3847,11 @@ void __init kmem_cache_init(void) | |||
3843 | 3847 | ||
3844 | if (s && s->size) { | 3848 | if (s && s->size) { |
3845 | char *name = kasprintf(GFP_NOWAIT, | 3849 | char *name = kasprintf(GFP_NOWAIT, |
3846 | "dma-kmalloc-%d", s->objsize); | 3850 | "dma-kmalloc-%d", s->object_size); |
3847 | 3851 | ||
3848 | BUG_ON(!name); | 3852 | BUG_ON(!name); |
3849 | kmalloc_dma_caches[i] = create_kmalloc_cache(name, | 3853 | kmalloc_dma_caches[i] = create_kmalloc_cache(name, |
3850 | s->objsize, SLAB_CACHE_DMA); | 3854 | s->object_size, SLAB_CACHE_DMA); |
3851 | } | 3855 | } |
3852 | } | 3856 | } |
3853 | #endif | 3857 | #endif |
@@ -3924,16 +3928,12 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
3924 | return NULL; | 3928 | return NULL; |
3925 | } | 3929 | } |
3926 | 3930 | ||
3927 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 3931 | struct kmem_cache *__kmem_cache_create(const char *name, size_t size, |
3928 | size_t align, unsigned long flags, void (*ctor)(void *)) | 3932 | size_t align, unsigned long flags, void (*ctor)(void *)) |
3929 | { | 3933 | { |
3930 | struct kmem_cache *s; | 3934 | struct kmem_cache *s; |
3931 | char *n; | 3935 | char *n; |
3932 | 3936 | ||
3933 | if (WARN_ON(!name)) | ||
3934 | return NULL; | ||
3935 | |||
3936 | down_write(&slub_lock); | ||
3937 | s = find_mergeable(size, align, flags, name, ctor); | 3937 | s = find_mergeable(size, align, flags, name, ctor); |
3938 | if (s) { | 3938 | if (s) { |
3939 | s->refcount++; | 3939 | s->refcount++; |
@@ -3941,49 +3941,42 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
3941 | * Adjust the object sizes so that we clear | 3941 | * Adjust the object sizes so that we clear |
3942 | * the complete object on kzalloc. | 3942 | * the complete object on kzalloc. |
3943 | */ | 3943 | */ |
3944 | s->objsize = max(s->objsize, (int)size); | 3944 | s->object_size = max(s->object_size, (int)size); |
3945 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 3945 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
3946 | 3946 | ||
3947 | if (sysfs_slab_alias(s, name)) { | 3947 | if (sysfs_slab_alias(s, name)) { |
3948 | s->refcount--; | 3948 | s->refcount--; |
3949 | goto err; | 3949 | return NULL; |
3950 | } | 3950 | } |
3951 | up_write(&slub_lock); | ||
3952 | return s; | 3951 | return s; |
3953 | } | 3952 | } |
3954 | 3953 | ||
3955 | n = kstrdup(name, GFP_KERNEL); | 3954 | n = kstrdup(name, GFP_KERNEL); |
3956 | if (!n) | 3955 | if (!n) |
3957 | goto err; | 3956 | return NULL; |
3958 | 3957 | ||
3959 | s = kmalloc(kmem_size, GFP_KERNEL); | 3958 | s = kmalloc(kmem_size, GFP_KERNEL); |
3960 | if (s) { | 3959 | if (s) { |
3961 | if (kmem_cache_open(s, n, | 3960 | if (kmem_cache_open(s, n, |
3962 | size, align, flags, ctor)) { | 3961 | size, align, flags, ctor)) { |
3962 | int r; | ||
3963 | |||
3963 | list_add(&s->list, &slab_caches); | 3964 | list_add(&s->list, &slab_caches); |
3964 | up_write(&slub_lock); | 3965 | mutex_unlock(&slab_mutex); |
3965 | if (sysfs_slab_add(s)) { | 3966 | r = sysfs_slab_add(s); |
3966 | down_write(&slub_lock); | 3967 | mutex_lock(&slab_mutex); |
3967 | list_del(&s->list); | 3968 | |
3968 | kfree(n); | 3969 | if (!r) |
3969 | kfree(s); | 3970 | return s; |
3970 | goto err; | 3971 | |
3971 | } | 3972 | list_del(&s->list); |
3972 | return s; | 3973 | kmem_cache_close(s); |
3973 | } | 3974 | } |
3974 | kfree(s); | 3975 | kfree(s); |
3975 | } | 3976 | } |
3976 | kfree(n); | 3977 | kfree(n); |
3977 | err: | 3978 | return NULL; |
3978 | up_write(&slub_lock); | ||
3979 | |||
3980 | if (flags & SLAB_PANIC) | ||
3981 | panic("Cannot create slabcache %s\n", name); | ||
3982 | else | ||
3983 | s = NULL; | ||
3984 | return s; | ||
3985 | } | 3979 | } |
3986 | EXPORT_SYMBOL(kmem_cache_create); | ||
3987 | 3980 | ||
3988 | #ifdef CONFIG_SMP | 3981 | #ifdef CONFIG_SMP |
3989 | /* | 3982 | /* |
@@ -4002,13 +3995,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
4002 | case CPU_UP_CANCELED_FROZEN: | 3995 | case CPU_UP_CANCELED_FROZEN: |
4003 | case CPU_DEAD: | 3996 | case CPU_DEAD: |
4004 | case CPU_DEAD_FROZEN: | 3997 | case CPU_DEAD_FROZEN: |
4005 | down_read(&slub_lock); | 3998 | mutex_lock(&slab_mutex); |
4006 | list_for_each_entry(s, &slab_caches, list) { | 3999 | list_for_each_entry(s, &slab_caches, list) { |
4007 | local_irq_save(flags); | 4000 | local_irq_save(flags); |
4008 | __flush_cpu_slab(s, cpu); | 4001 | __flush_cpu_slab(s, cpu); |
4009 | local_irq_restore(flags); | 4002 | local_irq_restore(flags); |
4010 | } | 4003 | } |
4011 | up_read(&slub_lock); | 4004 | mutex_unlock(&slab_mutex); |
4012 | break; | 4005 | break; |
4013 | default: | 4006 | default: |
4014 | break; | 4007 | break; |
@@ -4500,30 +4493,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4500 | 4493 | ||
4501 | for_each_possible_cpu(cpu) { | 4494 | for_each_possible_cpu(cpu) { |
4502 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); | 4495 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
4503 | int node = ACCESS_ONCE(c->node); | 4496 | int node; |
4504 | struct page *page; | 4497 | struct page *page; |
4505 | 4498 | ||
4506 | if (node < 0) | ||
4507 | continue; | ||
4508 | page = ACCESS_ONCE(c->page); | 4499 | page = ACCESS_ONCE(c->page); |
4509 | if (page) { | 4500 | if (!page) |
4510 | if (flags & SO_TOTAL) | 4501 | continue; |
4511 | x = page->objects; | ||
4512 | else if (flags & SO_OBJECTS) | ||
4513 | x = page->inuse; | ||
4514 | else | ||
4515 | x = 1; | ||
4516 | 4502 | ||
4517 | total += x; | 4503 | node = page_to_nid(page); |
4518 | nodes[node] += x; | 4504 | if (flags & SO_TOTAL) |
4519 | } | 4505 | x = page->objects; |
4520 | page = c->partial; | 4506 | else if (flags & SO_OBJECTS) |
4507 | x = page->inuse; | ||
4508 | else | ||
4509 | x = 1; | ||
4521 | 4510 | ||
4511 | total += x; | ||
4512 | nodes[node] += x; | ||
4513 | |||
4514 | page = ACCESS_ONCE(c->partial); | ||
4522 | if (page) { | 4515 | if (page) { |
4523 | x = page->pobjects; | 4516 | x = page->pobjects; |
4524 | total += x; | 4517 | total += x; |
4525 | nodes[node] += x; | 4518 | nodes[node] += x; |
4526 | } | 4519 | } |
4520 | |||
4527 | per_cpu[node]++; | 4521 | per_cpu[node]++; |
4528 | } | 4522 | } |
4529 | } | 4523 | } |
@@ -4623,7 +4617,7 @@ SLAB_ATTR_RO(align); | |||
4623 | 4617 | ||
4624 | static ssize_t object_size_show(struct kmem_cache *s, char *buf) | 4618 | static ssize_t object_size_show(struct kmem_cache *s, char *buf) |
4625 | { | 4619 | { |
4626 | return sprintf(buf, "%d\n", s->objsize); | 4620 | return sprintf(buf, "%d\n", s->object_size); |
4627 | } | 4621 | } |
4628 | SLAB_ATTR_RO(object_size); | 4622 | SLAB_ATTR_RO(object_size); |
4629 | 4623 | ||
@@ -5286,7 +5280,7 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
5286 | const char *name; | 5280 | const char *name; |
5287 | int unmergeable; | 5281 | int unmergeable; |
5288 | 5282 | ||
5289 | if (slab_state < SYSFS) | 5283 | if (slab_state < FULL) |
5290 | /* Defer until later */ | 5284 | /* Defer until later */ |
5291 | return 0; | 5285 | return 0; |
5292 | 5286 | ||
@@ -5331,7 +5325,7 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
5331 | 5325 | ||
5332 | static void sysfs_slab_remove(struct kmem_cache *s) | 5326 | static void sysfs_slab_remove(struct kmem_cache *s) |
5333 | { | 5327 | { |
5334 | if (slab_state < SYSFS) | 5328 | if (slab_state < FULL) |
5335 | /* | 5329 | /* |
5336 | * Sysfs has not been setup yet so no need to remove the | 5330 | * Sysfs has not been setup yet so no need to remove the |
5337 | * cache from sysfs. | 5331 | * cache from sysfs. |
@@ -5359,7 +5353,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) | |||
5359 | { | 5353 | { |
5360 | struct saved_alias *al; | 5354 | struct saved_alias *al; |
5361 | 5355 | ||
5362 | if (slab_state == SYSFS) { | 5356 | if (slab_state == FULL) { |
5363 | /* | 5357 | /* |
5364 | * If we have a leftover link then remove it. | 5358 | * If we have a leftover link then remove it. |
5365 | */ | 5359 | */ |
@@ -5383,16 +5377,16 @@ static int __init slab_sysfs_init(void) | |||
5383 | struct kmem_cache *s; | 5377 | struct kmem_cache *s; |
5384 | int err; | 5378 | int err; |
5385 | 5379 | ||
5386 | down_write(&slub_lock); | 5380 | mutex_lock(&slab_mutex); |
5387 | 5381 | ||
5388 | slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); | 5382 | slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); |
5389 | if (!slab_kset) { | 5383 | if (!slab_kset) { |
5390 | up_write(&slub_lock); | 5384 | mutex_unlock(&slab_mutex); |
5391 | printk(KERN_ERR "Cannot register slab subsystem.\n"); | 5385 | printk(KERN_ERR "Cannot register slab subsystem.\n"); |
5392 | return -ENOSYS; | 5386 | return -ENOSYS; |
5393 | } | 5387 | } |
5394 | 5388 | ||
5395 | slab_state = SYSFS; | 5389 | slab_state = FULL; |
5396 | 5390 | ||
5397 | list_for_each_entry(s, &slab_caches, list) { | 5391 | list_for_each_entry(s, &slab_caches, list) { |
5398 | err = sysfs_slab_add(s); | 5392 | err = sysfs_slab_add(s); |
@@ -5408,11 +5402,11 @@ static int __init slab_sysfs_init(void) | |||
5408 | err = sysfs_slab_alias(al->s, al->name); | 5402 | err = sysfs_slab_alias(al->s, al->name); |
5409 | if (err) | 5403 | if (err) |
5410 | printk(KERN_ERR "SLUB: Unable to add boot slab alias" | 5404 | printk(KERN_ERR "SLUB: Unable to add boot slab alias" |
5411 | " %s to sysfs\n", s->name); | 5405 | " %s to sysfs\n", al->name); |
5412 | kfree(al); | 5406 | kfree(al); |
5413 | } | 5407 | } |
5414 | 5408 | ||
5415 | up_write(&slub_lock); | 5409 | mutex_unlock(&slab_mutex); |
5416 | resiliency_test(); | 5410 | resiliency_test(); |
5417 | return 0; | 5411 | return 0; |
5418 | } | 5412 | } |
@@ -5427,7 +5421,7 @@ __initcall(slab_sysfs_init); | |||
5427 | static void print_slabinfo_header(struct seq_file *m) | 5421 | static void print_slabinfo_header(struct seq_file *m) |
5428 | { | 5422 | { |
5429 | seq_puts(m, "slabinfo - version: 2.1\n"); | 5423 | seq_puts(m, "slabinfo - version: 2.1\n"); |
5430 | seq_puts(m, "# name <active_objs> <num_objs> <objsize> " | 5424 | seq_puts(m, "# name <active_objs> <num_objs> <object_size> " |
5431 | "<objperslab> <pagesperslab>"); | 5425 | "<objperslab> <pagesperslab>"); |
5432 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | 5426 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); |
5433 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | 5427 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); |
@@ -5438,7 +5432,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
5438 | { | 5432 | { |
5439 | loff_t n = *pos; | 5433 | loff_t n = *pos; |
5440 | 5434 | ||
5441 | down_read(&slub_lock); | 5435 | mutex_lock(&slab_mutex); |
5442 | if (!n) | 5436 | if (!n) |
5443 | print_slabinfo_header(m); | 5437 | print_slabinfo_header(m); |
5444 | 5438 | ||
@@ -5452,7 +5446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) | |||
5452 | 5446 | ||
5453 | static void s_stop(struct seq_file *m, void *p) | 5447 | static void s_stop(struct seq_file *m, void *p) |
5454 | { | 5448 | { |
5455 | up_read(&slub_lock); | 5449 | mutex_unlock(&slab_mutex); |
5456 | } | 5450 | } |
5457 | 5451 | ||
5458 | static int s_show(struct seq_file *m, void *p) | 5452 | static int s_show(struct seq_file *m, void *p) |
diff --git a/mm/sparse.c b/mm/sparse.c index 6a4bf9160e85..fac95f2888f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) | |||
65 | 65 | ||
66 | if (slab_is_available()) { | 66 | if (slab_is_available()) { |
67 | if (node_state(nid, N_HIGH_MEMORY)) | 67 | if (node_state(nid, N_HIGH_MEMORY)) |
68 | section = kmalloc_node(array_size, GFP_KERNEL, nid); | 68 | section = kzalloc_node(array_size, GFP_KERNEL, nid); |
69 | else | 69 | else |
70 | section = kmalloc(array_size, GFP_KERNEL); | 70 | section = kzalloc(array_size, GFP_KERNEL); |
71 | } else | 71 | } else { |
72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); | 72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); |
73 | 73 | } | |
74 | if (section) | ||
75 | memset(section, 0, array_size); | ||
76 | 74 | ||
77 | return section; | 75 | return section; |
78 | } | 76 | } |
79 | 77 | ||
80 | static int __meminit sparse_index_init(unsigned long section_nr, int nid) | 78 | static int __meminit sparse_index_init(unsigned long section_nr, int nid) |
81 | { | 79 | { |
82 | static DEFINE_SPINLOCK(index_init_lock); | ||
83 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); | 80 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); |
84 | struct mem_section *section; | 81 | struct mem_section *section; |
85 | int ret = 0; | 82 | int ret = 0; |
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) | |||
90 | section = sparse_index_alloc(nid); | 87 | section = sparse_index_alloc(nid); |
91 | if (!section) | 88 | if (!section) |
92 | return -ENOMEM; | 89 | return -ENOMEM; |
93 | /* | ||
94 | * This lock keeps two different sections from | ||
95 | * reallocating for the same index | ||
96 | */ | ||
97 | spin_lock(&index_init_lock); | ||
98 | |||
99 | if (mem_section[root]) { | ||
100 | ret = -EEXIST; | ||
101 | goto out; | ||
102 | } | ||
103 | 90 | ||
104 | mem_section[root] = section; | 91 | mem_section[root] = section; |
105 | out: | 92 | |
106 | spin_unlock(&index_init_lock); | ||
107 | return ret; | 93 | return ret; |
108 | } | 94 | } |
109 | #else /* !SPARSEMEM_EXTREME */ | 95 | #else /* !SPARSEMEM_EXTREME */ |
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms) | |||
132 | break; | 118 | break; |
133 | } | 119 | } |
134 | 120 | ||
121 | VM_BUG_ON(root_nr == NR_SECTION_ROOTS); | ||
122 | |||
135 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); | 123 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); |
136 | } | 124 | } |
137 | 125 | ||
@@ -275,8 +263,9 @@ static unsigned long * __init | |||
275 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 263 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
276 | unsigned long size) | 264 | unsigned long size) |
277 | { | 265 | { |
278 | pg_data_t *host_pgdat; | 266 | unsigned long goal, limit; |
279 | unsigned long goal; | 267 | unsigned long *p; |
268 | int nid; | ||
280 | /* | 269 | /* |
281 | * A page may contain usemaps for other sections preventing the | 270 | * A page may contain usemaps for other sections preventing the |
282 | * page being freed and making a section unremovable while | 271 | * page being freed and making a section unremovable while |
@@ -287,10 +276,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | |||
287 | * from the same section as the pgdat where possible to avoid | 276 | * from the same section as the pgdat where possible to avoid |
288 | * this problem. | 277 | * this problem. |
289 | */ | 278 | */ |
290 | goal = __pa(pgdat) & PAGE_SECTION_MASK; | 279 | goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); |
291 | host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); | 280 | limit = goal + (1UL << PA_SECTION_SHIFT); |
292 | return __alloc_bootmem_node_nopanic(host_pgdat, size, | 281 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); |
293 | SMP_CACHE_BYTES, goal); | 282 | again: |
283 | p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, | ||
284 | SMP_CACHE_BYTES, goal, limit); | ||
285 | if (!p && limit) { | ||
286 | limit = 0; | ||
287 | goto again; | ||
288 | } | ||
289 | return p; | ||
294 | } | 290 | } |
295 | 291 | ||
296 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 292 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -485,6 +481,9 @@ void __init sparse_init(void) | |||
485 | struct page **map_map; | 481 | struct page **map_map; |
486 | #endif | 482 | #endif |
487 | 483 | ||
484 | /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ | ||
485 | set_pageblock_order(); | ||
486 | |||
488 | /* | 487 | /* |
489 | * map is using big page (aka 2M in x86 64 bit) | 488 | * map is using big page (aka 2M in x86 64 bit) |
490 | * usemap is less one page (aka 24 bytes) | 489 | * usemap is less one page (aka 24 bytes) |
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages) | |||
236 | } | 236 | } |
237 | EXPORT_SYMBOL(put_pages_list); | 237 | EXPORT_SYMBOL(put_pages_list); |
238 | 238 | ||
239 | /* | ||
240 | * get_kernel_pages() - pin kernel pages in memory | ||
241 | * @kiov: An array of struct kvec structures | ||
242 | * @nr_segs: number of segments to pin | ||
243 | * @write: pinning for read/write, currently ignored | ||
244 | * @pages: array that receives pointers to the pages pinned. | ||
245 | * Should be at least nr_segs long. | ||
246 | * | ||
247 | * Returns number of pages pinned. This may be fewer than the number | ||
248 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
249 | * were pinned, returns -errno. Each page returned must be released | ||
250 | * with a put_page() call when it is finished with. | ||
251 | */ | ||
252 | int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, | ||
253 | struct page **pages) | ||
254 | { | ||
255 | int seg; | ||
256 | |||
257 | for (seg = 0; seg < nr_segs; seg++) { | ||
258 | if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) | ||
259 | return seg; | ||
260 | |||
261 | pages[seg] = kmap_to_page(kiov[seg].iov_base); | ||
262 | page_cache_get(pages[seg]); | ||
263 | } | ||
264 | |||
265 | return seg; | ||
266 | } | ||
267 | EXPORT_SYMBOL_GPL(get_kernel_pages); | ||
268 | |||
269 | /* | ||
270 | * get_kernel_page() - pin a kernel page in memory | ||
271 | * @start: starting kernel address | ||
272 | * @write: pinning for read/write, currently ignored | ||
273 | * @pages: array that receives pointer to the page pinned. | ||
274 | * Must be at least nr_segs long. | ||
275 | * | ||
276 | * Returns 1 if page is pinned. If the page was not pinned, returns | ||
277 | * -errno. The page returned must be released with a put_page() call | ||
278 | * when it is finished with. | ||
279 | */ | ||
280 | int get_kernel_page(unsigned long start, int write, struct page **pages) | ||
281 | { | ||
282 | const struct kvec kiov = { | ||
283 | .iov_base = (void *)start, | ||
284 | .iov_len = PAGE_SIZE | ||
285 | }; | ||
286 | |||
287 | return get_kernel_pages(&kiov, 1, write, pages); | ||
288 | } | ||
289 | EXPORT_SYMBOL_GPL(get_kernel_page); | ||
290 | |||
239 | static void pagevec_lru_move_fn(struct pagevec *pvec, | 291 | static void pagevec_lru_move_fn(struct pagevec *pvec, |
240 | void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), | 292 | void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), |
241 | void *arg) | 293 | void *arg) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 4c5ff7f284d9..0cb36fb1f61c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/blkdev.h> | ||
17 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
18 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
19 | #include <linux/page_cgroup.h> | 20 | #include <linux/page_cgroup.h> |
@@ -26,7 +27,7 @@ | |||
26 | */ | 27 | */ |
27 | static const struct address_space_operations swap_aops = { | 28 | static const struct address_space_operations swap_aops = { |
28 | .writepage = swap_writepage, | 29 | .writepage = swap_writepage, |
29 | .set_page_dirty = __set_page_dirty_no_writeback, | 30 | .set_page_dirty = swap_set_page_dirty, |
30 | .migratepage = migrate_page, | 31 | .migratepage = migrate_page, |
31 | }; | 32 | }; |
32 | 33 | ||
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
376 | unsigned long offset = swp_offset(entry); | 377 | unsigned long offset = swp_offset(entry); |
377 | unsigned long start_offset, end_offset; | 378 | unsigned long start_offset, end_offset; |
378 | unsigned long mask = (1UL << page_cluster) - 1; | 379 | unsigned long mask = (1UL << page_cluster) - 1; |
380 | struct blk_plug plug; | ||
379 | 381 | ||
380 | /* Read a page_cluster sized and aligned cluster around offset. */ | 382 | /* Read a page_cluster sized and aligned cluster around offset. */ |
381 | start_offset = offset & ~mask; | 383 | start_offset = offset & ~mask; |
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
383 | if (!start_offset) /* First page is swap header. */ | 385 | if (!start_offset) /* First page is swap header. */ |
384 | start_offset++; | 386 | start_offset++; |
385 | 387 | ||
388 | blk_start_plug(&plug); | ||
386 | for (offset = start_offset; offset <= end_offset ; offset++) { | 389 | for (offset = start_offset; offset <= end_offset ; offset++) { |
387 | /* Ok, do the async read-ahead now */ | 390 | /* Ok, do the async read-ahead now */ |
388 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), | 391 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), |
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
391 | continue; | 394 | continue; |
392 | page_cache_release(page); | 395 | page_cache_release(page); |
393 | } | 396 | } |
397 | blk_finish_plug(&plug); | ||
398 | |||
394 | lru_add_drain(); /* Push any new pages onto the LRU now */ | 399 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
395 | return read_swap_cache_async(entry, gfp_mask, vma, addr); | 400 | return read_swap_cache_async(entry, gfp_mask, vma, addr); |
396 | } | 401 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 457b10baef59..14e254c768fc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -31,6 +31,9 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/poll.h> | 32 | #include <linux/poll.h> |
33 | #include <linux/oom.h> | 33 | #include <linux/oom.h> |
34 | #include <linux/frontswap.h> | ||
35 | #include <linux/swapfile.h> | ||
36 | #include <linux/export.h> | ||
34 | 37 | ||
35 | #include <asm/pgtable.h> | 38 | #include <asm/pgtable.h> |
36 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
@@ -42,7 +45,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | |||
42 | static void free_swap_count_continuations(struct swap_info_struct *); | 45 | static void free_swap_count_continuations(struct swap_info_struct *); |
43 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | 46 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); |
44 | 47 | ||
45 | static DEFINE_SPINLOCK(swap_lock); | 48 | DEFINE_SPINLOCK(swap_lock); |
46 | static unsigned int nr_swapfiles; | 49 | static unsigned int nr_swapfiles; |
47 | long nr_swap_pages; | 50 | long nr_swap_pages; |
48 | long total_swap_pages; | 51 | long total_swap_pages; |
@@ -53,9 +56,9 @@ static const char Unused_file[] = "Unused swap file entry "; | |||
53 | static const char Bad_offset[] = "Bad swap offset entry "; | 56 | static const char Bad_offset[] = "Bad swap offset entry "; |
54 | static const char Unused_offset[] = "Unused swap offset entry "; | 57 | static const char Unused_offset[] = "Unused swap offset entry "; |
55 | 58 | ||
56 | static struct swap_list_t swap_list = {-1, -1}; | 59 | struct swap_list_t swap_list = {-1, -1}; |
57 | 60 | ||
58 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 61 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
59 | 62 | ||
60 | static DEFINE_MUTEX(swapon_mutex); | 63 | static DEFINE_MUTEX(swapon_mutex); |
61 | 64 | ||
@@ -546,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
546 | 549 | ||
547 | /* free if no reference */ | 550 | /* free if no reference */ |
548 | if (!usage) { | 551 | if (!usage) { |
549 | struct gendisk *disk = p->bdev->bd_disk; | ||
550 | if (offset < p->lowest_bit) | 552 | if (offset < p->lowest_bit) |
551 | p->lowest_bit = offset; | 553 | p->lowest_bit = offset; |
552 | if (offset > p->highest_bit) | 554 | if (offset > p->highest_bit) |
@@ -556,9 +558,13 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
556 | swap_list.next = p->type; | 558 | swap_list.next = p->type; |
557 | nr_swap_pages++; | 559 | nr_swap_pages++; |
558 | p->inuse_pages--; | 560 | p->inuse_pages--; |
559 | if ((p->flags & SWP_BLKDEV) && | 561 | frontswap_invalidate_page(p->type, offset); |
560 | disk->fops->swap_slot_free_notify) | 562 | if (p->flags & SWP_BLKDEV) { |
561 | disk->fops->swap_slot_free_notify(p->bdev, offset); | 563 | struct gendisk *disk = p->bdev->bd_disk; |
564 | if (disk->fops->swap_slot_free_notify) | ||
565 | disk->fops->swap_slot_free_notify(p->bdev, | ||
566 | offset); | ||
567 | } | ||
562 | } | 568 | } |
563 | 569 | ||
564 | return usage; | 570 | return usage; |
@@ -829,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
829 | 835 | ||
830 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 836 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
831 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | 837 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { |
832 | if (ret > 0) | 838 | mem_cgroup_cancel_charge_swapin(memcg); |
833 | mem_cgroup_cancel_charge_swapin(memcg); | ||
834 | ret = 0; | 839 | ret = 0; |
835 | goto out; | 840 | goto out; |
836 | } | 841 | } |
@@ -985,11 +990,12 @@ static int unuse_mm(struct mm_struct *mm, | |||
985 | } | 990 | } |
986 | 991 | ||
987 | /* | 992 | /* |
988 | * Scan swap_map from current position to next entry still in use. | 993 | * Scan swap_map (or frontswap_map if frontswap parameter is true) |
994 | * from current position to next entry still in use. | ||
989 | * Recycle to start on reaching the end, returning 0 when empty. | 995 | * Recycle to start on reaching the end, returning 0 when empty. |
990 | */ | 996 | */ |
991 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, | 997 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
992 | unsigned int prev) | 998 | unsigned int prev, bool frontswap) |
993 | { | 999 | { |
994 | unsigned int max = si->max; | 1000 | unsigned int max = si->max; |
995 | unsigned int i = prev; | 1001 | unsigned int i = prev; |
@@ -1015,6 +1021,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1015 | prev = 0; | 1021 | prev = 0; |
1016 | i = 1; | 1022 | i = 1; |
1017 | } | 1023 | } |
1024 | if (frontswap) { | ||
1025 | if (frontswap_test(si, i)) | ||
1026 | break; | ||
1027 | else | ||
1028 | continue; | ||
1029 | } | ||
1018 | count = si->swap_map[i]; | 1030 | count = si->swap_map[i]; |
1019 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1031 | if (count && swap_count(count) != SWAP_MAP_BAD) |
1020 | break; | 1032 | break; |
@@ -1026,8 +1038,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1026 | * We completely avoid races by reading each swap page in advance, | 1038 | * We completely avoid races by reading each swap page in advance, |
1027 | * and then search for the process using it. All the necessary | 1039 | * and then search for the process using it. All the necessary |
1028 | * page table adjustments can then be made atomically. | 1040 | * page table adjustments can then be made atomically. |
1041 | * | ||
1042 | * if the boolean frontswap is true, only unuse pages_to_unuse pages; | ||
1043 | * pages_to_unuse==0 means all pages; ignored if frontswap is false | ||
1029 | */ | 1044 | */ |
1030 | static int try_to_unuse(unsigned int type) | 1045 | int try_to_unuse(unsigned int type, bool frontswap, |
1046 | unsigned long pages_to_unuse) | ||
1031 | { | 1047 | { |
1032 | struct swap_info_struct *si = swap_info[type]; | 1048 | struct swap_info_struct *si = swap_info[type]; |
1033 | struct mm_struct *start_mm; | 1049 | struct mm_struct *start_mm; |
@@ -1060,7 +1076,7 @@ static int try_to_unuse(unsigned int type) | |||
1060 | * one pass through swap_map is enough, but not necessarily: | 1076 | * one pass through swap_map is enough, but not necessarily: |
1061 | * there are races when an instance of an entry might be missed. | 1077 | * there are races when an instance of an entry might be missed. |
1062 | */ | 1078 | */ |
1063 | while ((i = find_next_to_unuse(si, i)) != 0) { | 1079 | while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { |
1064 | if (signal_pending(current)) { | 1080 | if (signal_pending(current)) { |
1065 | retval = -EINTR; | 1081 | retval = -EINTR; |
1066 | break; | 1082 | break; |
@@ -1227,6 +1243,10 @@ static int try_to_unuse(unsigned int type) | |||
1227 | * interactive performance. | 1243 | * interactive performance. |
1228 | */ | 1244 | */ |
1229 | cond_resched(); | 1245 | cond_resched(); |
1246 | if (frontswap && pages_to_unuse > 0) { | ||
1247 | if (!--pages_to_unuse) | ||
1248 | break; | ||
1249 | } | ||
1230 | } | 1250 | } |
1231 | 1251 | ||
1232 | mmput(start_mm); | 1252 | mmput(start_mm); |
@@ -1310,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis) | |||
1310 | list_del(&se->list); | 1330 | list_del(&se->list); |
1311 | kfree(se); | 1331 | kfree(se); |
1312 | } | 1332 | } |
1333 | |||
1334 | if (sis->flags & SWP_FILE) { | ||
1335 | struct file *swap_file = sis->swap_file; | ||
1336 | struct address_space *mapping = swap_file->f_mapping; | ||
1337 | |||
1338 | sis->flags &= ~SWP_FILE; | ||
1339 | mapping->a_ops->swap_deactivate(swap_file); | ||
1340 | } | ||
1313 | } | 1341 | } |
1314 | 1342 | ||
1315 | /* | 1343 | /* |
@@ -1318,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) | |||
1318 | * | 1346 | * |
1319 | * This function rather assumes that it is called in ascending page order. | 1347 | * This function rather assumes that it is called in ascending page order. |
1320 | */ | 1348 | */ |
1321 | static int | 1349 | int |
1322 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | 1350 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, |
1323 | unsigned long nr_pages, sector_t start_block) | 1351 | unsigned long nr_pages, sector_t start_block) |
1324 | { | 1352 | { |
@@ -1391,102 +1419,33 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1391 | */ | 1419 | */ |
1392 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | 1420 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) |
1393 | { | 1421 | { |
1394 | struct inode *inode; | 1422 | struct file *swap_file = sis->swap_file; |
1395 | unsigned blocks_per_page; | 1423 | struct address_space *mapping = swap_file->f_mapping; |
1396 | unsigned long page_no; | 1424 | struct inode *inode = mapping->host; |
1397 | unsigned blkbits; | ||
1398 | sector_t probe_block; | ||
1399 | sector_t last_block; | ||
1400 | sector_t lowest_block = -1; | ||
1401 | sector_t highest_block = 0; | ||
1402 | int nr_extents = 0; | ||
1403 | int ret; | 1425 | int ret; |
1404 | 1426 | ||
1405 | inode = sis->swap_file->f_mapping->host; | ||
1406 | if (S_ISBLK(inode->i_mode)) { | 1427 | if (S_ISBLK(inode->i_mode)) { |
1407 | ret = add_swap_extent(sis, 0, sis->max, 0); | 1428 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1408 | *span = sis->pages; | 1429 | *span = sis->pages; |
1409 | goto out; | 1430 | return ret; |
1410 | } | 1431 | } |
1411 | 1432 | ||
1412 | blkbits = inode->i_blkbits; | 1433 | if (mapping->a_ops->swap_activate) { |
1413 | blocks_per_page = PAGE_SIZE >> blkbits; | 1434 | ret = mapping->a_ops->swap_activate(sis, swap_file, span); |
1414 | 1435 | if (!ret) { | |
1415 | /* | 1436 | sis->flags |= SWP_FILE; |
1416 | * Map all the blocks into the extent list. This code doesn't try | 1437 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1417 | * to be very smart. | 1438 | *span = sis->pages; |
1418 | */ | ||
1419 | probe_block = 0; | ||
1420 | page_no = 0; | ||
1421 | last_block = i_size_read(inode) >> blkbits; | ||
1422 | while ((probe_block + blocks_per_page) <= last_block && | ||
1423 | page_no < sis->max) { | ||
1424 | unsigned block_in_page; | ||
1425 | sector_t first_block; | ||
1426 | |||
1427 | first_block = bmap(inode, probe_block); | ||
1428 | if (first_block == 0) | ||
1429 | goto bad_bmap; | ||
1430 | |||
1431 | /* | ||
1432 | * It must be PAGE_SIZE aligned on-disk | ||
1433 | */ | ||
1434 | if (first_block & (blocks_per_page - 1)) { | ||
1435 | probe_block++; | ||
1436 | goto reprobe; | ||
1437 | } | ||
1438 | |||
1439 | for (block_in_page = 1; block_in_page < blocks_per_page; | ||
1440 | block_in_page++) { | ||
1441 | sector_t block; | ||
1442 | |||
1443 | block = bmap(inode, probe_block + block_in_page); | ||
1444 | if (block == 0) | ||
1445 | goto bad_bmap; | ||
1446 | if (block != first_block + block_in_page) { | ||
1447 | /* Discontiguity */ | ||
1448 | probe_block++; | ||
1449 | goto reprobe; | ||
1450 | } | ||
1451 | } | ||
1452 | |||
1453 | first_block >>= (PAGE_SHIFT - blkbits); | ||
1454 | if (page_no) { /* exclude the header page */ | ||
1455 | if (first_block < lowest_block) | ||
1456 | lowest_block = first_block; | ||
1457 | if (first_block > highest_block) | ||
1458 | highest_block = first_block; | ||
1459 | } | 1439 | } |
1440 | return ret; | ||
1441 | } | ||
1460 | 1442 | ||
1461 | /* | 1443 | return generic_swapfile_activate(sis, swap_file, span); |
1462 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks | ||
1463 | */ | ||
1464 | ret = add_swap_extent(sis, page_no, 1, first_block); | ||
1465 | if (ret < 0) | ||
1466 | goto out; | ||
1467 | nr_extents += ret; | ||
1468 | page_no++; | ||
1469 | probe_block += blocks_per_page; | ||
1470 | reprobe: | ||
1471 | continue; | ||
1472 | } | ||
1473 | ret = nr_extents; | ||
1474 | *span = 1 + highest_block - lowest_block; | ||
1475 | if (page_no == 0) | ||
1476 | page_no = 1; /* force Empty message */ | ||
1477 | sis->max = page_no; | ||
1478 | sis->pages = page_no - 1; | ||
1479 | sis->highest_bit = page_no - 1; | ||
1480 | out: | ||
1481 | return ret; | ||
1482 | bad_bmap: | ||
1483 | printk(KERN_ERR "swapon: swapfile has holes\n"); | ||
1484 | ret = -EINVAL; | ||
1485 | goto out; | ||
1486 | } | 1444 | } |
1487 | 1445 | ||
1488 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1446 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
1489 | unsigned char *swap_map) | 1447 | unsigned char *swap_map, |
1448 | unsigned long *frontswap_map) | ||
1490 | { | 1449 | { |
1491 | int i, prev; | 1450 | int i, prev; |
1492 | 1451 | ||
@@ -1496,6 +1455,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1496 | else | 1455 | else |
1497 | p->prio = --least_priority; | 1456 | p->prio = --least_priority; |
1498 | p->swap_map = swap_map; | 1457 | p->swap_map = swap_map; |
1458 | frontswap_map_set(p, frontswap_map); | ||
1499 | p->flags |= SWP_WRITEOK; | 1459 | p->flags |= SWP_WRITEOK; |
1500 | nr_swap_pages += p->pages; | 1460 | nr_swap_pages += p->pages; |
1501 | total_swap_pages += p->pages; | 1461 | total_swap_pages += p->pages; |
@@ -1512,6 +1472,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1512 | swap_list.head = swap_list.next = p->type; | 1472 | swap_list.head = swap_list.next = p->type; |
1513 | else | 1473 | else |
1514 | swap_info[prev]->next = p->type; | 1474 | swap_info[prev]->next = p->type; |
1475 | frontswap_init(p->type); | ||
1515 | spin_unlock(&swap_lock); | 1476 | spin_unlock(&swap_lock); |
1516 | } | 1477 | } |
1517 | 1478 | ||
@@ -1585,7 +1546,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1585 | spin_unlock(&swap_lock); | 1546 | spin_unlock(&swap_lock); |
1586 | 1547 | ||
1587 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1548 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
1588 | err = try_to_unuse(type); | 1549 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ |
1589 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); | 1550 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); |
1590 | 1551 | ||
1591 | if (err) { | 1552 | if (err) { |
@@ -1596,7 +1557,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1596 | * sys_swapoff for this swap_info_struct at this point. | 1557 | * sys_swapoff for this swap_info_struct at this point. |
1597 | */ | 1558 | */ |
1598 | /* re-insert swap space back into swap_list */ | 1559 | /* re-insert swap space back into swap_list */ |
1599 | enable_swap_info(p, p->prio, p->swap_map); | 1560 | enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); |
1600 | goto out_dput; | 1561 | goto out_dput; |
1601 | } | 1562 | } |
1602 | 1563 | ||
@@ -1622,9 +1583,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1622 | swap_map = p->swap_map; | 1583 | swap_map = p->swap_map; |
1623 | p->swap_map = NULL; | 1584 | p->swap_map = NULL; |
1624 | p->flags = 0; | 1585 | p->flags = 0; |
1586 | frontswap_invalidate_area(type); | ||
1625 | spin_unlock(&swap_lock); | 1587 | spin_unlock(&swap_lock); |
1626 | mutex_unlock(&swapon_mutex); | 1588 | mutex_unlock(&swapon_mutex); |
1627 | vfree(swap_map); | 1589 | vfree(swap_map); |
1590 | vfree(frontswap_map_get(p)); | ||
1628 | /* Destroy swap account informatin */ | 1591 | /* Destroy swap account informatin */ |
1629 | swap_cgroup_swapoff(type); | 1592 | swap_cgroup_swapoff(type); |
1630 | 1593 | ||
@@ -1893,24 +1856,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1893 | 1856 | ||
1894 | /* | 1857 | /* |
1895 | * Find out how many pages are allowed for a single swap | 1858 | * Find out how many pages are allowed for a single swap |
1896 | * device. There are three limiting factors: 1) the number | 1859 | * device. There are two limiting factors: 1) the number |
1897 | * of bits for the swap offset in the swp_entry_t type, and | 1860 | * of bits for the swap offset in the swp_entry_t type, and |
1898 | * 2) the number of bits in the swap pte as defined by the | 1861 | * 2) the number of bits in the swap pte as defined by the |
1899 | * the different architectures, and 3) the number of free bits | 1862 | * different architectures. In order to find the |
1900 | * in an exceptional radix_tree entry. In order to find the | ||
1901 | * largest possible bit mask, a swap entry with swap type 0 | 1863 | * largest possible bit mask, a swap entry with swap type 0 |
1902 | * and swap offset ~0UL is created, encoded to a swap pte, | 1864 | * and swap offset ~0UL is created, encoded to a swap pte, |
1903 | * decoded to a swp_entry_t again, and finally the swap | 1865 | * decoded to a swp_entry_t again, and finally the swap |
1904 | * offset is extracted. This will mask all the bits from | 1866 | * offset is extracted. This will mask all the bits from |
1905 | * the initial ~0UL mask that can't be encoded in either | 1867 | * the initial ~0UL mask that can't be encoded in either |
1906 | * the swp_entry_t or the architecture definition of a | 1868 | * the swp_entry_t or the architecture definition of a |
1907 | * swap pte. Then the same is done for a radix_tree entry. | 1869 | * swap pte. |
1908 | */ | 1870 | */ |
1909 | maxpages = swp_offset(pte_to_swp_entry( | 1871 | maxpages = swp_offset(pte_to_swp_entry( |
1910 | swp_entry_to_pte(swp_entry(0, ~0UL)))); | 1872 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
1911 | maxpages = swp_offset(radix_to_swp_entry( | ||
1912 | swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; | ||
1913 | |||
1914 | if (maxpages > swap_header->info.last_page) { | 1873 | if (maxpages > swap_header->info.last_page) { |
1915 | maxpages = swap_header->info.last_page + 1; | 1874 | maxpages = swap_header->info.last_page + 1; |
1916 | /* p->max is an unsigned int: don't overflow it */ | 1875 | /* p->max is an unsigned int: don't overflow it */ |
@@ -1988,6 +1947,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1988 | sector_t span; | 1947 | sector_t span; |
1989 | unsigned long maxpages; | 1948 | unsigned long maxpages; |
1990 | unsigned char *swap_map = NULL; | 1949 | unsigned char *swap_map = NULL; |
1950 | unsigned long *frontswap_map = NULL; | ||
1991 | struct page *page = NULL; | 1951 | struct page *page = NULL; |
1992 | struct inode *inode = NULL; | 1952 | struct inode *inode = NULL; |
1993 | 1953 | ||
@@ -2071,6 +2031,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2071 | error = nr_extents; | 2031 | error = nr_extents; |
2072 | goto bad_swap; | 2032 | goto bad_swap; |
2073 | } | 2033 | } |
2034 | /* frontswap enabled? set up bit-per-page map for frontswap */ | ||
2035 | if (frontswap_enabled) | ||
2036 | frontswap_map = vzalloc(maxpages / sizeof(long)); | ||
2074 | 2037 | ||
2075 | if (p->bdev) { | 2038 | if (p->bdev) { |
2076 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 2039 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
@@ -2086,14 +2049,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2086 | if (swap_flags & SWAP_FLAG_PREFER) | 2049 | if (swap_flags & SWAP_FLAG_PREFER) |
2087 | prio = | 2050 | prio = |
2088 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | 2051 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
2089 | enable_swap_info(p, prio, swap_map); | 2052 | enable_swap_info(p, prio, swap_map, frontswap_map); |
2090 | 2053 | ||
2091 | printk(KERN_INFO "Adding %uk swap on %s. " | 2054 | printk(KERN_INFO "Adding %uk swap on %s. " |
2092 | "Priority:%d extents:%d across:%lluk %s%s\n", | 2055 | "Priority:%d extents:%d across:%lluk %s%s%s\n", |
2093 | p->pages<<(PAGE_SHIFT-10), name, p->prio, | 2056 | p->pages<<(PAGE_SHIFT-10), name, p->prio, |
2094 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2057 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
2095 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 2058 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
2096 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | 2059 | (p->flags & SWP_DISCARDABLE) ? "D" : "", |
2060 | (frontswap_map) ? "FS" : ""); | ||
2097 | 2061 | ||
2098 | mutex_unlock(&swapon_mutex); | 2062 | mutex_unlock(&swapon_mutex); |
2099 | atomic_inc(&proc_poll_event); | 2063 | atomic_inc(&proc_poll_event); |
@@ -2261,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry) | |||
2261 | return __swap_duplicate(entry, SWAP_HAS_CACHE); | 2225 | return __swap_duplicate(entry, SWAP_HAS_CACHE); |
2262 | } | 2226 | } |
2263 | 2227 | ||
2228 | struct swap_info_struct *page_swap_info(struct page *page) | ||
2229 | { | ||
2230 | swp_entry_t swap = { .val = page_private(page) }; | ||
2231 | BUG_ON(!PageSwapCache(page)); | ||
2232 | return swap_info[swp_type(swap)]; | ||
2233 | } | ||
2234 | |||
2235 | /* | ||
2236 | * out-of-line __page_file_ methods to avoid include hell. | ||
2237 | */ | ||
2238 | struct address_space *__page_file_mapping(struct page *page) | ||
2239 | { | ||
2240 | VM_BUG_ON(!PageSwapCache(page)); | ||
2241 | return page_swap_info(page)->swap_file->f_mapping; | ||
2242 | } | ||
2243 | EXPORT_SYMBOL_GPL(__page_file_mapping); | ||
2244 | |||
2245 | pgoff_t __page_file_index(struct page *page) | ||
2246 | { | ||
2247 | swp_entry_t swap = { .val = page_private(page) }; | ||
2248 | VM_BUG_ON(!PageSwapCache(page)); | ||
2249 | return swp_offset(swap); | ||
2250 | } | ||
2251 | EXPORT_SYMBOL_GPL(__page_file_index); | ||
2252 | |||
2264 | /* | 2253 | /* |
2265 | * add_swap_count_continuation - called when a swap count is duplicated | 2254 | * add_swap_count_continuation - called when a swap count is duplicated |
2266 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | 2255 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2aad49981b57..2bb90b1d241c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -413,11 +413,11 @@ nocache: | |||
413 | if (addr + size - 1 < addr) | 413 | if (addr + size - 1 < addr) |
414 | goto overflow; | 414 | goto overflow; |
415 | 415 | ||
416 | n = rb_next(&first->rb_node); | 416 | if (list_is_last(&first->list, &vmap_area_list)) |
417 | if (n) | ||
418 | first = rb_entry(n, struct vmap_area, rb_node); | ||
419 | else | ||
420 | goto found; | 417 | goto found; |
418 | |||
419 | first = list_entry(first->list.next, | ||
420 | struct vmap_area, list); | ||
421 | } | 421 | } |
422 | 422 | ||
423 | found: | 423 | found: |
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
904 | 904 | ||
905 | BUG_ON(size & ~PAGE_MASK); | 905 | BUG_ON(size & ~PAGE_MASK); |
906 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 906 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
907 | if (WARN_ON(size == 0)) { | ||
908 | /* | ||
909 | * Allocating 0 bytes isn't what caller wants since | ||
910 | * get_order(0) returns funny result. Just warn and terminate | ||
911 | * early. | ||
912 | */ | ||
913 | return NULL; | ||
914 | } | ||
907 | order = get_order(size); | 915 | order = get_order(size); |
908 | 916 | ||
909 | again: | 917 | again: |
@@ -1280,7 +1288,7 @@ DEFINE_RWLOCK(vmlist_lock); | |||
1280 | struct vm_struct *vmlist; | 1288 | struct vm_struct *vmlist; |
1281 | 1289 | ||
1282 | static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | 1290 | static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
1283 | unsigned long flags, void *caller) | 1291 | unsigned long flags, const void *caller) |
1284 | { | 1292 | { |
1285 | vm->flags = flags; | 1293 | vm->flags = flags; |
1286 | vm->addr = (void *)va->va_start; | 1294 | vm->addr = (void *)va->va_start; |
@@ -1306,7 +1314,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm) | |||
1306 | } | 1314 | } |
1307 | 1315 | ||
1308 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | 1316 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
1309 | unsigned long flags, void *caller) | 1317 | unsigned long flags, const void *caller) |
1310 | { | 1318 | { |
1311 | setup_vmalloc_vm(vm, va, flags, caller); | 1319 | setup_vmalloc_vm(vm, va, flags, caller); |
1312 | insert_vmalloc_vmlist(vm); | 1320 | insert_vmalloc_vmlist(vm); |
@@ -1314,7 +1322,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
1314 | 1322 | ||
1315 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1323 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
1316 | unsigned long align, unsigned long flags, unsigned long start, | 1324 | unsigned long align, unsigned long flags, unsigned long start, |
1317 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 1325 | unsigned long end, int node, gfp_t gfp_mask, const void *caller) |
1318 | { | 1326 | { |
1319 | struct vmap_area *va; | 1327 | struct vmap_area *va; |
1320 | struct vm_struct *area; | 1328 | struct vm_struct *area; |
@@ -1375,7 +1383,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area); | |||
1375 | 1383 | ||
1376 | struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | 1384 | struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, |
1377 | unsigned long start, unsigned long end, | 1385 | unsigned long start, unsigned long end, |
1378 | void *caller) | 1386 | const void *caller) |
1379 | { | 1387 | { |
1380 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, | 1388 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, |
1381 | caller); | 1389 | caller); |
@@ -1397,13 +1405,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | |||
1397 | } | 1405 | } |
1398 | 1406 | ||
1399 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | 1407 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, |
1400 | void *caller) | 1408 | const void *caller) |
1401 | { | 1409 | { |
1402 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | 1410 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
1403 | -1, GFP_KERNEL, caller); | 1411 | -1, GFP_KERNEL, caller); |
1404 | } | 1412 | } |
1405 | 1413 | ||
1406 | static struct vm_struct *find_vm_area(const void *addr) | 1414 | /** |
1415 | * find_vm_area - find a continuous kernel virtual area | ||
1416 | * @addr: base address | ||
1417 | * | ||
1418 | * Search for the kernel VM area starting at @addr, and return it. | ||
1419 | * It is up to the caller to do all required locking to keep the returned | ||
1420 | * pointer valid. | ||
1421 | */ | ||
1422 | struct vm_struct *find_vm_area(const void *addr) | ||
1407 | { | 1423 | { |
1408 | struct vmap_area *va; | 1424 | struct vmap_area *va; |
1409 | 1425 | ||
@@ -1568,9 +1584,9 @@ EXPORT_SYMBOL(vmap); | |||
1568 | 1584 | ||
1569 | static void *__vmalloc_node(unsigned long size, unsigned long align, | 1585 | static void *__vmalloc_node(unsigned long size, unsigned long align, |
1570 | gfp_t gfp_mask, pgprot_t prot, | 1586 | gfp_t gfp_mask, pgprot_t prot, |
1571 | int node, void *caller); | 1587 | int node, const void *caller); |
1572 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1588 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
1573 | pgprot_t prot, int node, void *caller) | 1589 | pgprot_t prot, int node, const void *caller) |
1574 | { | 1590 | { |
1575 | const int order = 0; | 1591 | const int order = 0; |
1576 | struct page **pages; | 1592 | struct page **pages; |
@@ -1643,7 +1659,7 @@ fail: | |||
1643 | */ | 1659 | */ |
1644 | void *__vmalloc_node_range(unsigned long size, unsigned long align, | 1660 | void *__vmalloc_node_range(unsigned long size, unsigned long align, |
1645 | unsigned long start, unsigned long end, gfp_t gfp_mask, | 1661 | unsigned long start, unsigned long end, gfp_t gfp_mask, |
1646 | pgprot_t prot, int node, void *caller) | 1662 | pgprot_t prot, int node, const void *caller) |
1647 | { | 1663 | { |
1648 | struct vm_struct *area; | 1664 | struct vm_struct *area; |
1649 | void *addr; | 1665 | void *addr; |
@@ -1699,7 +1715,7 @@ fail: | |||
1699 | */ | 1715 | */ |
1700 | static void *__vmalloc_node(unsigned long size, unsigned long align, | 1716 | static void *__vmalloc_node(unsigned long size, unsigned long align, |
1701 | gfp_t gfp_mask, pgprot_t prot, | 1717 | gfp_t gfp_mask, pgprot_t prot, |
1702 | int node, void *caller) | 1718 | int node, const void *caller) |
1703 | { | 1719 | { |
1704 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, | 1720 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, |
1705 | gfp_mask, prot, node, caller); | 1721 | gfp_mask, prot, node, caller); |
@@ -1975,9 +1991,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) | |||
1975 | * IOREMAP area is treated as memory hole and no copy is done. | 1991 | * IOREMAP area is treated as memory hole and no copy is done. |
1976 | * | 1992 | * |
1977 | * If [addr...addr+count) doesn't includes any intersects with alive | 1993 | * If [addr...addr+count) doesn't includes any intersects with alive |
1978 | * vm_struct area, returns 0. | 1994 | * vm_struct area, returns 0. @buf should be kernel's buffer. |
1979 | * @buf should be kernel's buffer. Because this function uses KM_USER0, | ||
1980 | * the caller should guarantee KM_USER0 is not used. | ||
1981 | * | 1995 | * |
1982 | * Note: In usual ops, vread() is never necessary because the caller | 1996 | * Note: In usual ops, vread() is never necessary because the caller |
1983 | * should know vmalloc() area is valid and can use memcpy(). | 1997 | * should know vmalloc() area is valid and can use memcpy(). |
@@ -2051,9 +2065,7 @@ finished: | |||
2051 | * IOREMAP area is treated as memory hole and no copy is done. | 2065 | * IOREMAP area is treated as memory hole and no copy is done. |
2052 | * | 2066 | * |
2053 | * If [addr...addr+count) doesn't includes any intersects with alive | 2067 | * If [addr...addr+count) doesn't includes any intersects with alive |
2054 | * vm_struct area, returns 0. | 2068 | * vm_struct area, returns 0. @buf should be kernel's buffer. |
2055 | * @buf should be kernel's buffer. Because this function uses KM_USER0, | ||
2056 | * the caller should guarantee KM_USER0 is not used. | ||
2057 | * | 2069 | * |
2058 | * Note: In usual ops, vwrite() is never necessary because the caller | 2070 | * Note: In usual ops, vwrite() is never necessary because the caller |
2059 | * should know vmalloc() area is valid and can use memcpy(). | 2071 | * should know vmalloc() area is valid and can use memcpy(). |
diff --git a/mm/vmscan.c b/mm/vmscan.c index eeb3bc9d1d36..8d01243d9560 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */ | |||
133 | static LIST_HEAD(shrinker_list); | 133 | static LIST_HEAD(shrinker_list); |
134 | static DECLARE_RWSEM(shrinker_rwsem); | 134 | static DECLARE_RWSEM(shrinker_rwsem); |
135 | 135 | ||
136 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 136 | #ifdef CONFIG_MEMCG |
137 | static bool global_reclaim(struct scan_control *sc) | 137 | static bool global_reclaim(struct scan_control *sc) |
138 | { | 138 | { |
139 | return !sc->target_mem_cgroup; | 139 | return !sc->target_mem_cgroup; |
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
687 | 687 | ||
688 | cond_resched(); | 688 | cond_resched(); |
689 | 689 | ||
690 | mem_cgroup_uncharge_start(); | ||
690 | while (!list_empty(page_list)) { | 691 | while (!list_empty(page_list)) { |
691 | enum page_references references; | 692 | enum page_references references; |
692 | struct address_space *mapping; | 693 | struct address_space *mapping; |
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
720 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 721 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
721 | 722 | ||
722 | if (PageWriteback(page)) { | 723 | if (PageWriteback(page)) { |
723 | nr_writeback++; | 724 | /* |
724 | unlock_page(page); | 725 | * memcg doesn't have any dirty pages throttling so we |
725 | goto keep; | 726 | * could easily OOM just because too many pages are in |
727 | * writeback and there is nothing else to reclaim. | ||
728 | * | ||
729 | * Check __GFP_IO, certainly because a loop driver | ||
730 | * thread might enter reclaim, and deadlock if it waits | ||
731 | * on a page for which it is needed to do the write | ||
732 | * (loop masks off __GFP_IO|__GFP_FS for this reason); | ||
733 | * but more thought would probably show more reasons. | ||
734 | * | ||
735 | * Don't require __GFP_FS, since we're not going into | ||
736 | * the FS, just waiting on its writeback completion. | ||
737 | * Worryingly, ext4 gfs2 and xfs allocate pages with | ||
738 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so | ||
739 | * testing may_enter_fs here is liable to OOM on them. | ||
740 | */ | ||
741 | if (global_reclaim(sc) || | ||
742 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { | ||
743 | /* | ||
744 | * This is slightly racy - end_page_writeback() | ||
745 | * might have just cleared PageReclaim, then | ||
746 | * setting PageReclaim here end up interpreted | ||
747 | * as PageReadahead - but that does not matter | ||
748 | * enough to care. What we do want is for this | ||
749 | * page to have PageReclaim set next time memcg | ||
750 | * reclaim reaches the tests above, so it will | ||
751 | * then wait_on_page_writeback() to avoid OOM; | ||
752 | * and it's also appropriate in global reclaim. | ||
753 | */ | ||
754 | SetPageReclaim(page); | ||
755 | nr_writeback++; | ||
756 | goto keep_locked; | ||
757 | } | ||
758 | wait_on_page_writeback(page); | ||
726 | } | 759 | } |
727 | 760 | ||
728 | references = page_check_references(page, sc); | 761 | references = page_check_references(page, sc); |
@@ -921,6 +954,7 @@ keep: | |||
921 | 954 | ||
922 | list_splice(&ret_pages, page_list); | 955 | list_splice(&ret_pages, page_list); |
923 | count_vm_events(PGACTIVATE, pgactivate); | 956 | count_vm_events(PGACTIVATE, pgactivate); |
957 | mem_cgroup_uncharge_end(); | ||
924 | *ret_nr_dirty += nr_dirty; | 958 | *ret_nr_dirty += nr_dirty; |
925 | *ret_nr_writeback += nr_writeback; | 959 | *ret_nr_writeback += nr_writeback; |
926 | return nr_reclaimed; | 960 | return nr_reclaimed; |
@@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
1567 | * by looking at the fraction of the pages scanned we did rotate back | 1601 | * by looking at the fraction of the pages scanned we did rotate back |
1568 | * onto the active list instead of evict. | 1602 | * onto the active list instead of evict. |
1569 | * | 1603 | * |
1570 | * nr[0] = anon pages to scan; nr[1] = file pages to scan | 1604 | * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan |
1605 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan | ||
1571 | */ | 1606 | */ |
1572 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | 1607 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, |
1573 | unsigned long *nr) | 1608 | unsigned long *nr) |
@@ -2111,6 +2146,83 @@ out: | |||
2111 | return 0; | 2146 | return 0; |
2112 | } | 2147 | } |
2113 | 2148 | ||
2149 | static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | ||
2150 | { | ||
2151 | struct zone *zone; | ||
2152 | unsigned long pfmemalloc_reserve = 0; | ||
2153 | unsigned long free_pages = 0; | ||
2154 | int i; | ||
2155 | bool wmark_ok; | ||
2156 | |||
2157 | for (i = 0; i <= ZONE_NORMAL; i++) { | ||
2158 | zone = &pgdat->node_zones[i]; | ||
2159 | pfmemalloc_reserve += min_wmark_pages(zone); | ||
2160 | free_pages += zone_page_state(zone, NR_FREE_PAGES); | ||
2161 | } | ||
2162 | |||
2163 | wmark_ok = free_pages > pfmemalloc_reserve / 2; | ||
2164 | |||
2165 | /* kswapd must be awake if processes are being throttled */ | ||
2166 | if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { | ||
2167 | pgdat->classzone_idx = min(pgdat->classzone_idx, | ||
2168 | (enum zone_type)ZONE_NORMAL); | ||
2169 | wake_up_interruptible(&pgdat->kswapd_wait); | ||
2170 | } | ||
2171 | |||
2172 | return wmark_ok; | ||
2173 | } | ||
2174 | |||
2175 | /* | ||
2176 | * Throttle direct reclaimers if backing storage is backed by the network | ||
2177 | * and the PFMEMALLOC reserve for the preferred node is getting dangerously | ||
2178 | * depleted. kswapd will continue to make progress and wake the processes | ||
2179 | * when the low watermark is reached | ||
2180 | */ | ||
2181 | static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | ||
2182 | nodemask_t *nodemask) | ||
2183 | { | ||
2184 | struct zone *zone; | ||
2185 | int high_zoneidx = gfp_zone(gfp_mask); | ||
2186 | pg_data_t *pgdat; | ||
2187 | |||
2188 | /* | ||
2189 | * Kernel threads should not be throttled as they may be indirectly | ||
2190 | * responsible for cleaning pages necessary for reclaim to make forward | ||
2191 | * progress. kjournald for example may enter direct reclaim while | ||
2192 | * committing a transaction where throttling it could forcing other | ||
2193 | * processes to block on log_wait_commit(). | ||
2194 | */ | ||
2195 | if (current->flags & PF_KTHREAD) | ||
2196 | return; | ||
2197 | |||
2198 | /* Check if the pfmemalloc reserves are ok */ | ||
2199 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | ||
2200 | pgdat = zone->zone_pgdat; | ||
2201 | if (pfmemalloc_watermark_ok(pgdat)) | ||
2202 | return; | ||
2203 | |||
2204 | /* Account for the throttling */ | ||
2205 | count_vm_event(PGSCAN_DIRECT_THROTTLE); | ||
2206 | |||
2207 | /* | ||
2208 | * If the caller cannot enter the filesystem, it's possible that it | ||
2209 | * is due to the caller holding an FS lock or performing a journal | ||
2210 | * transaction in the case of a filesystem like ext[3|4]. In this case, | ||
2211 | * it is not safe to block on pfmemalloc_wait as kswapd could be | ||
2212 | * blocked waiting on the same lock. Instead, throttle for up to a | ||
2213 | * second before continuing. | ||
2214 | */ | ||
2215 | if (!(gfp_mask & __GFP_FS)) { | ||
2216 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | ||
2217 | pfmemalloc_watermark_ok(pgdat), HZ); | ||
2218 | return; | ||
2219 | } | ||
2220 | |||
2221 | /* Throttle until kswapd wakes the process */ | ||
2222 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | ||
2223 | pfmemalloc_watermark_ok(pgdat)); | ||
2224 | } | ||
2225 | |||
2114 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 2226 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
2115 | gfp_t gfp_mask, nodemask_t *nodemask) | 2227 | gfp_t gfp_mask, nodemask_t *nodemask) |
2116 | { | 2228 | { |
@@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2130 | .gfp_mask = sc.gfp_mask, | 2242 | .gfp_mask = sc.gfp_mask, |
2131 | }; | 2243 | }; |
2132 | 2244 | ||
2245 | throttle_direct_reclaim(gfp_mask, zonelist, nodemask); | ||
2246 | |||
2247 | /* | ||
2248 | * Do not enter reclaim if fatal signal is pending. 1 is returned so | ||
2249 | * that the page allocator does not consider triggering OOM | ||
2250 | */ | ||
2251 | if (fatal_signal_pending(current)) | ||
2252 | return 1; | ||
2253 | |||
2133 | trace_mm_vmscan_direct_reclaim_begin(order, | 2254 | trace_mm_vmscan_direct_reclaim_begin(order, |
2134 | sc.may_writepage, | 2255 | sc.may_writepage, |
2135 | gfp_mask); | 2256 | gfp_mask); |
@@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2141 | return nr_reclaimed; | 2262 | return nr_reclaimed; |
2142 | } | 2263 | } |
2143 | 2264 | ||
2144 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2265 | #ifdef CONFIG_MEMCG |
2145 | 2266 | ||
2146 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | 2267 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, |
2147 | gfp_t gfp_mask, bool noswap, | 2268 | gfp_t gfp_mask, bool noswap, |
@@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2274 | return balanced_pages >= (present_pages >> 2); | 2395 | return balanced_pages >= (present_pages >> 2); |
2275 | } | 2396 | } |
2276 | 2397 | ||
2277 | /* is kswapd sleeping prematurely? */ | 2398 | /* |
2278 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | 2399 | * Prepare kswapd for sleeping. This verifies that there are no processes |
2400 | * waiting in throttle_direct_reclaim() and that watermarks have been met. | ||
2401 | * | ||
2402 | * Returns true if kswapd is ready to sleep | ||
2403 | */ | ||
2404 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | ||
2279 | int classzone_idx) | 2405 | int classzone_idx) |
2280 | { | 2406 | { |
2281 | int i; | 2407 | int i; |
@@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2284 | 2410 | ||
2285 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2411 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2286 | if (remaining) | 2412 | if (remaining) |
2287 | return true; | 2413 | return false; |
2414 | |||
2415 | /* | ||
2416 | * There is a potential race between when kswapd checks its watermarks | ||
2417 | * and a process gets throttled. There is also a potential race if | ||
2418 | * processes get throttled, kswapd wakes, a large process exits therby | ||
2419 | * balancing the zones that causes kswapd to miss a wakeup. If kswapd | ||
2420 | * is going to sleep, no process should be sleeping on pfmemalloc_wait | ||
2421 | * so wake them now if necessary. If necessary, processes will wake | ||
2422 | * kswapd and get throttled again | ||
2423 | */ | ||
2424 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) { | ||
2425 | wake_up(&pgdat->pfmemalloc_wait); | ||
2426 | return false; | ||
2427 | } | ||
2288 | 2428 | ||
2289 | /* Check the watermark levels */ | 2429 | /* Check the watermark levels */ |
2290 | for (i = 0; i <= classzone_idx; i++) { | 2430 | for (i = 0; i <= classzone_idx; i++) { |
@@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2317 | * must be balanced | 2457 | * must be balanced |
2318 | */ | 2458 | */ |
2319 | if (order) | 2459 | if (order) |
2320 | return !pgdat_balanced(pgdat, balanced, classzone_idx); | 2460 | return pgdat_balanced(pgdat, balanced, classzone_idx); |
2321 | else | 2461 | else |
2322 | return !all_zones_ok; | 2462 | return all_zones_ok; |
2323 | } | 2463 | } |
2324 | 2464 | ||
2325 | /* | 2465 | /* |
@@ -2537,7 +2677,7 @@ loop_again: | |||
2537 | * consider it to be no longer congested. It's | 2677 | * consider it to be no longer congested. It's |
2538 | * possible there are dirty pages backed by | 2678 | * possible there are dirty pages backed by |
2539 | * congested BDIs but as pressure is relieved, | 2679 | * congested BDIs but as pressure is relieved, |
2540 | * spectulatively avoid congestion waits | 2680 | * speculatively avoid congestion waits |
2541 | */ | 2681 | */ |
2542 | zone_clear_flag(zone, ZONE_CONGESTED); | 2682 | zone_clear_flag(zone, ZONE_CONGESTED); |
2543 | if (i <= *classzone_idx) | 2683 | if (i <= *classzone_idx) |
@@ -2545,6 +2685,16 @@ loop_again: | |||
2545 | } | 2685 | } |
2546 | 2686 | ||
2547 | } | 2687 | } |
2688 | |||
2689 | /* | ||
2690 | * If the low watermark is met there is no need for processes | ||
2691 | * to be throttled on pfmemalloc_wait as they should not be | ||
2692 | * able to safely make forward progress. Wake them | ||
2693 | */ | ||
2694 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | ||
2695 | pfmemalloc_watermark_ok(pgdat)) | ||
2696 | wake_up(&pgdat->pfmemalloc_wait); | ||
2697 | |||
2548 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) | 2698 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2549 | break; /* kswapd: all done */ | 2699 | break; /* kswapd: all done */ |
2550 | /* | 2700 | /* |
@@ -2646,7 +2796,7 @@ out: | |||
2646 | } | 2796 | } |
2647 | 2797 | ||
2648 | /* | 2798 | /* |
2649 | * Return the order we were reclaiming at so sleeping_prematurely() | 2799 | * Return the order we were reclaiming at so prepare_kswapd_sleep() |
2650 | * makes a decision on the order we were last reclaiming at. However, | 2800 | * makes a decision on the order we were last reclaiming at. However, |
2651 | * if another caller entered the allocator slow path while kswapd | 2801 | * if another caller entered the allocator slow path while kswapd |
2652 | * was awake, order will remain at the higher level | 2802 | * was awake, order will remain at the higher level |
@@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2666 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2816 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2667 | 2817 | ||
2668 | /* Try to sleep for a short interval */ | 2818 | /* Try to sleep for a short interval */ |
2669 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2819 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2670 | remaining = schedule_timeout(HZ/10); | 2820 | remaining = schedule_timeout(HZ/10); |
2671 | finish_wait(&pgdat->kswapd_wait, &wait); | 2821 | finish_wait(&pgdat->kswapd_wait, &wait); |
2672 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2822 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
@@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2676 | * After a short sleep, check if it was a premature sleep. If not, then | 2826 | * After a short sleep, check if it was a premature sleep. If not, then |
2677 | * go fully to sleep until explicitly woken up. | 2827 | * go fully to sleep until explicitly woken up. |
2678 | */ | 2828 | */ |
2679 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2829 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2680 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 2830 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
2681 | 2831 | ||
2682 | /* | 2832 | /* |
@@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2688 | * them before going back to sleep. | 2838 | * them before going back to sleep. |
2689 | */ | 2839 | */ |
2690 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2840 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
2691 | schedule(); | 2841 | |
2842 | if (!kthread_should_stop()) | ||
2843 | schedule(); | ||
2844 | |||
2692 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | 2845 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); |
2693 | } else { | 2846 | } else { |
2694 | if (remaining) | 2847 | if (remaining) |
@@ -2955,14 +3108,17 @@ int kswapd_run(int nid) | |||
2955 | } | 3108 | } |
2956 | 3109 | ||
2957 | /* | 3110 | /* |
2958 | * Called by memory hotplug when all memory in a node is offlined. | 3111 | * Called by memory hotplug when all memory in a node is offlined. Caller must |
3112 | * hold lock_memory_hotplug(). | ||
2959 | */ | 3113 | */ |
2960 | void kswapd_stop(int nid) | 3114 | void kswapd_stop(int nid) |
2961 | { | 3115 | { |
2962 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | 3116 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; |
2963 | 3117 | ||
2964 | if (kswapd) | 3118 | if (kswapd) { |
2965 | kthread_stop(kswapd); | 3119 | kthread_stop(kswapd); |
3120 | NODE_DATA(nid)->kswapd = NULL; | ||
3121 | } | ||
2966 | } | 3122 | } |
2967 | 3123 | ||
2968 | static int __init kswapd_init(void) | 3124 | static int __init kswapd_init(void) |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 1bbbbd9776ad..df7a6748231d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = { | |||
745 | TEXTS_FOR_ZONES("pgsteal_direct") | 745 | TEXTS_FOR_ZONES("pgsteal_direct") |
746 | TEXTS_FOR_ZONES("pgscan_kswapd") | 746 | TEXTS_FOR_ZONES("pgscan_kswapd") |
747 | TEXTS_FOR_ZONES("pgscan_direct") | 747 | TEXTS_FOR_ZONES("pgscan_direct") |
748 | "pgscan_direct_throttle", | ||
748 | 749 | ||
749 | #ifdef CONFIG_NUMA | 750 | #ifdef CONFIG_NUMA |
750 | "zone_reclaim_failed", | 751 | "zone_reclaim_failed", |