diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-15 19:39:15 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-15 19:39:15 -0400 |
| commit | eea3a00264cf243a28e4331566ce67b86059339d (patch) | |
| tree | 487f16389e0dfa32e9caa7604d1274a7dcda8f04 /mm | |
| parent | e7c82412433a8039616c7314533a0a1c025d99bf (diff) | |
| parent | e693d73c20ffdb06840c9378f367bad849ac0d5d (diff) | |
Merge branch 'akpm' (patches from Andrew)
Merge second patchbomb from Andrew Morton:
- the rest of MM
- various misc bits
- add ability to run /sbin/reboot at reboot time
- printk/vsprintf changes
- fiddle with seq_printf() return value
* akpm: (114 commits)
parisc: remove use of seq_printf return value
lru_cache: remove use of seq_printf return value
tracing: remove use of seq_printf return value
cgroup: remove use of seq_printf return value
proc: remove use of seq_printf return value
s390: remove use of seq_printf return value
cris fasttimer: remove use of seq_printf return value
cris: remove use of seq_printf return value
openrisc: remove use of seq_printf return value
ARM: plat-pxa: remove use of seq_printf return value
nios2: cpuinfo: remove use of seq_printf return value
microblaze: mb: remove use of seq_printf return value
ipc: remove use of seq_printf return value
rtc: remove use of seq_printf return value
power: wakeup: remove use of seq_printf return value
x86: mtrr: if: remove use of seq_printf return value
linux/bitmap.h: improve BITMAP_{LAST,FIRST}_WORD_MASK
MAINTAINERS: CREDITS: remove Stefano Brivio from B43
.mailmap: add Ricardo Ribalda
CREDITS: add Ricardo Ribalda Delgado
...
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/cma.c | 5 | ||||
| -rw-r--r-- | mm/cma_debug.c | 41 | ||||
| -rw-r--r-- | mm/compaction.c | 60 | ||||
| -rw-r--r-- | mm/gup.c | 4 | ||||
| -rw-r--r-- | mm/huge_memory.c | 86 | ||||
| -rw-r--r-- | mm/hugetlb.c | 234 | ||||
| -rw-r--r-- | mm/internal.h | 4 | ||||
| -rw-r--r-- | mm/kasan/kasan.c | 13 | ||||
| -rw-r--r-- | mm/ksm.c | 10 | ||||
| -rw-r--r-- | mm/memblock.c | 18 | ||||
| -rw-r--r-- | mm/memcontrol.c | 47 | ||||
| -rw-r--r-- | mm/memory-failure.c | 122 | ||||
| -rw-r--r-- | mm/memory.c | 56 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
| -rw-r--r-- | mm/mempool.c | 117 | ||||
| -rw-r--r-- | mm/migrate.c | 3 | ||||
| -rw-r--r-- | mm/mmap.c | 21 | ||||
| -rw-r--r-- | mm/mremap.c | 25 | ||||
| -rw-r--r-- | mm/oom_kill.c | 2 | ||||
| -rw-r--r-- | mm/page-writeback.c | 3 | ||||
| -rw-r--r-- | mm/page_alloc.c | 6 | ||||
| -rw-r--r-- | mm/rmap.c | 6 | ||||
| -rw-r--r-- | mm/slub.c | 4 | ||||
| -rw-r--r-- | mm/swap.c | 34 | ||||
| -rw-r--r-- | mm/swap_state.c | 2 | ||||
| -rw-r--r-- | mm/swapfile.c | 2 | ||||
| -rw-r--r-- | mm/truncate.c | 2 | ||||
| -rw-r--r-- | mm/util.c | 41 | ||||
| -rw-r--r-- | mm/vmalloc.c | 95 | ||||
| -rw-r--r-- | mm/zsmalloc.c | 971 |
30 files changed, 1453 insertions, 583 deletions
| @@ -23,6 +23,7 @@ | |||
| 23 | # define DEBUG | 23 | # define DEBUG |
| 24 | #endif | 24 | #endif |
| 25 | #endif | 25 | #endif |
| 26 | #define CREATE_TRACE_POINTS | ||
| 26 | 27 | ||
| 27 | #include <linux/memblock.h> | 28 | #include <linux/memblock.h> |
| 28 | #include <linux/err.h> | 29 | #include <linux/err.h> |
| @@ -34,6 +35,7 @@ | |||
| 34 | #include <linux/cma.h> | 35 | #include <linux/cma.h> |
| 35 | #include <linux/highmem.h> | 36 | #include <linux/highmem.h> |
| 36 | #include <linux/io.h> | 37 | #include <linux/io.h> |
| 38 | #include <trace/events/cma.h> | ||
| 37 | 39 | ||
| 38 | #include "cma.h" | 40 | #include "cma.h" |
| 39 | 41 | ||
| @@ -414,6 +416,8 @@ struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) | |||
| 414 | start = bitmap_no + mask + 1; | 416 | start = bitmap_no + mask + 1; |
| 415 | } | 417 | } |
| 416 | 418 | ||
| 419 | trace_cma_alloc(page ? pfn : -1UL, page, count, align); | ||
| 420 | |||
| 417 | pr_debug("%s(): returned %p\n", __func__, page); | 421 | pr_debug("%s(): returned %p\n", __func__, page); |
| 418 | return page; | 422 | return page; |
| 419 | } | 423 | } |
| @@ -446,6 +450,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) | |||
| 446 | 450 | ||
| 447 | free_contig_range(pfn, count); | 451 | free_contig_range(pfn, count); |
| 448 | cma_clear_bitmap(cma, pfn, count); | 452 | cma_clear_bitmap(cma, pfn, count); |
| 453 | trace_cma_release(pfn, pages, count); | ||
| 449 | 454 | ||
| 450 | return true; | 455 | return true; |
| 451 | } | 456 | } |
diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 0b377536ccde..7621ee34daa0 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c | |||
| @@ -30,9 +30,44 @@ static int cma_debugfs_get(void *data, u64 *val) | |||
| 30 | 30 | ||
| 31 | return 0; | 31 | return 0; |
| 32 | } | 32 | } |
| 33 | |||
| 34 | DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); | 33 | DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); |
| 35 | 34 | ||
| 35 | static int cma_used_get(void *data, u64 *val) | ||
| 36 | { | ||
| 37 | struct cma *cma = data; | ||
| 38 | unsigned long used; | ||
| 39 | |||
| 40 | mutex_lock(&cma->lock); | ||
| 41 | /* pages counter is smaller than sizeof(int) */ | ||
| 42 | used = bitmap_weight(cma->bitmap, (int)cma->count); | ||
| 43 | mutex_unlock(&cma->lock); | ||
| 44 | *val = (u64)used << cma->order_per_bit; | ||
| 45 | |||
| 46 | return 0; | ||
| 47 | } | ||
| 48 | DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); | ||
| 49 | |||
| 50 | static int cma_maxchunk_get(void *data, u64 *val) | ||
| 51 | { | ||
| 52 | struct cma *cma = data; | ||
| 53 | unsigned long maxchunk = 0; | ||
| 54 | unsigned long start, end = 0; | ||
| 55 | |||
| 56 | mutex_lock(&cma->lock); | ||
| 57 | for (;;) { | ||
| 58 | start = find_next_zero_bit(cma->bitmap, cma->count, end); | ||
| 59 | if (start >= cma->count) | ||
| 60 | break; | ||
| 61 | end = find_next_bit(cma->bitmap, cma->count, start); | ||
| 62 | maxchunk = max(end - start, maxchunk); | ||
| 63 | } | ||
| 64 | mutex_unlock(&cma->lock); | ||
| 65 | *val = (u64)maxchunk << cma->order_per_bit; | ||
| 66 | |||
| 67 | return 0; | ||
| 68 | } | ||
| 69 | DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); | ||
| 70 | |||
| 36 | static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) | 71 | static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) |
| 37 | { | 72 | { |
| 38 | spin_lock(&cma->mem_head_lock); | 73 | spin_lock(&cma->mem_head_lock); |
| @@ -91,7 +126,6 @@ static int cma_free_write(void *data, u64 val) | |||
| 91 | 126 | ||
| 92 | return cma_free_mem(cma, pages); | 127 | return cma_free_mem(cma, pages); |
| 93 | } | 128 | } |
| 94 | |||
| 95 | DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); | 129 | DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); |
| 96 | 130 | ||
| 97 | static int cma_alloc_mem(struct cma *cma, int count) | 131 | static int cma_alloc_mem(struct cma *cma, int count) |
| @@ -124,7 +158,6 @@ static int cma_alloc_write(void *data, u64 val) | |||
| 124 | 158 | ||
| 125 | return cma_alloc_mem(cma, pages); | 159 | return cma_alloc_mem(cma, pages); |
| 126 | } | 160 | } |
| 127 | |||
| 128 | DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); | 161 | DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); |
| 129 | 162 | ||
| 130 | static void cma_debugfs_add_one(struct cma *cma, int idx) | 163 | static void cma_debugfs_add_one(struct cma *cma, int idx) |
| @@ -149,6 +182,8 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) | |||
| 149 | &cma->count, &cma_debugfs_fops); | 182 | &cma->count, &cma_debugfs_fops); |
| 150 | debugfs_create_file("order_per_bit", S_IRUGO, tmp, | 183 | debugfs_create_file("order_per_bit", S_IRUGO, tmp, |
| 151 | &cma->order_per_bit, &cma_debugfs_fops); | 184 | &cma->order_per_bit, &cma_debugfs_fops); |
| 185 | debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops); | ||
| 186 | debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops); | ||
| 152 | 187 | ||
| 153 | u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); | 188 | u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); |
| 154 | debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); | 189 | debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); |
diff --git a/mm/compaction.c b/mm/compaction.c index a18201a8124e..018f08da99a2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
| @@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc) | |||
| 391 | return false; | 391 | return false; |
| 392 | } | 392 | } |
| 393 | 393 | ||
| 394 | /* Returns true if the page is within a block suitable for migration to */ | ||
| 395 | static bool suitable_migration_target(struct page *page) | ||
| 396 | { | ||
| 397 | /* If the page is a large free page, then disallow migration */ | ||
| 398 | if (PageBuddy(page)) { | ||
| 399 | /* | ||
| 400 | * We are checking page_order without zone->lock taken. But | ||
| 401 | * the only small danger is that we skip a potentially suitable | ||
| 402 | * pageblock, so it's not worth to check order for valid range. | ||
| 403 | */ | ||
| 404 | if (page_order_unsafe(page) >= pageblock_order) | ||
| 405 | return false; | ||
| 406 | } | ||
| 407 | |||
| 408 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
| 409 | if (migrate_async_suitable(get_pageblock_migratetype(page))) | ||
| 410 | return true; | ||
| 411 | |||
| 412 | /* Otherwise skip the block */ | ||
| 413 | return false; | ||
| 414 | } | ||
| 415 | |||
| 416 | /* | 394 | /* |
| 417 | * Isolate free pages onto a private freelist. If @strict is true, will abort | 395 | * Isolate free pages onto a private freelist. If @strict is true, will abort |
| 418 | * returning 0 on any invalid PFNs or non-free pages inside of the pageblock | 396 | * returning 0 on any invalid PFNs or non-free pages inside of the pageblock |
| @@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, | |||
| 896 | 874 | ||
| 897 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 875 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
| 898 | #ifdef CONFIG_COMPACTION | 876 | #ifdef CONFIG_COMPACTION |
| 877 | |||
| 878 | /* Returns true if the page is within a block suitable for migration to */ | ||
| 879 | static bool suitable_migration_target(struct page *page) | ||
| 880 | { | ||
| 881 | /* If the page is a large free page, then disallow migration */ | ||
| 882 | if (PageBuddy(page)) { | ||
| 883 | /* | ||
| 884 | * We are checking page_order without zone->lock taken. But | ||
| 885 | * the only small danger is that we skip a potentially suitable | ||
| 886 | * pageblock, so it's not worth to check order for valid range. | ||
| 887 | */ | ||
| 888 | if (page_order_unsafe(page) >= pageblock_order) | ||
| 889 | return false; | ||
| 890 | } | ||
| 891 | |||
| 892 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
| 893 | if (migrate_async_suitable(get_pageblock_migratetype(page))) | ||
| 894 | return true; | ||
| 895 | |||
| 896 | /* Otherwise skip the block */ | ||
| 897 | return false; | ||
| 898 | } | ||
| 899 | |||
| 899 | /* | 900 | /* |
| 900 | * Based on information in the current compact_control, find blocks | 901 | * Based on information in the current compact_control, find blocks |
| 901 | * suitable for isolating free pages from and then isolate them. | 902 | * suitable for isolating free pages from and then isolate them. |
| @@ -1047,6 +1048,12 @@ typedef enum { | |||
| 1047 | } isolate_migrate_t; | 1048 | } isolate_migrate_t; |
| 1048 | 1049 | ||
| 1049 | /* | 1050 | /* |
| 1051 | * Allow userspace to control policy on scanning the unevictable LRU for | ||
| 1052 | * compactable pages. | ||
| 1053 | */ | ||
| 1054 | int sysctl_compact_unevictable_allowed __read_mostly = 1; | ||
| 1055 | |||
| 1056 | /* | ||
| 1050 | * Isolate all pages that can be migrated from the first suitable block, | 1057 | * Isolate all pages that can be migrated from the first suitable block, |
| 1051 | * starting at the block pointed to by the migrate scanner pfn within | 1058 | * starting at the block pointed to by the migrate scanner pfn within |
| 1052 | * compact_control. | 1059 | * compact_control. |
| @@ -1057,6 +1064,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
| 1057 | unsigned long low_pfn, end_pfn; | 1064 | unsigned long low_pfn, end_pfn; |
| 1058 | struct page *page; | 1065 | struct page *page; |
| 1059 | const isolate_mode_t isolate_mode = | 1066 | const isolate_mode_t isolate_mode = |
| 1067 | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | | ||
| 1060 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); | 1068 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); |
| 1061 | 1069 | ||
| 1062 | /* | 1070 | /* |
| @@ -1598,6 +1606,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
| 1598 | INIT_LIST_HEAD(&cc->freepages); | 1606 | INIT_LIST_HEAD(&cc->freepages); |
| 1599 | INIT_LIST_HEAD(&cc->migratepages); | 1607 | INIT_LIST_HEAD(&cc->migratepages); |
| 1600 | 1608 | ||
| 1609 | /* | ||
| 1610 | * When called via /proc/sys/vm/compact_memory | ||
| 1611 | * this makes sure we compact the whole zone regardless of | ||
| 1612 | * cached scanner positions. | ||
| 1613 | */ | ||
| 1614 | if (cc->order == -1) | ||
| 1615 | __reset_isolation_suitable(zone); | ||
| 1616 | |||
| 1601 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) | 1617 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) |
| 1602 | compact_zone(zone, cc); | 1618 | compact_zone(zone, cc); |
| 1603 | 1619 | ||
| @@ -1019,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
| 1019 | * | 1019 | * |
| 1020 | * for an example see gup_get_pte in arch/x86/mm/gup.c | 1020 | * for an example see gup_get_pte in arch/x86/mm/gup.c |
| 1021 | */ | 1021 | */ |
| 1022 | pte_t pte = ACCESS_ONCE(*ptep); | 1022 | pte_t pte = READ_ONCE(*ptep); |
| 1023 | struct page *page; | 1023 | struct page *page; |
| 1024 | 1024 | ||
| 1025 | /* | 1025 | /* |
| @@ -1309,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
| 1309 | local_irq_save(flags); | 1309 | local_irq_save(flags); |
| 1310 | pgdp = pgd_offset(mm, addr); | 1310 | pgdp = pgd_offset(mm, addr); |
| 1311 | do { | 1311 | do { |
| 1312 | pgd_t pgd = ACCESS_ONCE(*pgdp); | 1312 | pgd_t pgd = READ_ONCE(*pgdp); |
| 1313 | 1313 | ||
| 1314 | next = pgd_addr_end(addr, end); | 1314 | next = pgd_addr_end(addr, end); |
| 1315 | if (pgd_none(pgd)) | 1315 | if (pgd_none(pgd)) |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3afb5cbe1312..078832cf3636 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | |||
| 67 | 67 | ||
| 68 | static int khugepaged(void *none); | 68 | static int khugepaged(void *none); |
| 69 | static int khugepaged_slab_init(void); | 69 | static int khugepaged_slab_init(void); |
| 70 | static void khugepaged_slab_exit(void); | ||
| 70 | 71 | ||
| 71 | #define MM_SLOTS_HASH_BITS 10 | 72 | #define MM_SLOTS_HASH_BITS 10 |
| 72 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | 73 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); |
| @@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void) | |||
| 109 | int nr_zones = 0; | 110 | int nr_zones = 0; |
| 110 | unsigned long recommended_min; | 111 | unsigned long recommended_min; |
| 111 | 112 | ||
| 112 | if (!khugepaged_enabled()) | ||
| 113 | return 0; | ||
| 114 | |||
| 115 | for_each_populated_zone(zone) | 113 | for_each_populated_zone(zone) |
| 116 | nr_zones++; | 114 | nr_zones++; |
| 117 | 115 | ||
| @@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void) | |||
| 143 | setup_per_zone_wmarks(); | 141 | setup_per_zone_wmarks(); |
| 144 | return 0; | 142 | return 0; |
| 145 | } | 143 | } |
| 146 | late_initcall(set_recommended_min_free_kbytes); | ||
| 147 | 144 | ||
| 148 | static int start_khugepaged(void) | 145 | static int start_stop_khugepaged(void) |
| 149 | { | 146 | { |
| 150 | int err = 0; | 147 | int err = 0; |
| 151 | if (khugepaged_enabled()) { | 148 | if (khugepaged_enabled()) { |
| @@ -156,6 +153,7 @@ static int start_khugepaged(void) | |||
| 156 | pr_err("khugepaged: kthread_run(khugepaged) failed\n"); | 153 | pr_err("khugepaged: kthread_run(khugepaged) failed\n"); |
| 157 | err = PTR_ERR(khugepaged_thread); | 154 | err = PTR_ERR(khugepaged_thread); |
| 158 | khugepaged_thread = NULL; | 155 | khugepaged_thread = NULL; |
| 156 | goto fail; | ||
| 159 | } | 157 | } |
| 160 | 158 | ||
| 161 | if (!list_empty(&khugepaged_scan.mm_head)) | 159 | if (!list_empty(&khugepaged_scan.mm_head)) |
| @@ -166,7 +164,7 @@ static int start_khugepaged(void) | |||
| 166 | kthread_stop(khugepaged_thread); | 164 | kthread_stop(khugepaged_thread); |
| 167 | khugepaged_thread = NULL; | 165 | khugepaged_thread = NULL; |
| 168 | } | 166 | } |
| 169 | 167 | fail: | |
| 170 | return err; | 168 | return err; |
| 171 | } | 169 | } |
| 172 | 170 | ||
| @@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void) | |||
| 183 | struct page *zero_page; | 181 | struct page *zero_page; |
| 184 | retry: | 182 | retry: |
| 185 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | 183 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) |
| 186 | return ACCESS_ONCE(huge_zero_page); | 184 | return READ_ONCE(huge_zero_page); |
| 187 | 185 | ||
| 188 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | 186 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, |
| 189 | HPAGE_PMD_ORDER); | 187 | HPAGE_PMD_ORDER); |
| @@ -202,7 +200,7 @@ retry: | |||
| 202 | /* We take additional reference here. It will be put back by shrinker */ | 200 | /* We take additional reference here. It will be put back by shrinker */ |
| 203 | atomic_set(&huge_zero_refcount, 2); | 201 | atomic_set(&huge_zero_refcount, 2); |
| 204 | preempt_enable(); | 202 | preempt_enable(); |
| 205 | return ACCESS_ONCE(huge_zero_page); | 203 | return READ_ONCE(huge_zero_page); |
| 206 | } | 204 | } |
| 207 | 205 | ||
| 208 | static void put_huge_zero_page(void) | 206 | static void put_huge_zero_page(void) |
| @@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj, | |||
| 300 | int err; | 298 | int err; |
| 301 | 299 | ||
| 302 | mutex_lock(&khugepaged_mutex); | 300 | mutex_lock(&khugepaged_mutex); |
| 303 | err = start_khugepaged(); | 301 | err = start_stop_khugepaged(); |
| 304 | mutex_unlock(&khugepaged_mutex); | 302 | mutex_unlock(&khugepaged_mutex); |
| 305 | 303 | ||
| 306 | if (err) | 304 | if (err) |
| @@ -634,27 +632,38 @@ static int __init hugepage_init(void) | |||
| 634 | 632 | ||
| 635 | err = hugepage_init_sysfs(&hugepage_kobj); | 633 | err = hugepage_init_sysfs(&hugepage_kobj); |
| 636 | if (err) | 634 | if (err) |
| 637 | return err; | 635 | goto err_sysfs; |
| 638 | 636 | ||
| 639 | err = khugepaged_slab_init(); | 637 | err = khugepaged_slab_init(); |
| 640 | if (err) | 638 | if (err) |
| 641 | goto out; | 639 | goto err_slab; |
| 642 | 640 | ||
| 643 | register_shrinker(&huge_zero_page_shrinker); | 641 | err = register_shrinker(&huge_zero_page_shrinker); |
| 642 | if (err) | ||
| 643 | goto err_hzp_shrinker; | ||
| 644 | 644 | ||
| 645 | /* | 645 | /* |
| 646 | * By default disable transparent hugepages on smaller systems, | 646 | * By default disable transparent hugepages on smaller systems, |
| 647 | * where the extra memory used could hurt more than TLB overhead | 647 | * where the extra memory used could hurt more than TLB overhead |
| 648 | * is likely to save. The admin can still enable it through /sys. | 648 | * is likely to save. The admin can still enable it through /sys. |
| 649 | */ | 649 | */ |
| 650 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) | 650 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { |
| 651 | transparent_hugepage_flags = 0; | 651 | transparent_hugepage_flags = 0; |
| 652 | return 0; | ||
| 653 | } | ||
| 652 | 654 | ||
| 653 | start_khugepaged(); | 655 | err = start_stop_khugepaged(); |
| 656 | if (err) | ||
| 657 | goto err_khugepaged; | ||
| 654 | 658 | ||
| 655 | return 0; | 659 | return 0; |
| 656 | out: | 660 | err_khugepaged: |
| 661 | unregister_shrinker(&huge_zero_page_shrinker); | ||
| 662 | err_hzp_shrinker: | ||
| 663 | khugepaged_slab_exit(); | ||
| 664 | err_slab: | ||
| 657 | hugepage_exit_sysfs(hugepage_kobj); | 665 | hugepage_exit_sysfs(hugepage_kobj); |
| 666 | err_sysfs: | ||
| 658 | return err; | 667 | return err; |
| 659 | } | 668 | } |
| 660 | subsys_initcall(hugepage_init); | 669 | subsys_initcall(hugepage_init); |
| @@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) | |||
| 708 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 717 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, |
| 709 | struct vm_area_struct *vma, | 718 | struct vm_area_struct *vma, |
| 710 | unsigned long haddr, pmd_t *pmd, | 719 | unsigned long haddr, pmd_t *pmd, |
| 711 | struct page *page) | 720 | struct page *page, gfp_t gfp) |
| 712 | { | 721 | { |
| 713 | struct mem_cgroup *memcg; | 722 | struct mem_cgroup *memcg; |
| 714 | pgtable_t pgtable; | 723 | pgtable_t pgtable; |
| @@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
| 716 | 725 | ||
| 717 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 726 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
| 718 | 727 | ||
| 719 | if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) | 728 | if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) |
| 720 | return VM_FAULT_OOM; | 729 | return VM_FAULT_OOM; |
| 721 | 730 | ||
| 722 | pgtable = pte_alloc_one(mm, haddr); | 731 | pgtable = pte_alloc_one(mm, haddr); |
| @@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 822 | count_vm_event(THP_FAULT_FALLBACK); | 831 | count_vm_event(THP_FAULT_FALLBACK); |
| 823 | return VM_FAULT_FALLBACK; | 832 | return VM_FAULT_FALLBACK; |
| 824 | } | 833 | } |
| 825 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { | 834 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { |
| 826 | put_page(page); | 835 | put_page(page); |
| 827 | count_vm_event(THP_FAULT_FALLBACK); | 836 | count_vm_event(THP_FAULT_FALLBACK); |
| 828 | return VM_FAULT_FALLBACK; | 837 | return VM_FAULT_FALLBACK; |
| @@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1080 | unsigned long haddr; | 1089 | unsigned long haddr; |
| 1081 | unsigned long mmun_start; /* For mmu_notifiers */ | 1090 | unsigned long mmun_start; /* For mmu_notifiers */ |
| 1082 | unsigned long mmun_end; /* For mmu_notifiers */ | 1091 | unsigned long mmun_end; /* For mmu_notifiers */ |
| 1092 | gfp_t huge_gfp; /* for allocation and charge */ | ||
| 1083 | 1093 | ||
| 1084 | ptl = pmd_lockptr(mm, pmd); | 1094 | ptl = pmd_lockptr(mm, pmd); |
| 1085 | VM_BUG_ON_VMA(!vma->anon_vma, vma); | 1095 | VM_BUG_ON_VMA(!vma->anon_vma, vma); |
| @@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1106 | alloc: | 1116 | alloc: |
| 1107 | if (transparent_hugepage_enabled(vma) && | 1117 | if (transparent_hugepage_enabled(vma) && |
| 1108 | !transparent_hugepage_debug_cow()) { | 1118 | !transparent_hugepage_debug_cow()) { |
| 1109 | gfp_t gfp; | 1119 | huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
| 1110 | 1120 | new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); | |
| 1111 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); | ||
| 1112 | new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); | ||
| 1113 | } else | 1121 | } else |
| 1114 | new_page = NULL; | 1122 | new_page = NULL; |
| 1115 | 1123 | ||
| @@ -1130,8 +1138,7 @@ alloc: | |||
| 1130 | goto out; | 1138 | goto out; |
| 1131 | } | 1139 | } |
| 1132 | 1140 | ||
| 1133 | if (unlikely(mem_cgroup_try_charge(new_page, mm, | 1141 | if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { |
| 1134 | GFP_TRANSHUGE, &memcg))) { | ||
| 1135 | put_page(new_page); | 1142 | put_page(new_page); |
| 1136 | if (page) { | 1143 | if (page) { |
| 1137 | split_huge_page(page); | 1144 | split_huge_page(page); |
| @@ -1976,6 +1983,11 @@ static int __init khugepaged_slab_init(void) | |||
| 1976 | return 0; | 1983 | return 0; |
| 1977 | } | 1984 | } |
| 1978 | 1985 | ||
| 1986 | static void __init khugepaged_slab_exit(void) | ||
| 1987 | { | ||
| 1988 | kmem_cache_destroy(mm_slot_cache); | ||
| 1989 | } | ||
| 1990 | |||
| 1979 | static inline struct mm_slot *alloc_mm_slot(void) | 1991 | static inline struct mm_slot *alloc_mm_slot(void) |
| 1980 | { | 1992 | { |
| 1981 | if (!mm_slot_cache) /* initialization failed */ | 1993 | if (!mm_slot_cache) /* initialization failed */ |
| @@ -2323,19 +2335,13 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | |||
| 2323 | return true; | 2335 | return true; |
| 2324 | } | 2336 | } |
| 2325 | 2337 | ||
| 2326 | static struct page | 2338 | static struct page * |
| 2327 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | 2339 | khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, |
| 2328 | struct vm_area_struct *vma, unsigned long address, | 2340 | struct vm_area_struct *vma, unsigned long address, |
| 2329 | int node) | 2341 | int node) |
| 2330 | { | 2342 | { |
| 2331 | gfp_t flags; | ||
| 2332 | |||
| 2333 | VM_BUG_ON_PAGE(*hpage, *hpage); | 2343 | VM_BUG_ON_PAGE(*hpage, *hpage); |
| 2334 | 2344 | ||
| 2335 | /* Only allocate from the target node */ | ||
| 2336 | flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | | ||
| 2337 | __GFP_THISNODE; | ||
| 2338 | |||
| 2339 | /* | 2345 | /* |
| 2340 | * Before allocating the hugepage, release the mmap_sem read lock. | 2346 | * Before allocating the hugepage, release the mmap_sem read lock. |
| 2341 | * The allocation can take potentially a long time if it involves | 2347 | * The allocation can take potentially a long time if it involves |
| @@ -2344,7 +2350,7 @@ static struct page | |||
| 2344 | */ | 2350 | */ |
| 2345 | up_read(&mm->mmap_sem); | 2351 | up_read(&mm->mmap_sem); |
| 2346 | 2352 | ||
| 2347 | *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER); | 2353 | *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); |
| 2348 | if (unlikely(!*hpage)) { | 2354 | if (unlikely(!*hpage)) { |
| 2349 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 2355 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
| 2350 | *hpage = ERR_PTR(-ENOMEM); | 2356 | *hpage = ERR_PTR(-ENOMEM); |
| @@ -2397,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | |||
| 2397 | return true; | 2403 | return true; |
| 2398 | } | 2404 | } |
| 2399 | 2405 | ||
| 2400 | static struct page | 2406 | static struct page * |
| 2401 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | 2407 | khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, |
| 2402 | struct vm_area_struct *vma, unsigned long address, | 2408 | struct vm_area_struct *vma, unsigned long address, |
| 2403 | int node) | 2409 | int node) |
| 2404 | { | 2410 | { |
| 2405 | up_read(&mm->mmap_sem); | 2411 | up_read(&mm->mmap_sem); |
| 2406 | VM_BUG_ON(!*hpage); | 2412 | VM_BUG_ON(!*hpage); |
| 2413 | |||
| 2407 | return *hpage; | 2414 | return *hpage; |
| 2408 | } | 2415 | } |
| 2409 | #endif | 2416 | #endif |
| @@ -2438,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
| 2438 | struct mem_cgroup *memcg; | 2445 | struct mem_cgroup *memcg; |
| 2439 | unsigned long mmun_start; /* For mmu_notifiers */ | 2446 | unsigned long mmun_start; /* For mmu_notifiers */ |
| 2440 | unsigned long mmun_end; /* For mmu_notifiers */ | 2447 | unsigned long mmun_end; /* For mmu_notifiers */ |
| 2448 | gfp_t gfp; | ||
| 2441 | 2449 | ||
| 2442 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2450 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
| 2443 | 2451 | ||
| 2452 | /* Only allocate from the target node */ | ||
| 2453 | gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | | ||
| 2454 | __GFP_THISNODE; | ||
| 2455 | |||
| 2444 | /* release the mmap_sem read lock. */ | 2456 | /* release the mmap_sem read lock. */ |
| 2445 | new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); | 2457 | new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); |
| 2446 | if (!new_page) | 2458 | if (!new_page) |
| 2447 | return; | 2459 | return; |
| 2448 | 2460 | ||
| 2449 | if (unlikely(mem_cgroup_try_charge(new_page, mm, | 2461 | if (unlikely(mem_cgroup_try_charge(new_page, mm, |
| 2450 | GFP_TRANSHUGE, &memcg))) | 2462 | gfp, &memcg))) |
| 2451 | return; | 2463 | return; |
| 2452 | 2464 | ||
| 2453 | /* | 2465 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8874c8ad55aa..271e4432734c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock); | |||
| 61 | static int num_fault_mutexes; | 61 | static int num_fault_mutexes; |
| 62 | static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; | 62 | static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; |
| 63 | 63 | ||
| 64 | /* Forward declaration */ | ||
| 65 | static int hugetlb_acct_memory(struct hstate *h, long delta); | ||
| 66 | |||
| 64 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | 67 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) |
| 65 | { | 68 | { |
| 66 | bool free = (spool->count == 0) && (spool->used_hpages == 0); | 69 | bool free = (spool->count == 0) && (spool->used_hpages == 0); |
| @@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | |||
| 68 | spin_unlock(&spool->lock); | 71 | spin_unlock(&spool->lock); |
| 69 | 72 | ||
| 70 | /* If no pages are used, and no other handles to the subpool | 73 | /* If no pages are used, and no other handles to the subpool |
| 71 | * remain, free the subpool the subpool remain */ | 74 | * remain, give up any reservations mased on minimum size and |
| 72 | if (free) | 75 | * free the subpool */ |
| 76 | if (free) { | ||
| 77 | if (spool->min_hpages != -1) | ||
| 78 | hugetlb_acct_memory(spool->hstate, | ||
| 79 | -spool->min_hpages); | ||
| 73 | kfree(spool); | 80 | kfree(spool); |
| 81 | } | ||
| 74 | } | 82 | } |
| 75 | 83 | ||
| 76 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) | 84 | struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, |
| 85 | long min_hpages) | ||
| 77 | { | 86 | { |
| 78 | struct hugepage_subpool *spool; | 87 | struct hugepage_subpool *spool; |
| 79 | 88 | ||
| 80 | spool = kmalloc(sizeof(*spool), GFP_KERNEL); | 89 | spool = kzalloc(sizeof(*spool), GFP_KERNEL); |
| 81 | if (!spool) | 90 | if (!spool) |
| 82 | return NULL; | 91 | return NULL; |
| 83 | 92 | ||
| 84 | spin_lock_init(&spool->lock); | 93 | spin_lock_init(&spool->lock); |
| 85 | spool->count = 1; | 94 | spool->count = 1; |
| 86 | spool->max_hpages = nr_blocks; | 95 | spool->max_hpages = max_hpages; |
| 87 | spool->used_hpages = 0; | 96 | spool->hstate = h; |
| 97 | spool->min_hpages = min_hpages; | ||
| 98 | |||
| 99 | if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { | ||
| 100 | kfree(spool); | ||
| 101 | return NULL; | ||
| 102 | } | ||
| 103 | spool->rsv_hpages = min_hpages; | ||
| 88 | 104 | ||
| 89 | return spool; | 105 | return spool; |
| 90 | } | 106 | } |
| @@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) | |||
| 97 | unlock_or_release_subpool(spool); | 113 | unlock_or_release_subpool(spool); |
| 98 | } | 114 | } |
| 99 | 115 | ||
| 100 | static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, | 116 | /* |
| 117 | * Subpool accounting for allocating and reserving pages. | ||
| 118 | * Return -ENOMEM if there are not enough resources to satisfy the | ||
| 119 | * the request. Otherwise, return the number of pages by which the | ||
| 120 | * global pools must be adjusted (upward). The returned value may | ||
| 121 | * only be different than the passed value (delta) in the case where | ||
| 122 | * a subpool minimum size must be manitained. | ||
| 123 | */ | ||
| 124 | static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, | ||
| 101 | long delta) | 125 | long delta) |
| 102 | { | 126 | { |
| 103 | int ret = 0; | 127 | long ret = delta; |
| 104 | 128 | ||
| 105 | if (!spool) | 129 | if (!spool) |
| 106 | return 0; | 130 | return ret; |
| 107 | 131 | ||
| 108 | spin_lock(&spool->lock); | 132 | spin_lock(&spool->lock); |
| 109 | if ((spool->used_hpages + delta) <= spool->max_hpages) { | 133 | |
| 110 | spool->used_hpages += delta; | 134 | if (spool->max_hpages != -1) { /* maximum size accounting */ |
| 111 | } else { | 135 | if ((spool->used_hpages + delta) <= spool->max_hpages) |
| 112 | ret = -ENOMEM; | 136 | spool->used_hpages += delta; |
| 137 | else { | ||
| 138 | ret = -ENOMEM; | ||
| 139 | goto unlock_ret; | ||
| 140 | } | ||
| 141 | } | ||
| 142 | |||
| 143 | if (spool->min_hpages != -1) { /* minimum size accounting */ | ||
| 144 | if (delta > spool->rsv_hpages) { | ||
| 145 | /* | ||
| 146 | * Asking for more reserves than those already taken on | ||
| 147 | * behalf of subpool. Return difference. | ||
| 148 | */ | ||
| 149 | ret = delta - spool->rsv_hpages; | ||
| 150 | spool->rsv_hpages = 0; | ||
| 151 | } else { | ||
| 152 | ret = 0; /* reserves already accounted for */ | ||
| 153 | spool->rsv_hpages -= delta; | ||
| 154 | } | ||
| 113 | } | 155 | } |
| 114 | spin_unlock(&spool->lock); | ||
| 115 | 156 | ||
| 157 | unlock_ret: | ||
| 158 | spin_unlock(&spool->lock); | ||
| 116 | return ret; | 159 | return ret; |
| 117 | } | 160 | } |
| 118 | 161 | ||
| 119 | static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, | 162 | /* |
| 163 | * Subpool accounting for freeing and unreserving pages. | ||
| 164 | * Return the number of global page reservations that must be dropped. | ||
| 165 | * The return value may only be different than the passed value (delta) | ||
| 166 | * in the case where a subpool minimum size must be maintained. | ||
| 167 | */ | ||
| 168 | static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, | ||
| 120 | long delta) | 169 | long delta) |
| 121 | { | 170 | { |
| 171 | long ret = delta; | ||
| 172 | |||
| 122 | if (!spool) | 173 | if (!spool) |
| 123 | return; | 174 | return delta; |
| 124 | 175 | ||
| 125 | spin_lock(&spool->lock); | 176 | spin_lock(&spool->lock); |
| 126 | spool->used_hpages -= delta; | 177 | |
| 127 | /* If hugetlbfs_put_super couldn't free spool due to | 178 | if (spool->max_hpages != -1) /* maximum size accounting */ |
| 128 | * an outstanding quota reference, free it now. */ | 179 | spool->used_hpages -= delta; |
| 180 | |||
| 181 | if (spool->min_hpages != -1) { /* minimum size accounting */ | ||
| 182 | if (spool->rsv_hpages + delta <= spool->min_hpages) | ||
| 183 | ret = 0; | ||
| 184 | else | ||
| 185 | ret = spool->rsv_hpages + delta - spool->min_hpages; | ||
| 186 | |||
| 187 | spool->rsv_hpages += delta; | ||
| 188 | if (spool->rsv_hpages > spool->min_hpages) | ||
| 189 | spool->rsv_hpages = spool->min_hpages; | ||
| 190 | } | ||
| 191 | |||
| 192 | /* | ||
| 193 | * If hugetlbfs_put_super couldn't free spool due to an outstanding | ||
| 194 | * quota reference, free it now. | ||
| 195 | */ | ||
| 129 | unlock_or_release_subpool(spool); | 196 | unlock_or_release_subpool(spool); |
| 197 | |||
| 198 | return ret; | ||
| 130 | } | 199 | } |
| 131 | 200 | ||
| 132 | static inline struct hugepage_subpool *subpool_inode(struct inode *inode) | 201 | static inline struct hugepage_subpool *subpool_inode(struct inode *inode) |
| @@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size) | |||
| 855 | return NULL; | 924 | return NULL; |
| 856 | } | 925 | } |
| 857 | 926 | ||
| 927 | /* | ||
| 928 | * Test to determine whether the hugepage is "active/in-use" (i.e. being linked | ||
| 929 | * to hstate->hugepage_activelist.) | ||
| 930 | * | ||
| 931 | * This function can be called for tail pages, but never returns true for them. | ||
| 932 | */ | ||
| 933 | bool page_huge_active(struct page *page) | ||
| 934 | { | ||
| 935 | VM_BUG_ON_PAGE(!PageHuge(page), page); | ||
| 936 | return PageHead(page) && PagePrivate(&page[1]); | ||
| 937 | } | ||
| 938 | |||
| 939 | /* never called for tail page */ | ||
| 940 | static void set_page_huge_active(struct page *page) | ||
| 941 | { | ||
| 942 | VM_BUG_ON_PAGE(!PageHeadHuge(page), page); | ||
| 943 | SetPagePrivate(&page[1]); | ||
| 944 | } | ||
| 945 | |||
| 946 | static void clear_page_huge_active(struct page *page) | ||
| 947 | { | ||
| 948 | VM_BUG_ON_PAGE(!PageHeadHuge(page), page); | ||
| 949 | ClearPagePrivate(&page[1]); | ||
| 950 | } | ||
| 951 | |||
| 858 | void free_huge_page(struct page *page) | 952 | void free_huge_page(struct page *page) |
| 859 | { | 953 | { |
| 860 | /* | 954 | /* |
| @@ -874,7 +968,16 @@ void free_huge_page(struct page *page) | |||
| 874 | restore_reserve = PagePrivate(page); | 968 | restore_reserve = PagePrivate(page); |
| 875 | ClearPagePrivate(page); | 969 | ClearPagePrivate(page); |
| 876 | 970 | ||
| 971 | /* | ||
| 972 | * A return code of zero implies that the subpool will be under its | ||
| 973 | * minimum size if the reservation is not restored after page is free. | ||
| 974 | * Therefore, force restore_reserve operation. | ||
| 975 | */ | ||
| 976 | if (hugepage_subpool_put_pages(spool, 1) == 0) | ||
| 977 | restore_reserve = true; | ||
| 978 | |||
| 877 | spin_lock(&hugetlb_lock); | 979 | spin_lock(&hugetlb_lock); |
| 980 | clear_page_huge_active(page); | ||
| 878 | hugetlb_cgroup_uncharge_page(hstate_index(h), | 981 | hugetlb_cgroup_uncharge_page(hstate_index(h), |
| 879 | pages_per_huge_page(h), page); | 982 | pages_per_huge_page(h), page); |
| 880 | if (restore_reserve) | 983 | if (restore_reserve) |
| @@ -891,7 +994,6 @@ void free_huge_page(struct page *page) | |||
| 891 | enqueue_huge_page(h, page); | 994 | enqueue_huge_page(h, page); |
| 892 | } | 995 | } |
| 893 | spin_unlock(&hugetlb_lock); | 996 | spin_unlock(&hugetlb_lock); |
| 894 | hugepage_subpool_put_pages(spool, 1); | ||
| 895 | } | 997 | } |
| 896 | 998 | ||
| 897 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 999 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
| @@ -1386,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
| 1386 | if (chg < 0) | 1488 | if (chg < 0) |
| 1387 | return ERR_PTR(-ENOMEM); | 1489 | return ERR_PTR(-ENOMEM); |
| 1388 | if (chg || avoid_reserve) | 1490 | if (chg || avoid_reserve) |
| 1389 | if (hugepage_subpool_get_pages(spool, 1)) | 1491 | if (hugepage_subpool_get_pages(spool, 1) < 0) |
| 1390 | return ERR_PTR(-ENOSPC); | 1492 | return ERR_PTR(-ENOSPC); |
| 1391 | 1493 | ||
| 1392 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | 1494 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); |
| @@ -2454,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
| 2454 | struct resv_map *resv = vma_resv_map(vma); | 2556 | struct resv_map *resv = vma_resv_map(vma); |
| 2455 | struct hugepage_subpool *spool = subpool_vma(vma); | 2557 | struct hugepage_subpool *spool = subpool_vma(vma); |
| 2456 | unsigned long reserve, start, end; | 2558 | unsigned long reserve, start, end; |
| 2559 | long gbl_reserve; | ||
| 2457 | 2560 | ||
| 2458 | if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | 2561 | if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
| 2459 | return; | 2562 | return; |
| @@ -2466,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
| 2466 | kref_put(&resv->refs, resv_map_release); | 2569 | kref_put(&resv->refs, resv_map_release); |
| 2467 | 2570 | ||
| 2468 | if (reserve) { | 2571 | if (reserve) { |
| 2469 | hugetlb_acct_memory(h, -reserve); | 2572 | /* |
| 2470 | hugepage_subpool_put_pages(spool, reserve); | 2573 | * Decrement reserve counts. The global reserve count may be |
| 2574 | * adjusted if the subpool has a minimum size. | ||
| 2575 | */ | ||
| 2576 | gbl_reserve = hugepage_subpool_put_pages(spool, reserve); | ||
| 2577 | hugetlb_acct_memory(h, -gbl_reserve); | ||
| 2471 | } | 2578 | } |
| 2472 | } | 2579 | } |
| 2473 | 2580 | ||
| @@ -2891,6 +2998,7 @@ retry_avoidcopy: | |||
| 2891 | copy_user_huge_page(new_page, old_page, address, vma, | 2998 | copy_user_huge_page(new_page, old_page, address, vma, |
| 2892 | pages_per_huge_page(h)); | 2999 | pages_per_huge_page(h)); |
| 2893 | __SetPageUptodate(new_page); | 3000 | __SetPageUptodate(new_page); |
| 3001 | set_page_huge_active(new_page); | ||
| 2894 | 3002 | ||
| 2895 | mmun_start = address & huge_page_mask(h); | 3003 | mmun_start = address & huge_page_mask(h); |
| 2896 | mmun_end = mmun_start + huge_page_size(h); | 3004 | mmun_end = mmun_start + huge_page_size(h); |
| @@ -3003,6 +3111,7 @@ retry: | |||
| 3003 | } | 3111 | } |
| 3004 | clear_huge_page(page, address, pages_per_huge_page(h)); | 3112 | clear_huge_page(page, address, pages_per_huge_page(h)); |
| 3005 | __SetPageUptodate(page); | 3113 | __SetPageUptodate(page); |
| 3114 | set_page_huge_active(page); | ||
| 3006 | 3115 | ||
| 3007 | if (vma->vm_flags & VM_MAYSHARE) { | 3116 | if (vma->vm_flags & VM_MAYSHARE) { |
| 3008 | int err; | 3117 | int err; |
| @@ -3447,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
| 3447 | struct hstate *h = hstate_inode(inode); | 3556 | struct hstate *h = hstate_inode(inode); |
| 3448 | struct hugepage_subpool *spool = subpool_inode(inode); | 3557 | struct hugepage_subpool *spool = subpool_inode(inode); |
| 3449 | struct resv_map *resv_map; | 3558 | struct resv_map *resv_map; |
| 3559 | long gbl_reserve; | ||
| 3450 | 3560 | ||
| 3451 | /* | 3561 | /* |
| 3452 | * Only apply hugepage reservation if asked. At fault time, an | 3562 | * Only apply hugepage reservation if asked. At fault time, an |
| @@ -3483,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
| 3483 | goto out_err; | 3593 | goto out_err; |
| 3484 | } | 3594 | } |
| 3485 | 3595 | ||
| 3486 | /* There must be enough pages in the subpool for the mapping */ | 3596 | /* |
| 3487 | if (hugepage_subpool_get_pages(spool, chg)) { | 3597 | * There must be enough pages in the subpool for the mapping. If |
| 3598 | * the subpool has a minimum size, there may be some global | ||
| 3599 | * reservations already in place (gbl_reserve). | ||
| 3600 | */ | ||
| 3601 | gbl_reserve = hugepage_subpool_get_pages(spool, chg); | ||
| 3602 | if (gbl_reserve < 0) { | ||
| 3488 | ret = -ENOSPC; | 3603 | ret = -ENOSPC; |
| 3489 | goto out_err; | 3604 | goto out_err; |
| 3490 | } | 3605 | } |
| @@ -3493,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
| 3493 | * Check enough hugepages are available for the reservation. | 3608 | * Check enough hugepages are available for the reservation. |
| 3494 | * Hand the pages back to the subpool if there are not | 3609 | * Hand the pages back to the subpool if there are not |
| 3495 | */ | 3610 | */ |
| 3496 | ret = hugetlb_acct_memory(h, chg); | 3611 | ret = hugetlb_acct_memory(h, gbl_reserve); |
| 3497 | if (ret < 0) { | 3612 | if (ret < 0) { |
| 3498 | hugepage_subpool_put_pages(spool, chg); | 3613 | /* put back original number of pages, chg */ |
| 3614 | (void)hugepage_subpool_put_pages(spool, chg); | ||
| 3499 | goto out_err; | 3615 | goto out_err; |
| 3500 | } | 3616 | } |
| 3501 | 3617 | ||
| @@ -3525,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
| 3525 | struct resv_map *resv_map = inode_resv_map(inode); | 3641 | struct resv_map *resv_map = inode_resv_map(inode); |
| 3526 | long chg = 0; | 3642 | long chg = 0; |
| 3527 | struct hugepage_subpool *spool = subpool_inode(inode); | 3643 | struct hugepage_subpool *spool = subpool_inode(inode); |
| 3644 | long gbl_reserve; | ||
| 3528 | 3645 | ||
| 3529 | if (resv_map) | 3646 | if (resv_map) |
| 3530 | chg = region_truncate(resv_map, offset); | 3647 | chg = region_truncate(resv_map, offset); |
| @@ -3532,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
| 3532 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); | 3649 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); |
| 3533 | spin_unlock(&inode->i_lock); | 3650 | spin_unlock(&inode->i_lock); |
| 3534 | 3651 | ||
| 3535 | hugepage_subpool_put_pages(spool, (chg - freed)); | 3652 | /* |
| 3536 | hugetlb_acct_memory(h, -(chg - freed)); | 3653 | * If the subpool has a minimum size, the number of global |
| 3654 | * reservations to be released may be adjusted. | ||
| 3655 | */ | ||
| 3656 | gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); | ||
| 3657 | hugetlb_acct_memory(h, -gbl_reserve); | ||
| 3537 | } | 3658 | } |
| 3538 | 3659 | ||
| 3539 | #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE | 3660 | #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE |
| @@ -3775,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
| 3775 | 3896 | ||
| 3776 | #ifdef CONFIG_MEMORY_FAILURE | 3897 | #ifdef CONFIG_MEMORY_FAILURE |
| 3777 | 3898 | ||
| 3778 | /* Should be called in hugetlb_lock */ | ||
| 3779 | static int is_hugepage_on_freelist(struct page *hpage) | ||
| 3780 | { | ||
| 3781 | struct page *page; | ||
| 3782 | struct page *tmp; | ||
| 3783 | struct hstate *h = page_hstate(hpage); | ||
| 3784 | int nid = page_to_nid(hpage); | ||
| 3785 | |||
| 3786 | list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) | ||
| 3787 | if (page == hpage) | ||
| 3788 | return 1; | ||
| 3789 | return 0; | ||
| 3790 | } | ||
| 3791 | |||
| 3792 | /* | 3899 | /* |
| 3793 | * This function is called from memory failure code. | 3900 | * This function is called from memory failure code. |
| 3794 | * Assume the caller holds page lock of the head page. | 3901 | * Assume the caller holds page lock of the head page. |
| @@ -3800,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
| 3800 | int ret = -EBUSY; | 3907 | int ret = -EBUSY; |
| 3801 | 3908 | ||
| 3802 | spin_lock(&hugetlb_lock); | 3909 | spin_lock(&hugetlb_lock); |
| 3803 | if (is_hugepage_on_freelist(hpage)) { | 3910 | /* |
| 3911 | * Just checking !page_huge_active is not enough, because that could be | ||
| 3912 | * an isolated/hwpoisoned hugepage (which have >0 refcount). | ||
| 3913 | */ | ||
| 3914 | if (!page_huge_active(hpage) && !page_count(hpage)) { | ||
| 3804 | /* | 3915 | /* |
| 3805 | * Hwpoisoned hugepage isn't linked to activelist or freelist, | 3916 | * Hwpoisoned hugepage isn't linked to activelist or freelist, |
| 3806 | * but dangling hpage->lru can trigger list-debug warnings | 3917 | * but dangling hpage->lru can trigger list-debug warnings |
| @@ -3820,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
| 3820 | 3931 | ||
| 3821 | bool isolate_huge_page(struct page *page, struct list_head *list) | 3932 | bool isolate_huge_page(struct page *page, struct list_head *list) |
| 3822 | { | 3933 | { |
| 3934 | bool ret = true; | ||
| 3935 | |||
| 3823 | VM_BUG_ON_PAGE(!PageHead(page), page); | 3936 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 3824 | if (!get_page_unless_zero(page)) | ||
| 3825 | return false; | ||
| 3826 | spin_lock(&hugetlb_lock); | 3937 | spin_lock(&hugetlb_lock); |
| 3938 | if (!page_huge_active(page) || !get_page_unless_zero(page)) { | ||
| 3939 | ret = false; | ||
| 3940 | goto unlock; | ||
| 3941 | } | ||
| 3942 | clear_page_huge_active(page); | ||
| 3827 | list_move_tail(&page->lru, list); | 3943 | list_move_tail(&page->lru, list); |
| 3944 | unlock: | ||
| 3828 | spin_unlock(&hugetlb_lock); | 3945 | spin_unlock(&hugetlb_lock); |
| 3829 | return true; | 3946 | return ret; |
| 3830 | } | 3947 | } |
| 3831 | 3948 | ||
| 3832 | void putback_active_hugepage(struct page *page) | 3949 | void putback_active_hugepage(struct page *page) |
| 3833 | { | 3950 | { |
| 3834 | VM_BUG_ON_PAGE(!PageHead(page), page); | 3951 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 3835 | spin_lock(&hugetlb_lock); | 3952 | spin_lock(&hugetlb_lock); |
| 3953 | set_page_huge_active(page); | ||
| 3836 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); | 3954 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); |
| 3837 | spin_unlock(&hugetlb_lock); | 3955 | spin_unlock(&hugetlb_lock); |
| 3838 | put_page(page); | 3956 | put_page(page); |
| 3839 | } | 3957 | } |
| 3840 | |||
| 3841 | bool is_hugepage_active(struct page *page) | ||
| 3842 | { | ||
| 3843 | VM_BUG_ON_PAGE(!PageHuge(page), page); | ||
| 3844 | /* | ||
| 3845 | * This function can be called for a tail page because the caller, | ||
| 3846 | * scan_movable_pages, scans through a given pfn-range which typically | ||
| 3847 | * covers one memory block. In systems using gigantic hugepage (1GB | ||
| 3848 | * for x86_64,) a hugepage is larger than a memory block, and we don't | ||
| 3849 | * support migrating such large hugepages for now, so return false | ||
| 3850 | * when called for tail pages. | ||
| 3851 | */ | ||
| 3852 | if (PageTail(page)) | ||
| 3853 | return false; | ||
| 3854 | /* | ||
| 3855 | * Refcount of a hwpoisoned hugepages is 1, but they are not active, | ||
| 3856 | * so we should return false for them. | ||
| 3857 | */ | ||
| 3858 | if (unlikely(PageHWPoison(page))) | ||
| 3859 | return false; | ||
| 3860 | return page_count(page) > 0; | ||
| 3861 | } | ||
diff --git a/mm/internal.h b/mm/internal.h index edaab69a9c35..a25e359a4039 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -224,13 +224,13 @@ static inline unsigned long page_order(struct page *page) | |||
| 224 | * PageBuddy() should be checked first by the caller to minimize race window, | 224 | * PageBuddy() should be checked first by the caller to minimize race window, |
| 225 | * and invalid values must be handled gracefully. | 225 | * and invalid values must be handled gracefully. |
| 226 | * | 226 | * |
| 227 | * ACCESS_ONCE is used so that if the caller assigns the result into a local | 227 | * READ_ONCE is used so that if the caller assigns the result into a local |
| 228 | * variable and e.g. tests it for valid range before using, the compiler cannot | 228 | * variable and e.g. tests it for valid range before using, the compiler cannot |
| 229 | * decide to remove the variable and inline the page_private(page) multiple | 229 | * decide to remove the variable and inline the page_private(page) multiple |
| 230 | * times, potentially observing different values in the tests and the actual | 230 | * times, potentially observing different values in the tests and the actual |
| 231 | * use of the result. | 231 | * use of the result. |
| 232 | */ | 232 | */ |
| 233 | #define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) | 233 | #define page_order_unsafe(page) READ_ONCE(page_private(page)) |
| 234 | 234 | ||
| 235 | static inline bool is_cow_mapping(vm_flags_t flags) | 235 | static inline bool is_cow_mapping(vm_flags_t flags) |
| 236 | { | 236 | { |
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 936d81661c47..6c513a63ea84 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
| @@ -389,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size) | |||
| 389 | kasan_kmalloc(page->slab_cache, object, size); | 389 | kasan_kmalloc(page->slab_cache, object, size); |
| 390 | } | 390 | } |
| 391 | 391 | ||
| 392 | void kasan_kfree(void *ptr) | ||
| 393 | { | ||
| 394 | struct page *page; | ||
| 395 | |||
| 396 | page = virt_to_head_page(ptr); | ||
| 397 | |||
| 398 | if (unlikely(!PageSlab(page))) | ||
| 399 | kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), | ||
| 400 | KASAN_FREE_PAGE); | ||
| 401 | else | ||
| 402 | kasan_slab_free(page->slab_cache, ptr); | ||
| 403 | } | ||
| 404 | |||
| 392 | void kasan_kfree_large(const void *ptr) | 405 | void kasan_kfree_large(const void *ptr) |
| 393 | { | 406 | { |
| 394 | struct page *page = virt_to_page(ptr); | 407 | struct page *page = virt_to_page(ptr); |
| @@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) | |||
| 542 | expected_mapping = (void *)stable_node + | 542 | expected_mapping = (void *)stable_node + |
| 543 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 543 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); |
| 544 | again: | 544 | again: |
| 545 | kpfn = ACCESS_ONCE(stable_node->kpfn); | 545 | kpfn = READ_ONCE(stable_node->kpfn); |
| 546 | page = pfn_to_page(kpfn); | 546 | page = pfn_to_page(kpfn); |
| 547 | 547 | ||
| 548 | /* | 548 | /* |
| @@ -551,7 +551,7 @@ again: | |||
| 551 | * but on Alpha we need to be more careful. | 551 | * but on Alpha we need to be more careful. |
| 552 | */ | 552 | */ |
| 553 | smp_read_barrier_depends(); | 553 | smp_read_barrier_depends(); |
| 554 | if (ACCESS_ONCE(page->mapping) != expected_mapping) | 554 | if (READ_ONCE(page->mapping) != expected_mapping) |
| 555 | goto stale; | 555 | goto stale; |
| 556 | 556 | ||
| 557 | /* | 557 | /* |
| @@ -577,14 +577,14 @@ again: | |||
| 577 | cpu_relax(); | 577 | cpu_relax(); |
| 578 | } | 578 | } |
| 579 | 579 | ||
| 580 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | 580 | if (READ_ONCE(page->mapping) != expected_mapping) { |
| 581 | put_page(page); | 581 | put_page(page); |
| 582 | goto stale; | 582 | goto stale; |
| 583 | } | 583 | } |
| 584 | 584 | ||
| 585 | if (lock_it) { | 585 | if (lock_it) { |
| 586 | lock_page(page); | 586 | lock_page(page); |
| 587 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | 587 | if (READ_ONCE(page->mapping) != expected_mapping) { |
| 588 | unlock_page(page); | 588 | unlock_page(page); |
| 589 | put_page(page); | 589 | put_page(page); |
| 590 | goto stale; | 590 | goto stale; |
| @@ -600,7 +600,7 @@ stale: | |||
| 600 | * before checking whether node->kpfn has been changed. | 600 | * before checking whether node->kpfn has been changed. |
| 601 | */ | 601 | */ |
| 602 | smp_rmb(); | 602 | smp_rmb(); |
| 603 | if (ACCESS_ONCE(stable_node->kpfn) != kpfn) | 603 | if (READ_ONCE(stable_node->kpfn) != kpfn) |
| 604 | goto again; | 604 | goto again; |
| 605 | remove_node_from_stable_tree(stable_node); | 605 | remove_node_from_stable_tree(stable_node); |
| 606 | return NULL; | 606 | return NULL; |
diff --git a/mm/memblock.c b/mm/memblock.c index 3f37a0bca5d5..9318b567ed79 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
| @@ -580,10 +580,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, | |||
| 580 | return memblock_add_range(&memblock.memory, base, size, nid, 0); | 580 | return memblock_add_range(&memblock.memory, base, size, nid, 0); |
| 581 | } | 581 | } |
| 582 | 582 | ||
| 583 | static int __init_memblock memblock_add_region(phys_addr_t base, | ||
| 584 | phys_addr_t size, | ||
| 585 | int nid, | ||
| 586 | unsigned long flags) | ||
| 587 | { | ||
| 588 | struct memblock_type *_rgn = &memblock.memory; | ||
| 589 | |||
| 590 | memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", | ||
| 591 | (unsigned long long)base, | ||
| 592 | (unsigned long long)base + size - 1, | ||
| 593 | flags, (void *)_RET_IP_); | ||
| 594 | |||
| 595 | return memblock_add_range(_rgn, base, size, nid, flags); | ||
| 596 | } | ||
| 597 | |||
| 583 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | 598 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) |
| 584 | { | 599 | { |
| 585 | return memblock_add_range(&memblock.memory, base, size, | 600 | return memblock_add_region(base, size, MAX_NUMNODES, 0); |
| 586 | MAX_NUMNODES, 0); | ||
| 587 | } | 601 | } |
| 588 | 602 | ||
| 589 | /** | 603 | /** |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c3f09b2dda5f..14c2f2017e37 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -259,11 +259,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | |||
| 259 | * page cache and RSS per cgroup. We would eventually like to provide | 259 | * page cache and RSS per cgroup. We would eventually like to provide |
| 260 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 260 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
| 261 | * to help the administrator determine what knobs to tune. | 261 | * to help the administrator determine what knobs to tune. |
| 262 | * | ||
| 263 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | ||
| 264 | * we hit the water mark. May be even add a low water mark, such that | ||
| 265 | * no reclaim occurs from a cgroup at it's low water mark, this is | ||
| 266 | * a feature that will be implemented much later in the future. | ||
| 267 | */ | 262 | */ |
| 268 | struct mem_cgroup { | 263 | struct mem_cgroup { |
| 269 | struct cgroup_subsys_state css; | 264 | struct cgroup_subsys_state css; |
| @@ -460,6 +455,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) | |||
| 460 | return memcg->css.id; | 455 | return memcg->css.id; |
| 461 | } | 456 | } |
| 462 | 457 | ||
| 458 | /* | ||
| 459 | * A helper function to get mem_cgroup from ID. must be called under | ||
| 460 | * rcu_read_lock(). The caller is responsible for calling | ||
| 461 | * css_tryget_online() if the mem_cgroup is used for charging. (dropping | ||
| 462 | * refcnt from swap can be called against removed memcg.) | ||
| 463 | */ | ||
| 463 | static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) | 464 | static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) |
| 464 | { | 465 | { |
| 465 | struct cgroup_subsys_state *css; | 466 | struct cgroup_subsys_state *css; |
| @@ -673,7 +674,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, | |||
| 673 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) | 674 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) |
| 674 | { | 675 | { |
| 675 | unsigned long nr_pages = page_counter_read(&memcg->memory); | 676 | unsigned long nr_pages = page_counter_read(&memcg->memory); |
| 676 | unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); | 677 | unsigned long soft_limit = READ_ONCE(memcg->soft_limit); |
| 677 | unsigned long excess = 0; | 678 | unsigned long excess = 0; |
| 678 | 679 | ||
| 679 | if (nr_pages > soft_limit) | 680 | if (nr_pages > soft_limit) |
| @@ -1041,7 +1042,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
| 1041 | goto out_unlock; | 1042 | goto out_unlock; |
| 1042 | 1043 | ||
| 1043 | do { | 1044 | do { |
| 1044 | pos = ACCESS_ONCE(iter->position); | 1045 | pos = READ_ONCE(iter->position); |
| 1045 | /* | 1046 | /* |
| 1046 | * A racing update may change the position and | 1047 | * A racing update may change the position and |
| 1047 | * put the last reference, hence css_tryget(), | 1048 | * put the last reference, hence css_tryget(), |
| @@ -1358,13 +1359,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) | |||
| 1358 | unsigned long limit; | 1359 | unsigned long limit; |
| 1359 | 1360 | ||
| 1360 | count = page_counter_read(&memcg->memory); | 1361 | count = page_counter_read(&memcg->memory); |
| 1361 | limit = ACCESS_ONCE(memcg->memory.limit); | 1362 | limit = READ_ONCE(memcg->memory.limit); |
| 1362 | if (count < limit) | 1363 | if (count < limit) |
| 1363 | margin = limit - count; | 1364 | margin = limit - count; |
| 1364 | 1365 | ||
| 1365 | if (do_swap_account) { | 1366 | if (do_swap_account) { |
| 1366 | count = page_counter_read(&memcg->memsw); | 1367 | count = page_counter_read(&memcg->memsw); |
| 1367 | limit = ACCESS_ONCE(memcg->memsw.limit); | 1368 | limit = READ_ONCE(memcg->memsw.limit); |
| 1368 | if (count <= limit) | 1369 | if (count <= limit) |
| 1369 | margin = min(margin, limit - count); | 1370 | margin = min(margin, limit - count); |
| 1370 | } | 1371 | } |
| @@ -2349,20 +2350,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
| 2349 | } | 2350 | } |
| 2350 | 2351 | ||
| 2351 | /* | 2352 | /* |
| 2352 | * A helper function to get mem_cgroup from ID. must be called under | ||
| 2353 | * rcu_read_lock(). The caller is responsible for calling | ||
| 2354 | * css_tryget_online() if the mem_cgroup is used for charging. (dropping | ||
| 2355 | * refcnt from swap can be called against removed memcg.) | ||
| 2356 | */ | ||
| 2357 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | ||
| 2358 | { | ||
| 2359 | /* ID 0 is unused ID */ | ||
| 2360 | if (!id) | ||
| 2361 | return NULL; | ||
| 2362 | return mem_cgroup_from_id(id); | ||
| 2363 | } | ||
| 2364 | |||
| 2365 | /* | ||
| 2366 | * try_get_mem_cgroup_from_page - look up page's memcg association | 2353 | * try_get_mem_cgroup_from_page - look up page's memcg association |
| 2367 | * @page: the page | 2354 | * @page: the page |
| 2368 | * | 2355 | * |
| @@ -2388,7 +2375,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
| 2388 | ent.val = page_private(page); | 2375 | ent.val = page_private(page); |
| 2389 | id = lookup_swap_cgroup_id(ent); | 2376 | id = lookup_swap_cgroup_id(ent); |
| 2390 | rcu_read_lock(); | 2377 | rcu_read_lock(); |
| 2391 | memcg = mem_cgroup_lookup(id); | 2378 | memcg = mem_cgroup_from_id(id); |
| 2392 | if (memcg && !css_tryget_online(&memcg->css)) | 2379 | if (memcg && !css_tryget_online(&memcg->css)) |
| 2393 | memcg = NULL; | 2380 | memcg = NULL; |
| 2394 | rcu_read_unlock(); | 2381 | rcu_read_unlock(); |
| @@ -2650,7 +2637,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
| 2650 | return cachep; | 2637 | return cachep; |
| 2651 | 2638 | ||
| 2652 | memcg = get_mem_cgroup_from_mm(current->mm); | 2639 | memcg = get_mem_cgroup_from_mm(current->mm); |
| 2653 | kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); | 2640 | kmemcg_id = READ_ONCE(memcg->kmemcg_id); |
| 2654 | if (kmemcg_id < 0) | 2641 | if (kmemcg_id < 0) |
| 2655 | goto out; | 2642 | goto out; |
| 2656 | 2643 | ||
| @@ -5020,7 +5007,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
| 5020 | * tunable will only affect upcoming migrations, not the current one. | 5007 | * tunable will only affect upcoming migrations, not the current one. |
| 5021 | * So we need to save it, and keep it going. | 5008 | * So we need to save it, and keep it going. |
| 5022 | */ | 5009 | */ |
| 5023 | move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); | 5010 | move_flags = READ_ONCE(memcg->move_charge_at_immigrate); |
| 5024 | if (move_flags) { | 5011 | if (move_flags) { |
| 5025 | struct mm_struct *mm; | 5012 | struct mm_struct *mm; |
| 5026 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 5013 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
| @@ -5254,7 +5241,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, | |||
| 5254 | static int memory_low_show(struct seq_file *m, void *v) | 5241 | static int memory_low_show(struct seq_file *m, void *v) |
| 5255 | { | 5242 | { |
| 5256 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5243 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5257 | unsigned long low = ACCESS_ONCE(memcg->low); | 5244 | unsigned long low = READ_ONCE(memcg->low); |
| 5258 | 5245 | ||
| 5259 | if (low == PAGE_COUNTER_MAX) | 5246 | if (low == PAGE_COUNTER_MAX) |
| 5260 | seq_puts(m, "max\n"); | 5247 | seq_puts(m, "max\n"); |
| @@ -5284,7 +5271,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, | |||
| 5284 | static int memory_high_show(struct seq_file *m, void *v) | 5271 | static int memory_high_show(struct seq_file *m, void *v) |
| 5285 | { | 5272 | { |
| 5286 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5273 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5287 | unsigned long high = ACCESS_ONCE(memcg->high); | 5274 | unsigned long high = READ_ONCE(memcg->high); |
| 5288 | 5275 | ||
| 5289 | if (high == PAGE_COUNTER_MAX) | 5276 | if (high == PAGE_COUNTER_MAX) |
| 5290 | seq_puts(m, "max\n"); | 5277 | seq_puts(m, "max\n"); |
| @@ -5314,7 +5301,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, | |||
| 5314 | static int memory_max_show(struct seq_file *m, void *v) | 5301 | static int memory_max_show(struct seq_file *m, void *v) |
| 5315 | { | 5302 | { |
| 5316 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5303 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5317 | unsigned long max = ACCESS_ONCE(memcg->memory.limit); | 5304 | unsigned long max = READ_ONCE(memcg->memory.limit); |
| 5318 | 5305 | ||
| 5319 | if (max == PAGE_COUNTER_MAX) | 5306 | if (max == PAGE_COUNTER_MAX) |
| 5320 | seq_puts(m, "max\n"); | 5307 | seq_puts(m, "max\n"); |
| @@ -5869,7 +5856,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) | |||
| 5869 | 5856 | ||
| 5870 | id = swap_cgroup_record(entry, 0); | 5857 | id = swap_cgroup_record(entry, 0); |
| 5871 | rcu_read_lock(); | 5858 | rcu_read_lock(); |
| 5872 | memcg = mem_cgroup_lookup(id); | 5859 | memcg = mem_cgroup_from_id(id); |
| 5873 | if (memcg) { | 5860 | if (memcg) { |
| 5874 | if (!mem_cgroup_is_root(memcg)) | 5861 | if (!mem_cgroup_is_root(memcg)) |
| 5875 | page_counter_uncharge(&memcg->memsw, 1); | 5862 | page_counter_uncharge(&memcg->memsw, 1); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d487f8dc6d39..d9359b770cd9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -521,6 +521,52 @@ static const char *action_name[] = { | |||
| 521 | [RECOVERED] = "Recovered", | 521 | [RECOVERED] = "Recovered", |
| 522 | }; | 522 | }; |
| 523 | 523 | ||
| 524 | enum action_page_type { | ||
| 525 | MSG_KERNEL, | ||
| 526 | MSG_KERNEL_HIGH_ORDER, | ||
| 527 | MSG_SLAB, | ||
| 528 | MSG_DIFFERENT_COMPOUND, | ||
| 529 | MSG_POISONED_HUGE, | ||
| 530 | MSG_HUGE, | ||
| 531 | MSG_FREE_HUGE, | ||
| 532 | MSG_UNMAP_FAILED, | ||
| 533 | MSG_DIRTY_SWAPCACHE, | ||
| 534 | MSG_CLEAN_SWAPCACHE, | ||
| 535 | MSG_DIRTY_MLOCKED_LRU, | ||
| 536 | MSG_CLEAN_MLOCKED_LRU, | ||
| 537 | MSG_DIRTY_UNEVICTABLE_LRU, | ||
| 538 | MSG_CLEAN_UNEVICTABLE_LRU, | ||
| 539 | MSG_DIRTY_LRU, | ||
| 540 | MSG_CLEAN_LRU, | ||
| 541 | MSG_TRUNCATED_LRU, | ||
| 542 | MSG_BUDDY, | ||
| 543 | MSG_BUDDY_2ND, | ||
| 544 | MSG_UNKNOWN, | ||
| 545 | }; | ||
| 546 | |||
| 547 | static const char * const action_page_types[] = { | ||
| 548 | [MSG_KERNEL] = "reserved kernel page", | ||
| 549 | [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", | ||
| 550 | [MSG_SLAB] = "kernel slab page", | ||
| 551 | [MSG_DIFFERENT_COMPOUND] = "different compound page after locking", | ||
| 552 | [MSG_POISONED_HUGE] = "huge page already hardware poisoned", | ||
| 553 | [MSG_HUGE] = "huge page", | ||
| 554 | [MSG_FREE_HUGE] = "free huge page", | ||
| 555 | [MSG_UNMAP_FAILED] = "unmapping failed page", | ||
| 556 | [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", | ||
| 557 | [MSG_CLEAN_SWAPCACHE] = "clean swapcache page", | ||
| 558 | [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", | ||
| 559 | [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", | ||
| 560 | [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", | ||
| 561 | [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", | ||
| 562 | [MSG_DIRTY_LRU] = "dirty LRU page", | ||
| 563 | [MSG_CLEAN_LRU] = "clean LRU page", | ||
| 564 | [MSG_TRUNCATED_LRU] = "already truncated LRU page", | ||
| 565 | [MSG_BUDDY] = "free buddy page", | ||
| 566 | [MSG_BUDDY_2ND] = "free buddy page (2nd try)", | ||
| 567 | [MSG_UNKNOWN] = "unknown page", | ||
| 568 | }; | ||
| 569 | |||
| 524 | /* | 570 | /* |
| 525 | * XXX: It is possible that a page is isolated from LRU cache, | 571 | * XXX: It is possible that a page is isolated from LRU cache, |
| 526 | * and then kept in swap cache or failed to remove from page cache. | 572 | * and then kept in swap cache or failed to remove from page cache. |
| @@ -777,10 +823,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
| 777 | static struct page_state { | 823 | static struct page_state { |
| 778 | unsigned long mask; | 824 | unsigned long mask; |
| 779 | unsigned long res; | 825 | unsigned long res; |
| 780 | char *msg; | 826 | enum action_page_type type; |
| 781 | int (*action)(struct page *p, unsigned long pfn); | 827 | int (*action)(struct page *p, unsigned long pfn); |
| 782 | } error_states[] = { | 828 | } error_states[] = { |
| 783 | { reserved, reserved, "reserved kernel", me_kernel }, | 829 | { reserved, reserved, MSG_KERNEL, me_kernel }, |
| 784 | /* | 830 | /* |
| 785 | * free pages are specially detected outside this table: | 831 | * free pages are specially detected outside this table: |
| 786 | * PG_buddy pages only make a small fraction of all free pages. | 832 | * PG_buddy pages only make a small fraction of all free pages. |
| @@ -791,31 +837,31 @@ static struct page_state { | |||
| 791 | * currently unused objects without touching them. But just | 837 | * currently unused objects without touching them. But just |
| 792 | * treat it as standard kernel for now. | 838 | * treat it as standard kernel for now. |
| 793 | */ | 839 | */ |
| 794 | { slab, slab, "kernel slab", me_kernel }, | 840 | { slab, slab, MSG_SLAB, me_kernel }, |
| 795 | 841 | ||
| 796 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | 842 | #ifdef CONFIG_PAGEFLAGS_EXTENDED |
| 797 | { head, head, "huge", me_huge_page }, | 843 | { head, head, MSG_HUGE, me_huge_page }, |
| 798 | { tail, tail, "huge", me_huge_page }, | 844 | { tail, tail, MSG_HUGE, me_huge_page }, |
| 799 | #else | 845 | #else |
| 800 | { compound, compound, "huge", me_huge_page }, | 846 | { compound, compound, MSG_HUGE, me_huge_page }, |
| 801 | #endif | 847 | #endif |
| 802 | 848 | ||
| 803 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, | 849 | { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, |
| 804 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, | 850 | { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, |
| 805 | 851 | ||
| 806 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, | 852 | { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, |
| 807 | { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, | 853 | { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, |
| 808 | 854 | ||
| 809 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | 855 | { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, |
| 810 | { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, | 856 | { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, |
| 811 | 857 | ||
| 812 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, | 858 | { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty }, |
| 813 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 859 | { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean }, |
| 814 | 860 | ||
| 815 | /* | 861 | /* |
| 816 | * Catchall entry: must be at end. | 862 | * Catchall entry: must be at end. |
| 817 | */ | 863 | */ |
| 818 | { 0, 0, "unknown page state", me_unknown }, | 864 | { 0, 0, MSG_UNKNOWN, me_unknown }, |
| 819 | }; | 865 | }; |
| 820 | 866 | ||
| 821 | #undef dirty | 867 | #undef dirty |
| @@ -835,10 +881,10 @@ static struct page_state { | |||
| 835 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of | 881 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of |
| 836 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). | 882 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). |
| 837 | */ | 883 | */ |
| 838 | static void action_result(unsigned long pfn, char *msg, int result) | 884 | static void action_result(unsigned long pfn, enum action_page_type type, int result) |
| 839 | { | 885 | { |
| 840 | pr_err("MCE %#lx: %s page recovery: %s\n", | 886 | pr_err("MCE %#lx: recovery action for %s: %s\n", |
| 841 | pfn, msg, action_name[result]); | 887 | pfn, action_page_types[type], action_name[result]); |
| 842 | } | 888 | } |
| 843 | 889 | ||
| 844 | static int page_action(struct page_state *ps, struct page *p, | 890 | static int page_action(struct page_state *ps, struct page *p, |
| @@ -854,11 +900,11 @@ static int page_action(struct page_state *ps, struct page *p, | |||
| 854 | count--; | 900 | count--; |
| 855 | if (count != 0) { | 901 | if (count != 0) { |
| 856 | printk(KERN_ERR | 902 | printk(KERN_ERR |
| 857 | "MCE %#lx: %s page still referenced by %d users\n", | 903 | "MCE %#lx: %s still referenced by %d users\n", |
| 858 | pfn, ps->msg, count); | 904 | pfn, action_page_types[ps->type], count); |
| 859 | result = FAILED; | 905 | result = FAILED; |
| 860 | } | 906 | } |
| 861 | action_result(pfn, ps->msg, result); | 907 | action_result(pfn, ps->type, result); |
| 862 | 908 | ||
| 863 | /* Could do more checks here if page looks ok */ | 909 | /* Could do more checks here if page looks ok */ |
| 864 | /* | 910 | /* |
| @@ -1106,7 +1152,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1106 | if (!(flags & MF_COUNT_INCREASED) && | 1152 | if (!(flags & MF_COUNT_INCREASED) && |
| 1107 | !get_page_unless_zero(hpage)) { | 1153 | !get_page_unless_zero(hpage)) { |
| 1108 | if (is_free_buddy_page(p)) { | 1154 | if (is_free_buddy_page(p)) { |
| 1109 | action_result(pfn, "free buddy", DELAYED); | 1155 | action_result(pfn, MSG_BUDDY, DELAYED); |
| 1110 | return 0; | 1156 | return 0; |
| 1111 | } else if (PageHuge(hpage)) { | 1157 | } else if (PageHuge(hpage)) { |
| 1112 | /* | 1158 | /* |
| @@ -1123,12 +1169,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1123 | } | 1169 | } |
| 1124 | set_page_hwpoison_huge_page(hpage); | 1170 | set_page_hwpoison_huge_page(hpage); |
| 1125 | res = dequeue_hwpoisoned_huge_page(hpage); | 1171 | res = dequeue_hwpoisoned_huge_page(hpage); |
| 1126 | action_result(pfn, "free huge", | 1172 | action_result(pfn, MSG_FREE_HUGE, |
| 1127 | res ? IGNORED : DELAYED); | 1173 | res ? IGNORED : DELAYED); |
| 1128 | unlock_page(hpage); | 1174 | unlock_page(hpage); |
| 1129 | return res; | 1175 | return res; |
| 1130 | } else { | 1176 | } else { |
| 1131 | action_result(pfn, "high order kernel", IGNORED); | 1177 | action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED); |
| 1132 | return -EBUSY; | 1178 | return -EBUSY; |
| 1133 | } | 1179 | } |
| 1134 | } | 1180 | } |
| @@ -1150,9 +1196,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1150 | */ | 1196 | */ |
| 1151 | if (is_free_buddy_page(p)) { | 1197 | if (is_free_buddy_page(p)) { |
| 1152 | if (flags & MF_COUNT_INCREASED) | 1198 | if (flags & MF_COUNT_INCREASED) |
| 1153 | action_result(pfn, "free buddy", DELAYED); | 1199 | action_result(pfn, MSG_BUDDY, DELAYED); |
| 1154 | else | 1200 | else |
| 1155 | action_result(pfn, "free buddy, 2nd try", DELAYED); | 1201 | action_result(pfn, MSG_BUDDY_2ND, |
| 1202 | DELAYED); | ||
| 1156 | return 0; | 1203 | return 0; |
| 1157 | } | 1204 | } |
| 1158 | } | 1205 | } |
| @@ -1165,7 +1212,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1165 | * If this happens just bail out. | 1212 | * If this happens just bail out. |
| 1166 | */ | 1213 | */ |
| 1167 | if (compound_head(p) != hpage) { | 1214 | if (compound_head(p) != hpage) { |
| 1168 | action_result(pfn, "different compound page after locking", IGNORED); | 1215 | action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); |
| 1169 | res = -EBUSY; | 1216 | res = -EBUSY; |
| 1170 | goto out; | 1217 | goto out; |
| 1171 | } | 1218 | } |
| @@ -1205,8 +1252,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1205 | * on the head page to show that the hugepage is hwpoisoned | 1252 | * on the head page to show that the hugepage is hwpoisoned |
| 1206 | */ | 1253 | */ |
| 1207 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { | 1254 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { |
| 1208 | action_result(pfn, "hugepage already hardware poisoned", | 1255 | action_result(pfn, MSG_POISONED_HUGE, IGNORED); |
| 1209 | IGNORED); | ||
| 1210 | unlock_page(hpage); | 1256 | unlock_page(hpage); |
| 1211 | put_page(hpage); | 1257 | put_page(hpage); |
| 1212 | return 0; | 1258 | return 0; |
| @@ -1235,7 +1281,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1235 | */ | 1281 | */ |
| 1236 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) | 1282 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) |
| 1237 | != SWAP_SUCCESS) { | 1283 | != SWAP_SUCCESS) { |
| 1238 | action_result(pfn, "unmapping failed", IGNORED); | 1284 | action_result(pfn, MSG_UNMAP_FAILED, IGNORED); |
| 1239 | res = -EBUSY; | 1285 | res = -EBUSY; |
| 1240 | goto out; | 1286 | goto out; |
| 1241 | } | 1287 | } |
| @@ -1244,7 +1290,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1244 | * Torn down by someone else? | 1290 | * Torn down by someone else? |
| 1245 | */ | 1291 | */ |
| 1246 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { | 1292 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
| 1247 | action_result(pfn, "already truncated LRU", IGNORED); | 1293 | action_result(pfn, MSG_TRUNCATED_LRU, IGNORED); |
| 1248 | res = -EBUSY; | 1294 | res = -EBUSY; |
| 1249 | goto out; | 1295 | goto out; |
| 1250 | } | 1296 | } |
| @@ -1540,8 +1586,18 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
| 1540 | } | 1586 | } |
| 1541 | unlock_page(hpage); | 1587 | unlock_page(hpage); |
| 1542 | 1588 | ||
| 1543 | /* Keep page count to indicate a given hugepage is isolated. */ | 1589 | ret = isolate_huge_page(hpage, &pagelist); |
| 1544 | list_move(&hpage->lru, &pagelist); | 1590 | if (ret) { |
| 1591 | /* | ||
| 1592 | * get_any_page() and isolate_huge_page() takes a refcount each, | ||
| 1593 | * so need to drop one here. | ||
| 1594 | */ | ||
| 1595 | put_page(hpage); | ||
| 1596 | } else { | ||
| 1597 | pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); | ||
| 1598 | return -EBUSY; | ||
| 1599 | } | ||
| 1600 | |||
| 1545 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, | 1601 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
| 1546 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1602 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
| 1547 | if (ret) { | 1603 | if (ret) { |
diff --git a/mm/memory.c b/mm/memory.c index ac20b2a6a0c3..22e037e3364e 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
| 690 | /* | 690 | /* |
| 691 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y | 691 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y |
| 692 | */ | 692 | */ |
| 693 | if (vma->vm_ops) | 693 | pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", |
| 694 | printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", | 694 | vma->vm_file, |
| 695 | vma->vm_ops->fault); | 695 | vma->vm_ops ? vma->vm_ops->fault : NULL, |
| 696 | if (vma->vm_file) | 696 | vma->vm_file ? vma->vm_file->f_op->mmap : NULL, |
| 697 | printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", | 697 | mapping ? mapping->a_ops->readpage : NULL); |
| 698 | vma->vm_file->f_op->mmap); | ||
| 699 | dump_stack(); | 698 | dump_stack(); |
| 700 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 699 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
| 701 | } | 700 | } |
| @@ -2181,6 +2180,42 @@ oom: | |||
| 2181 | return VM_FAULT_OOM; | 2180 | return VM_FAULT_OOM; |
| 2182 | } | 2181 | } |
| 2183 | 2182 | ||
| 2183 | /* | ||
| 2184 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED | ||
| 2185 | * mapping | ||
| 2186 | */ | ||
| 2187 | static int wp_pfn_shared(struct mm_struct *mm, | ||
| 2188 | struct vm_area_struct *vma, unsigned long address, | ||
| 2189 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, | ||
| 2190 | pmd_t *pmd) | ||
| 2191 | { | ||
| 2192 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { | ||
| 2193 | struct vm_fault vmf = { | ||
| 2194 | .page = NULL, | ||
| 2195 | .pgoff = linear_page_index(vma, address), | ||
| 2196 | .virtual_address = (void __user *)(address & PAGE_MASK), | ||
| 2197 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, | ||
| 2198 | }; | ||
| 2199 | int ret; | ||
| 2200 | |||
| 2201 | pte_unmap_unlock(page_table, ptl); | ||
| 2202 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); | ||
| 2203 | if (ret & VM_FAULT_ERROR) | ||
| 2204 | return ret; | ||
| 2205 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
| 2206 | /* | ||
| 2207 | * We might have raced with another page fault while we | ||
| 2208 | * released the pte_offset_map_lock. | ||
| 2209 | */ | ||
| 2210 | if (!pte_same(*page_table, orig_pte)) { | ||
| 2211 | pte_unmap_unlock(page_table, ptl); | ||
| 2212 | return 0; | ||
| 2213 | } | ||
| 2214 | } | ||
| 2215 | return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, | ||
| 2216 | NULL, 0, 0); | ||
| 2217 | } | ||
| 2218 | |||
| 2184 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | 2219 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2185 | unsigned long address, pte_t *page_table, | 2220 | unsigned long address, pte_t *page_table, |
| 2186 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, | 2221 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, |
| @@ -2259,13 +2294,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2259 | * VM_PFNMAP VMA. | 2294 | * VM_PFNMAP VMA. |
| 2260 | * | 2295 | * |
| 2261 | * We should not cow pages in a shared writeable mapping. | 2296 | * We should not cow pages in a shared writeable mapping. |
| 2262 | * Just mark the pages writable as we can't do any dirty | 2297 | * Just mark the pages writable and/or call ops->pfn_mkwrite. |
| 2263 | * accounting on raw pfn maps. | ||
| 2264 | */ | 2298 | */ |
| 2265 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2299 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
| 2266 | (VM_WRITE|VM_SHARED)) | 2300 | (VM_WRITE|VM_SHARED)) |
| 2267 | return wp_page_reuse(mm, vma, address, page_table, ptl, | 2301 | return wp_pfn_shared(mm, vma, address, page_table, ptl, |
| 2268 | orig_pte, old_page, 0, 0); | 2302 | orig_pte, pmd); |
| 2269 | 2303 | ||
| 2270 | pte_unmap_unlock(page_table, ptl); | 2304 | pte_unmap_unlock(page_table, ptl); |
| 2271 | return wp_page_copy(mm, vma, address, page_table, pmd, | 2305 | return wp_page_copy(mm, vma, address, page_table, pmd, |
| @@ -2845,7 +2879,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | |||
| 2845 | struct vm_fault vmf; | 2879 | struct vm_fault vmf; |
| 2846 | int off; | 2880 | int off; |
| 2847 | 2881 | ||
| 2848 | nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 2882 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
| 2849 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 2883 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
| 2850 | 2884 | ||
| 2851 | start_addr = max(address & mask, vma->vm_start); | 2885 | start_addr = max(address & mask, vma->vm_start); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e2e8014fb755..457bde530cbe 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -1373,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) | |||
| 1373 | if (PageLRU(page)) | 1373 | if (PageLRU(page)) |
| 1374 | return pfn; | 1374 | return pfn; |
| 1375 | if (PageHuge(page)) { | 1375 | if (PageHuge(page)) { |
| 1376 | if (is_hugepage_active(page)) | 1376 | if (page_huge_active(page)) |
| 1377 | return pfn; | 1377 | return pfn; |
| 1378 | else | 1378 | else |
| 1379 | pfn = round_up(pfn + 1, | 1379 | pfn = round_up(pfn + 1, |
diff --git a/mm/mempool.c b/mm/mempool.c index 949970db2874..2cc08de8b1db 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
| @@ -6,26 +6,138 @@ | |||
| 6 | * extreme VM load. | 6 | * extreme VM load. |
| 7 | * | 7 | * |
| 8 | * started by Ingo Molnar, Copyright (C) 2001 | 8 | * started by Ingo Molnar, Copyright (C) 2001 |
| 9 | * debugging by David Rientjes, Copyright (C) 2015 | ||
| 9 | */ | 10 | */ |
| 10 | 11 | ||
| 11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
| 12 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
| 14 | #include <linux/highmem.h> | ||
| 15 | #include <linux/kasan.h> | ||
| 13 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
| 14 | #include <linux/export.h> | 17 | #include <linux/export.h> |
| 15 | #include <linux/mempool.h> | 18 | #include <linux/mempool.h> |
| 16 | #include <linux/blkdev.h> | 19 | #include <linux/blkdev.h> |
| 17 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
| 21 | #include "slab.h" | ||
| 22 | |||
| 23 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) | ||
| 24 | static void poison_error(mempool_t *pool, void *element, size_t size, | ||
| 25 | size_t byte) | ||
| 26 | { | ||
| 27 | const int nr = pool->curr_nr; | ||
| 28 | const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); | ||
| 29 | const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); | ||
| 30 | int i; | ||
| 31 | |||
| 32 | pr_err("BUG: mempool element poison mismatch\n"); | ||
| 33 | pr_err("Mempool %p size %zu\n", pool, size); | ||
| 34 | pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); | ||
| 35 | for (i = start; i < end; i++) | ||
| 36 | pr_cont("%x ", *(u8 *)(element + i)); | ||
| 37 | pr_cont("%s\n", end < size ? "..." : ""); | ||
| 38 | dump_stack(); | ||
| 39 | } | ||
| 40 | |||
| 41 | static void __check_element(mempool_t *pool, void *element, size_t size) | ||
| 42 | { | ||
| 43 | u8 *obj = element; | ||
| 44 | size_t i; | ||
| 45 | |||
| 46 | for (i = 0; i < size; i++) { | ||
| 47 | u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; | ||
| 48 | |||
| 49 | if (obj[i] != exp) { | ||
| 50 | poison_error(pool, element, size, i); | ||
| 51 | return; | ||
| 52 | } | ||
| 53 | } | ||
| 54 | memset(obj, POISON_INUSE, size); | ||
| 55 | } | ||
| 56 | |||
| 57 | static void check_element(mempool_t *pool, void *element) | ||
| 58 | { | ||
| 59 | /* Mempools backed by slab allocator */ | ||
| 60 | if (pool->free == mempool_free_slab || pool->free == mempool_kfree) | ||
| 61 | __check_element(pool, element, ksize(element)); | ||
| 62 | |||
| 63 | /* Mempools backed by page allocator */ | ||
| 64 | if (pool->free == mempool_free_pages) { | ||
| 65 | int order = (int)(long)pool->pool_data; | ||
| 66 | void *addr = kmap_atomic((struct page *)element); | ||
| 67 | |||
| 68 | __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); | ||
| 69 | kunmap_atomic(addr); | ||
| 70 | } | ||
| 71 | } | ||
| 72 | |||
| 73 | static void __poison_element(void *element, size_t size) | ||
| 74 | { | ||
| 75 | u8 *obj = element; | ||
| 76 | |||
| 77 | memset(obj, POISON_FREE, size - 1); | ||
| 78 | obj[size - 1] = POISON_END; | ||
| 79 | } | ||
| 80 | |||
| 81 | static void poison_element(mempool_t *pool, void *element) | ||
| 82 | { | ||
| 83 | /* Mempools backed by slab allocator */ | ||
| 84 | if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) | ||
| 85 | __poison_element(element, ksize(element)); | ||
| 86 | |||
| 87 | /* Mempools backed by page allocator */ | ||
| 88 | if (pool->alloc == mempool_alloc_pages) { | ||
| 89 | int order = (int)(long)pool->pool_data; | ||
| 90 | void *addr = kmap_atomic((struct page *)element); | ||
| 91 | |||
| 92 | __poison_element(addr, 1UL << (PAGE_SHIFT + order)); | ||
| 93 | kunmap_atomic(addr); | ||
| 94 | } | ||
| 95 | } | ||
| 96 | #else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ | ||
| 97 | static inline void check_element(mempool_t *pool, void *element) | ||
| 98 | { | ||
| 99 | } | ||
| 100 | static inline void poison_element(mempool_t *pool, void *element) | ||
| 101 | { | ||
| 102 | } | ||
| 103 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ | ||
| 104 | |||
| 105 | static void kasan_poison_element(mempool_t *pool, void *element) | ||
| 106 | { | ||
| 107 | if (pool->alloc == mempool_alloc_slab) | ||
| 108 | kasan_slab_free(pool->pool_data, element); | ||
| 109 | if (pool->alloc == mempool_kmalloc) | ||
| 110 | kasan_kfree(element); | ||
| 111 | if (pool->alloc == mempool_alloc_pages) | ||
| 112 | kasan_free_pages(element, (unsigned long)pool->pool_data); | ||
| 113 | } | ||
| 114 | |||
| 115 | static void kasan_unpoison_element(mempool_t *pool, void *element) | ||
| 116 | { | ||
| 117 | if (pool->alloc == mempool_alloc_slab) | ||
| 118 | kasan_slab_alloc(pool->pool_data, element); | ||
| 119 | if (pool->alloc == mempool_kmalloc) | ||
| 120 | kasan_krealloc(element, (size_t)pool->pool_data); | ||
| 121 | if (pool->alloc == mempool_alloc_pages) | ||
| 122 | kasan_alloc_pages(element, (unsigned long)pool->pool_data); | ||
| 123 | } | ||
| 18 | 124 | ||
| 19 | static void add_element(mempool_t *pool, void *element) | 125 | static void add_element(mempool_t *pool, void *element) |
| 20 | { | 126 | { |
| 21 | BUG_ON(pool->curr_nr >= pool->min_nr); | 127 | BUG_ON(pool->curr_nr >= pool->min_nr); |
| 128 | poison_element(pool, element); | ||
| 129 | kasan_poison_element(pool, element); | ||
| 22 | pool->elements[pool->curr_nr++] = element; | 130 | pool->elements[pool->curr_nr++] = element; |
| 23 | } | 131 | } |
| 24 | 132 | ||
| 25 | static void *remove_element(mempool_t *pool) | 133 | static void *remove_element(mempool_t *pool) |
| 26 | { | 134 | { |
| 27 | BUG_ON(pool->curr_nr <= 0); | 135 | void *element = pool->elements[--pool->curr_nr]; |
| 28 | return pool->elements[--pool->curr_nr]; | 136 | |
| 137 | BUG_ON(pool->curr_nr < 0); | ||
| 138 | check_element(pool, element); | ||
| 139 | kasan_unpoison_element(pool, element); | ||
| 140 | return element; | ||
| 29 | } | 141 | } |
| 30 | 142 | ||
| 31 | /** | 143 | /** |
| @@ -334,6 +446,7 @@ EXPORT_SYMBOL(mempool_free); | |||
| 334 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) | 446 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) |
| 335 | { | 447 | { |
| 336 | struct kmem_cache *mem = pool_data; | 448 | struct kmem_cache *mem = pool_data; |
| 449 | VM_BUG_ON(mem->ctor); | ||
| 337 | return kmem_cache_alloc(mem, gfp_mask); | 450 | return kmem_cache_alloc(mem, gfp_mask); |
| 338 | } | 451 | } |
| 339 | EXPORT_SYMBOL(mempool_alloc_slab); | 452 | EXPORT_SYMBOL(mempool_alloc_slab); |
diff --git a/mm/migrate.c b/mm/migrate.c index a65ff72ab739..f53838fe3dfe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
| 537 | * Please do not reorder this without considering how mm/ksm.c's | 537 | * Please do not reorder this without considering how mm/ksm.c's |
| 538 | * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). | 538 | * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). |
| 539 | */ | 539 | */ |
| 540 | ClearPageSwapCache(page); | 540 | if (PageSwapCache(page)) |
| 541 | ClearPageSwapCache(page); | ||
| 541 | ClearPagePrivate(page); | 542 | ClearPagePrivate(page); |
| 542 | set_page_private(page, 0); | 543 | set_page_private(page, 0); |
| 543 | 544 | ||
| @@ -1133,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * | |||
| 1133 | * by another page fault trying to merge _that_. But that's ok: if it | 1133 | * by another page fault trying to merge _that_. But that's ok: if it |
| 1134 | * is being set up, that automatically means that it will be a singleton | 1134 | * is being set up, that automatically means that it will be a singleton |
| 1135 | * acceptable for merging, so we can do all of this optimistically. But | 1135 | * acceptable for merging, so we can do all of this optimistically. But |
| 1136 | * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. | 1136 | * we do that READ_ONCE() to make sure that we never re-load the pointer. |
| 1137 | * | 1137 | * |
| 1138 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only | 1138 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only |
| 1139 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid | 1139 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid |
| @@ -1147,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * | |||
| 1147 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) | 1147 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) |
| 1148 | { | 1148 | { |
| 1149 | if (anon_vma_compatible(a, b)) { | 1149 | if (anon_vma_compatible(a, b)) { |
| 1150 | struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); | 1150 | struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); |
| 1151 | 1151 | ||
| 1152 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) | 1152 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) |
| 1153 | return anon_vma; | 1153 | return anon_vma; |
| @@ -1551,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
| 1551 | 1551 | ||
| 1552 | /* Clear old maps */ | 1552 | /* Clear old maps */ |
| 1553 | error = -ENOMEM; | 1553 | error = -ENOMEM; |
| 1554 | munmap_back: | 1554 | while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, |
| 1555 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { | 1555 | &rb_parent)) { |
| 1556 | if (do_munmap(mm, addr, len)) | 1556 | if (do_munmap(mm, addr, len)) |
| 1557 | return -ENOMEM; | 1557 | return -ENOMEM; |
| 1558 | goto munmap_back; | ||
| 1559 | } | 1558 | } |
| 1560 | 1559 | ||
| 1561 | /* | 1560 | /* |
| @@ -1571,7 +1570,8 @@ munmap_back: | |||
| 1571 | /* | 1570 | /* |
| 1572 | * Can we just expand an old mapping? | 1571 | * Can we just expand an old mapping? |
| 1573 | */ | 1572 | */ |
| 1574 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); | 1573 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, |
| 1574 | NULL); | ||
| 1575 | if (vma) | 1575 | if (vma) |
| 1576 | goto out; | 1576 | goto out; |
| 1577 | 1577 | ||
| @@ -2100,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
| 2100 | actual_size = size; | 2100 | actual_size = size; |
| 2101 | if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) | 2101 | if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) |
| 2102 | actual_size -= PAGE_SIZE; | 2102 | actual_size -= PAGE_SIZE; |
| 2103 | if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) | 2103 | if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) |
| 2104 | return -ENOMEM; | 2104 | return -ENOMEM; |
| 2105 | 2105 | ||
| 2106 | /* mlock limit tests */ | 2106 | /* mlock limit tests */ |
| @@ -2108,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
| 2108 | unsigned long locked; | 2108 | unsigned long locked; |
| 2109 | unsigned long limit; | 2109 | unsigned long limit; |
| 2110 | locked = mm->locked_vm + grow; | 2110 | locked = mm->locked_vm + grow; |
| 2111 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); | 2111 | limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); |
| 2112 | limit >>= PAGE_SHIFT; | 2112 | limit >>= PAGE_SHIFT; |
| 2113 | if (locked > limit && !capable(CAP_IPC_LOCK)) | 2113 | if (locked > limit && !capable(CAP_IPC_LOCK)) |
| 2114 | return -ENOMEM; | 2114 | return -ENOMEM; |
| @@ -2739,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2739 | /* | 2739 | /* |
| 2740 | * Clear old maps. this also does some error checking for us | 2740 | * Clear old maps. this also does some error checking for us |
| 2741 | */ | 2741 | */ |
| 2742 | munmap_back: | 2742 | while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, |
| 2743 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { | 2743 | &rb_parent)) { |
| 2744 | if (do_munmap(mm, addr, len)) | 2744 | if (do_munmap(mm, addr, len)) |
| 2745 | return -ENOMEM; | 2745 | return -ENOMEM; |
| 2746 | goto munmap_back; | ||
| 2747 | } | 2746 | } |
| 2748 | 2747 | ||
| 2749 | /* Check against address space limits *after* clearing old maps... */ | 2748 | /* Check against address space limits *after* clearing old maps... */ |
diff --git a/mm/mremap.c b/mm/mremap.c index 2dc44b1cb1df..034e2d360652 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -345,25 +345,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
| 345 | struct vm_area_struct *vma = find_vma(mm, addr); | 345 | struct vm_area_struct *vma = find_vma(mm, addr); |
| 346 | 346 | ||
| 347 | if (!vma || vma->vm_start > addr) | 347 | if (!vma || vma->vm_start > addr) |
| 348 | goto Efault; | 348 | return ERR_PTR(-EFAULT); |
| 349 | 349 | ||
| 350 | if (is_vm_hugetlb_page(vma)) | 350 | if (is_vm_hugetlb_page(vma)) |
| 351 | goto Einval; | 351 | return ERR_PTR(-EINVAL); |
| 352 | 352 | ||
| 353 | /* We can't remap across vm area boundaries */ | 353 | /* We can't remap across vm area boundaries */ |
| 354 | if (old_len > vma->vm_end - addr) | 354 | if (old_len > vma->vm_end - addr) |
| 355 | goto Efault; | 355 | return ERR_PTR(-EFAULT); |
| 356 | 356 | ||
| 357 | /* Need to be careful about a growing mapping */ | 357 | /* Need to be careful about a growing mapping */ |
| 358 | if (new_len > old_len) { | 358 | if (new_len > old_len) { |
| 359 | unsigned long pgoff; | 359 | unsigned long pgoff; |
| 360 | 360 | ||
| 361 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) | 361 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) |
| 362 | goto Efault; | 362 | return ERR_PTR(-EFAULT); |
| 363 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; | 363 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; |
| 364 | pgoff += vma->vm_pgoff; | 364 | pgoff += vma->vm_pgoff; |
| 365 | if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) | 365 | if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) |
| 366 | goto Einval; | 366 | return ERR_PTR(-EINVAL); |
| 367 | } | 367 | } |
| 368 | 368 | ||
| 369 | if (vma->vm_flags & VM_LOCKED) { | 369 | if (vma->vm_flags & VM_LOCKED) { |
| @@ -372,29 +372,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
| 372 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 372 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 373 | locked += new_len - old_len; | 373 | locked += new_len - old_len; |
| 374 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 374 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
| 375 | goto Eagain; | 375 | return ERR_PTR(-EAGAIN); |
| 376 | } | 376 | } |
| 377 | 377 | ||
| 378 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) | 378 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) |
| 379 | goto Enomem; | 379 | return ERR_PTR(-ENOMEM); |
| 380 | 380 | ||
| 381 | if (vma->vm_flags & VM_ACCOUNT) { | 381 | if (vma->vm_flags & VM_ACCOUNT) { |
| 382 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; | 382 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; |
| 383 | if (security_vm_enough_memory_mm(mm, charged)) | 383 | if (security_vm_enough_memory_mm(mm, charged)) |
| 384 | goto Efault; | 384 | return ERR_PTR(-ENOMEM); |
| 385 | *p = charged; | 385 | *p = charged; |
| 386 | } | 386 | } |
| 387 | 387 | ||
| 388 | return vma; | 388 | return vma; |
| 389 | |||
| 390 | Efault: /* very odd choice for most of the cases, but... */ | ||
| 391 | return ERR_PTR(-EFAULT); | ||
| 392 | Einval: | ||
| 393 | return ERR_PTR(-EINVAL); | ||
| 394 | Enomem: | ||
| 395 | return ERR_PTR(-ENOMEM); | ||
| 396 | Eagain: | ||
| 397 | return ERR_PTR(-EAGAIN); | ||
| 398 | } | 389 | } |
| 399 | 390 | ||
| 400 | static unsigned long mremap_to(unsigned long addr, unsigned long old_len, | 391 | static unsigned long mremap_to(unsigned long addr, unsigned long old_len, |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 52628c819bf7..2b665da1b3c9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -408,7 +408,7 @@ bool oom_killer_disabled __read_mostly; | |||
| 408 | static DECLARE_RWSEM(oom_sem); | 408 | static DECLARE_RWSEM(oom_sem); |
| 409 | 409 | ||
| 410 | /** | 410 | /** |
| 411 | * mark_tsk_oom_victim - marks the given taks as OOM victim. | 411 | * mark_tsk_oom_victim - marks the given task as OOM victim. |
| 412 | * @tsk: task to mark | 412 | * @tsk: task to mark |
| 413 | * | 413 | * |
| 414 | * Has to be called with oom_sem taken for read and never after | 414 | * Has to be called with oom_sem taken for read and never after |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0372411f38fc..5daf5568b9e1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -2228,7 +2228,8 @@ int set_page_dirty(struct page *page) | |||
| 2228 | * it will confuse readahead and make it restart the size rampup | 2228 | * it will confuse readahead and make it restart the size rampup |
| 2229 | * process. But it's a trivial problem. | 2229 | * process. But it's a trivial problem. |
| 2230 | */ | 2230 | */ |
| 2231 | ClearPageReclaim(page); | 2231 | if (PageReclaim(page)) |
| 2232 | ClearPageReclaim(page); | ||
| 2232 | #ifdef CONFIG_BLOCK | 2233 | #ifdef CONFIG_BLOCK |
| 2233 | if (!spd) | 2234 | if (!spd) |
| 2234 | spd = __set_page_dirty_buffers; | 2235 | spd = __set_page_dirty_buffers; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1b849500640c..ebffa0e4a9c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -1371,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
| 1371 | int to_drain, batch; | 1371 | int to_drain, batch; |
| 1372 | 1372 | ||
| 1373 | local_irq_save(flags); | 1373 | local_irq_save(flags); |
| 1374 | batch = ACCESS_ONCE(pcp->batch); | 1374 | batch = READ_ONCE(pcp->batch); |
| 1375 | to_drain = min(pcp->count, batch); | 1375 | to_drain = min(pcp->count, batch); |
| 1376 | if (to_drain > 0) { | 1376 | if (to_drain > 0) { |
| 1377 | free_pcppages_bulk(zone, to_drain, pcp); | 1377 | free_pcppages_bulk(zone, to_drain, pcp); |
| @@ -1570,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
| 1570 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | 1570 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
| 1571 | pcp->count++; | 1571 | pcp->count++; |
| 1572 | if (pcp->count >= pcp->high) { | 1572 | if (pcp->count >= pcp->high) { |
| 1573 | unsigned long batch = ACCESS_ONCE(pcp->batch); | 1573 | unsigned long batch = READ_ONCE(pcp->batch); |
| 1574 | free_pcppages_bulk(zone, batch, pcp); | 1574 | free_pcppages_bulk(zone, batch, pcp); |
| 1575 | pcp->count -= batch; | 1575 | pcp->count -= batch; |
| 1576 | } | 1576 | } |
| @@ -6207,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, | |||
| 6207 | mask <<= (BITS_PER_LONG - bitidx - 1); | 6207 | mask <<= (BITS_PER_LONG - bitidx - 1); |
| 6208 | flags <<= (BITS_PER_LONG - bitidx - 1); | 6208 | flags <<= (BITS_PER_LONG - bitidx - 1); |
| 6209 | 6209 | ||
| 6210 | word = ACCESS_ONCE(bitmap[word_bitidx]); | 6210 | word = READ_ONCE(bitmap[word_bitidx]); |
| 6211 | for (;;) { | 6211 | for (;;) { |
| 6212 | old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); | 6212 | old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); |
| 6213 | if (word == old_word) | 6213 | if (word == old_word) |
| @@ -456,7 +456,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) | |||
| 456 | unsigned long anon_mapping; | 456 | unsigned long anon_mapping; |
| 457 | 457 | ||
| 458 | rcu_read_lock(); | 458 | rcu_read_lock(); |
| 459 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | 459 | anon_mapping = (unsigned long)READ_ONCE(page->mapping); |
| 460 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | 460 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
| 461 | goto out; | 461 | goto out; |
| 462 | if (!page_mapped(page)) | 462 | if (!page_mapped(page)) |
| @@ -500,14 +500,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) | |||
| 500 | unsigned long anon_mapping; | 500 | unsigned long anon_mapping; |
| 501 | 501 | ||
| 502 | rcu_read_lock(); | 502 | rcu_read_lock(); |
| 503 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | 503 | anon_mapping = (unsigned long)READ_ONCE(page->mapping); |
| 504 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | 504 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
| 505 | goto out; | 505 | goto out; |
| 506 | if (!page_mapped(page)) | 506 | if (!page_mapped(page)) |
| 507 | goto out; | 507 | goto out; |
| 508 | 508 | ||
| 509 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 509 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
| 510 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 510 | root_anon_vma = READ_ONCE(anon_vma->root); |
| 511 | if (down_read_trylock(&root_anon_vma->rwsem)) { | 511 | if (down_read_trylock(&root_anon_vma->rwsem)) { |
| 512 | /* | 512 | /* |
| 513 | * If the page is still mapped, then this anon_vma is still | 513 | * If the page is still mapped, then this anon_vma is still |
| @@ -4277,7 +4277,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
| 4277 | int node; | 4277 | int node; |
| 4278 | struct page *page; | 4278 | struct page *page; |
| 4279 | 4279 | ||
| 4280 | page = ACCESS_ONCE(c->page); | 4280 | page = READ_ONCE(c->page); |
| 4281 | if (!page) | 4281 | if (!page) |
| 4282 | continue; | 4282 | continue; |
| 4283 | 4283 | ||
| @@ -4292,7 +4292,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
| 4292 | total += x; | 4292 | total += x; |
| 4293 | nodes[node] += x; | 4293 | nodes[node] += x; |
| 4294 | 4294 | ||
| 4295 | page = ACCESS_ONCE(c->partial); | 4295 | page = READ_ONCE(c->partial); |
| 4296 | if (page) { | 4296 | if (page) { |
| 4297 | node = page_to_nid(page); | 4297 | node = page_to_nid(page); |
| 4298 | if (flags & SO_TOTAL) | 4298 | if (flags & SO_TOTAL) |
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
| 32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
| 33 | #include <linux/uio.h> | 33 | #include <linux/uio.h> |
| 34 | #include <linux/hugetlb.h> | ||
| 34 | 35 | ||
| 35 | #include "internal.h" | 36 | #include "internal.h" |
| 36 | 37 | ||
| @@ -42,7 +43,7 @@ int page_cluster; | |||
| 42 | 43 | ||
| 43 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); | 44 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); |
| 44 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); | 45 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
| 45 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | 46 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); |
| 46 | 47 | ||
| 47 | /* | 48 | /* |
| 48 | * This path almost never happens for VM activity - pages are normally | 49 | * This path almost never happens for VM activity - pages are normally |
| @@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page) | |||
| 75 | { | 76 | { |
| 76 | compound_page_dtor *dtor; | 77 | compound_page_dtor *dtor; |
| 77 | 78 | ||
| 78 | __page_cache_release(page); | 79 | /* |
| 80 | * __page_cache_release() is supposed to be called for thp, not for | ||
| 81 | * hugetlb. This is because hugetlb page does never have PageLRU set | ||
| 82 | * (it's never listed to any LRU lists) and no memcg routines should | ||
| 83 | * be called for hugetlb (it has a separate hugetlb_cgroup.) | ||
| 84 | */ | ||
| 85 | if (!PageHuge(page)) | ||
| 86 | __page_cache_release(page); | ||
| 79 | dtor = get_compound_page_dtor(page); | 87 | dtor = get_compound_page_dtor(page); |
| 80 | (*dtor)(page); | 88 | (*dtor)(page); |
| 81 | } | 89 | } |
| @@ -743,7 +751,7 @@ void lru_cache_add_active_or_unevictable(struct page *page, | |||
| 743 | * be write it out by flusher threads as this is much more effective | 751 | * be write it out by flusher threads as this is much more effective |
| 744 | * than the single-page writeout from reclaim. | 752 | * than the single-page writeout from reclaim. |
| 745 | */ | 753 | */ |
| 746 | static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, | 754 | static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, |
| 747 | void *arg) | 755 | void *arg) |
| 748 | { | 756 | { |
| 749 | int lru, file; | 757 | int lru, file; |
| @@ -811,36 +819,36 @@ void lru_add_drain_cpu(int cpu) | |||
| 811 | local_irq_restore(flags); | 819 | local_irq_restore(flags); |
| 812 | } | 820 | } |
| 813 | 821 | ||
| 814 | pvec = &per_cpu(lru_deactivate_pvecs, cpu); | 822 | pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); |
| 815 | if (pagevec_count(pvec)) | 823 | if (pagevec_count(pvec)) |
| 816 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | 824 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
| 817 | 825 | ||
| 818 | activate_page_drain(cpu); | 826 | activate_page_drain(cpu); |
| 819 | } | 827 | } |
| 820 | 828 | ||
| 821 | /** | 829 | /** |
| 822 | * deactivate_page - forcefully deactivate a page | 830 | * deactivate_file_page - forcefully deactivate a file page |
| 823 | * @page: page to deactivate | 831 | * @page: page to deactivate |
| 824 | * | 832 | * |
| 825 | * This function hints the VM that @page is a good reclaim candidate, | 833 | * This function hints the VM that @page is a good reclaim candidate, |
| 826 | * for example if its invalidation fails due to the page being dirty | 834 | * for example if its invalidation fails due to the page being dirty |
| 827 | * or under writeback. | 835 | * or under writeback. |
| 828 | */ | 836 | */ |
| 829 | void deactivate_page(struct page *page) | 837 | void deactivate_file_page(struct page *page) |
| 830 | { | 838 | { |
| 831 | /* | 839 | /* |
| 832 | * In a workload with many unevictable page such as mprotect, unevictable | 840 | * In a workload with many unevictable page such as mprotect, |
| 833 | * page deactivation for accelerating reclaim is pointless. | 841 | * unevictable page deactivation for accelerating reclaim is pointless. |
| 834 | */ | 842 | */ |
| 835 | if (PageUnevictable(page)) | 843 | if (PageUnevictable(page)) |
| 836 | return; | 844 | return; |
| 837 | 845 | ||
| 838 | if (likely(get_page_unless_zero(page))) { | 846 | if (likely(get_page_unless_zero(page))) { |
| 839 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); | 847 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); |
| 840 | 848 | ||
| 841 | if (!pagevec_add(pvec, page)) | 849 | if (!pagevec_add(pvec, page)) |
| 842 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | 850 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
| 843 | put_cpu_var(lru_deactivate_pvecs); | 851 | put_cpu_var(lru_deactivate_file_pvecs); |
| 844 | } | 852 | } |
| 845 | } | 853 | } |
| 846 | 854 | ||
| @@ -872,7 +880,7 @@ void lru_add_drain_all(void) | |||
| 872 | 880 | ||
| 873 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || | 881 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || |
| 874 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || | 882 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || |
| 875 | pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || | 883 | pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || |
| 876 | need_activate_page_drain(cpu)) { | 884 | need_activate_page_drain(cpu)) { |
| 877 | INIT_WORK(work, lru_add_drain_per_cpu); | 885 | INIT_WORK(work, lru_add_drain_per_cpu); |
| 878 | schedule_work_on(cpu, work); | 886 | schedule_work_on(cpu, work); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 405923f77334..8bc8e66138da 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) | |||
| 390 | unsigned int pages, max_pages, last_ra; | 390 | unsigned int pages, max_pages, last_ra; |
| 391 | static atomic_t last_readahead_pages; | 391 | static atomic_t last_readahead_pages; |
| 392 | 392 | ||
| 393 | max_pages = 1 << ACCESS_ONCE(page_cluster); | 393 | max_pages = 1 << READ_ONCE(page_cluster); |
| 394 | if (max_pages <= 1) | 394 | if (max_pages <= 1) |
| 395 | return 1; | 395 | return 1; |
| 396 | 396 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 63f55ccb9b26..a7e72103f23b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
| 1312 | else | 1312 | else |
| 1313 | continue; | 1313 | continue; |
| 1314 | } | 1314 | } |
| 1315 | count = ACCESS_ONCE(si->swap_map[i]); | 1315 | count = READ_ONCE(si->swap_map[i]); |
| 1316 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1316 | if (count && swap_count(count) != SWAP_MAP_BAD) |
| 1317 | break; | 1317 | break; |
| 1318 | } | 1318 | } |
diff --git a/mm/truncate.c b/mm/truncate.c index 7a9d8a3cb143..66af9031fae8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -490,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
| 490 | * of interest and try to speed up its reclaim. | 490 | * of interest and try to speed up its reclaim. |
| 491 | */ | 491 | */ |
| 492 | if (!ret) | 492 | if (!ret) |
| 493 | deactivate_page(page); | 493 | deactivate_file_page(page); |
| 494 | count += ret; | 494 | count += ret; |
| 495 | } | 495 | } |
| 496 | pagevec_remove_exceptionals(&pvec); | 496 | pagevec_remove_exceptionals(&pvec); |
| @@ -325,9 +325,37 @@ void kvfree(const void *addr) | |||
| 325 | } | 325 | } |
| 326 | EXPORT_SYMBOL(kvfree); | 326 | EXPORT_SYMBOL(kvfree); |
| 327 | 327 | ||
| 328 | static inline void *__page_rmapping(struct page *page) | ||
| 329 | { | ||
| 330 | unsigned long mapping; | ||
| 331 | |||
| 332 | mapping = (unsigned long)page->mapping; | ||
| 333 | mapping &= ~PAGE_MAPPING_FLAGS; | ||
| 334 | |||
| 335 | return (void *)mapping; | ||
| 336 | } | ||
| 337 | |||
| 338 | /* Neutral page->mapping pointer to address_space or anon_vma or other */ | ||
| 339 | void *page_rmapping(struct page *page) | ||
| 340 | { | ||
| 341 | page = compound_head(page); | ||
| 342 | return __page_rmapping(page); | ||
| 343 | } | ||
| 344 | |||
| 345 | struct anon_vma *page_anon_vma(struct page *page) | ||
| 346 | { | ||
| 347 | unsigned long mapping; | ||
| 348 | |||
| 349 | page = compound_head(page); | ||
| 350 | mapping = (unsigned long)page->mapping; | ||
| 351 | if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | ||
| 352 | return NULL; | ||
| 353 | return __page_rmapping(page); | ||
| 354 | } | ||
| 355 | |||
| 328 | struct address_space *page_mapping(struct page *page) | 356 | struct address_space *page_mapping(struct page *page) |
| 329 | { | 357 | { |
| 330 | struct address_space *mapping = page->mapping; | 358 | unsigned long mapping; |
| 331 | 359 | ||
| 332 | /* This happens if someone calls flush_dcache_page on slab page */ | 360 | /* This happens if someone calls flush_dcache_page on slab page */ |
| 333 | if (unlikely(PageSlab(page))) | 361 | if (unlikely(PageSlab(page))) |
| @@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page) | |||
| 337 | swp_entry_t entry; | 365 | swp_entry_t entry; |
| 338 | 366 | ||
| 339 | entry.val = page_private(page); | 367 | entry.val = page_private(page); |
| 340 | mapping = swap_address_space(entry); | 368 | return swap_address_space(entry); |
| 341 | } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) | 369 | } |
| 342 | mapping = NULL; | 370 | |
| 343 | return mapping; | 371 | mapping = (unsigned long)page->mapping; |
| 372 | if (mapping & PAGE_MAPPING_FLAGS) | ||
| 373 | return NULL; | ||
| 374 | return page->mapping; | ||
| 344 | } | 375 | } |
| 345 | 376 | ||
| 346 | int overcommit_ratio_handler(struct ctl_table *table, int write, | 377 | int overcommit_ratio_handler(struct ctl_table *table, int write, |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a5bbdd3b5d67..2faaa2976447 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -765,7 +765,7 @@ struct vmap_block { | |||
| 765 | spinlock_t lock; | 765 | spinlock_t lock; |
| 766 | struct vmap_area *va; | 766 | struct vmap_area *va; |
| 767 | unsigned long free, dirty; | 767 | unsigned long free, dirty; |
| 768 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | 768 | unsigned long dirty_min, dirty_max; /*< dirty range */ |
| 769 | struct list_head free_list; | 769 | struct list_head free_list; |
| 770 | struct rcu_head rcu_head; | 770 | struct rcu_head rcu_head; |
| 771 | struct list_head purge; | 771 | struct list_head purge; |
| @@ -796,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr) | |||
| 796 | return addr; | 796 | return addr; |
| 797 | } | 797 | } |
| 798 | 798 | ||
| 799 | static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | 799 | static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) |
| 800 | { | ||
| 801 | unsigned long addr; | ||
| 802 | |||
| 803 | addr = va_start + (pages_off << PAGE_SHIFT); | ||
| 804 | BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); | ||
| 805 | return (void *)addr; | ||
| 806 | } | ||
| 807 | |||
| 808 | /** | ||
| 809 | * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this | ||
| 810 | * block. Of course pages number can't exceed VMAP_BBMAP_BITS | ||
| 811 | * @order: how many 2^order pages should be occupied in newly allocated block | ||
| 812 | * @gfp_mask: flags for the page level allocator | ||
| 813 | * | ||
| 814 | * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) | ||
| 815 | */ | ||
| 816 | static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) | ||
| 800 | { | 817 | { |
| 801 | struct vmap_block_queue *vbq; | 818 | struct vmap_block_queue *vbq; |
| 802 | struct vmap_block *vb; | 819 | struct vmap_block *vb; |
| 803 | struct vmap_area *va; | 820 | struct vmap_area *va; |
| 804 | unsigned long vb_idx; | 821 | unsigned long vb_idx; |
| 805 | int node, err; | 822 | int node, err; |
| 823 | void *vaddr; | ||
| 806 | 824 | ||
| 807 | node = numa_node_id(); | 825 | node = numa_node_id(); |
| 808 | 826 | ||
| @@ -826,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
| 826 | return ERR_PTR(err); | 844 | return ERR_PTR(err); |
| 827 | } | 845 | } |
| 828 | 846 | ||
| 847 | vaddr = vmap_block_vaddr(va->va_start, 0); | ||
| 829 | spin_lock_init(&vb->lock); | 848 | spin_lock_init(&vb->lock); |
| 830 | vb->va = va; | 849 | vb->va = va; |
| 831 | vb->free = VMAP_BBMAP_BITS; | 850 | /* At least something should be left free */ |
| 851 | BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); | ||
| 852 | vb->free = VMAP_BBMAP_BITS - (1UL << order); | ||
| 832 | vb->dirty = 0; | 853 | vb->dirty = 0; |
| 833 | bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); | 854 | vb->dirty_min = VMAP_BBMAP_BITS; |
| 855 | vb->dirty_max = 0; | ||
| 834 | INIT_LIST_HEAD(&vb->free_list); | 856 | INIT_LIST_HEAD(&vb->free_list); |
| 835 | 857 | ||
| 836 | vb_idx = addr_to_vb_idx(va->va_start); | 858 | vb_idx = addr_to_vb_idx(va->va_start); |
| @@ -842,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
| 842 | 864 | ||
| 843 | vbq = &get_cpu_var(vmap_block_queue); | 865 | vbq = &get_cpu_var(vmap_block_queue); |
| 844 | spin_lock(&vbq->lock); | 866 | spin_lock(&vbq->lock); |
| 845 | list_add_rcu(&vb->free_list, &vbq->free); | 867 | list_add_tail_rcu(&vb->free_list, &vbq->free); |
| 846 | spin_unlock(&vbq->lock); | 868 | spin_unlock(&vbq->lock); |
| 847 | put_cpu_var(vmap_block_queue); | 869 | put_cpu_var(vmap_block_queue); |
| 848 | 870 | ||
| 849 | return vb; | 871 | return vaddr; |
| 850 | } | 872 | } |
| 851 | 873 | ||
| 852 | static void free_vmap_block(struct vmap_block *vb) | 874 | static void free_vmap_block(struct vmap_block *vb) |
| @@ -881,7 +903,8 @@ static void purge_fragmented_blocks(int cpu) | |||
| 881 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { | 903 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { |
| 882 | vb->free = 0; /* prevent further allocs after releasing lock */ | 904 | vb->free = 0; /* prevent further allocs after releasing lock */ |
| 883 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ | 905 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ |
| 884 | bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); | 906 | vb->dirty_min = 0; |
| 907 | vb->dirty_max = VMAP_BBMAP_BITS; | ||
| 885 | spin_lock(&vbq->lock); | 908 | spin_lock(&vbq->lock); |
| 886 | list_del_rcu(&vb->free_list); | 909 | list_del_rcu(&vb->free_list); |
| 887 | spin_unlock(&vbq->lock); | 910 | spin_unlock(&vbq->lock); |
| @@ -910,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
| 910 | { | 933 | { |
| 911 | struct vmap_block_queue *vbq; | 934 | struct vmap_block_queue *vbq; |
| 912 | struct vmap_block *vb; | 935 | struct vmap_block *vb; |
| 913 | unsigned long addr = 0; | 936 | void *vaddr = NULL; |
| 914 | unsigned int order; | 937 | unsigned int order; |
| 915 | 938 | ||
| 916 | BUG_ON(size & ~PAGE_MASK); | 939 | BUG_ON(size & ~PAGE_MASK); |
| @@ -925,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
| 925 | } | 948 | } |
| 926 | order = get_order(size); | 949 | order = get_order(size); |
| 927 | 950 | ||
| 928 | again: | ||
| 929 | rcu_read_lock(); | 951 | rcu_read_lock(); |
| 930 | vbq = &get_cpu_var(vmap_block_queue); | 952 | vbq = &get_cpu_var(vmap_block_queue); |
| 931 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | 953 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { |
| 932 | int i; | 954 | unsigned long pages_off; |
| 933 | 955 | ||
| 934 | spin_lock(&vb->lock); | 956 | spin_lock(&vb->lock); |
| 935 | if (vb->free < 1UL << order) | 957 | if (vb->free < (1UL << order)) { |
| 936 | goto next; | 958 | spin_unlock(&vb->lock); |
| 959 | continue; | ||
| 960 | } | ||
| 937 | 961 | ||
| 938 | i = VMAP_BBMAP_BITS - vb->free; | 962 | pages_off = VMAP_BBMAP_BITS - vb->free; |
| 939 | addr = vb->va->va_start + (i << PAGE_SHIFT); | 963 | vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); |
| 940 | BUG_ON(addr_to_vb_idx(addr) != | ||
| 941 | addr_to_vb_idx(vb->va->va_start)); | ||
| 942 | vb->free -= 1UL << order; | 964 | vb->free -= 1UL << order; |
| 943 | if (vb->free == 0) { | 965 | if (vb->free == 0) { |
| 944 | spin_lock(&vbq->lock); | 966 | spin_lock(&vbq->lock); |
| 945 | list_del_rcu(&vb->free_list); | 967 | list_del_rcu(&vb->free_list); |
| 946 | spin_unlock(&vbq->lock); | 968 | spin_unlock(&vbq->lock); |
| 947 | } | 969 | } |
| 970 | |||
| 948 | spin_unlock(&vb->lock); | 971 | spin_unlock(&vb->lock); |
| 949 | break; | 972 | break; |
| 950 | next: | ||
| 951 | spin_unlock(&vb->lock); | ||
| 952 | } | 973 | } |
| 953 | 974 | ||
| 954 | put_cpu_var(vmap_block_queue); | 975 | put_cpu_var(vmap_block_queue); |
| 955 | rcu_read_unlock(); | 976 | rcu_read_unlock(); |
| 956 | 977 | ||
| 957 | if (!addr) { | 978 | /* Allocate new block if nothing was found */ |
| 958 | vb = new_vmap_block(gfp_mask); | 979 | if (!vaddr) |
| 959 | if (IS_ERR(vb)) | 980 | vaddr = new_vmap_block(order, gfp_mask); |
| 960 | return vb; | ||
| 961 | goto again; | ||
| 962 | } | ||
| 963 | 981 | ||
| 964 | return (void *)addr; | 982 | return vaddr; |
| 965 | } | 983 | } |
| 966 | 984 | ||
| 967 | static void vb_free(const void *addr, unsigned long size) | 985 | static void vb_free(const void *addr, unsigned long size) |
| @@ -979,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size) | |||
| 979 | order = get_order(size); | 997 | order = get_order(size); |
| 980 | 998 | ||
| 981 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); | 999 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); |
| 1000 | offset >>= PAGE_SHIFT; | ||
| 982 | 1001 | ||
| 983 | vb_idx = addr_to_vb_idx((unsigned long)addr); | 1002 | vb_idx = addr_to_vb_idx((unsigned long)addr); |
| 984 | rcu_read_lock(); | 1003 | rcu_read_lock(); |
| @@ -989,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size) | |||
| 989 | vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); | 1008 | vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); |
| 990 | 1009 | ||
| 991 | spin_lock(&vb->lock); | 1010 | spin_lock(&vb->lock); |
| 992 | BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); | 1011 | |
| 1012 | /* Expand dirty range */ | ||
| 1013 | vb->dirty_min = min(vb->dirty_min, offset); | ||
| 1014 | vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); | ||
| 993 | 1015 | ||
| 994 | vb->dirty += 1UL << order; | 1016 | vb->dirty += 1UL << order; |
| 995 | if (vb->dirty == VMAP_BBMAP_BITS) { | 1017 | if (vb->dirty == VMAP_BBMAP_BITS) { |
| @@ -1028,25 +1050,18 @@ void vm_unmap_aliases(void) | |||
| 1028 | 1050 | ||
| 1029 | rcu_read_lock(); | 1051 | rcu_read_lock(); |
| 1030 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | 1052 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { |
| 1031 | int i, j; | ||
| 1032 | |||
| 1033 | spin_lock(&vb->lock); | 1053 | spin_lock(&vb->lock); |
| 1034 | i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); | 1054 | if (vb->dirty) { |
| 1035 | if (i < VMAP_BBMAP_BITS) { | 1055 | unsigned long va_start = vb->va->va_start; |
| 1036 | unsigned long s, e; | 1056 | unsigned long s, e; |
| 1037 | 1057 | ||
| 1038 | j = find_last_bit(vb->dirty_map, | 1058 | s = va_start + (vb->dirty_min << PAGE_SHIFT); |
| 1039 | VMAP_BBMAP_BITS); | 1059 | e = va_start + (vb->dirty_max << PAGE_SHIFT); |
| 1040 | j = j + 1; /* need exclusive index */ | ||
| 1041 | 1060 | ||
| 1042 | s = vb->va->va_start + (i << PAGE_SHIFT); | 1061 | start = min(s, start); |
| 1043 | e = vb->va->va_start + (j << PAGE_SHIFT); | 1062 | end = max(e, end); |
| 1044 | flush = 1; | ||
| 1045 | 1063 | ||
| 1046 | if (s < start) | 1064 | flush = 1; |
| 1047 | start = s; | ||
| 1048 | if (e > end) | ||
| 1049 | end = e; | ||
| 1050 | } | 1065 | } |
| 1051 | spin_unlock(&vb->lock); | 1066 | spin_unlock(&vb->lock); |
| 1052 | } | 1067 | } |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0dec1fa5f656..08bd7a3d464a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
| @@ -12,35 +12,6 @@ | |||
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| 14 | /* | 14 | /* |
| 15 | * This allocator is designed for use with zram. Thus, the allocator is | ||
| 16 | * supposed to work well under low memory conditions. In particular, it | ||
| 17 | * never attempts higher order page allocation which is very likely to | ||
| 18 | * fail under memory pressure. On the other hand, if we just use single | ||
| 19 | * (0-order) pages, it would suffer from very high fragmentation -- | ||
| 20 | * any object of size PAGE_SIZE/2 or larger would occupy an entire page. | ||
| 21 | * This was one of the major issues with its predecessor (xvmalloc). | ||
| 22 | * | ||
| 23 | * To overcome these issues, zsmalloc allocates a bunch of 0-order pages | ||
| 24 | * and links them together using various 'struct page' fields. These linked | ||
| 25 | * pages act as a single higher-order page i.e. an object can span 0-order | ||
| 26 | * page boundaries. The code refers to these linked pages as a single entity | ||
| 27 | * called zspage. | ||
| 28 | * | ||
| 29 | * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE | ||
| 30 | * since this satisfies the requirements of all its current users (in the | ||
| 31 | * worst case, page is incompressible and is thus stored "as-is" i.e. in | ||
| 32 | * uncompressed form). For allocation requests larger than this size, failure | ||
| 33 | * is returned (see zs_malloc). | ||
| 34 | * | ||
| 35 | * Additionally, zs_malloc() does not return a dereferenceable pointer. | ||
| 36 | * Instead, it returns an opaque handle (unsigned long) which encodes actual | ||
| 37 | * location of the allocated object. The reason for this indirection is that | ||
| 38 | * zsmalloc does not keep zspages permanently mapped since that would cause | ||
| 39 | * issues on 32-bit systems where the VA region for kernel space mappings | ||
| 40 | * is very small. So, before using the allocating memory, the object has to | ||
| 41 | * be mapped using zs_map_object() to get a usable pointer and subsequently | ||
| 42 | * unmapped using zs_unmap_object(). | ||
| 43 | * | ||
| 44 | * Following is how we use various fields and flags of underlying | 15 | * Following is how we use various fields and flags of underlying |
| 45 | * struct page(s) to form a zspage. | 16 | * struct page(s) to form a zspage. |
| 46 | * | 17 | * |
| @@ -57,6 +28,8 @@ | |||
| 57 | * | 28 | * |
| 58 | * page->private (union with page->first_page): refers to the | 29 | * page->private (union with page->first_page): refers to the |
| 59 | * component page after the first page | 30 | * component page after the first page |
| 31 | * If the page is first_page for huge object, it stores handle. | ||
| 32 | * Look at size_class->huge. | ||
| 60 | * page->freelist: points to the first free object in zspage. | 33 | * page->freelist: points to the first free object in zspage. |
| 61 | * Free objects are linked together using in-place | 34 | * Free objects are linked together using in-place |
| 62 | * metadata. | 35 | * metadata. |
| @@ -78,6 +51,7 @@ | |||
| 78 | 51 | ||
| 79 | #include <linux/module.h> | 52 | #include <linux/module.h> |
| 80 | #include <linux/kernel.h> | 53 | #include <linux/kernel.h> |
| 54 | #include <linux/sched.h> | ||
| 81 | #include <linux/bitops.h> | 55 | #include <linux/bitops.h> |
| 82 | #include <linux/errno.h> | 56 | #include <linux/errno.h> |
| 83 | #include <linux/highmem.h> | 57 | #include <linux/highmem.h> |
| @@ -110,6 +84,8 @@ | |||
| 110 | #define ZS_MAX_ZSPAGE_ORDER 2 | 84 | #define ZS_MAX_ZSPAGE_ORDER 2 |
| 111 | #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) | 85 | #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) |
| 112 | 86 | ||
| 87 | #define ZS_HANDLE_SIZE (sizeof(unsigned long)) | ||
| 88 | |||
| 113 | /* | 89 | /* |
| 114 | * Object location (<PFN>, <obj_idx>) is encoded as | 90 | * Object location (<PFN>, <obj_idx>) is encoded as |
| 115 | * as single (unsigned long) handle value. | 91 | * as single (unsigned long) handle value. |
| @@ -133,13 +109,33 @@ | |||
| 133 | #endif | 109 | #endif |
| 134 | #endif | 110 | #endif |
| 135 | #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) | 111 | #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) |
| 136 | #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) | 112 | |
| 113 | /* | ||
| 114 | * Memory for allocating for handle keeps object position by | ||
| 115 | * encoding <page, obj_idx> and the encoded value has a room | ||
| 116 | * in least bit(ie, look at obj_to_location). | ||
| 117 | * We use the bit to synchronize between object access by | ||
| 118 | * user and migration. | ||
| 119 | */ | ||
| 120 | #define HANDLE_PIN_BIT 0 | ||
| 121 | |||
| 122 | /* | ||
| 123 | * Head in allocated object should have OBJ_ALLOCATED_TAG | ||
| 124 | * to identify the object was allocated or not. | ||
| 125 | * It's okay to add the status bit in the least bit because | ||
| 126 | * header keeps handle which is 4byte-aligned address so we | ||
| 127 | * have room for two bit at least. | ||
| 128 | */ | ||
| 129 | #define OBJ_ALLOCATED_TAG 1 | ||
| 130 | #define OBJ_TAG_BITS 1 | ||
| 131 | #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) | ||
| 137 | #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) | 132 | #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) |
| 138 | 133 | ||
| 139 | #define MAX(a, b) ((a) >= (b) ? (a) : (b)) | 134 | #define MAX(a, b) ((a) >= (b) ? (a) : (b)) |
| 140 | /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ | 135 | /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ |
| 141 | #define ZS_MIN_ALLOC_SIZE \ | 136 | #define ZS_MIN_ALLOC_SIZE \ |
| 142 | MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) | 137 | MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) |
| 138 | /* each chunk includes extra space to keep handle */ | ||
| 143 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE | 139 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE |
| 144 | 140 | ||
| 145 | /* | 141 | /* |
| @@ -172,6 +168,8 @@ enum fullness_group { | |||
| 172 | enum zs_stat_type { | 168 | enum zs_stat_type { |
| 173 | OBJ_ALLOCATED, | 169 | OBJ_ALLOCATED, |
| 174 | OBJ_USED, | 170 | OBJ_USED, |
| 171 | CLASS_ALMOST_FULL, | ||
| 172 | CLASS_ALMOST_EMPTY, | ||
| 175 | NR_ZS_STAT_TYPE, | 173 | NR_ZS_STAT_TYPE, |
| 176 | }; | 174 | }; |
| 177 | 175 | ||
| @@ -216,6 +214,8 @@ struct size_class { | |||
| 216 | 214 | ||
| 217 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ | 215 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ |
| 218 | int pages_per_zspage; | 216 | int pages_per_zspage; |
| 217 | /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ | ||
| 218 | bool huge; | ||
| 219 | 219 | ||
| 220 | #ifdef CONFIG_ZSMALLOC_STAT | 220 | #ifdef CONFIG_ZSMALLOC_STAT |
| 221 | struct zs_size_stat stats; | 221 | struct zs_size_stat stats; |
| @@ -233,14 +233,24 @@ struct size_class { | |||
| 233 | * This must be power of 2 and less than or equal to ZS_ALIGN | 233 | * This must be power of 2 and less than or equal to ZS_ALIGN |
| 234 | */ | 234 | */ |
| 235 | struct link_free { | 235 | struct link_free { |
| 236 | /* Handle of next free chunk (encodes <PFN, obj_idx>) */ | 236 | union { |
| 237 | void *next; | 237 | /* |
| 238 | * Position of next free chunk (encodes <PFN, obj_idx>) | ||
| 239 | * It's valid for non-allocated object | ||
| 240 | */ | ||
| 241 | void *next; | ||
| 242 | /* | ||
| 243 | * Handle of allocated object. | ||
| 244 | */ | ||
| 245 | unsigned long handle; | ||
| 246 | }; | ||
| 238 | }; | 247 | }; |
| 239 | 248 | ||
| 240 | struct zs_pool { | 249 | struct zs_pool { |
| 241 | char *name; | 250 | char *name; |
| 242 | 251 | ||
| 243 | struct size_class **size_class; | 252 | struct size_class **size_class; |
| 253 | struct kmem_cache *handle_cachep; | ||
| 244 | 254 | ||
| 245 | gfp_t flags; /* allocation flags used when growing pool */ | 255 | gfp_t flags; /* allocation flags used when growing pool */ |
| 246 | atomic_long_t pages_allocated; | 256 | atomic_long_t pages_allocated; |
| @@ -267,8 +277,37 @@ struct mapping_area { | |||
| 267 | #endif | 277 | #endif |
| 268 | char *vm_addr; /* address of kmap_atomic()'ed pages */ | 278 | char *vm_addr; /* address of kmap_atomic()'ed pages */ |
| 269 | enum zs_mapmode vm_mm; /* mapping mode */ | 279 | enum zs_mapmode vm_mm; /* mapping mode */ |
| 280 | bool huge; | ||
| 270 | }; | 281 | }; |
| 271 | 282 | ||
| 283 | static int create_handle_cache(struct zs_pool *pool) | ||
| 284 | { | ||
| 285 | pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, | ||
| 286 | 0, 0, NULL); | ||
| 287 | return pool->handle_cachep ? 0 : 1; | ||
| 288 | } | ||
| 289 | |||
| 290 | static void destroy_handle_cache(struct zs_pool *pool) | ||
| 291 | { | ||
| 292 | kmem_cache_destroy(pool->handle_cachep); | ||
| 293 | } | ||
| 294 | |||
| 295 | static unsigned long alloc_handle(struct zs_pool *pool) | ||
| 296 | { | ||
| 297 | return (unsigned long)kmem_cache_alloc(pool->handle_cachep, | ||
| 298 | pool->flags & ~__GFP_HIGHMEM); | ||
| 299 | } | ||
| 300 | |||
| 301 | static void free_handle(struct zs_pool *pool, unsigned long handle) | ||
| 302 | { | ||
| 303 | kmem_cache_free(pool->handle_cachep, (void *)handle); | ||
| 304 | } | ||
| 305 | |||
| 306 | static void record_obj(unsigned long handle, unsigned long obj) | ||
| 307 | { | ||
| 308 | *(unsigned long *)handle = obj; | ||
| 309 | } | ||
| 310 | |||
| 272 | /* zpool driver */ | 311 | /* zpool driver */ |
| 273 | 312 | ||
| 274 | #ifdef CONFIG_ZPOOL | 313 | #ifdef CONFIG_ZPOOL |
| @@ -346,6 +385,11 @@ static struct zpool_driver zs_zpool_driver = { | |||
| 346 | MODULE_ALIAS("zpool-zsmalloc"); | 385 | MODULE_ALIAS("zpool-zsmalloc"); |
| 347 | #endif /* CONFIG_ZPOOL */ | 386 | #endif /* CONFIG_ZPOOL */ |
| 348 | 387 | ||
| 388 | static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) | ||
| 389 | { | ||
| 390 | return pages_per_zspage * PAGE_SIZE / size; | ||
| 391 | } | ||
| 392 | |||
| 349 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ | 393 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ |
| 350 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); | 394 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); |
| 351 | 395 | ||
| @@ -396,9 +440,182 @@ static int get_size_class_index(int size) | |||
| 396 | idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, | 440 | idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, |
| 397 | ZS_SIZE_CLASS_DELTA); | 441 | ZS_SIZE_CLASS_DELTA); |
| 398 | 442 | ||
| 399 | return idx; | 443 | return min(zs_size_classes - 1, idx); |
| 444 | } | ||
| 445 | |||
| 446 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 447 | |||
| 448 | static inline void zs_stat_inc(struct size_class *class, | ||
| 449 | enum zs_stat_type type, unsigned long cnt) | ||
| 450 | { | ||
| 451 | class->stats.objs[type] += cnt; | ||
| 452 | } | ||
| 453 | |||
| 454 | static inline void zs_stat_dec(struct size_class *class, | ||
| 455 | enum zs_stat_type type, unsigned long cnt) | ||
| 456 | { | ||
| 457 | class->stats.objs[type] -= cnt; | ||
| 458 | } | ||
| 459 | |||
| 460 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 461 | enum zs_stat_type type) | ||
| 462 | { | ||
| 463 | return class->stats.objs[type]; | ||
| 464 | } | ||
| 465 | |||
| 466 | static int __init zs_stat_init(void) | ||
| 467 | { | ||
| 468 | if (!debugfs_initialized()) | ||
| 469 | return -ENODEV; | ||
| 470 | |||
| 471 | zs_stat_root = debugfs_create_dir("zsmalloc", NULL); | ||
| 472 | if (!zs_stat_root) | ||
| 473 | return -ENOMEM; | ||
| 474 | |||
| 475 | return 0; | ||
| 476 | } | ||
| 477 | |||
| 478 | static void __exit zs_stat_exit(void) | ||
| 479 | { | ||
| 480 | debugfs_remove_recursive(zs_stat_root); | ||
| 481 | } | ||
| 482 | |||
| 483 | static int zs_stats_size_show(struct seq_file *s, void *v) | ||
| 484 | { | ||
| 485 | int i; | ||
| 486 | struct zs_pool *pool = s->private; | ||
| 487 | struct size_class *class; | ||
| 488 | int objs_per_zspage; | ||
| 489 | unsigned long class_almost_full, class_almost_empty; | ||
| 490 | unsigned long obj_allocated, obj_used, pages_used; | ||
| 491 | unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; | ||
| 492 | unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; | ||
| 493 | |||
| 494 | seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", | ||
| 495 | "class", "size", "almost_full", "almost_empty", | ||
| 496 | "obj_allocated", "obj_used", "pages_used", | ||
| 497 | "pages_per_zspage"); | ||
| 498 | |||
| 499 | for (i = 0; i < zs_size_classes; i++) { | ||
| 500 | class = pool->size_class[i]; | ||
| 501 | |||
| 502 | if (class->index != i) | ||
| 503 | continue; | ||
| 504 | |||
| 505 | spin_lock(&class->lock); | ||
| 506 | class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); | ||
| 507 | class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); | ||
| 508 | obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); | ||
| 509 | obj_used = zs_stat_get(class, OBJ_USED); | ||
| 510 | spin_unlock(&class->lock); | ||
| 511 | |||
| 512 | objs_per_zspage = get_maxobj_per_zspage(class->size, | ||
| 513 | class->pages_per_zspage); | ||
| 514 | pages_used = obj_allocated / objs_per_zspage * | ||
| 515 | class->pages_per_zspage; | ||
| 516 | |||
| 517 | seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", | ||
| 518 | i, class->size, class_almost_full, class_almost_empty, | ||
| 519 | obj_allocated, obj_used, pages_used, | ||
| 520 | class->pages_per_zspage); | ||
| 521 | |||
| 522 | total_class_almost_full += class_almost_full; | ||
| 523 | total_class_almost_empty += class_almost_empty; | ||
| 524 | total_objs += obj_allocated; | ||
| 525 | total_used_objs += obj_used; | ||
| 526 | total_pages += pages_used; | ||
| 527 | } | ||
| 528 | |||
| 529 | seq_puts(s, "\n"); | ||
| 530 | seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", | ||
| 531 | "Total", "", total_class_almost_full, | ||
| 532 | total_class_almost_empty, total_objs, | ||
| 533 | total_used_objs, total_pages); | ||
| 534 | |||
| 535 | return 0; | ||
| 536 | } | ||
| 537 | |||
| 538 | static int zs_stats_size_open(struct inode *inode, struct file *file) | ||
| 539 | { | ||
| 540 | return single_open(file, zs_stats_size_show, inode->i_private); | ||
| 541 | } | ||
| 542 | |||
| 543 | static const struct file_operations zs_stat_size_ops = { | ||
| 544 | .open = zs_stats_size_open, | ||
| 545 | .read = seq_read, | ||
| 546 | .llseek = seq_lseek, | ||
| 547 | .release = single_release, | ||
| 548 | }; | ||
| 549 | |||
| 550 | static int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
| 551 | { | ||
| 552 | struct dentry *entry; | ||
| 553 | |||
| 554 | if (!zs_stat_root) | ||
| 555 | return -ENODEV; | ||
| 556 | |||
| 557 | entry = debugfs_create_dir(name, zs_stat_root); | ||
| 558 | if (!entry) { | ||
| 559 | pr_warn("debugfs dir <%s> creation failed\n", name); | ||
| 560 | return -ENOMEM; | ||
| 561 | } | ||
| 562 | pool->stat_dentry = entry; | ||
| 563 | |||
| 564 | entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, | ||
| 565 | pool->stat_dentry, pool, &zs_stat_size_ops); | ||
| 566 | if (!entry) { | ||
| 567 | pr_warn("%s: debugfs file entry <%s> creation failed\n", | ||
| 568 | name, "classes"); | ||
| 569 | return -ENOMEM; | ||
| 570 | } | ||
| 571 | |||
| 572 | return 0; | ||
| 573 | } | ||
| 574 | |||
| 575 | static void zs_pool_stat_destroy(struct zs_pool *pool) | ||
| 576 | { | ||
| 577 | debugfs_remove_recursive(pool->stat_dentry); | ||
| 578 | } | ||
| 579 | |||
| 580 | #else /* CONFIG_ZSMALLOC_STAT */ | ||
| 581 | |||
| 582 | static inline void zs_stat_inc(struct size_class *class, | ||
| 583 | enum zs_stat_type type, unsigned long cnt) | ||
| 584 | { | ||
| 585 | } | ||
| 586 | |||
| 587 | static inline void zs_stat_dec(struct size_class *class, | ||
| 588 | enum zs_stat_type type, unsigned long cnt) | ||
| 589 | { | ||
| 590 | } | ||
| 591 | |||
| 592 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 593 | enum zs_stat_type type) | ||
| 594 | { | ||
| 595 | return 0; | ||
| 596 | } | ||
| 597 | |||
| 598 | static int __init zs_stat_init(void) | ||
| 599 | { | ||
| 600 | return 0; | ||
| 601 | } | ||
| 602 | |||
| 603 | static void __exit zs_stat_exit(void) | ||
| 604 | { | ||
| 605 | } | ||
| 606 | |||
| 607 | static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
| 608 | { | ||
| 609 | return 0; | ||
| 610 | } | ||
| 611 | |||
| 612 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) | ||
| 613 | { | ||
| 400 | } | 614 | } |
| 401 | 615 | ||
| 616 | #endif | ||
| 617 | |||
| 618 | |||
| 402 | /* | 619 | /* |
| 403 | * For each size class, zspages are divided into different groups | 620 | * For each size class, zspages are divided into different groups |
| 404 | * depending on how "full" they are. This was done so that we could | 621 | * depending on how "full" they are. This was done so that we could |
| @@ -419,7 +636,7 @@ static enum fullness_group get_fullness_group(struct page *page) | |||
| 419 | fg = ZS_EMPTY; | 636 | fg = ZS_EMPTY; |
| 420 | else if (inuse == max_objects) | 637 | else if (inuse == max_objects) |
| 421 | fg = ZS_FULL; | 638 | fg = ZS_FULL; |
| 422 | else if (inuse <= max_objects / fullness_threshold_frac) | 639 | else if (inuse <= 3 * max_objects / fullness_threshold_frac) |
| 423 | fg = ZS_ALMOST_EMPTY; | 640 | fg = ZS_ALMOST_EMPTY; |
| 424 | else | 641 | else |
| 425 | fg = ZS_ALMOST_FULL; | 642 | fg = ZS_ALMOST_FULL; |
| @@ -448,6 +665,8 @@ static void insert_zspage(struct page *page, struct size_class *class, | |||
| 448 | list_add_tail(&page->lru, &(*head)->lru); | 665 | list_add_tail(&page->lru, &(*head)->lru); |
| 449 | 666 | ||
| 450 | *head = page; | 667 | *head = page; |
| 668 | zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? | ||
| 669 | CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); | ||
| 451 | } | 670 | } |
| 452 | 671 | ||
| 453 | /* | 672 | /* |
| @@ -473,6 +692,8 @@ static void remove_zspage(struct page *page, struct size_class *class, | |||
| 473 | struct page, lru); | 692 | struct page, lru); |
| 474 | 693 | ||
| 475 | list_del_init(&page->lru); | 694 | list_del_init(&page->lru); |
| 695 | zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? | ||
| 696 | CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); | ||
| 476 | } | 697 | } |
| 477 | 698 | ||
| 478 | /* | 699 | /* |
| @@ -484,11 +705,10 @@ static void remove_zspage(struct page *page, struct size_class *class, | |||
| 484 | * page from the freelist of the old fullness group to that of the new | 705 | * page from the freelist of the old fullness group to that of the new |
| 485 | * fullness group. | 706 | * fullness group. |
| 486 | */ | 707 | */ |
| 487 | static enum fullness_group fix_fullness_group(struct zs_pool *pool, | 708 | static enum fullness_group fix_fullness_group(struct size_class *class, |
| 488 | struct page *page) | 709 | struct page *page) |
| 489 | { | 710 | { |
| 490 | int class_idx; | 711 | int class_idx; |
| 491 | struct size_class *class; | ||
| 492 | enum fullness_group currfg, newfg; | 712 | enum fullness_group currfg, newfg; |
| 493 | 713 | ||
| 494 | BUG_ON(!is_first_page(page)); | 714 | BUG_ON(!is_first_page(page)); |
| @@ -498,7 +718,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, | |||
| 498 | if (newfg == currfg) | 718 | if (newfg == currfg) |
| 499 | goto out; | 719 | goto out; |
| 500 | 720 | ||
| 501 | class = pool->size_class[class_idx]; | ||
| 502 | remove_zspage(page, class, currfg); | 721 | remove_zspage(page, class, currfg); |
| 503 | insert_zspage(page, class, newfg); | 722 | insert_zspage(page, class, newfg); |
| 504 | set_zspage_mapping(page, class_idx, newfg); | 723 | set_zspage_mapping(page, class_idx, newfg); |
| @@ -512,7 +731,8 @@ out: | |||
| 512 | * to form a zspage for each size class. This is important | 731 | * to form a zspage for each size class. This is important |
| 513 | * to reduce wastage due to unusable space left at end of | 732 | * to reduce wastage due to unusable space left at end of |
| 514 | * each zspage which is given as: | 733 | * each zspage which is given as: |
| 515 | * wastage = Zp - Zp % size_class | 734 | * wastage = Zp % class_size |
| 735 | * usage = Zp - wastage | ||
| 516 | * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... | 736 | * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... |
| 517 | * | 737 | * |
| 518 | * For example, for size class of 3/8 * PAGE_SIZE, we should | 738 | * For example, for size class of 3/8 * PAGE_SIZE, we should |
| @@ -571,35 +791,50 @@ static struct page *get_next_page(struct page *page) | |||
| 571 | 791 | ||
| 572 | /* | 792 | /* |
| 573 | * Encode <page, obj_idx> as a single handle value. | 793 | * Encode <page, obj_idx> as a single handle value. |
| 574 | * On hardware platforms with physical memory starting at 0x0 the pfn | 794 | * We use the least bit of handle for tagging. |
| 575 | * could be 0 so we ensure that the handle will never be 0 by adjusting the | ||
| 576 | * encoded obj_idx value before encoding. | ||
| 577 | */ | 795 | */ |
| 578 | static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) | 796 | static void *location_to_obj(struct page *page, unsigned long obj_idx) |
| 579 | { | 797 | { |
| 580 | unsigned long handle; | 798 | unsigned long obj; |
| 581 | 799 | ||
| 582 | if (!page) { | 800 | if (!page) { |
| 583 | BUG_ON(obj_idx); | 801 | BUG_ON(obj_idx); |
| 584 | return NULL; | 802 | return NULL; |
| 585 | } | 803 | } |
| 586 | 804 | ||
| 587 | handle = page_to_pfn(page) << OBJ_INDEX_BITS; | 805 | obj = page_to_pfn(page) << OBJ_INDEX_BITS; |
| 588 | handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); | 806 | obj |= ((obj_idx) & OBJ_INDEX_MASK); |
| 807 | obj <<= OBJ_TAG_BITS; | ||
| 589 | 808 | ||
| 590 | return (void *)handle; | 809 | return (void *)obj; |
| 591 | } | 810 | } |
| 592 | 811 | ||
| 593 | /* | 812 | /* |
| 594 | * Decode <page, obj_idx> pair from the given object handle. We adjust the | 813 | * Decode <page, obj_idx> pair from the given object handle. We adjust the |
| 595 | * decoded obj_idx back to its original value since it was adjusted in | 814 | * decoded obj_idx back to its original value since it was adjusted in |
| 596 | * obj_location_to_handle(). | 815 | * location_to_obj(). |
| 597 | */ | 816 | */ |
| 598 | static void obj_handle_to_location(unsigned long handle, struct page **page, | 817 | static void obj_to_location(unsigned long obj, struct page **page, |
| 599 | unsigned long *obj_idx) | 818 | unsigned long *obj_idx) |
| 600 | { | 819 | { |
| 601 | *page = pfn_to_page(handle >> OBJ_INDEX_BITS); | 820 | obj >>= OBJ_TAG_BITS; |
| 602 | *obj_idx = (handle & OBJ_INDEX_MASK) - 1; | 821 | *page = pfn_to_page(obj >> OBJ_INDEX_BITS); |
| 822 | *obj_idx = (obj & OBJ_INDEX_MASK); | ||
| 823 | } | ||
| 824 | |||
| 825 | static unsigned long handle_to_obj(unsigned long handle) | ||
| 826 | { | ||
| 827 | return *(unsigned long *)handle; | ||
| 828 | } | ||
| 829 | |||
| 830 | static unsigned long obj_to_head(struct size_class *class, struct page *page, | ||
| 831 | void *obj) | ||
| 832 | { | ||
| 833 | if (class->huge) { | ||
| 834 | VM_BUG_ON(!is_first_page(page)); | ||
| 835 | return *(unsigned long *)page_private(page); | ||
| 836 | } else | ||
| 837 | return *(unsigned long *)obj; | ||
| 603 | } | 838 | } |
| 604 | 839 | ||
| 605 | static unsigned long obj_idx_to_offset(struct page *page, | 840 | static unsigned long obj_idx_to_offset(struct page *page, |
| @@ -613,6 +848,25 @@ static unsigned long obj_idx_to_offset(struct page *page, | |||
| 613 | return off + obj_idx * class_size; | 848 | return off + obj_idx * class_size; |
| 614 | } | 849 | } |
| 615 | 850 | ||
| 851 | static inline int trypin_tag(unsigned long handle) | ||
| 852 | { | ||
| 853 | unsigned long *ptr = (unsigned long *)handle; | ||
| 854 | |||
| 855 | return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); | ||
| 856 | } | ||
| 857 | |||
| 858 | static void pin_tag(unsigned long handle) | ||
| 859 | { | ||
| 860 | while (!trypin_tag(handle)); | ||
| 861 | } | ||
| 862 | |||
| 863 | static void unpin_tag(unsigned long handle) | ||
| 864 | { | ||
| 865 | unsigned long *ptr = (unsigned long *)handle; | ||
| 866 | |||
| 867 | clear_bit_unlock(HANDLE_PIN_BIT, ptr); | ||
| 868 | } | ||
| 869 | |||
| 616 | static void reset_page(struct page *page) | 870 | static void reset_page(struct page *page) |
| 617 | { | 871 | { |
| 618 | clear_bit(PG_private, &page->flags); | 872 | clear_bit(PG_private, &page->flags); |
| @@ -674,7 +928,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
| 674 | link = (struct link_free *)vaddr + off / sizeof(*link); | 928 | link = (struct link_free *)vaddr + off / sizeof(*link); |
| 675 | 929 | ||
| 676 | while ((off += class->size) < PAGE_SIZE) { | 930 | while ((off += class->size) < PAGE_SIZE) { |
| 677 | link->next = obj_location_to_handle(page, i++); | 931 | link->next = location_to_obj(page, i++); |
| 678 | link += class->size / sizeof(*link); | 932 | link += class->size / sizeof(*link); |
| 679 | } | 933 | } |
| 680 | 934 | ||
| @@ -684,7 +938,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
| 684 | * page (if present) | 938 | * page (if present) |
| 685 | */ | 939 | */ |
| 686 | next_page = get_next_page(page); | 940 | next_page = get_next_page(page); |
| 687 | link->next = obj_location_to_handle(next_page, 0); | 941 | link->next = location_to_obj(next_page, 0); |
| 688 | kunmap_atomic(vaddr); | 942 | kunmap_atomic(vaddr); |
| 689 | page = next_page; | 943 | page = next_page; |
| 690 | off %= PAGE_SIZE; | 944 | off %= PAGE_SIZE; |
| @@ -738,7 +992,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) | |||
| 738 | 992 | ||
| 739 | init_zspage(first_page, class); | 993 | init_zspage(first_page, class); |
| 740 | 994 | ||
| 741 | first_page->freelist = obj_location_to_handle(first_page, 0); | 995 | first_page->freelist = location_to_obj(first_page, 0); |
| 742 | /* Maximum number of objects we can store in this zspage */ | 996 | /* Maximum number of objects we can store in this zspage */ |
| 743 | first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; | 997 | first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; |
| 744 | 998 | ||
| @@ -860,12 +1114,19 @@ static void __zs_unmap_object(struct mapping_area *area, | |||
| 860 | { | 1114 | { |
| 861 | int sizes[2]; | 1115 | int sizes[2]; |
| 862 | void *addr; | 1116 | void *addr; |
| 863 | char *buf = area->vm_buf; | 1117 | char *buf; |
| 864 | 1118 | ||
| 865 | /* no write fastpath */ | 1119 | /* no write fastpath */ |
| 866 | if (area->vm_mm == ZS_MM_RO) | 1120 | if (area->vm_mm == ZS_MM_RO) |
| 867 | goto out; | 1121 | goto out; |
| 868 | 1122 | ||
| 1123 | buf = area->vm_buf; | ||
| 1124 | if (!area->huge) { | ||
| 1125 | buf = buf + ZS_HANDLE_SIZE; | ||
| 1126 | size -= ZS_HANDLE_SIZE; | ||
| 1127 | off += ZS_HANDLE_SIZE; | ||
| 1128 | } | ||
| 1129 | |||
| 869 | sizes[0] = PAGE_SIZE - off; | 1130 | sizes[0] = PAGE_SIZE - off; |
| 870 | sizes[1] = size - sizes[0]; | 1131 | sizes[1] = size - sizes[0]; |
| 871 | 1132 | ||
| @@ -952,11 +1213,6 @@ static void init_zs_size_classes(void) | |||
| 952 | zs_size_classes = nr; | 1213 | zs_size_classes = nr; |
| 953 | } | 1214 | } |
| 954 | 1215 | ||
| 955 | static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) | ||
| 956 | { | ||
| 957 | return pages_per_zspage * PAGE_SIZE / size; | ||
| 958 | } | ||
| 959 | |||
| 960 | static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | 1216 | static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) |
| 961 | { | 1217 | { |
| 962 | if (prev->pages_per_zspage != pages_per_zspage) | 1218 | if (prev->pages_per_zspage != pages_per_zspage) |
| @@ -969,166 +1225,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | |||
| 969 | return true; | 1225 | return true; |
| 970 | } | 1226 | } |
| 971 | 1227 | ||
| 972 | #ifdef CONFIG_ZSMALLOC_STAT | 1228 | static bool zspage_full(struct page *page) |
| 973 | |||
| 974 | static inline void zs_stat_inc(struct size_class *class, | ||
| 975 | enum zs_stat_type type, unsigned long cnt) | ||
| 976 | { | ||
| 977 | class->stats.objs[type] += cnt; | ||
| 978 | } | ||
| 979 | |||
| 980 | static inline void zs_stat_dec(struct size_class *class, | ||
| 981 | enum zs_stat_type type, unsigned long cnt) | ||
| 982 | { | ||
| 983 | class->stats.objs[type] -= cnt; | ||
| 984 | } | ||
| 985 | |||
| 986 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 987 | enum zs_stat_type type) | ||
| 988 | { | ||
| 989 | return class->stats.objs[type]; | ||
| 990 | } | ||
| 991 | |||
| 992 | static int __init zs_stat_init(void) | ||
| 993 | { | ||
| 994 | if (!debugfs_initialized()) | ||
| 995 | return -ENODEV; | ||
| 996 | |||
| 997 | zs_stat_root = debugfs_create_dir("zsmalloc", NULL); | ||
| 998 | if (!zs_stat_root) | ||
| 999 | return -ENOMEM; | ||
| 1000 | |||
| 1001 | return 0; | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static void __exit zs_stat_exit(void) | ||
| 1005 | { | ||
| 1006 | debugfs_remove_recursive(zs_stat_root); | ||
| 1007 | } | ||
| 1008 | |||
| 1009 | static int zs_stats_size_show(struct seq_file *s, void *v) | ||
| 1010 | { | 1229 | { |
| 1011 | int i; | 1230 | BUG_ON(!is_first_page(page)); |
| 1012 | struct zs_pool *pool = s->private; | ||
| 1013 | struct size_class *class; | ||
| 1014 | int objs_per_zspage; | ||
| 1015 | unsigned long obj_allocated, obj_used, pages_used; | ||
| 1016 | unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; | ||
| 1017 | |||
| 1018 | seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size", | ||
| 1019 | "obj_allocated", "obj_used", "pages_used"); | ||
| 1020 | |||
| 1021 | for (i = 0; i < zs_size_classes; i++) { | ||
| 1022 | class = pool->size_class[i]; | ||
| 1023 | |||
| 1024 | if (class->index != i) | ||
| 1025 | continue; | ||
| 1026 | |||
| 1027 | spin_lock(&class->lock); | ||
| 1028 | obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); | ||
| 1029 | obj_used = zs_stat_get(class, OBJ_USED); | ||
| 1030 | spin_unlock(&class->lock); | ||
| 1031 | |||
| 1032 | objs_per_zspage = get_maxobj_per_zspage(class->size, | ||
| 1033 | class->pages_per_zspage); | ||
| 1034 | pages_used = obj_allocated / objs_per_zspage * | ||
| 1035 | class->pages_per_zspage; | ||
| 1036 | |||
| 1037 | seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i, | ||
| 1038 | class->size, obj_allocated, obj_used, pages_used); | ||
| 1039 | |||
| 1040 | total_objs += obj_allocated; | ||
| 1041 | total_used_objs += obj_used; | ||
| 1042 | total_pages += pages_used; | ||
| 1043 | } | ||
| 1044 | |||
| 1045 | seq_puts(s, "\n"); | ||
| 1046 | seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "", | ||
| 1047 | total_objs, total_used_objs, total_pages); | ||
| 1048 | |||
| 1049 | return 0; | ||
| 1050 | } | ||
| 1051 | |||
| 1052 | static int zs_stats_size_open(struct inode *inode, struct file *file) | ||
| 1053 | { | ||
| 1054 | return single_open(file, zs_stats_size_show, inode->i_private); | ||
| 1055 | } | ||
| 1056 | |||
| 1057 | static const struct file_operations zs_stat_size_ops = { | ||
| 1058 | .open = zs_stats_size_open, | ||
| 1059 | .read = seq_read, | ||
| 1060 | .llseek = seq_lseek, | ||
| 1061 | .release = single_release, | ||
| 1062 | }; | ||
| 1063 | |||
| 1064 | static int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
| 1065 | { | ||
| 1066 | struct dentry *entry; | ||
| 1067 | |||
| 1068 | if (!zs_stat_root) | ||
| 1069 | return -ENODEV; | ||
| 1070 | |||
| 1071 | entry = debugfs_create_dir(name, zs_stat_root); | ||
| 1072 | if (!entry) { | ||
| 1073 | pr_warn("debugfs dir <%s> creation failed\n", name); | ||
| 1074 | return -ENOMEM; | ||
| 1075 | } | ||
| 1076 | pool->stat_dentry = entry; | ||
| 1077 | |||
| 1078 | entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO, | ||
| 1079 | pool->stat_dentry, pool, &zs_stat_size_ops); | ||
| 1080 | if (!entry) { | ||
| 1081 | pr_warn("%s: debugfs file entry <%s> creation failed\n", | ||
| 1082 | name, "obj_in_classes"); | ||
| 1083 | return -ENOMEM; | ||
| 1084 | } | ||
| 1085 | |||
| 1086 | return 0; | ||
| 1087 | } | ||
| 1088 | |||
| 1089 | static void zs_pool_stat_destroy(struct zs_pool *pool) | ||
| 1090 | { | ||
| 1091 | debugfs_remove_recursive(pool->stat_dentry); | ||
| 1092 | } | ||
| 1093 | |||
| 1094 | #else /* CONFIG_ZSMALLOC_STAT */ | ||
| 1095 | |||
| 1096 | static inline void zs_stat_inc(struct size_class *class, | ||
| 1097 | enum zs_stat_type type, unsigned long cnt) | ||
| 1098 | { | ||
| 1099 | } | ||
| 1100 | |||
| 1101 | static inline void zs_stat_dec(struct size_class *class, | ||
| 1102 | enum zs_stat_type type, unsigned long cnt) | ||
| 1103 | { | ||
| 1104 | } | ||
| 1105 | |||
| 1106 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 1107 | enum zs_stat_type type) | ||
| 1108 | { | ||
| 1109 | return 0; | ||
| 1110 | } | ||
| 1111 | |||
| 1112 | static int __init zs_stat_init(void) | ||
| 1113 | { | ||
| 1114 | return 0; | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | static void __exit zs_stat_exit(void) | ||
| 1118 | { | ||
| 1119 | } | ||
| 1120 | |||
| 1121 | static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
| 1122 | { | ||
| 1123 | return 0; | ||
| 1124 | } | ||
| 1125 | 1231 | ||
| 1126 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) | 1232 | return page->inuse == page->objects; |
| 1127 | { | ||
| 1128 | } | 1233 | } |
| 1129 | 1234 | ||
| 1130 | #endif | ||
| 1131 | |||
| 1132 | unsigned long zs_get_total_pages(struct zs_pool *pool) | 1235 | unsigned long zs_get_total_pages(struct zs_pool *pool) |
| 1133 | { | 1236 | { |
| 1134 | return atomic_long_read(&pool->pages_allocated); | 1237 | return atomic_long_read(&pool->pages_allocated); |
| @@ -1153,13 +1256,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
| 1153 | enum zs_mapmode mm) | 1256 | enum zs_mapmode mm) |
| 1154 | { | 1257 | { |
| 1155 | struct page *page; | 1258 | struct page *page; |
| 1156 | unsigned long obj_idx, off; | 1259 | unsigned long obj, obj_idx, off; |
| 1157 | 1260 | ||
| 1158 | unsigned int class_idx; | 1261 | unsigned int class_idx; |
| 1159 | enum fullness_group fg; | 1262 | enum fullness_group fg; |
| 1160 | struct size_class *class; | 1263 | struct size_class *class; |
| 1161 | struct mapping_area *area; | 1264 | struct mapping_area *area; |
| 1162 | struct page *pages[2]; | 1265 | struct page *pages[2]; |
| 1266 | void *ret; | ||
| 1163 | 1267 | ||
| 1164 | BUG_ON(!handle); | 1268 | BUG_ON(!handle); |
| 1165 | 1269 | ||
| @@ -1170,7 +1274,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
| 1170 | */ | 1274 | */ |
| 1171 | BUG_ON(in_interrupt()); | 1275 | BUG_ON(in_interrupt()); |
| 1172 | 1276 | ||
| 1173 | obj_handle_to_location(handle, &page, &obj_idx); | 1277 | /* From now on, migration cannot move the object */ |
| 1278 | pin_tag(handle); | ||
| 1279 | |||
| 1280 | obj = handle_to_obj(handle); | ||
| 1281 | obj_to_location(obj, &page, &obj_idx); | ||
| 1174 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1282 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
| 1175 | class = pool->size_class[class_idx]; | 1283 | class = pool->size_class[class_idx]; |
| 1176 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1284 | off = obj_idx_to_offset(page, obj_idx, class->size); |
| @@ -1180,7 +1288,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
| 1180 | if (off + class->size <= PAGE_SIZE) { | 1288 | if (off + class->size <= PAGE_SIZE) { |
| 1181 | /* this object is contained entirely within a page */ | 1289 | /* this object is contained entirely within a page */ |
| 1182 | area->vm_addr = kmap_atomic(page); | 1290 | area->vm_addr = kmap_atomic(page); |
| 1183 | return area->vm_addr + off; | 1291 | ret = area->vm_addr + off; |
| 1292 | goto out; | ||
| 1184 | } | 1293 | } |
| 1185 | 1294 | ||
| 1186 | /* this object spans two pages */ | 1295 | /* this object spans two pages */ |
| @@ -1188,14 +1297,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
| 1188 | pages[1] = get_next_page(page); | 1297 | pages[1] = get_next_page(page); |
| 1189 | BUG_ON(!pages[1]); | 1298 | BUG_ON(!pages[1]); |
| 1190 | 1299 | ||
| 1191 | return __zs_map_object(area, pages, off, class->size); | 1300 | ret = __zs_map_object(area, pages, off, class->size); |
| 1301 | out: | ||
| 1302 | if (!class->huge) | ||
| 1303 | ret += ZS_HANDLE_SIZE; | ||
| 1304 | |||
| 1305 | return ret; | ||
| 1192 | } | 1306 | } |
| 1193 | EXPORT_SYMBOL_GPL(zs_map_object); | 1307 | EXPORT_SYMBOL_GPL(zs_map_object); |
| 1194 | 1308 | ||
| 1195 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | 1309 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle) |
| 1196 | { | 1310 | { |
| 1197 | struct page *page; | 1311 | struct page *page; |
| 1198 | unsigned long obj_idx, off; | 1312 | unsigned long obj, obj_idx, off; |
| 1199 | 1313 | ||
| 1200 | unsigned int class_idx; | 1314 | unsigned int class_idx; |
| 1201 | enum fullness_group fg; | 1315 | enum fullness_group fg; |
| @@ -1204,7 +1318,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
| 1204 | 1318 | ||
| 1205 | BUG_ON(!handle); | 1319 | BUG_ON(!handle); |
| 1206 | 1320 | ||
| 1207 | obj_handle_to_location(handle, &page, &obj_idx); | 1321 | obj = handle_to_obj(handle); |
| 1322 | obj_to_location(obj, &page, &obj_idx); | ||
| 1208 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1323 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
| 1209 | class = pool->size_class[class_idx]; | 1324 | class = pool->size_class[class_idx]; |
| 1210 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1325 | off = obj_idx_to_offset(page, obj_idx, class->size); |
| @@ -1222,9 +1337,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
| 1222 | __zs_unmap_object(area, pages, off, class->size); | 1337 | __zs_unmap_object(area, pages, off, class->size); |
| 1223 | } | 1338 | } |
| 1224 | put_cpu_var(zs_map_area); | 1339 | put_cpu_var(zs_map_area); |
| 1340 | unpin_tag(handle); | ||
| 1225 | } | 1341 | } |
| 1226 | EXPORT_SYMBOL_GPL(zs_unmap_object); | 1342 | EXPORT_SYMBOL_GPL(zs_unmap_object); |
| 1227 | 1343 | ||
| 1344 | static unsigned long obj_malloc(struct page *first_page, | ||
| 1345 | struct size_class *class, unsigned long handle) | ||
| 1346 | { | ||
| 1347 | unsigned long obj; | ||
| 1348 | struct link_free *link; | ||
| 1349 | |||
| 1350 | struct page *m_page; | ||
| 1351 | unsigned long m_objidx, m_offset; | ||
| 1352 | void *vaddr; | ||
| 1353 | |||
| 1354 | handle |= OBJ_ALLOCATED_TAG; | ||
| 1355 | obj = (unsigned long)first_page->freelist; | ||
| 1356 | obj_to_location(obj, &m_page, &m_objidx); | ||
| 1357 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | ||
| 1358 | |||
| 1359 | vaddr = kmap_atomic(m_page); | ||
| 1360 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); | ||
| 1361 | first_page->freelist = link->next; | ||
| 1362 | if (!class->huge) | ||
| 1363 | /* record handle in the header of allocated chunk */ | ||
| 1364 | link->handle = handle; | ||
| 1365 | else | ||
| 1366 | /* record handle in first_page->private */ | ||
| 1367 | set_page_private(first_page, handle); | ||
| 1368 | kunmap_atomic(vaddr); | ||
| 1369 | first_page->inuse++; | ||
| 1370 | zs_stat_inc(class, OBJ_USED, 1); | ||
| 1371 | |||
| 1372 | return obj; | ||
| 1373 | } | ||
| 1374 | |||
| 1375 | |||
| 1228 | /** | 1376 | /** |
| 1229 | * zs_malloc - Allocate block of given size from pool. | 1377 | * zs_malloc - Allocate block of given size from pool. |
| 1230 | * @pool: pool to allocate from | 1378 | * @pool: pool to allocate from |
| @@ -1236,17 +1384,19 @@ EXPORT_SYMBOL_GPL(zs_unmap_object); | |||
| 1236 | */ | 1384 | */ |
| 1237 | unsigned long zs_malloc(struct zs_pool *pool, size_t size) | 1385 | unsigned long zs_malloc(struct zs_pool *pool, size_t size) |
| 1238 | { | 1386 | { |
| 1239 | unsigned long obj; | 1387 | unsigned long handle, obj; |
| 1240 | struct link_free *link; | ||
| 1241 | struct size_class *class; | 1388 | struct size_class *class; |
| 1242 | void *vaddr; | 1389 | struct page *first_page; |
| 1243 | |||
| 1244 | struct page *first_page, *m_page; | ||
| 1245 | unsigned long m_objidx, m_offset; | ||
| 1246 | 1390 | ||
| 1247 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) | 1391 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) |
| 1248 | return 0; | 1392 | return 0; |
| 1249 | 1393 | ||
| 1394 | handle = alloc_handle(pool); | ||
| 1395 | if (!handle) | ||
| 1396 | return 0; | ||
| 1397 | |||
| 1398 | /* extra space in chunk to keep the handle */ | ||
| 1399 | size += ZS_HANDLE_SIZE; | ||
| 1250 | class = pool->size_class[get_size_class_index(size)]; | 1400 | class = pool->size_class[get_size_class_index(size)]; |
| 1251 | 1401 | ||
| 1252 | spin_lock(&class->lock); | 1402 | spin_lock(&class->lock); |
| @@ -1255,8 +1405,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
| 1255 | if (!first_page) { | 1405 | if (!first_page) { |
| 1256 | spin_unlock(&class->lock); | 1406 | spin_unlock(&class->lock); |
| 1257 | first_page = alloc_zspage(class, pool->flags); | 1407 | first_page = alloc_zspage(class, pool->flags); |
| 1258 | if (unlikely(!first_page)) | 1408 | if (unlikely(!first_page)) { |
| 1409 | free_handle(pool, handle); | ||
| 1259 | return 0; | 1410 | return 0; |
| 1411 | } | ||
| 1260 | 1412 | ||
| 1261 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); | 1413 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); |
| 1262 | atomic_long_add(class->pages_per_zspage, | 1414 | atomic_long_add(class->pages_per_zspage, |
| @@ -1267,73 +1419,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
| 1267 | class->size, class->pages_per_zspage)); | 1419 | class->size, class->pages_per_zspage)); |
| 1268 | } | 1420 | } |
| 1269 | 1421 | ||
| 1270 | obj = (unsigned long)first_page->freelist; | 1422 | obj = obj_malloc(first_page, class, handle); |
| 1271 | obj_handle_to_location(obj, &m_page, &m_objidx); | ||
| 1272 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | ||
| 1273 | |||
| 1274 | vaddr = kmap_atomic(m_page); | ||
| 1275 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); | ||
| 1276 | first_page->freelist = link->next; | ||
| 1277 | memset(link, POISON_INUSE, sizeof(*link)); | ||
| 1278 | kunmap_atomic(vaddr); | ||
| 1279 | |||
| 1280 | first_page->inuse++; | ||
| 1281 | zs_stat_inc(class, OBJ_USED, 1); | ||
| 1282 | /* Now move the zspage to another fullness group, if required */ | 1423 | /* Now move the zspage to another fullness group, if required */ |
| 1283 | fix_fullness_group(pool, first_page); | 1424 | fix_fullness_group(class, first_page); |
| 1425 | record_obj(handle, obj); | ||
| 1284 | spin_unlock(&class->lock); | 1426 | spin_unlock(&class->lock); |
| 1285 | 1427 | ||
| 1286 | return obj; | 1428 | return handle; |
| 1287 | } | 1429 | } |
| 1288 | EXPORT_SYMBOL_GPL(zs_malloc); | 1430 | EXPORT_SYMBOL_GPL(zs_malloc); |
| 1289 | 1431 | ||
| 1290 | void zs_free(struct zs_pool *pool, unsigned long obj) | 1432 | static void obj_free(struct zs_pool *pool, struct size_class *class, |
| 1433 | unsigned long obj) | ||
| 1291 | { | 1434 | { |
| 1292 | struct link_free *link; | 1435 | struct link_free *link; |
| 1293 | struct page *first_page, *f_page; | 1436 | struct page *first_page, *f_page; |
| 1294 | unsigned long f_objidx, f_offset; | 1437 | unsigned long f_objidx, f_offset; |
| 1295 | void *vaddr; | 1438 | void *vaddr; |
| 1296 | |||
| 1297 | int class_idx; | 1439 | int class_idx; |
| 1298 | struct size_class *class; | ||
| 1299 | enum fullness_group fullness; | 1440 | enum fullness_group fullness; |
| 1300 | 1441 | ||
| 1301 | if (unlikely(!obj)) | 1442 | BUG_ON(!obj); |
| 1302 | return; | ||
| 1303 | 1443 | ||
| 1304 | obj_handle_to_location(obj, &f_page, &f_objidx); | 1444 | obj &= ~OBJ_ALLOCATED_TAG; |
| 1445 | obj_to_location(obj, &f_page, &f_objidx); | ||
| 1305 | first_page = get_first_page(f_page); | 1446 | first_page = get_first_page(f_page); |
| 1306 | 1447 | ||
| 1307 | get_zspage_mapping(first_page, &class_idx, &fullness); | 1448 | get_zspage_mapping(first_page, &class_idx, &fullness); |
| 1308 | class = pool->size_class[class_idx]; | ||
| 1309 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); | 1449 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); |
| 1310 | 1450 | ||
| 1311 | spin_lock(&class->lock); | 1451 | vaddr = kmap_atomic(f_page); |
| 1312 | 1452 | ||
| 1313 | /* Insert this object in containing zspage's freelist */ | 1453 | /* Insert this object in containing zspage's freelist */ |
| 1314 | vaddr = kmap_atomic(f_page); | ||
| 1315 | link = (struct link_free *)(vaddr + f_offset); | 1454 | link = (struct link_free *)(vaddr + f_offset); |
| 1316 | link->next = first_page->freelist; | 1455 | link->next = first_page->freelist; |
| 1456 | if (class->huge) | ||
| 1457 | set_page_private(first_page, 0); | ||
| 1317 | kunmap_atomic(vaddr); | 1458 | kunmap_atomic(vaddr); |
| 1318 | first_page->freelist = (void *)obj; | 1459 | first_page->freelist = (void *)obj; |
| 1319 | |||
| 1320 | first_page->inuse--; | 1460 | first_page->inuse--; |
| 1321 | fullness = fix_fullness_group(pool, first_page); | ||
| 1322 | |||
| 1323 | zs_stat_dec(class, OBJ_USED, 1); | 1461 | zs_stat_dec(class, OBJ_USED, 1); |
| 1324 | if (fullness == ZS_EMPTY) | 1462 | } |
| 1463 | |||
| 1464 | void zs_free(struct zs_pool *pool, unsigned long handle) | ||
| 1465 | { | ||
| 1466 | struct page *first_page, *f_page; | ||
| 1467 | unsigned long obj, f_objidx; | ||
| 1468 | int class_idx; | ||
| 1469 | struct size_class *class; | ||
| 1470 | enum fullness_group fullness; | ||
| 1471 | |||
| 1472 | if (unlikely(!handle)) | ||
| 1473 | return; | ||
| 1474 | |||
| 1475 | pin_tag(handle); | ||
| 1476 | obj = handle_to_obj(handle); | ||
| 1477 | obj_to_location(obj, &f_page, &f_objidx); | ||
| 1478 | first_page = get_first_page(f_page); | ||
| 1479 | |||
| 1480 | get_zspage_mapping(first_page, &class_idx, &fullness); | ||
| 1481 | class = pool->size_class[class_idx]; | ||
| 1482 | |||
| 1483 | spin_lock(&class->lock); | ||
| 1484 | obj_free(pool, class, obj); | ||
| 1485 | fullness = fix_fullness_group(class, first_page); | ||
| 1486 | if (fullness == ZS_EMPTY) { | ||
| 1325 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | 1487 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( |
| 1326 | class->size, class->pages_per_zspage)); | 1488 | class->size, class->pages_per_zspage)); |
| 1327 | 1489 | atomic_long_sub(class->pages_per_zspage, | |
| 1490 | &pool->pages_allocated); | ||
| 1491 | free_zspage(first_page); | ||
| 1492 | } | ||
| 1328 | spin_unlock(&class->lock); | 1493 | spin_unlock(&class->lock); |
| 1494 | unpin_tag(handle); | ||
| 1495 | |||
| 1496 | free_handle(pool, handle); | ||
| 1497 | } | ||
| 1498 | EXPORT_SYMBOL_GPL(zs_free); | ||
| 1499 | |||
| 1500 | static void zs_object_copy(unsigned long src, unsigned long dst, | ||
| 1501 | struct size_class *class) | ||
| 1502 | { | ||
| 1503 | struct page *s_page, *d_page; | ||
| 1504 | unsigned long s_objidx, d_objidx; | ||
| 1505 | unsigned long s_off, d_off; | ||
| 1506 | void *s_addr, *d_addr; | ||
| 1507 | int s_size, d_size, size; | ||
| 1508 | int written = 0; | ||
| 1509 | |||
| 1510 | s_size = d_size = class->size; | ||
| 1511 | |||
| 1512 | obj_to_location(src, &s_page, &s_objidx); | ||
| 1513 | obj_to_location(dst, &d_page, &d_objidx); | ||
| 1514 | |||
| 1515 | s_off = obj_idx_to_offset(s_page, s_objidx, class->size); | ||
| 1516 | d_off = obj_idx_to_offset(d_page, d_objidx, class->size); | ||
| 1517 | |||
| 1518 | if (s_off + class->size > PAGE_SIZE) | ||
| 1519 | s_size = PAGE_SIZE - s_off; | ||
| 1520 | |||
| 1521 | if (d_off + class->size > PAGE_SIZE) | ||
| 1522 | d_size = PAGE_SIZE - d_off; | ||
| 1523 | |||
| 1524 | s_addr = kmap_atomic(s_page); | ||
| 1525 | d_addr = kmap_atomic(d_page); | ||
| 1526 | |||
| 1527 | while (1) { | ||
| 1528 | size = min(s_size, d_size); | ||
| 1529 | memcpy(d_addr + d_off, s_addr + s_off, size); | ||
| 1530 | written += size; | ||
| 1531 | |||
| 1532 | if (written == class->size) | ||
| 1533 | break; | ||
| 1534 | |||
| 1535 | s_off += size; | ||
| 1536 | s_size -= size; | ||
| 1537 | d_off += size; | ||
| 1538 | d_size -= size; | ||
| 1539 | |||
| 1540 | if (s_off >= PAGE_SIZE) { | ||
| 1541 | kunmap_atomic(d_addr); | ||
| 1542 | kunmap_atomic(s_addr); | ||
| 1543 | s_page = get_next_page(s_page); | ||
| 1544 | BUG_ON(!s_page); | ||
| 1545 | s_addr = kmap_atomic(s_page); | ||
| 1546 | d_addr = kmap_atomic(d_page); | ||
| 1547 | s_size = class->size - written; | ||
| 1548 | s_off = 0; | ||
| 1549 | } | ||
| 1550 | |||
| 1551 | if (d_off >= PAGE_SIZE) { | ||
| 1552 | kunmap_atomic(d_addr); | ||
| 1553 | d_page = get_next_page(d_page); | ||
| 1554 | BUG_ON(!d_page); | ||
| 1555 | d_addr = kmap_atomic(d_page); | ||
| 1556 | d_size = class->size - written; | ||
| 1557 | d_off = 0; | ||
| 1558 | } | ||
| 1559 | } | ||
| 1560 | |||
| 1561 | kunmap_atomic(d_addr); | ||
| 1562 | kunmap_atomic(s_addr); | ||
| 1563 | } | ||
| 1564 | |||
| 1565 | /* | ||
| 1566 | * Find alloced object in zspage from index object and | ||
| 1567 | * return handle. | ||
| 1568 | */ | ||
| 1569 | static unsigned long find_alloced_obj(struct page *page, int index, | ||
| 1570 | struct size_class *class) | ||
| 1571 | { | ||
| 1572 | unsigned long head; | ||
| 1573 | int offset = 0; | ||
| 1574 | unsigned long handle = 0; | ||
| 1575 | void *addr = kmap_atomic(page); | ||
| 1576 | |||
| 1577 | if (!is_first_page(page)) | ||
| 1578 | offset = page->index; | ||
| 1579 | offset += class->size * index; | ||
| 1580 | |||
| 1581 | while (offset < PAGE_SIZE) { | ||
| 1582 | head = obj_to_head(class, page, addr + offset); | ||
| 1583 | if (head & OBJ_ALLOCATED_TAG) { | ||
| 1584 | handle = head & ~OBJ_ALLOCATED_TAG; | ||
| 1585 | if (trypin_tag(handle)) | ||
| 1586 | break; | ||
| 1587 | handle = 0; | ||
| 1588 | } | ||
| 1589 | |||
| 1590 | offset += class->size; | ||
| 1591 | index++; | ||
| 1592 | } | ||
| 1593 | |||
| 1594 | kunmap_atomic(addr); | ||
| 1595 | return handle; | ||
| 1596 | } | ||
| 1597 | |||
| 1598 | struct zs_compact_control { | ||
| 1599 | /* Source page for migration which could be a subpage of zspage. */ | ||
| 1600 | struct page *s_page; | ||
| 1601 | /* Destination page for migration which should be a first page | ||
| 1602 | * of zspage. */ | ||
| 1603 | struct page *d_page; | ||
| 1604 | /* Starting object index within @s_page which used for live object | ||
| 1605 | * in the subpage. */ | ||
| 1606 | int index; | ||
| 1607 | /* how many of objects are migrated */ | ||
| 1608 | int nr_migrated; | ||
| 1609 | }; | ||
| 1610 | |||
| 1611 | static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | ||
| 1612 | struct zs_compact_control *cc) | ||
| 1613 | { | ||
| 1614 | unsigned long used_obj, free_obj; | ||
| 1615 | unsigned long handle; | ||
| 1616 | struct page *s_page = cc->s_page; | ||
| 1617 | struct page *d_page = cc->d_page; | ||
| 1618 | unsigned long index = cc->index; | ||
| 1619 | int nr_migrated = 0; | ||
| 1620 | int ret = 0; | ||
| 1621 | |||
| 1622 | while (1) { | ||
| 1623 | handle = find_alloced_obj(s_page, index, class); | ||
| 1624 | if (!handle) { | ||
| 1625 | s_page = get_next_page(s_page); | ||
| 1626 | if (!s_page) | ||
| 1627 | break; | ||
| 1628 | index = 0; | ||
| 1629 | continue; | ||
| 1630 | } | ||
| 1631 | |||
| 1632 | /* Stop if there is no more space */ | ||
| 1633 | if (zspage_full(d_page)) { | ||
| 1634 | unpin_tag(handle); | ||
| 1635 | ret = -ENOMEM; | ||
| 1636 | break; | ||
| 1637 | } | ||
| 1638 | |||
| 1639 | used_obj = handle_to_obj(handle); | ||
| 1640 | free_obj = obj_malloc(d_page, class, handle); | ||
| 1641 | zs_object_copy(used_obj, free_obj, class); | ||
| 1642 | index++; | ||
| 1643 | record_obj(handle, free_obj); | ||
| 1644 | unpin_tag(handle); | ||
| 1645 | obj_free(pool, class, used_obj); | ||
| 1646 | nr_migrated++; | ||
| 1647 | } | ||
| 1648 | |||
| 1649 | /* Remember last position in this iteration */ | ||
| 1650 | cc->s_page = s_page; | ||
| 1651 | cc->index = index; | ||
| 1652 | cc->nr_migrated = nr_migrated; | ||
| 1653 | |||
| 1654 | return ret; | ||
| 1655 | } | ||
| 1656 | |||
| 1657 | static struct page *alloc_target_page(struct size_class *class) | ||
| 1658 | { | ||
| 1659 | int i; | ||
| 1660 | struct page *page; | ||
| 1661 | |||
| 1662 | for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { | ||
| 1663 | page = class->fullness_list[i]; | ||
| 1664 | if (page) { | ||
| 1665 | remove_zspage(page, class, i); | ||
| 1666 | break; | ||
| 1667 | } | ||
| 1668 | } | ||
| 1669 | |||
| 1670 | return page; | ||
| 1671 | } | ||
| 1672 | |||
| 1673 | static void putback_zspage(struct zs_pool *pool, struct size_class *class, | ||
| 1674 | struct page *first_page) | ||
| 1675 | { | ||
| 1676 | enum fullness_group fullness; | ||
| 1677 | |||
| 1678 | BUG_ON(!is_first_page(first_page)); | ||
| 1679 | |||
| 1680 | fullness = get_fullness_group(first_page); | ||
| 1681 | insert_zspage(first_page, class, fullness); | ||
| 1682 | set_zspage_mapping(first_page, class->index, fullness); | ||
| 1329 | 1683 | ||
| 1330 | if (fullness == ZS_EMPTY) { | 1684 | if (fullness == ZS_EMPTY) { |
| 1685 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | ||
| 1686 | class->size, class->pages_per_zspage)); | ||
| 1331 | atomic_long_sub(class->pages_per_zspage, | 1687 | atomic_long_sub(class->pages_per_zspage, |
| 1332 | &pool->pages_allocated); | 1688 | &pool->pages_allocated); |
| 1689 | |||
| 1333 | free_zspage(first_page); | 1690 | free_zspage(first_page); |
| 1334 | } | 1691 | } |
| 1335 | } | 1692 | } |
| 1336 | EXPORT_SYMBOL_GPL(zs_free); | 1693 | |
| 1694 | static struct page *isolate_source_page(struct size_class *class) | ||
| 1695 | { | ||
| 1696 | struct page *page; | ||
| 1697 | |||
| 1698 | page = class->fullness_list[ZS_ALMOST_EMPTY]; | ||
| 1699 | if (page) | ||
| 1700 | remove_zspage(page, class, ZS_ALMOST_EMPTY); | ||
| 1701 | |||
| 1702 | return page; | ||
| 1703 | } | ||
| 1704 | |||
| 1705 | static unsigned long __zs_compact(struct zs_pool *pool, | ||
| 1706 | struct size_class *class) | ||
| 1707 | { | ||
| 1708 | int nr_to_migrate; | ||
| 1709 | struct zs_compact_control cc; | ||
| 1710 | struct page *src_page; | ||
| 1711 | struct page *dst_page = NULL; | ||
| 1712 | unsigned long nr_total_migrated = 0; | ||
| 1713 | |||
| 1714 | spin_lock(&class->lock); | ||
| 1715 | while ((src_page = isolate_source_page(class))) { | ||
| 1716 | |||
| 1717 | BUG_ON(!is_first_page(src_page)); | ||
| 1718 | |||
| 1719 | /* The goal is to migrate all live objects in source page */ | ||
| 1720 | nr_to_migrate = src_page->inuse; | ||
| 1721 | cc.index = 0; | ||
| 1722 | cc.s_page = src_page; | ||
| 1723 | |||
| 1724 | while ((dst_page = alloc_target_page(class))) { | ||
| 1725 | cc.d_page = dst_page; | ||
| 1726 | /* | ||
| 1727 | * If there is no more space in dst_page, try to | ||
| 1728 | * allocate another zspage. | ||
| 1729 | */ | ||
| 1730 | if (!migrate_zspage(pool, class, &cc)) | ||
| 1731 | break; | ||
| 1732 | |||
| 1733 | putback_zspage(pool, class, dst_page); | ||
| 1734 | nr_total_migrated += cc.nr_migrated; | ||
| 1735 | nr_to_migrate -= cc.nr_migrated; | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | /* Stop if we couldn't find slot */ | ||
| 1739 | if (dst_page == NULL) | ||
| 1740 | break; | ||
| 1741 | |||
| 1742 | putback_zspage(pool, class, dst_page); | ||
| 1743 | putback_zspage(pool, class, src_page); | ||
| 1744 | spin_unlock(&class->lock); | ||
| 1745 | nr_total_migrated += cc.nr_migrated; | ||
| 1746 | cond_resched(); | ||
| 1747 | spin_lock(&class->lock); | ||
| 1748 | } | ||
| 1749 | |||
| 1750 | if (src_page) | ||
| 1751 | putback_zspage(pool, class, src_page); | ||
| 1752 | |||
| 1753 | spin_unlock(&class->lock); | ||
| 1754 | |||
| 1755 | return nr_total_migrated; | ||
| 1756 | } | ||
| 1757 | |||
| 1758 | unsigned long zs_compact(struct zs_pool *pool) | ||
| 1759 | { | ||
| 1760 | int i; | ||
| 1761 | unsigned long nr_migrated = 0; | ||
| 1762 | struct size_class *class; | ||
| 1763 | |||
| 1764 | for (i = zs_size_classes - 1; i >= 0; i--) { | ||
| 1765 | class = pool->size_class[i]; | ||
| 1766 | if (!class) | ||
| 1767 | continue; | ||
| 1768 | if (class->index != i) | ||
| 1769 | continue; | ||
| 1770 | nr_migrated += __zs_compact(pool, class); | ||
| 1771 | } | ||
| 1772 | |||
| 1773 | return nr_migrated; | ||
| 1774 | } | ||
| 1775 | EXPORT_SYMBOL_GPL(zs_compact); | ||
| 1337 | 1776 | ||
| 1338 | /** | 1777 | /** |
| 1339 | * zs_create_pool - Creates an allocation pool to work from. | 1778 | * zs_create_pool - Creates an allocation pool to work from. |
| @@ -1355,20 +1794,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) | |||
| 1355 | if (!pool) | 1794 | if (!pool) |
| 1356 | return NULL; | 1795 | return NULL; |
| 1357 | 1796 | ||
| 1358 | pool->name = kstrdup(name, GFP_KERNEL); | ||
| 1359 | if (!pool->name) { | ||
| 1360 | kfree(pool); | ||
| 1361 | return NULL; | ||
| 1362 | } | ||
| 1363 | |||
| 1364 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), | 1797 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), |
| 1365 | GFP_KERNEL); | 1798 | GFP_KERNEL); |
| 1366 | if (!pool->size_class) { | 1799 | if (!pool->size_class) { |
| 1367 | kfree(pool->name); | ||
| 1368 | kfree(pool); | 1800 | kfree(pool); |
| 1369 | return NULL; | 1801 | return NULL; |
| 1370 | } | 1802 | } |
| 1371 | 1803 | ||
| 1804 | pool->name = kstrdup(name, GFP_KERNEL); | ||
| 1805 | if (!pool->name) | ||
| 1806 | goto err; | ||
| 1807 | |||
| 1808 | if (create_handle_cache(pool)) | ||
| 1809 | goto err; | ||
| 1810 | |||
| 1372 | /* | 1811 | /* |
| 1373 | * Iterate reversly, because, size of size_class that we want to use | 1812 | * Iterate reversly, because, size of size_class that we want to use |
| 1374 | * for merging should be larger or equal to current size. | 1813 | * for merging should be larger or equal to current size. |
| @@ -1406,6 +1845,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) | |||
| 1406 | class->size = size; | 1845 | class->size = size; |
| 1407 | class->index = i; | 1846 | class->index = i; |
| 1408 | class->pages_per_zspage = pages_per_zspage; | 1847 | class->pages_per_zspage = pages_per_zspage; |
| 1848 | if (pages_per_zspage == 1 && | ||
| 1849 | get_maxobj_per_zspage(size, pages_per_zspage) == 1) | ||
| 1850 | class->huge = true; | ||
| 1409 | spin_lock_init(&class->lock); | 1851 | spin_lock_init(&class->lock); |
| 1410 | pool->size_class[i] = class; | 1852 | pool->size_class[i] = class; |
| 1411 | 1853 | ||
| @@ -1450,6 +1892,7 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
| 1450 | kfree(class); | 1892 | kfree(class); |
| 1451 | } | 1893 | } |
| 1452 | 1894 | ||
| 1895 | destroy_handle_cache(pool); | ||
| 1453 | kfree(pool->size_class); | 1896 | kfree(pool->size_class); |
| 1454 | kfree(pool->name); | 1897 | kfree(pool->name); |
| 1455 | kfree(pool); | 1898 | kfree(pool); |
