diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-15 19:39:15 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-15 19:39:15 -0400 |
commit | eea3a00264cf243a28e4331566ce67b86059339d (patch) | |
tree | 487f16389e0dfa32e9caa7604d1274a7dcda8f04 /mm | |
parent | e7c82412433a8039616c7314533a0a1c025d99bf (diff) | |
parent | e693d73c20ffdb06840c9378f367bad849ac0d5d (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge second patchbomb from Andrew Morton:
- the rest of MM
- various misc bits
- add ability to run /sbin/reboot at reboot time
- printk/vsprintf changes
- fiddle with seq_printf() return value
* akpm: (114 commits)
parisc: remove use of seq_printf return value
lru_cache: remove use of seq_printf return value
tracing: remove use of seq_printf return value
cgroup: remove use of seq_printf return value
proc: remove use of seq_printf return value
s390: remove use of seq_printf return value
cris fasttimer: remove use of seq_printf return value
cris: remove use of seq_printf return value
openrisc: remove use of seq_printf return value
ARM: plat-pxa: remove use of seq_printf return value
nios2: cpuinfo: remove use of seq_printf return value
microblaze: mb: remove use of seq_printf return value
ipc: remove use of seq_printf return value
rtc: remove use of seq_printf return value
power: wakeup: remove use of seq_printf return value
x86: mtrr: if: remove use of seq_printf return value
linux/bitmap.h: improve BITMAP_{LAST,FIRST}_WORD_MASK
MAINTAINERS: CREDITS: remove Stefano Brivio from B43
.mailmap: add Ricardo Ribalda
CREDITS: add Ricardo Ribalda Delgado
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/cma.c | 5 | ||||
-rw-r--r-- | mm/cma_debug.c | 41 | ||||
-rw-r--r-- | mm/compaction.c | 60 | ||||
-rw-r--r-- | mm/gup.c | 4 | ||||
-rw-r--r-- | mm/huge_memory.c | 86 | ||||
-rw-r--r-- | mm/hugetlb.c | 234 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 13 | ||||
-rw-r--r-- | mm/ksm.c | 10 | ||||
-rw-r--r-- | mm/memblock.c | 18 | ||||
-rw-r--r-- | mm/memcontrol.c | 47 | ||||
-rw-r--r-- | mm/memory-failure.c | 122 | ||||
-rw-r--r-- | mm/memory.c | 56 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempool.c | 117 | ||||
-rw-r--r-- | mm/migrate.c | 3 | ||||
-rw-r--r-- | mm/mmap.c | 21 | ||||
-rw-r--r-- | mm/mremap.c | 25 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 3 | ||||
-rw-r--r-- | mm/page_alloc.c | 6 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 4 | ||||
-rw-r--r-- | mm/swap.c | 34 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 2 | ||||
-rw-r--r-- | mm/util.c | 41 | ||||
-rw-r--r-- | mm/vmalloc.c | 95 | ||||
-rw-r--r-- | mm/zsmalloc.c | 971 |
30 files changed, 1453 insertions, 583 deletions
@@ -23,6 +23,7 @@ | |||
23 | # define DEBUG | 23 | # define DEBUG |
24 | #endif | 24 | #endif |
25 | #endif | 25 | #endif |
26 | #define CREATE_TRACE_POINTS | ||
26 | 27 | ||
27 | #include <linux/memblock.h> | 28 | #include <linux/memblock.h> |
28 | #include <linux/err.h> | 29 | #include <linux/err.h> |
@@ -34,6 +35,7 @@ | |||
34 | #include <linux/cma.h> | 35 | #include <linux/cma.h> |
35 | #include <linux/highmem.h> | 36 | #include <linux/highmem.h> |
36 | #include <linux/io.h> | 37 | #include <linux/io.h> |
38 | #include <trace/events/cma.h> | ||
37 | 39 | ||
38 | #include "cma.h" | 40 | #include "cma.h" |
39 | 41 | ||
@@ -414,6 +416,8 @@ struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) | |||
414 | start = bitmap_no + mask + 1; | 416 | start = bitmap_no + mask + 1; |
415 | } | 417 | } |
416 | 418 | ||
419 | trace_cma_alloc(page ? pfn : -1UL, page, count, align); | ||
420 | |||
417 | pr_debug("%s(): returned %p\n", __func__, page); | 421 | pr_debug("%s(): returned %p\n", __func__, page); |
418 | return page; | 422 | return page; |
419 | } | 423 | } |
@@ -446,6 +450,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) | |||
446 | 450 | ||
447 | free_contig_range(pfn, count); | 451 | free_contig_range(pfn, count); |
448 | cma_clear_bitmap(cma, pfn, count); | 452 | cma_clear_bitmap(cma, pfn, count); |
453 | trace_cma_release(pfn, pages, count); | ||
449 | 454 | ||
450 | return true; | 455 | return true; |
451 | } | 456 | } |
diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 0b377536ccde..7621ee34daa0 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c | |||
@@ -30,9 +30,44 @@ static int cma_debugfs_get(void *data, u64 *val) | |||
30 | 30 | ||
31 | return 0; | 31 | return 0; |
32 | } | 32 | } |
33 | |||
34 | DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); | 33 | DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); |
35 | 34 | ||
35 | static int cma_used_get(void *data, u64 *val) | ||
36 | { | ||
37 | struct cma *cma = data; | ||
38 | unsigned long used; | ||
39 | |||
40 | mutex_lock(&cma->lock); | ||
41 | /* pages counter is smaller than sizeof(int) */ | ||
42 | used = bitmap_weight(cma->bitmap, (int)cma->count); | ||
43 | mutex_unlock(&cma->lock); | ||
44 | *val = (u64)used << cma->order_per_bit; | ||
45 | |||
46 | return 0; | ||
47 | } | ||
48 | DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); | ||
49 | |||
50 | static int cma_maxchunk_get(void *data, u64 *val) | ||
51 | { | ||
52 | struct cma *cma = data; | ||
53 | unsigned long maxchunk = 0; | ||
54 | unsigned long start, end = 0; | ||
55 | |||
56 | mutex_lock(&cma->lock); | ||
57 | for (;;) { | ||
58 | start = find_next_zero_bit(cma->bitmap, cma->count, end); | ||
59 | if (start >= cma->count) | ||
60 | break; | ||
61 | end = find_next_bit(cma->bitmap, cma->count, start); | ||
62 | maxchunk = max(end - start, maxchunk); | ||
63 | } | ||
64 | mutex_unlock(&cma->lock); | ||
65 | *val = (u64)maxchunk << cma->order_per_bit; | ||
66 | |||
67 | return 0; | ||
68 | } | ||
69 | DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); | ||
70 | |||
36 | static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) | 71 | static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) |
37 | { | 72 | { |
38 | spin_lock(&cma->mem_head_lock); | 73 | spin_lock(&cma->mem_head_lock); |
@@ -91,7 +126,6 @@ static int cma_free_write(void *data, u64 val) | |||
91 | 126 | ||
92 | return cma_free_mem(cma, pages); | 127 | return cma_free_mem(cma, pages); |
93 | } | 128 | } |
94 | |||
95 | DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); | 129 | DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); |
96 | 130 | ||
97 | static int cma_alloc_mem(struct cma *cma, int count) | 131 | static int cma_alloc_mem(struct cma *cma, int count) |
@@ -124,7 +158,6 @@ static int cma_alloc_write(void *data, u64 val) | |||
124 | 158 | ||
125 | return cma_alloc_mem(cma, pages); | 159 | return cma_alloc_mem(cma, pages); |
126 | } | 160 | } |
127 | |||
128 | DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); | 161 | DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); |
129 | 162 | ||
130 | static void cma_debugfs_add_one(struct cma *cma, int idx) | 163 | static void cma_debugfs_add_one(struct cma *cma, int idx) |
@@ -149,6 +182,8 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) | |||
149 | &cma->count, &cma_debugfs_fops); | 182 | &cma->count, &cma_debugfs_fops); |
150 | debugfs_create_file("order_per_bit", S_IRUGO, tmp, | 183 | debugfs_create_file("order_per_bit", S_IRUGO, tmp, |
151 | &cma->order_per_bit, &cma_debugfs_fops); | 184 | &cma->order_per_bit, &cma_debugfs_fops); |
185 | debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops); | ||
186 | debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops); | ||
152 | 187 | ||
153 | u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); | 188 | u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); |
154 | debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); | 189 | debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); |
diff --git a/mm/compaction.c b/mm/compaction.c index a18201a8124e..018f08da99a2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc) | |||
391 | return false; | 391 | return false; |
392 | } | 392 | } |
393 | 393 | ||
394 | /* Returns true if the page is within a block suitable for migration to */ | ||
395 | static bool suitable_migration_target(struct page *page) | ||
396 | { | ||
397 | /* If the page is a large free page, then disallow migration */ | ||
398 | if (PageBuddy(page)) { | ||
399 | /* | ||
400 | * We are checking page_order without zone->lock taken. But | ||
401 | * the only small danger is that we skip a potentially suitable | ||
402 | * pageblock, so it's not worth to check order for valid range. | ||
403 | */ | ||
404 | if (page_order_unsafe(page) >= pageblock_order) | ||
405 | return false; | ||
406 | } | ||
407 | |||
408 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
409 | if (migrate_async_suitable(get_pageblock_migratetype(page))) | ||
410 | return true; | ||
411 | |||
412 | /* Otherwise skip the block */ | ||
413 | return false; | ||
414 | } | ||
415 | |||
416 | /* | 394 | /* |
417 | * Isolate free pages onto a private freelist. If @strict is true, will abort | 395 | * Isolate free pages onto a private freelist. If @strict is true, will abort |
418 | * returning 0 on any invalid PFNs or non-free pages inside of the pageblock | 396 | * returning 0 on any invalid PFNs or non-free pages inside of the pageblock |
@@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, | |||
896 | 874 | ||
897 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 875 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
898 | #ifdef CONFIG_COMPACTION | 876 | #ifdef CONFIG_COMPACTION |
877 | |||
878 | /* Returns true if the page is within a block suitable for migration to */ | ||
879 | static bool suitable_migration_target(struct page *page) | ||
880 | { | ||
881 | /* If the page is a large free page, then disallow migration */ | ||
882 | if (PageBuddy(page)) { | ||
883 | /* | ||
884 | * We are checking page_order without zone->lock taken. But | ||
885 | * the only small danger is that we skip a potentially suitable | ||
886 | * pageblock, so it's not worth to check order for valid range. | ||
887 | */ | ||
888 | if (page_order_unsafe(page) >= pageblock_order) | ||
889 | return false; | ||
890 | } | ||
891 | |||
892 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
893 | if (migrate_async_suitable(get_pageblock_migratetype(page))) | ||
894 | return true; | ||
895 | |||
896 | /* Otherwise skip the block */ | ||
897 | return false; | ||
898 | } | ||
899 | |||
899 | /* | 900 | /* |
900 | * Based on information in the current compact_control, find blocks | 901 | * Based on information in the current compact_control, find blocks |
901 | * suitable for isolating free pages from and then isolate them. | 902 | * suitable for isolating free pages from and then isolate them. |
@@ -1047,6 +1048,12 @@ typedef enum { | |||
1047 | } isolate_migrate_t; | 1048 | } isolate_migrate_t; |
1048 | 1049 | ||
1049 | /* | 1050 | /* |
1051 | * Allow userspace to control policy on scanning the unevictable LRU for | ||
1052 | * compactable pages. | ||
1053 | */ | ||
1054 | int sysctl_compact_unevictable_allowed __read_mostly = 1; | ||
1055 | |||
1056 | /* | ||
1050 | * Isolate all pages that can be migrated from the first suitable block, | 1057 | * Isolate all pages that can be migrated from the first suitable block, |
1051 | * starting at the block pointed to by the migrate scanner pfn within | 1058 | * starting at the block pointed to by the migrate scanner pfn within |
1052 | * compact_control. | 1059 | * compact_control. |
@@ -1057,6 +1064,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1057 | unsigned long low_pfn, end_pfn; | 1064 | unsigned long low_pfn, end_pfn; |
1058 | struct page *page; | 1065 | struct page *page; |
1059 | const isolate_mode_t isolate_mode = | 1066 | const isolate_mode_t isolate_mode = |
1067 | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | | ||
1060 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); | 1068 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); |
1061 | 1069 | ||
1062 | /* | 1070 | /* |
@@ -1598,6 +1606,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1598 | INIT_LIST_HEAD(&cc->freepages); | 1606 | INIT_LIST_HEAD(&cc->freepages); |
1599 | INIT_LIST_HEAD(&cc->migratepages); | 1607 | INIT_LIST_HEAD(&cc->migratepages); |
1600 | 1608 | ||
1609 | /* | ||
1610 | * When called via /proc/sys/vm/compact_memory | ||
1611 | * this makes sure we compact the whole zone regardless of | ||
1612 | * cached scanner positions. | ||
1613 | */ | ||
1614 | if (cc->order == -1) | ||
1615 | __reset_isolation_suitable(zone); | ||
1616 | |||
1601 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) | 1617 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) |
1602 | compact_zone(zone, cc); | 1618 | compact_zone(zone, cc); |
1603 | 1619 | ||
@@ -1019,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
1019 | * | 1019 | * |
1020 | * for an example see gup_get_pte in arch/x86/mm/gup.c | 1020 | * for an example see gup_get_pte in arch/x86/mm/gup.c |
1021 | */ | 1021 | */ |
1022 | pte_t pte = ACCESS_ONCE(*ptep); | 1022 | pte_t pte = READ_ONCE(*ptep); |
1023 | struct page *page; | 1023 | struct page *page; |
1024 | 1024 | ||
1025 | /* | 1025 | /* |
@@ -1309,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1309 | local_irq_save(flags); | 1309 | local_irq_save(flags); |
1310 | pgdp = pgd_offset(mm, addr); | 1310 | pgdp = pgd_offset(mm, addr); |
1311 | do { | 1311 | do { |
1312 | pgd_t pgd = ACCESS_ONCE(*pgdp); | 1312 | pgd_t pgd = READ_ONCE(*pgdp); |
1313 | 1313 | ||
1314 | next = pgd_addr_end(addr, end); | 1314 | next = pgd_addr_end(addr, end); |
1315 | if (pgd_none(pgd)) | 1315 | if (pgd_none(pgd)) |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3afb5cbe1312..078832cf3636 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | |||
67 | 67 | ||
68 | static int khugepaged(void *none); | 68 | static int khugepaged(void *none); |
69 | static int khugepaged_slab_init(void); | 69 | static int khugepaged_slab_init(void); |
70 | static void khugepaged_slab_exit(void); | ||
70 | 71 | ||
71 | #define MM_SLOTS_HASH_BITS 10 | 72 | #define MM_SLOTS_HASH_BITS 10 |
72 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | 73 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); |
@@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void) | |||
109 | int nr_zones = 0; | 110 | int nr_zones = 0; |
110 | unsigned long recommended_min; | 111 | unsigned long recommended_min; |
111 | 112 | ||
112 | if (!khugepaged_enabled()) | ||
113 | return 0; | ||
114 | |||
115 | for_each_populated_zone(zone) | 113 | for_each_populated_zone(zone) |
116 | nr_zones++; | 114 | nr_zones++; |
117 | 115 | ||
@@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void) | |||
143 | setup_per_zone_wmarks(); | 141 | setup_per_zone_wmarks(); |
144 | return 0; | 142 | return 0; |
145 | } | 143 | } |
146 | late_initcall(set_recommended_min_free_kbytes); | ||
147 | 144 | ||
148 | static int start_khugepaged(void) | 145 | static int start_stop_khugepaged(void) |
149 | { | 146 | { |
150 | int err = 0; | 147 | int err = 0; |
151 | if (khugepaged_enabled()) { | 148 | if (khugepaged_enabled()) { |
@@ -156,6 +153,7 @@ static int start_khugepaged(void) | |||
156 | pr_err("khugepaged: kthread_run(khugepaged) failed\n"); | 153 | pr_err("khugepaged: kthread_run(khugepaged) failed\n"); |
157 | err = PTR_ERR(khugepaged_thread); | 154 | err = PTR_ERR(khugepaged_thread); |
158 | khugepaged_thread = NULL; | 155 | khugepaged_thread = NULL; |
156 | goto fail; | ||
159 | } | 157 | } |
160 | 158 | ||
161 | if (!list_empty(&khugepaged_scan.mm_head)) | 159 | if (!list_empty(&khugepaged_scan.mm_head)) |
@@ -166,7 +164,7 @@ static int start_khugepaged(void) | |||
166 | kthread_stop(khugepaged_thread); | 164 | kthread_stop(khugepaged_thread); |
167 | khugepaged_thread = NULL; | 165 | khugepaged_thread = NULL; |
168 | } | 166 | } |
169 | 167 | fail: | |
170 | return err; | 168 | return err; |
171 | } | 169 | } |
172 | 170 | ||
@@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void) | |||
183 | struct page *zero_page; | 181 | struct page *zero_page; |
184 | retry: | 182 | retry: |
185 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | 183 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) |
186 | return ACCESS_ONCE(huge_zero_page); | 184 | return READ_ONCE(huge_zero_page); |
187 | 185 | ||
188 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | 186 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, |
189 | HPAGE_PMD_ORDER); | 187 | HPAGE_PMD_ORDER); |
@@ -202,7 +200,7 @@ retry: | |||
202 | /* We take additional reference here. It will be put back by shrinker */ | 200 | /* We take additional reference here. It will be put back by shrinker */ |
203 | atomic_set(&huge_zero_refcount, 2); | 201 | atomic_set(&huge_zero_refcount, 2); |
204 | preempt_enable(); | 202 | preempt_enable(); |
205 | return ACCESS_ONCE(huge_zero_page); | 203 | return READ_ONCE(huge_zero_page); |
206 | } | 204 | } |
207 | 205 | ||
208 | static void put_huge_zero_page(void) | 206 | static void put_huge_zero_page(void) |
@@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj, | |||
300 | int err; | 298 | int err; |
301 | 299 | ||
302 | mutex_lock(&khugepaged_mutex); | 300 | mutex_lock(&khugepaged_mutex); |
303 | err = start_khugepaged(); | 301 | err = start_stop_khugepaged(); |
304 | mutex_unlock(&khugepaged_mutex); | 302 | mutex_unlock(&khugepaged_mutex); |
305 | 303 | ||
306 | if (err) | 304 | if (err) |
@@ -634,27 +632,38 @@ static int __init hugepage_init(void) | |||
634 | 632 | ||
635 | err = hugepage_init_sysfs(&hugepage_kobj); | 633 | err = hugepage_init_sysfs(&hugepage_kobj); |
636 | if (err) | 634 | if (err) |
637 | return err; | 635 | goto err_sysfs; |
638 | 636 | ||
639 | err = khugepaged_slab_init(); | 637 | err = khugepaged_slab_init(); |
640 | if (err) | 638 | if (err) |
641 | goto out; | 639 | goto err_slab; |
642 | 640 | ||
643 | register_shrinker(&huge_zero_page_shrinker); | 641 | err = register_shrinker(&huge_zero_page_shrinker); |
642 | if (err) | ||
643 | goto err_hzp_shrinker; | ||
644 | 644 | ||
645 | /* | 645 | /* |
646 | * By default disable transparent hugepages on smaller systems, | 646 | * By default disable transparent hugepages on smaller systems, |
647 | * where the extra memory used could hurt more than TLB overhead | 647 | * where the extra memory used could hurt more than TLB overhead |
648 | * is likely to save. The admin can still enable it through /sys. | 648 | * is likely to save. The admin can still enable it through /sys. |
649 | */ | 649 | */ |
650 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) | 650 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { |
651 | transparent_hugepage_flags = 0; | 651 | transparent_hugepage_flags = 0; |
652 | return 0; | ||
653 | } | ||
652 | 654 | ||
653 | start_khugepaged(); | 655 | err = start_stop_khugepaged(); |
656 | if (err) | ||
657 | goto err_khugepaged; | ||
654 | 658 | ||
655 | return 0; | 659 | return 0; |
656 | out: | 660 | err_khugepaged: |
661 | unregister_shrinker(&huge_zero_page_shrinker); | ||
662 | err_hzp_shrinker: | ||
663 | khugepaged_slab_exit(); | ||
664 | err_slab: | ||
657 | hugepage_exit_sysfs(hugepage_kobj); | 665 | hugepage_exit_sysfs(hugepage_kobj); |
666 | err_sysfs: | ||
658 | return err; | 667 | return err; |
659 | } | 668 | } |
660 | subsys_initcall(hugepage_init); | 669 | subsys_initcall(hugepage_init); |
@@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) | |||
708 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 717 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, |
709 | struct vm_area_struct *vma, | 718 | struct vm_area_struct *vma, |
710 | unsigned long haddr, pmd_t *pmd, | 719 | unsigned long haddr, pmd_t *pmd, |
711 | struct page *page) | 720 | struct page *page, gfp_t gfp) |
712 | { | 721 | { |
713 | struct mem_cgroup *memcg; | 722 | struct mem_cgroup *memcg; |
714 | pgtable_t pgtable; | 723 | pgtable_t pgtable; |
@@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
716 | 725 | ||
717 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 726 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
718 | 727 | ||
719 | if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) | 728 | if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) |
720 | return VM_FAULT_OOM; | 729 | return VM_FAULT_OOM; |
721 | 730 | ||
722 | pgtable = pte_alloc_one(mm, haddr); | 731 | pgtable = pte_alloc_one(mm, haddr); |
@@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
822 | count_vm_event(THP_FAULT_FALLBACK); | 831 | count_vm_event(THP_FAULT_FALLBACK); |
823 | return VM_FAULT_FALLBACK; | 832 | return VM_FAULT_FALLBACK; |
824 | } | 833 | } |
825 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { | 834 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { |
826 | put_page(page); | 835 | put_page(page); |
827 | count_vm_event(THP_FAULT_FALLBACK); | 836 | count_vm_event(THP_FAULT_FALLBACK); |
828 | return VM_FAULT_FALLBACK; | 837 | return VM_FAULT_FALLBACK; |
@@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1080 | unsigned long haddr; | 1089 | unsigned long haddr; |
1081 | unsigned long mmun_start; /* For mmu_notifiers */ | 1090 | unsigned long mmun_start; /* For mmu_notifiers */ |
1082 | unsigned long mmun_end; /* For mmu_notifiers */ | 1091 | unsigned long mmun_end; /* For mmu_notifiers */ |
1092 | gfp_t huge_gfp; /* for allocation and charge */ | ||
1083 | 1093 | ||
1084 | ptl = pmd_lockptr(mm, pmd); | 1094 | ptl = pmd_lockptr(mm, pmd); |
1085 | VM_BUG_ON_VMA(!vma->anon_vma, vma); | 1095 | VM_BUG_ON_VMA(!vma->anon_vma, vma); |
@@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1106 | alloc: | 1116 | alloc: |
1107 | if (transparent_hugepage_enabled(vma) && | 1117 | if (transparent_hugepage_enabled(vma) && |
1108 | !transparent_hugepage_debug_cow()) { | 1118 | !transparent_hugepage_debug_cow()) { |
1109 | gfp_t gfp; | 1119 | huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
1110 | 1120 | new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); | |
1111 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); | ||
1112 | new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); | ||
1113 | } else | 1121 | } else |
1114 | new_page = NULL; | 1122 | new_page = NULL; |
1115 | 1123 | ||
@@ -1130,8 +1138,7 @@ alloc: | |||
1130 | goto out; | 1138 | goto out; |
1131 | } | 1139 | } |
1132 | 1140 | ||
1133 | if (unlikely(mem_cgroup_try_charge(new_page, mm, | 1141 | if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { |
1134 | GFP_TRANSHUGE, &memcg))) { | ||
1135 | put_page(new_page); | 1142 | put_page(new_page); |
1136 | if (page) { | 1143 | if (page) { |
1137 | split_huge_page(page); | 1144 | split_huge_page(page); |
@@ -1976,6 +1983,11 @@ static int __init khugepaged_slab_init(void) | |||
1976 | return 0; | 1983 | return 0; |
1977 | } | 1984 | } |
1978 | 1985 | ||
1986 | static void __init khugepaged_slab_exit(void) | ||
1987 | { | ||
1988 | kmem_cache_destroy(mm_slot_cache); | ||
1989 | } | ||
1990 | |||
1979 | static inline struct mm_slot *alloc_mm_slot(void) | 1991 | static inline struct mm_slot *alloc_mm_slot(void) |
1980 | { | 1992 | { |
1981 | if (!mm_slot_cache) /* initialization failed */ | 1993 | if (!mm_slot_cache) /* initialization failed */ |
@@ -2323,19 +2335,13 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | |||
2323 | return true; | 2335 | return true; |
2324 | } | 2336 | } |
2325 | 2337 | ||
2326 | static struct page | 2338 | static struct page * |
2327 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | 2339 | khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, |
2328 | struct vm_area_struct *vma, unsigned long address, | 2340 | struct vm_area_struct *vma, unsigned long address, |
2329 | int node) | 2341 | int node) |
2330 | { | 2342 | { |
2331 | gfp_t flags; | ||
2332 | |||
2333 | VM_BUG_ON_PAGE(*hpage, *hpage); | 2343 | VM_BUG_ON_PAGE(*hpage, *hpage); |
2334 | 2344 | ||
2335 | /* Only allocate from the target node */ | ||
2336 | flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | | ||
2337 | __GFP_THISNODE; | ||
2338 | |||
2339 | /* | 2345 | /* |
2340 | * Before allocating the hugepage, release the mmap_sem read lock. | 2346 | * Before allocating the hugepage, release the mmap_sem read lock. |
2341 | * The allocation can take potentially a long time if it involves | 2347 | * The allocation can take potentially a long time if it involves |
@@ -2344,7 +2350,7 @@ static struct page | |||
2344 | */ | 2350 | */ |
2345 | up_read(&mm->mmap_sem); | 2351 | up_read(&mm->mmap_sem); |
2346 | 2352 | ||
2347 | *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER); | 2353 | *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); |
2348 | if (unlikely(!*hpage)) { | 2354 | if (unlikely(!*hpage)) { |
2349 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 2355 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
2350 | *hpage = ERR_PTR(-ENOMEM); | 2356 | *hpage = ERR_PTR(-ENOMEM); |
@@ -2397,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | |||
2397 | return true; | 2403 | return true; |
2398 | } | 2404 | } |
2399 | 2405 | ||
2400 | static struct page | 2406 | static struct page * |
2401 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | 2407 | khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, |
2402 | struct vm_area_struct *vma, unsigned long address, | 2408 | struct vm_area_struct *vma, unsigned long address, |
2403 | int node) | 2409 | int node) |
2404 | { | 2410 | { |
2405 | up_read(&mm->mmap_sem); | 2411 | up_read(&mm->mmap_sem); |
2406 | VM_BUG_ON(!*hpage); | 2412 | VM_BUG_ON(!*hpage); |
2413 | |||
2407 | return *hpage; | 2414 | return *hpage; |
2408 | } | 2415 | } |
2409 | #endif | 2416 | #endif |
@@ -2438,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2438 | struct mem_cgroup *memcg; | 2445 | struct mem_cgroup *memcg; |
2439 | unsigned long mmun_start; /* For mmu_notifiers */ | 2446 | unsigned long mmun_start; /* For mmu_notifiers */ |
2440 | unsigned long mmun_end; /* For mmu_notifiers */ | 2447 | unsigned long mmun_end; /* For mmu_notifiers */ |
2448 | gfp_t gfp; | ||
2441 | 2449 | ||
2442 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2450 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2443 | 2451 | ||
2452 | /* Only allocate from the target node */ | ||
2453 | gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | | ||
2454 | __GFP_THISNODE; | ||
2455 | |||
2444 | /* release the mmap_sem read lock. */ | 2456 | /* release the mmap_sem read lock. */ |
2445 | new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); | 2457 | new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); |
2446 | if (!new_page) | 2458 | if (!new_page) |
2447 | return; | 2459 | return; |
2448 | 2460 | ||
2449 | if (unlikely(mem_cgroup_try_charge(new_page, mm, | 2461 | if (unlikely(mem_cgroup_try_charge(new_page, mm, |
2450 | GFP_TRANSHUGE, &memcg))) | 2462 | gfp, &memcg))) |
2451 | return; | 2463 | return; |
2452 | 2464 | ||
2453 | /* | 2465 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8874c8ad55aa..271e4432734c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock); | |||
61 | static int num_fault_mutexes; | 61 | static int num_fault_mutexes; |
62 | static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; | 62 | static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; |
63 | 63 | ||
64 | /* Forward declaration */ | ||
65 | static int hugetlb_acct_memory(struct hstate *h, long delta); | ||
66 | |||
64 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | 67 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) |
65 | { | 68 | { |
66 | bool free = (spool->count == 0) && (spool->used_hpages == 0); | 69 | bool free = (spool->count == 0) && (spool->used_hpages == 0); |
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | |||
68 | spin_unlock(&spool->lock); | 71 | spin_unlock(&spool->lock); |
69 | 72 | ||
70 | /* If no pages are used, and no other handles to the subpool | 73 | /* If no pages are used, and no other handles to the subpool |
71 | * remain, free the subpool the subpool remain */ | 74 | * remain, give up any reservations mased on minimum size and |
72 | if (free) | 75 | * free the subpool */ |
76 | if (free) { | ||
77 | if (spool->min_hpages != -1) | ||
78 | hugetlb_acct_memory(spool->hstate, | ||
79 | -spool->min_hpages); | ||
73 | kfree(spool); | 80 | kfree(spool); |
81 | } | ||
74 | } | 82 | } |
75 | 83 | ||
76 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) | 84 | struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, |
85 | long min_hpages) | ||
77 | { | 86 | { |
78 | struct hugepage_subpool *spool; | 87 | struct hugepage_subpool *spool; |
79 | 88 | ||
80 | spool = kmalloc(sizeof(*spool), GFP_KERNEL); | 89 | spool = kzalloc(sizeof(*spool), GFP_KERNEL); |
81 | if (!spool) | 90 | if (!spool) |
82 | return NULL; | 91 | return NULL; |
83 | 92 | ||
84 | spin_lock_init(&spool->lock); | 93 | spin_lock_init(&spool->lock); |
85 | spool->count = 1; | 94 | spool->count = 1; |
86 | spool->max_hpages = nr_blocks; | 95 | spool->max_hpages = max_hpages; |
87 | spool->used_hpages = 0; | 96 | spool->hstate = h; |
97 | spool->min_hpages = min_hpages; | ||
98 | |||
99 | if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { | ||
100 | kfree(spool); | ||
101 | return NULL; | ||
102 | } | ||
103 | spool->rsv_hpages = min_hpages; | ||
88 | 104 | ||
89 | return spool; | 105 | return spool; |
90 | } | 106 | } |
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) | |||
97 | unlock_or_release_subpool(spool); | 113 | unlock_or_release_subpool(spool); |
98 | } | 114 | } |
99 | 115 | ||
100 | static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, | 116 | /* |
117 | * Subpool accounting for allocating and reserving pages. | ||
118 | * Return -ENOMEM if there are not enough resources to satisfy the | ||
119 | * the request. Otherwise, return the number of pages by which the | ||
120 | * global pools must be adjusted (upward). The returned value may | ||
121 | * only be different than the passed value (delta) in the case where | ||
122 | * a subpool minimum size must be manitained. | ||
123 | */ | ||
124 | static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, | ||
101 | long delta) | 125 | long delta) |
102 | { | 126 | { |
103 | int ret = 0; | 127 | long ret = delta; |
104 | 128 | ||
105 | if (!spool) | 129 | if (!spool) |
106 | return 0; | 130 | return ret; |
107 | 131 | ||
108 | spin_lock(&spool->lock); | 132 | spin_lock(&spool->lock); |
109 | if ((spool->used_hpages + delta) <= spool->max_hpages) { | 133 | |
110 | spool->used_hpages += delta; | 134 | if (spool->max_hpages != -1) { /* maximum size accounting */ |
111 | } else { | 135 | if ((spool->used_hpages + delta) <= spool->max_hpages) |
112 | ret = -ENOMEM; | 136 | spool->used_hpages += delta; |
137 | else { | ||
138 | ret = -ENOMEM; | ||
139 | goto unlock_ret; | ||
140 | } | ||
141 | } | ||
142 | |||
143 | if (spool->min_hpages != -1) { /* minimum size accounting */ | ||
144 | if (delta > spool->rsv_hpages) { | ||
145 | /* | ||
146 | * Asking for more reserves than those already taken on | ||
147 | * behalf of subpool. Return difference. | ||
148 | */ | ||
149 | ret = delta - spool->rsv_hpages; | ||
150 | spool->rsv_hpages = 0; | ||
151 | } else { | ||
152 | ret = 0; /* reserves already accounted for */ | ||
153 | spool->rsv_hpages -= delta; | ||
154 | } | ||
113 | } | 155 | } |
114 | spin_unlock(&spool->lock); | ||
115 | 156 | ||
157 | unlock_ret: | ||
158 | spin_unlock(&spool->lock); | ||
116 | return ret; | 159 | return ret; |
117 | } | 160 | } |
118 | 161 | ||
119 | static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, | 162 | /* |
163 | * Subpool accounting for freeing and unreserving pages. | ||
164 | * Return the number of global page reservations that must be dropped. | ||
165 | * The return value may only be different than the passed value (delta) | ||
166 | * in the case where a subpool minimum size must be maintained. | ||
167 | */ | ||
168 | static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, | ||
120 | long delta) | 169 | long delta) |
121 | { | 170 | { |
171 | long ret = delta; | ||
172 | |||
122 | if (!spool) | 173 | if (!spool) |
123 | return; | 174 | return delta; |
124 | 175 | ||
125 | spin_lock(&spool->lock); | 176 | spin_lock(&spool->lock); |
126 | spool->used_hpages -= delta; | 177 | |
127 | /* If hugetlbfs_put_super couldn't free spool due to | 178 | if (spool->max_hpages != -1) /* maximum size accounting */ |
128 | * an outstanding quota reference, free it now. */ | 179 | spool->used_hpages -= delta; |
180 | |||
181 | if (spool->min_hpages != -1) { /* minimum size accounting */ | ||
182 | if (spool->rsv_hpages + delta <= spool->min_hpages) | ||
183 | ret = 0; | ||
184 | else | ||
185 | ret = spool->rsv_hpages + delta - spool->min_hpages; | ||
186 | |||
187 | spool->rsv_hpages += delta; | ||
188 | if (spool->rsv_hpages > spool->min_hpages) | ||
189 | spool->rsv_hpages = spool->min_hpages; | ||
190 | } | ||
191 | |||
192 | /* | ||
193 | * If hugetlbfs_put_super couldn't free spool due to an outstanding | ||
194 | * quota reference, free it now. | ||
195 | */ | ||
129 | unlock_or_release_subpool(spool); | 196 | unlock_or_release_subpool(spool); |
197 | |||
198 | return ret; | ||
130 | } | 199 | } |
131 | 200 | ||
132 | static inline struct hugepage_subpool *subpool_inode(struct inode *inode) | 201 | static inline struct hugepage_subpool *subpool_inode(struct inode *inode) |
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size) | |||
855 | return NULL; | 924 | return NULL; |
856 | } | 925 | } |
857 | 926 | ||
927 | /* | ||
928 | * Test to determine whether the hugepage is "active/in-use" (i.e. being linked | ||
929 | * to hstate->hugepage_activelist.) | ||
930 | * | ||
931 | * This function can be called for tail pages, but never returns true for them. | ||
932 | */ | ||
933 | bool page_huge_active(struct page *page) | ||
934 | { | ||
935 | VM_BUG_ON_PAGE(!PageHuge(page), page); | ||
936 | return PageHead(page) && PagePrivate(&page[1]); | ||
937 | } | ||
938 | |||
939 | /* never called for tail page */ | ||
940 | static void set_page_huge_active(struct page *page) | ||
941 | { | ||
942 | VM_BUG_ON_PAGE(!PageHeadHuge(page), page); | ||
943 | SetPagePrivate(&page[1]); | ||
944 | } | ||
945 | |||
946 | static void clear_page_huge_active(struct page *page) | ||
947 | { | ||
948 | VM_BUG_ON_PAGE(!PageHeadHuge(page), page); | ||
949 | ClearPagePrivate(&page[1]); | ||
950 | } | ||
951 | |||
858 | void free_huge_page(struct page *page) | 952 | void free_huge_page(struct page *page) |
859 | { | 953 | { |
860 | /* | 954 | /* |
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page) | |||
874 | restore_reserve = PagePrivate(page); | 968 | restore_reserve = PagePrivate(page); |
875 | ClearPagePrivate(page); | 969 | ClearPagePrivate(page); |
876 | 970 | ||
971 | /* | ||
972 | * A return code of zero implies that the subpool will be under its | ||
973 | * minimum size if the reservation is not restored after page is free. | ||
974 | * Therefore, force restore_reserve operation. | ||
975 | */ | ||
976 | if (hugepage_subpool_put_pages(spool, 1) == 0) | ||
977 | restore_reserve = true; | ||
978 | |||
877 | spin_lock(&hugetlb_lock); | 979 | spin_lock(&hugetlb_lock); |
980 | clear_page_huge_active(page); | ||
878 | hugetlb_cgroup_uncharge_page(hstate_index(h), | 981 | hugetlb_cgroup_uncharge_page(hstate_index(h), |
879 | pages_per_huge_page(h), page); | 982 | pages_per_huge_page(h), page); |
880 | if (restore_reserve) | 983 | if (restore_reserve) |
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page) | |||
891 | enqueue_huge_page(h, page); | 994 | enqueue_huge_page(h, page); |
892 | } | 995 | } |
893 | spin_unlock(&hugetlb_lock); | 996 | spin_unlock(&hugetlb_lock); |
894 | hugepage_subpool_put_pages(spool, 1); | ||
895 | } | 997 | } |
896 | 998 | ||
897 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 999 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
@@ -1386,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1386 | if (chg < 0) | 1488 | if (chg < 0) |
1387 | return ERR_PTR(-ENOMEM); | 1489 | return ERR_PTR(-ENOMEM); |
1388 | if (chg || avoid_reserve) | 1490 | if (chg || avoid_reserve) |
1389 | if (hugepage_subpool_get_pages(spool, 1)) | 1491 | if (hugepage_subpool_get_pages(spool, 1) < 0) |
1390 | return ERR_PTR(-ENOSPC); | 1492 | return ERR_PTR(-ENOSPC); |
1391 | 1493 | ||
1392 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | 1494 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); |
@@ -2454,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2454 | struct resv_map *resv = vma_resv_map(vma); | 2556 | struct resv_map *resv = vma_resv_map(vma); |
2455 | struct hugepage_subpool *spool = subpool_vma(vma); | 2557 | struct hugepage_subpool *spool = subpool_vma(vma); |
2456 | unsigned long reserve, start, end; | 2558 | unsigned long reserve, start, end; |
2559 | long gbl_reserve; | ||
2457 | 2560 | ||
2458 | if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | 2561 | if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
2459 | return; | 2562 | return; |
@@ -2466,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2466 | kref_put(&resv->refs, resv_map_release); | 2569 | kref_put(&resv->refs, resv_map_release); |
2467 | 2570 | ||
2468 | if (reserve) { | 2571 | if (reserve) { |
2469 | hugetlb_acct_memory(h, -reserve); | 2572 | /* |
2470 | hugepage_subpool_put_pages(spool, reserve); | 2573 | * Decrement reserve counts. The global reserve count may be |
2574 | * adjusted if the subpool has a minimum size. | ||
2575 | */ | ||
2576 | gbl_reserve = hugepage_subpool_put_pages(spool, reserve); | ||
2577 | hugetlb_acct_memory(h, -gbl_reserve); | ||
2471 | } | 2578 | } |
2472 | } | 2579 | } |
2473 | 2580 | ||
@@ -2891,6 +2998,7 @@ retry_avoidcopy: | |||
2891 | copy_user_huge_page(new_page, old_page, address, vma, | 2998 | copy_user_huge_page(new_page, old_page, address, vma, |
2892 | pages_per_huge_page(h)); | 2999 | pages_per_huge_page(h)); |
2893 | __SetPageUptodate(new_page); | 3000 | __SetPageUptodate(new_page); |
3001 | set_page_huge_active(new_page); | ||
2894 | 3002 | ||
2895 | mmun_start = address & huge_page_mask(h); | 3003 | mmun_start = address & huge_page_mask(h); |
2896 | mmun_end = mmun_start + huge_page_size(h); | 3004 | mmun_end = mmun_start + huge_page_size(h); |
@@ -3003,6 +3111,7 @@ retry: | |||
3003 | } | 3111 | } |
3004 | clear_huge_page(page, address, pages_per_huge_page(h)); | 3112 | clear_huge_page(page, address, pages_per_huge_page(h)); |
3005 | __SetPageUptodate(page); | 3113 | __SetPageUptodate(page); |
3114 | set_page_huge_active(page); | ||
3006 | 3115 | ||
3007 | if (vma->vm_flags & VM_MAYSHARE) { | 3116 | if (vma->vm_flags & VM_MAYSHARE) { |
3008 | int err; | 3117 | int err; |
@@ -3447,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
3447 | struct hstate *h = hstate_inode(inode); | 3556 | struct hstate *h = hstate_inode(inode); |
3448 | struct hugepage_subpool *spool = subpool_inode(inode); | 3557 | struct hugepage_subpool *spool = subpool_inode(inode); |
3449 | struct resv_map *resv_map; | 3558 | struct resv_map *resv_map; |
3559 | long gbl_reserve; | ||
3450 | 3560 | ||
3451 | /* | 3561 | /* |
3452 | * Only apply hugepage reservation if asked. At fault time, an | 3562 | * Only apply hugepage reservation if asked. At fault time, an |
@@ -3483,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
3483 | goto out_err; | 3593 | goto out_err; |
3484 | } | 3594 | } |
3485 | 3595 | ||
3486 | /* There must be enough pages in the subpool for the mapping */ | 3596 | /* |
3487 | if (hugepage_subpool_get_pages(spool, chg)) { | 3597 | * There must be enough pages in the subpool for the mapping. If |
3598 | * the subpool has a minimum size, there may be some global | ||
3599 | * reservations already in place (gbl_reserve). | ||
3600 | */ | ||
3601 | gbl_reserve = hugepage_subpool_get_pages(spool, chg); | ||
3602 | if (gbl_reserve < 0) { | ||
3488 | ret = -ENOSPC; | 3603 | ret = -ENOSPC; |
3489 | goto out_err; | 3604 | goto out_err; |
3490 | } | 3605 | } |
@@ -3493,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
3493 | * Check enough hugepages are available for the reservation. | 3608 | * Check enough hugepages are available for the reservation. |
3494 | * Hand the pages back to the subpool if there are not | 3609 | * Hand the pages back to the subpool if there are not |
3495 | */ | 3610 | */ |
3496 | ret = hugetlb_acct_memory(h, chg); | 3611 | ret = hugetlb_acct_memory(h, gbl_reserve); |
3497 | if (ret < 0) { | 3612 | if (ret < 0) { |
3498 | hugepage_subpool_put_pages(spool, chg); | 3613 | /* put back original number of pages, chg */ |
3614 | (void)hugepage_subpool_put_pages(spool, chg); | ||
3499 | goto out_err; | 3615 | goto out_err; |
3500 | } | 3616 | } |
3501 | 3617 | ||
@@ -3525,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
3525 | struct resv_map *resv_map = inode_resv_map(inode); | 3641 | struct resv_map *resv_map = inode_resv_map(inode); |
3526 | long chg = 0; | 3642 | long chg = 0; |
3527 | struct hugepage_subpool *spool = subpool_inode(inode); | 3643 | struct hugepage_subpool *spool = subpool_inode(inode); |
3644 | long gbl_reserve; | ||
3528 | 3645 | ||
3529 | if (resv_map) | 3646 | if (resv_map) |
3530 | chg = region_truncate(resv_map, offset); | 3647 | chg = region_truncate(resv_map, offset); |
@@ -3532,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
3532 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); | 3649 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); |
3533 | spin_unlock(&inode->i_lock); | 3650 | spin_unlock(&inode->i_lock); |
3534 | 3651 | ||
3535 | hugepage_subpool_put_pages(spool, (chg - freed)); | 3652 | /* |
3536 | hugetlb_acct_memory(h, -(chg - freed)); | 3653 | * If the subpool has a minimum size, the number of global |
3654 | * reservations to be released may be adjusted. | ||
3655 | */ | ||
3656 | gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); | ||
3657 | hugetlb_acct_memory(h, -gbl_reserve); | ||
3537 | } | 3658 | } |
3538 | 3659 | ||
3539 | #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE | 3660 | #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE |
@@ -3775,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
3775 | 3896 | ||
3776 | #ifdef CONFIG_MEMORY_FAILURE | 3897 | #ifdef CONFIG_MEMORY_FAILURE |
3777 | 3898 | ||
3778 | /* Should be called in hugetlb_lock */ | ||
3779 | static int is_hugepage_on_freelist(struct page *hpage) | ||
3780 | { | ||
3781 | struct page *page; | ||
3782 | struct page *tmp; | ||
3783 | struct hstate *h = page_hstate(hpage); | ||
3784 | int nid = page_to_nid(hpage); | ||
3785 | |||
3786 | list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) | ||
3787 | if (page == hpage) | ||
3788 | return 1; | ||
3789 | return 0; | ||
3790 | } | ||
3791 | |||
3792 | /* | 3899 | /* |
3793 | * This function is called from memory failure code. | 3900 | * This function is called from memory failure code. |
3794 | * Assume the caller holds page lock of the head page. | 3901 | * Assume the caller holds page lock of the head page. |
@@ -3800,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
3800 | int ret = -EBUSY; | 3907 | int ret = -EBUSY; |
3801 | 3908 | ||
3802 | spin_lock(&hugetlb_lock); | 3909 | spin_lock(&hugetlb_lock); |
3803 | if (is_hugepage_on_freelist(hpage)) { | 3910 | /* |
3911 | * Just checking !page_huge_active is not enough, because that could be | ||
3912 | * an isolated/hwpoisoned hugepage (which have >0 refcount). | ||
3913 | */ | ||
3914 | if (!page_huge_active(hpage) && !page_count(hpage)) { | ||
3804 | /* | 3915 | /* |
3805 | * Hwpoisoned hugepage isn't linked to activelist or freelist, | 3916 | * Hwpoisoned hugepage isn't linked to activelist or freelist, |
3806 | * but dangling hpage->lru can trigger list-debug warnings | 3917 | * but dangling hpage->lru can trigger list-debug warnings |
@@ -3820,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
3820 | 3931 | ||
3821 | bool isolate_huge_page(struct page *page, struct list_head *list) | 3932 | bool isolate_huge_page(struct page *page, struct list_head *list) |
3822 | { | 3933 | { |
3934 | bool ret = true; | ||
3935 | |||
3823 | VM_BUG_ON_PAGE(!PageHead(page), page); | 3936 | VM_BUG_ON_PAGE(!PageHead(page), page); |
3824 | if (!get_page_unless_zero(page)) | ||
3825 | return false; | ||
3826 | spin_lock(&hugetlb_lock); | 3937 | spin_lock(&hugetlb_lock); |
3938 | if (!page_huge_active(page) || !get_page_unless_zero(page)) { | ||
3939 | ret = false; | ||
3940 | goto unlock; | ||
3941 | } | ||
3942 | clear_page_huge_active(page); | ||
3827 | list_move_tail(&page->lru, list); | 3943 | list_move_tail(&page->lru, list); |
3944 | unlock: | ||
3828 | spin_unlock(&hugetlb_lock); | 3945 | spin_unlock(&hugetlb_lock); |
3829 | return true; | 3946 | return ret; |
3830 | } | 3947 | } |
3831 | 3948 | ||
3832 | void putback_active_hugepage(struct page *page) | 3949 | void putback_active_hugepage(struct page *page) |
3833 | { | 3950 | { |
3834 | VM_BUG_ON_PAGE(!PageHead(page), page); | 3951 | VM_BUG_ON_PAGE(!PageHead(page), page); |
3835 | spin_lock(&hugetlb_lock); | 3952 | spin_lock(&hugetlb_lock); |
3953 | set_page_huge_active(page); | ||
3836 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); | 3954 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); |
3837 | spin_unlock(&hugetlb_lock); | 3955 | spin_unlock(&hugetlb_lock); |
3838 | put_page(page); | 3956 | put_page(page); |
3839 | } | 3957 | } |
3840 | |||
3841 | bool is_hugepage_active(struct page *page) | ||
3842 | { | ||
3843 | VM_BUG_ON_PAGE(!PageHuge(page), page); | ||
3844 | /* | ||
3845 | * This function can be called for a tail page because the caller, | ||
3846 | * scan_movable_pages, scans through a given pfn-range which typically | ||
3847 | * covers one memory block. In systems using gigantic hugepage (1GB | ||
3848 | * for x86_64,) a hugepage is larger than a memory block, and we don't | ||
3849 | * support migrating such large hugepages for now, so return false | ||
3850 | * when called for tail pages. | ||
3851 | */ | ||
3852 | if (PageTail(page)) | ||
3853 | return false; | ||
3854 | /* | ||
3855 | * Refcount of a hwpoisoned hugepages is 1, but they are not active, | ||
3856 | * so we should return false for them. | ||
3857 | */ | ||
3858 | if (unlikely(PageHWPoison(page))) | ||
3859 | return false; | ||
3860 | return page_count(page) > 0; | ||
3861 | } | ||
diff --git a/mm/internal.h b/mm/internal.h index edaab69a9c35..a25e359a4039 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -224,13 +224,13 @@ static inline unsigned long page_order(struct page *page) | |||
224 | * PageBuddy() should be checked first by the caller to minimize race window, | 224 | * PageBuddy() should be checked first by the caller to minimize race window, |
225 | * and invalid values must be handled gracefully. | 225 | * and invalid values must be handled gracefully. |
226 | * | 226 | * |
227 | * ACCESS_ONCE is used so that if the caller assigns the result into a local | 227 | * READ_ONCE is used so that if the caller assigns the result into a local |
228 | * variable and e.g. tests it for valid range before using, the compiler cannot | 228 | * variable and e.g. tests it for valid range before using, the compiler cannot |
229 | * decide to remove the variable and inline the page_private(page) multiple | 229 | * decide to remove the variable and inline the page_private(page) multiple |
230 | * times, potentially observing different values in the tests and the actual | 230 | * times, potentially observing different values in the tests and the actual |
231 | * use of the result. | 231 | * use of the result. |
232 | */ | 232 | */ |
233 | #define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) | 233 | #define page_order_unsafe(page) READ_ONCE(page_private(page)) |
234 | 234 | ||
235 | static inline bool is_cow_mapping(vm_flags_t flags) | 235 | static inline bool is_cow_mapping(vm_flags_t flags) |
236 | { | 236 | { |
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 936d81661c47..6c513a63ea84 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
@@ -389,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size) | |||
389 | kasan_kmalloc(page->slab_cache, object, size); | 389 | kasan_kmalloc(page->slab_cache, object, size); |
390 | } | 390 | } |
391 | 391 | ||
392 | void kasan_kfree(void *ptr) | ||
393 | { | ||
394 | struct page *page; | ||
395 | |||
396 | page = virt_to_head_page(ptr); | ||
397 | |||
398 | if (unlikely(!PageSlab(page))) | ||
399 | kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), | ||
400 | KASAN_FREE_PAGE); | ||
401 | else | ||
402 | kasan_slab_free(page->slab_cache, ptr); | ||
403 | } | ||
404 | |||
392 | void kasan_kfree_large(const void *ptr) | 405 | void kasan_kfree_large(const void *ptr) |
393 | { | 406 | { |
394 | struct page *page = virt_to_page(ptr); | 407 | struct page *page = virt_to_page(ptr); |
@@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) | |||
542 | expected_mapping = (void *)stable_node + | 542 | expected_mapping = (void *)stable_node + |
543 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 543 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); |
544 | again: | 544 | again: |
545 | kpfn = ACCESS_ONCE(stable_node->kpfn); | 545 | kpfn = READ_ONCE(stable_node->kpfn); |
546 | page = pfn_to_page(kpfn); | 546 | page = pfn_to_page(kpfn); |
547 | 547 | ||
548 | /* | 548 | /* |
@@ -551,7 +551,7 @@ again: | |||
551 | * but on Alpha we need to be more careful. | 551 | * but on Alpha we need to be more careful. |
552 | */ | 552 | */ |
553 | smp_read_barrier_depends(); | 553 | smp_read_barrier_depends(); |
554 | if (ACCESS_ONCE(page->mapping) != expected_mapping) | 554 | if (READ_ONCE(page->mapping) != expected_mapping) |
555 | goto stale; | 555 | goto stale; |
556 | 556 | ||
557 | /* | 557 | /* |
@@ -577,14 +577,14 @@ again: | |||
577 | cpu_relax(); | 577 | cpu_relax(); |
578 | } | 578 | } |
579 | 579 | ||
580 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | 580 | if (READ_ONCE(page->mapping) != expected_mapping) { |
581 | put_page(page); | 581 | put_page(page); |
582 | goto stale; | 582 | goto stale; |
583 | } | 583 | } |
584 | 584 | ||
585 | if (lock_it) { | 585 | if (lock_it) { |
586 | lock_page(page); | 586 | lock_page(page); |
587 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | 587 | if (READ_ONCE(page->mapping) != expected_mapping) { |
588 | unlock_page(page); | 588 | unlock_page(page); |
589 | put_page(page); | 589 | put_page(page); |
590 | goto stale; | 590 | goto stale; |
@@ -600,7 +600,7 @@ stale: | |||
600 | * before checking whether node->kpfn has been changed. | 600 | * before checking whether node->kpfn has been changed. |
601 | */ | 601 | */ |
602 | smp_rmb(); | 602 | smp_rmb(); |
603 | if (ACCESS_ONCE(stable_node->kpfn) != kpfn) | 603 | if (READ_ONCE(stable_node->kpfn) != kpfn) |
604 | goto again; | 604 | goto again; |
605 | remove_node_from_stable_tree(stable_node); | 605 | remove_node_from_stable_tree(stable_node); |
606 | return NULL; | 606 | return NULL; |
diff --git a/mm/memblock.c b/mm/memblock.c index 3f37a0bca5d5..9318b567ed79 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -580,10 +580,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, | |||
580 | return memblock_add_range(&memblock.memory, base, size, nid, 0); | 580 | return memblock_add_range(&memblock.memory, base, size, nid, 0); |
581 | } | 581 | } |
582 | 582 | ||
583 | static int __init_memblock memblock_add_region(phys_addr_t base, | ||
584 | phys_addr_t size, | ||
585 | int nid, | ||
586 | unsigned long flags) | ||
587 | { | ||
588 | struct memblock_type *_rgn = &memblock.memory; | ||
589 | |||
590 | memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", | ||
591 | (unsigned long long)base, | ||
592 | (unsigned long long)base + size - 1, | ||
593 | flags, (void *)_RET_IP_); | ||
594 | |||
595 | return memblock_add_range(_rgn, base, size, nid, flags); | ||
596 | } | ||
597 | |||
583 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | 598 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) |
584 | { | 599 | { |
585 | return memblock_add_range(&memblock.memory, base, size, | 600 | return memblock_add_region(base, size, MAX_NUMNODES, 0); |
586 | MAX_NUMNODES, 0); | ||
587 | } | 601 | } |
588 | 602 | ||
589 | /** | 603 | /** |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c3f09b2dda5f..14c2f2017e37 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -259,11 +259,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | |||
259 | * page cache and RSS per cgroup. We would eventually like to provide | 259 | * page cache and RSS per cgroup. We would eventually like to provide |
260 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 260 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
261 | * to help the administrator determine what knobs to tune. | 261 | * to help the administrator determine what knobs to tune. |
262 | * | ||
263 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | ||
264 | * we hit the water mark. May be even add a low water mark, such that | ||
265 | * no reclaim occurs from a cgroup at it's low water mark, this is | ||
266 | * a feature that will be implemented much later in the future. | ||
267 | */ | 262 | */ |
268 | struct mem_cgroup { | 263 | struct mem_cgroup { |
269 | struct cgroup_subsys_state css; | 264 | struct cgroup_subsys_state css; |
@@ -460,6 +455,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) | |||
460 | return memcg->css.id; | 455 | return memcg->css.id; |
461 | } | 456 | } |
462 | 457 | ||
458 | /* | ||
459 | * A helper function to get mem_cgroup from ID. must be called under | ||
460 | * rcu_read_lock(). The caller is responsible for calling | ||
461 | * css_tryget_online() if the mem_cgroup is used for charging. (dropping | ||
462 | * refcnt from swap can be called against removed memcg.) | ||
463 | */ | ||
463 | static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) | 464 | static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) |
464 | { | 465 | { |
465 | struct cgroup_subsys_state *css; | 466 | struct cgroup_subsys_state *css; |
@@ -673,7 +674,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, | |||
673 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) | 674 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) |
674 | { | 675 | { |
675 | unsigned long nr_pages = page_counter_read(&memcg->memory); | 676 | unsigned long nr_pages = page_counter_read(&memcg->memory); |
676 | unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); | 677 | unsigned long soft_limit = READ_ONCE(memcg->soft_limit); |
677 | unsigned long excess = 0; | 678 | unsigned long excess = 0; |
678 | 679 | ||
679 | if (nr_pages > soft_limit) | 680 | if (nr_pages > soft_limit) |
@@ -1041,7 +1042,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1041 | goto out_unlock; | 1042 | goto out_unlock; |
1042 | 1043 | ||
1043 | do { | 1044 | do { |
1044 | pos = ACCESS_ONCE(iter->position); | 1045 | pos = READ_ONCE(iter->position); |
1045 | /* | 1046 | /* |
1046 | * A racing update may change the position and | 1047 | * A racing update may change the position and |
1047 | * put the last reference, hence css_tryget(), | 1048 | * put the last reference, hence css_tryget(), |
@@ -1358,13 +1359,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) | |||
1358 | unsigned long limit; | 1359 | unsigned long limit; |
1359 | 1360 | ||
1360 | count = page_counter_read(&memcg->memory); | 1361 | count = page_counter_read(&memcg->memory); |
1361 | limit = ACCESS_ONCE(memcg->memory.limit); | 1362 | limit = READ_ONCE(memcg->memory.limit); |
1362 | if (count < limit) | 1363 | if (count < limit) |
1363 | margin = limit - count; | 1364 | margin = limit - count; |
1364 | 1365 | ||
1365 | if (do_swap_account) { | 1366 | if (do_swap_account) { |
1366 | count = page_counter_read(&memcg->memsw); | 1367 | count = page_counter_read(&memcg->memsw); |
1367 | limit = ACCESS_ONCE(memcg->memsw.limit); | 1368 | limit = READ_ONCE(memcg->memsw.limit); |
1368 | if (count <= limit) | 1369 | if (count <= limit) |
1369 | margin = min(margin, limit - count); | 1370 | margin = min(margin, limit - count); |
1370 | } | 1371 | } |
@@ -2349,20 +2350,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
2349 | } | 2350 | } |
2350 | 2351 | ||
2351 | /* | 2352 | /* |
2352 | * A helper function to get mem_cgroup from ID. must be called under | ||
2353 | * rcu_read_lock(). The caller is responsible for calling | ||
2354 | * css_tryget_online() if the mem_cgroup is used for charging. (dropping | ||
2355 | * refcnt from swap can be called against removed memcg.) | ||
2356 | */ | ||
2357 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | ||
2358 | { | ||
2359 | /* ID 0 is unused ID */ | ||
2360 | if (!id) | ||
2361 | return NULL; | ||
2362 | return mem_cgroup_from_id(id); | ||
2363 | } | ||
2364 | |||
2365 | /* | ||
2366 | * try_get_mem_cgroup_from_page - look up page's memcg association | 2353 | * try_get_mem_cgroup_from_page - look up page's memcg association |
2367 | * @page: the page | 2354 | * @page: the page |
2368 | * | 2355 | * |
@@ -2388,7 +2375,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2388 | ent.val = page_private(page); | 2375 | ent.val = page_private(page); |
2389 | id = lookup_swap_cgroup_id(ent); | 2376 | id = lookup_swap_cgroup_id(ent); |
2390 | rcu_read_lock(); | 2377 | rcu_read_lock(); |
2391 | memcg = mem_cgroup_lookup(id); | 2378 | memcg = mem_cgroup_from_id(id); |
2392 | if (memcg && !css_tryget_online(&memcg->css)) | 2379 | if (memcg && !css_tryget_online(&memcg->css)) |
2393 | memcg = NULL; | 2380 | memcg = NULL; |
2394 | rcu_read_unlock(); | 2381 | rcu_read_unlock(); |
@@ -2650,7 +2637,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
2650 | return cachep; | 2637 | return cachep; |
2651 | 2638 | ||
2652 | memcg = get_mem_cgroup_from_mm(current->mm); | 2639 | memcg = get_mem_cgroup_from_mm(current->mm); |
2653 | kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); | 2640 | kmemcg_id = READ_ONCE(memcg->kmemcg_id); |
2654 | if (kmemcg_id < 0) | 2641 | if (kmemcg_id < 0) |
2655 | goto out; | 2642 | goto out; |
2656 | 2643 | ||
@@ -5020,7 +5007,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
5020 | * tunable will only affect upcoming migrations, not the current one. | 5007 | * tunable will only affect upcoming migrations, not the current one. |
5021 | * So we need to save it, and keep it going. | 5008 | * So we need to save it, and keep it going. |
5022 | */ | 5009 | */ |
5023 | move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); | 5010 | move_flags = READ_ONCE(memcg->move_charge_at_immigrate); |
5024 | if (move_flags) { | 5011 | if (move_flags) { |
5025 | struct mm_struct *mm; | 5012 | struct mm_struct *mm; |
5026 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 5013 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
@@ -5254,7 +5241,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, | |||
5254 | static int memory_low_show(struct seq_file *m, void *v) | 5241 | static int memory_low_show(struct seq_file *m, void *v) |
5255 | { | 5242 | { |
5256 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5243 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5257 | unsigned long low = ACCESS_ONCE(memcg->low); | 5244 | unsigned long low = READ_ONCE(memcg->low); |
5258 | 5245 | ||
5259 | if (low == PAGE_COUNTER_MAX) | 5246 | if (low == PAGE_COUNTER_MAX) |
5260 | seq_puts(m, "max\n"); | 5247 | seq_puts(m, "max\n"); |
@@ -5284,7 +5271,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, | |||
5284 | static int memory_high_show(struct seq_file *m, void *v) | 5271 | static int memory_high_show(struct seq_file *m, void *v) |
5285 | { | 5272 | { |
5286 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5273 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5287 | unsigned long high = ACCESS_ONCE(memcg->high); | 5274 | unsigned long high = READ_ONCE(memcg->high); |
5288 | 5275 | ||
5289 | if (high == PAGE_COUNTER_MAX) | 5276 | if (high == PAGE_COUNTER_MAX) |
5290 | seq_puts(m, "max\n"); | 5277 | seq_puts(m, "max\n"); |
@@ -5314,7 +5301,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, | |||
5314 | static int memory_max_show(struct seq_file *m, void *v) | 5301 | static int memory_max_show(struct seq_file *m, void *v) |
5315 | { | 5302 | { |
5316 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5303 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5317 | unsigned long max = ACCESS_ONCE(memcg->memory.limit); | 5304 | unsigned long max = READ_ONCE(memcg->memory.limit); |
5318 | 5305 | ||
5319 | if (max == PAGE_COUNTER_MAX) | 5306 | if (max == PAGE_COUNTER_MAX) |
5320 | seq_puts(m, "max\n"); | 5307 | seq_puts(m, "max\n"); |
@@ -5869,7 +5856,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) | |||
5869 | 5856 | ||
5870 | id = swap_cgroup_record(entry, 0); | 5857 | id = swap_cgroup_record(entry, 0); |
5871 | rcu_read_lock(); | 5858 | rcu_read_lock(); |
5872 | memcg = mem_cgroup_lookup(id); | 5859 | memcg = mem_cgroup_from_id(id); |
5873 | if (memcg) { | 5860 | if (memcg) { |
5874 | if (!mem_cgroup_is_root(memcg)) | 5861 | if (!mem_cgroup_is_root(memcg)) |
5875 | page_counter_uncharge(&memcg->memsw, 1); | 5862 | page_counter_uncharge(&memcg->memsw, 1); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d487f8dc6d39..d9359b770cd9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -521,6 +521,52 @@ static const char *action_name[] = { | |||
521 | [RECOVERED] = "Recovered", | 521 | [RECOVERED] = "Recovered", |
522 | }; | 522 | }; |
523 | 523 | ||
524 | enum action_page_type { | ||
525 | MSG_KERNEL, | ||
526 | MSG_KERNEL_HIGH_ORDER, | ||
527 | MSG_SLAB, | ||
528 | MSG_DIFFERENT_COMPOUND, | ||
529 | MSG_POISONED_HUGE, | ||
530 | MSG_HUGE, | ||
531 | MSG_FREE_HUGE, | ||
532 | MSG_UNMAP_FAILED, | ||
533 | MSG_DIRTY_SWAPCACHE, | ||
534 | MSG_CLEAN_SWAPCACHE, | ||
535 | MSG_DIRTY_MLOCKED_LRU, | ||
536 | MSG_CLEAN_MLOCKED_LRU, | ||
537 | MSG_DIRTY_UNEVICTABLE_LRU, | ||
538 | MSG_CLEAN_UNEVICTABLE_LRU, | ||
539 | MSG_DIRTY_LRU, | ||
540 | MSG_CLEAN_LRU, | ||
541 | MSG_TRUNCATED_LRU, | ||
542 | MSG_BUDDY, | ||
543 | MSG_BUDDY_2ND, | ||
544 | MSG_UNKNOWN, | ||
545 | }; | ||
546 | |||
547 | static const char * const action_page_types[] = { | ||
548 | [MSG_KERNEL] = "reserved kernel page", | ||
549 | [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", | ||
550 | [MSG_SLAB] = "kernel slab page", | ||
551 | [MSG_DIFFERENT_COMPOUND] = "different compound page after locking", | ||
552 | [MSG_POISONED_HUGE] = "huge page already hardware poisoned", | ||
553 | [MSG_HUGE] = "huge page", | ||
554 | [MSG_FREE_HUGE] = "free huge page", | ||
555 | [MSG_UNMAP_FAILED] = "unmapping failed page", | ||
556 | [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", | ||
557 | [MSG_CLEAN_SWAPCACHE] = "clean swapcache page", | ||
558 | [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", | ||
559 | [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", | ||
560 | [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", | ||
561 | [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", | ||
562 | [MSG_DIRTY_LRU] = "dirty LRU page", | ||
563 | [MSG_CLEAN_LRU] = "clean LRU page", | ||
564 | [MSG_TRUNCATED_LRU] = "already truncated LRU page", | ||
565 | [MSG_BUDDY] = "free buddy page", | ||
566 | [MSG_BUDDY_2ND] = "free buddy page (2nd try)", | ||
567 | [MSG_UNKNOWN] = "unknown page", | ||
568 | }; | ||
569 | |||
524 | /* | 570 | /* |
525 | * XXX: It is possible that a page is isolated from LRU cache, | 571 | * XXX: It is possible that a page is isolated from LRU cache, |
526 | * and then kept in swap cache or failed to remove from page cache. | 572 | * and then kept in swap cache or failed to remove from page cache. |
@@ -777,10 +823,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
777 | static struct page_state { | 823 | static struct page_state { |
778 | unsigned long mask; | 824 | unsigned long mask; |
779 | unsigned long res; | 825 | unsigned long res; |
780 | char *msg; | 826 | enum action_page_type type; |
781 | int (*action)(struct page *p, unsigned long pfn); | 827 | int (*action)(struct page *p, unsigned long pfn); |
782 | } error_states[] = { | 828 | } error_states[] = { |
783 | { reserved, reserved, "reserved kernel", me_kernel }, | 829 | { reserved, reserved, MSG_KERNEL, me_kernel }, |
784 | /* | 830 | /* |
785 | * free pages are specially detected outside this table: | 831 | * free pages are specially detected outside this table: |
786 | * PG_buddy pages only make a small fraction of all free pages. | 832 | * PG_buddy pages only make a small fraction of all free pages. |
@@ -791,31 +837,31 @@ static struct page_state { | |||
791 | * currently unused objects without touching them. But just | 837 | * currently unused objects without touching them. But just |
792 | * treat it as standard kernel for now. | 838 | * treat it as standard kernel for now. |
793 | */ | 839 | */ |
794 | { slab, slab, "kernel slab", me_kernel }, | 840 | { slab, slab, MSG_SLAB, me_kernel }, |
795 | 841 | ||
796 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | 842 | #ifdef CONFIG_PAGEFLAGS_EXTENDED |
797 | { head, head, "huge", me_huge_page }, | 843 | { head, head, MSG_HUGE, me_huge_page }, |
798 | { tail, tail, "huge", me_huge_page }, | 844 | { tail, tail, MSG_HUGE, me_huge_page }, |
799 | #else | 845 | #else |
800 | { compound, compound, "huge", me_huge_page }, | 846 | { compound, compound, MSG_HUGE, me_huge_page }, |
801 | #endif | 847 | #endif |
802 | 848 | ||
803 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, | 849 | { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, |
804 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, | 850 | { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, |
805 | 851 | ||
806 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, | 852 | { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, |
807 | { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, | 853 | { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, |
808 | 854 | ||
809 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | 855 | { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, |
810 | { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, | 856 | { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, |
811 | 857 | ||
812 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, | 858 | { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty }, |
813 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 859 | { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean }, |
814 | 860 | ||
815 | /* | 861 | /* |
816 | * Catchall entry: must be at end. | 862 | * Catchall entry: must be at end. |
817 | */ | 863 | */ |
818 | { 0, 0, "unknown page state", me_unknown }, | 864 | { 0, 0, MSG_UNKNOWN, me_unknown }, |
819 | }; | 865 | }; |
820 | 866 | ||
821 | #undef dirty | 867 | #undef dirty |
@@ -835,10 +881,10 @@ static struct page_state { | |||
835 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of | 881 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of |
836 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). | 882 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). |
837 | */ | 883 | */ |
838 | static void action_result(unsigned long pfn, char *msg, int result) | 884 | static void action_result(unsigned long pfn, enum action_page_type type, int result) |
839 | { | 885 | { |
840 | pr_err("MCE %#lx: %s page recovery: %s\n", | 886 | pr_err("MCE %#lx: recovery action for %s: %s\n", |
841 | pfn, msg, action_name[result]); | 887 | pfn, action_page_types[type], action_name[result]); |
842 | } | 888 | } |
843 | 889 | ||
844 | static int page_action(struct page_state *ps, struct page *p, | 890 | static int page_action(struct page_state *ps, struct page *p, |
@@ -854,11 +900,11 @@ static int page_action(struct page_state *ps, struct page *p, | |||
854 | count--; | 900 | count--; |
855 | if (count != 0) { | 901 | if (count != 0) { |
856 | printk(KERN_ERR | 902 | printk(KERN_ERR |
857 | "MCE %#lx: %s page still referenced by %d users\n", | 903 | "MCE %#lx: %s still referenced by %d users\n", |
858 | pfn, ps->msg, count); | 904 | pfn, action_page_types[ps->type], count); |
859 | result = FAILED; | 905 | result = FAILED; |
860 | } | 906 | } |
861 | action_result(pfn, ps->msg, result); | 907 | action_result(pfn, ps->type, result); |
862 | 908 | ||
863 | /* Could do more checks here if page looks ok */ | 909 | /* Could do more checks here if page looks ok */ |
864 | /* | 910 | /* |
@@ -1106,7 +1152,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1106 | if (!(flags & MF_COUNT_INCREASED) && | 1152 | if (!(flags & MF_COUNT_INCREASED) && |
1107 | !get_page_unless_zero(hpage)) { | 1153 | !get_page_unless_zero(hpage)) { |
1108 | if (is_free_buddy_page(p)) { | 1154 | if (is_free_buddy_page(p)) { |
1109 | action_result(pfn, "free buddy", DELAYED); | 1155 | action_result(pfn, MSG_BUDDY, DELAYED); |
1110 | return 0; | 1156 | return 0; |
1111 | } else if (PageHuge(hpage)) { | 1157 | } else if (PageHuge(hpage)) { |
1112 | /* | 1158 | /* |
@@ -1123,12 +1169,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1123 | } | 1169 | } |
1124 | set_page_hwpoison_huge_page(hpage); | 1170 | set_page_hwpoison_huge_page(hpage); |
1125 | res = dequeue_hwpoisoned_huge_page(hpage); | 1171 | res = dequeue_hwpoisoned_huge_page(hpage); |
1126 | action_result(pfn, "free huge", | 1172 | action_result(pfn, MSG_FREE_HUGE, |
1127 | res ? IGNORED : DELAYED); | 1173 | res ? IGNORED : DELAYED); |
1128 | unlock_page(hpage); | 1174 | unlock_page(hpage); |
1129 | return res; | 1175 | return res; |
1130 | } else { | 1176 | } else { |
1131 | action_result(pfn, "high order kernel", IGNORED); | 1177 | action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED); |
1132 | return -EBUSY; | 1178 | return -EBUSY; |
1133 | } | 1179 | } |
1134 | } | 1180 | } |
@@ -1150,9 +1196,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1150 | */ | 1196 | */ |
1151 | if (is_free_buddy_page(p)) { | 1197 | if (is_free_buddy_page(p)) { |
1152 | if (flags & MF_COUNT_INCREASED) | 1198 | if (flags & MF_COUNT_INCREASED) |
1153 | action_result(pfn, "free buddy", DELAYED); | 1199 | action_result(pfn, MSG_BUDDY, DELAYED); |
1154 | else | 1200 | else |
1155 | action_result(pfn, "free buddy, 2nd try", DELAYED); | 1201 | action_result(pfn, MSG_BUDDY_2ND, |
1202 | DELAYED); | ||
1156 | return 0; | 1203 | return 0; |
1157 | } | 1204 | } |
1158 | } | 1205 | } |
@@ -1165,7 +1212,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1165 | * If this happens just bail out. | 1212 | * If this happens just bail out. |
1166 | */ | 1213 | */ |
1167 | if (compound_head(p) != hpage) { | 1214 | if (compound_head(p) != hpage) { |
1168 | action_result(pfn, "different compound page after locking", IGNORED); | 1215 | action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); |
1169 | res = -EBUSY; | 1216 | res = -EBUSY; |
1170 | goto out; | 1217 | goto out; |
1171 | } | 1218 | } |
@@ -1205,8 +1252,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1205 | * on the head page to show that the hugepage is hwpoisoned | 1252 | * on the head page to show that the hugepage is hwpoisoned |
1206 | */ | 1253 | */ |
1207 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { | 1254 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { |
1208 | action_result(pfn, "hugepage already hardware poisoned", | 1255 | action_result(pfn, MSG_POISONED_HUGE, IGNORED); |
1209 | IGNORED); | ||
1210 | unlock_page(hpage); | 1256 | unlock_page(hpage); |
1211 | put_page(hpage); | 1257 | put_page(hpage); |
1212 | return 0; | 1258 | return 0; |
@@ -1235,7 +1281,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1235 | */ | 1281 | */ |
1236 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) | 1282 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) |
1237 | != SWAP_SUCCESS) { | 1283 | != SWAP_SUCCESS) { |
1238 | action_result(pfn, "unmapping failed", IGNORED); | 1284 | action_result(pfn, MSG_UNMAP_FAILED, IGNORED); |
1239 | res = -EBUSY; | 1285 | res = -EBUSY; |
1240 | goto out; | 1286 | goto out; |
1241 | } | 1287 | } |
@@ -1244,7 +1290,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1244 | * Torn down by someone else? | 1290 | * Torn down by someone else? |
1245 | */ | 1291 | */ |
1246 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { | 1292 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
1247 | action_result(pfn, "already truncated LRU", IGNORED); | 1293 | action_result(pfn, MSG_TRUNCATED_LRU, IGNORED); |
1248 | res = -EBUSY; | 1294 | res = -EBUSY; |
1249 | goto out; | 1295 | goto out; |
1250 | } | 1296 | } |
@@ -1540,8 +1586,18 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1540 | } | 1586 | } |
1541 | unlock_page(hpage); | 1587 | unlock_page(hpage); |
1542 | 1588 | ||
1543 | /* Keep page count to indicate a given hugepage is isolated. */ | 1589 | ret = isolate_huge_page(hpage, &pagelist); |
1544 | list_move(&hpage->lru, &pagelist); | 1590 | if (ret) { |
1591 | /* | ||
1592 | * get_any_page() and isolate_huge_page() takes a refcount each, | ||
1593 | * so need to drop one here. | ||
1594 | */ | ||
1595 | put_page(hpage); | ||
1596 | } else { | ||
1597 | pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); | ||
1598 | return -EBUSY; | ||
1599 | } | ||
1600 | |||
1545 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, | 1601 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1546 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1602 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1547 | if (ret) { | 1603 | if (ret) { |
diff --git a/mm/memory.c b/mm/memory.c index ac20b2a6a0c3..22e037e3364e 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
690 | /* | 690 | /* |
691 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y | 691 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y |
692 | */ | 692 | */ |
693 | if (vma->vm_ops) | 693 | pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", |
694 | printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", | 694 | vma->vm_file, |
695 | vma->vm_ops->fault); | 695 | vma->vm_ops ? vma->vm_ops->fault : NULL, |
696 | if (vma->vm_file) | 696 | vma->vm_file ? vma->vm_file->f_op->mmap : NULL, |
697 | printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", | 697 | mapping ? mapping->a_ops->readpage : NULL); |
698 | vma->vm_file->f_op->mmap); | ||
699 | dump_stack(); | 698 | dump_stack(); |
700 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 699 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
701 | } | 700 | } |
@@ -2181,6 +2180,42 @@ oom: | |||
2181 | return VM_FAULT_OOM; | 2180 | return VM_FAULT_OOM; |
2182 | } | 2181 | } |
2183 | 2182 | ||
2183 | /* | ||
2184 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED | ||
2185 | * mapping | ||
2186 | */ | ||
2187 | static int wp_pfn_shared(struct mm_struct *mm, | ||
2188 | struct vm_area_struct *vma, unsigned long address, | ||
2189 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, | ||
2190 | pmd_t *pmd) | ||
2191 | { | ||
2192 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { | ||
2193 | struct vm_fault vmf = { | ||
2194 | .page = NULL, | ||
2195 | .pgoff = linear_page_index(vma, address), | ||
2196 | .virtual_address = (void __user *)(address & PAGE_MASK), | ||
2197 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, | ||
2198 | }; | ||
2199 | int ret; | ||
2200 | |||
2201 | pte_unmap_unlock(page_table, ptl); | ||
2202 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); | ||
2203 | if (ret & VM_FAULT_ERROR) | ||
2204 | return ret; | ||
2205 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2206 | /* | ||
2207 | * We might have raced with another page fault while we | ||
2208 | * released the pte_offset_map_lock. | ||
2209 | */ | ||
2210 | if (!pte_same(*page_table, orig_pte)) { | ||
2211 | pte_unmap_unlock(page_table, ptl); | ||
2212 | return 0; | ||
2213 | } | ||
2214 | } | ||
2215 | return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, | ||
2216 | NULL, 0, 0); | ||
2217 | } | ||
2218 | |||
2184 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | 2219 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, |
2185 | unsigned long address, pte_t *page_table, | 2220 | unsigned long address, pte_t *page_table, |
2186 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, | 2221 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, |
@@ -2259,13 +2294,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2259 | * VM_PFNMAP VMA. | 2294 | * VM_PFNMAP VMA. |
2260 | * | 2295 | * |
2261 | * We should not cow pages in a shared writeable mapping. | 2296 | * We should not cow pages in a shared writeable mapping. |
2262 | * Just mark the pages writable as we can't do any dirty | 2297 | * Just mark the pages writable and/or call ops->pfn_mkwrite. |
2263 | * accounting on raw pfn maps. | ||
2264 | */ | 2298 | */ |
2265 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2299 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2266 | (VM_WRITE|VM_SHARED)) | 2300 | (VM_WRITE|VM_SHARED)) |
2267 | return wp_page_reuse(mm, vma, address, page_table, ptl, | 2301 | return wp_pfn_shared(mm, vma, address, page_table, ptl, |
2268 | orig_pte, old_page, 0, 0); | 2302 | orig_pte, pmd); |
2269 | 2303 | ||
2270 | pte_unmap_unlock(page_table, ptl); | 2304 | pte_unmap_unlock(page_table, ptl); |
2271 | return wp_page_copy(mm, vma, address, page_table, pmd, | 2305 | return wp_page_copy(mm, vma, address, page_table, pmd, |
@@ -2845,7 +2879,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | |||
2845 | struct vm_fault vmf; | 2879 | struct vm_fault vmf; |
2846 | int off; | 2880 | int off; |
2847 | 2881 | ||
2848 | nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 2882 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
2849 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 2883 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
2850 | 2884 | ||
2851 | start_addr = max(address & mask, vma->vm_start); | 2885 | start_addr = max(address & mask, vma->vm_start); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e2e8014fb755..457bde530cbe 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1373,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) | |||
1373 | if (PageLRU(page)) | 1373 | if (PageLRU(page)) |
1374 | return pfn; | 1374 | return pfn; |
1375 | if (PageHuge(page)) { | 1375 | if (PageHuge(page)) { |
1376 | if (is_hugepage_active(page)) | 1376 | if (page_huge_active(page)) |
1377 | return pfn; | 1377 | return pfn; |
1378 | else | 1378 | else |
1379 | pfn = round_up(pfn + 1, | 1379 | pfn = round_up(pfn + 1, |
diff --git a/mm/mempool.c b/mm/mempool.c index 949970db2874..2cc08de8b1db 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -6,26 +6,138 @@ | |||
6 | * extreme VM load. | 6 | * extreme VM load. |
7 | * | 7 | * |
8 | * started by Ingo Molnar, Copyright (C) 2001 | 8 | * started by Ingo Molnar, Copyright (C) 2001 |
9 | * debugging by David Rientjes, Copyright (C) 2015 | ||
9 | */ | 10 | */ |
10 | 11 | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
12 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/highmem.h> | ||
15 | #include <linux/kasan.h> | ||
13 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
14 | #include <linux/export.h> | 17 | #include <linux/export.h> |
15 | #include <linux/mempool.h> | 18 | #include <linux/mempool.h> |
16 | #include <linux/blkdev.h> | 19 | #include <linux/blkdev.h> |
17 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
21 | #include "slab.h" | ||
22 | |||
23 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) | ||
24 | static void poison_error(mempool_t *pool, void *element, size_t size, | ||
25 | size_t byte) | ||
26 | { | ||
27 | const int nr = pool->curr_nr; | ||
28 | const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); | ||
29 | const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); | ||
30 | int i; | ||
31 | |||
32 | pr_err("BUG: mempool element poison mismatch\n"); | ||
33 | pr_err("Mempool %p size %zu\n", pool, size); | ||
34 | pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); | ||
35 | for (i = start; i < end; i++) | ||
36 | pr_cont("%x ", *(u8 *)(element + i)); | ||
37 | pr_cont("%s\n", end < size ? "..." : ""); | ||
38 | dump_stack(); | ||
39 | } | ||
40 | |||
41 | static void __check_element(mempool_t *pool, void *element, size_t size) | ||
42 | { | ||
43 | u8 *obj = element; | ||
44 | size_t i; | ||
45 | |||
46 | for (i = 0; i < size; i++) { | ||
47 | u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; | ||
48 | |||
49 | if (obj[i] != exp) { | ||
50 | poison_error(pool, element, size, i); | ||
51 | return; | ||
52 | } | ||
53 | } | ||
54 | memset(obj, POISON_INUSE, size); | ||
55 | } | ||
56 | |||
57 | static void check_element(mempool_t *pool, void *element) | ||
58 | { | ||
59 | /* Mempools backed by slab allocator */ | ||
60 | if (pool->free == mempool_free_slab || pool->free == mempool_kfree) | ||
61 | __check_element(pool, element, ksize(element)); | ||
62 | |||
63 | /* Mempools backed by page allocator */ | ||
64 | if (pool->free == mempool_free_pages) { | ||
65 | int order = (int)(long)pool->pool_data; | ||
66 | void *addr = kmap_atomic((struct page *)element); | ||
67 | |||
68 | __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); | ||
69 | kunmap_atomic(addr); | ||
70 | } | ||
71 | } | ||
72 | |||
73 | static void __poison_element(void *element, size_t size) | ||
74 | { | ||
75 | u8 *obj = element; | ||
76 | |||
77 | memset(obj, POISON_FREE, size - 1); | ||
78 | obj[size - 1] = POISON_END; | ||
79 | } | ||
80 | |||
81 | static void poison_element(mempool_t *pool, void *element) | ||
82 | { | ||
83 | /* Mempools backed by slab allocator */ | ||
84 | if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) | ||
85 | __poison_element(element, ksize(element)); | ||
86 | |||
87 | /* Mempools backed by page allocator */ | ||
88 | if (pool->alloc == mempool_alloc_pages) { | ||
89 | int order = (int)(long)pool->pool_data; | ||
90 | void *addr = kmap_atomic((struct page *)element); | ||
91 | |||
92 | __poison_element(addr, 1UL << (PAGE_SHIFT + order)); | ||
93 | kunmap_atomic(addr); | ||
94 | } | ||
95 | } | ||
96 | #else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ | ||
97 | static inline void check_element(mempool_t *pool, void *element) | ||
98 | { | ||
99 | } | ||
100 | static inline void poison_element(mempool_t *pool, void *element) | ||
101 | { | ||
102 | } | ||
103 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ | ||
104 | |||
105 | static void kasan_poison_element(mempool_t *pool, void *element) | ||
106 | { | ||
107 | if (pool->alloc == mempool_alloc_slab) | ||
108 | kasan_slab_free(pool->pool_data, element); | ||
109 | if (pool->alloc == mempool_kmalloc) | ||
110 | kasan_kfree(element); | ||
111 | if (pool->alloc == mempool_alloc_pages) | ||
112 | kasan_free_pages(element, (unsigned long)pool->pool_data); | ||
113 | } | ||
114 | |||
115 | static void kasan_unpoison_element(mempool_t *pool, void *element) | ||
116 | { | ||
117 | if (pool->alloc == mempool_alloc_slab) | ||
118 | kasan_slab_alloc(pool->pool_data, element); | ||
119 | if (pool->alloc == mempool_kmalloc) | ||
120 | kasan_krealloc(element, (size_t)pool->pool_data); | ||
121 | if (pool->alloc == mempool_alloc_pages) | ||
122 | kasan_alloc_pages(element, (unsigned long)pool->pool_data); | ||
123 | } | ||
18 | 124 | ||
19 | static void add_element(mempool_t *pool, void *element) | 125 | static void add_element(mempool_t *pool, void *element) |
20 | { | 126 | { |
21 | BUG_ON(pool->curr_nr >= pool->min_nr); | 127 | BUG_ON(pool->curr_nr >= pool->min_nr); |
128 | poison_element(pool, element); | ||
129 | kasan_poison_element(pool, element); | ||
22 | pool->elements[pool->curr_nr++] = element; | 130 | pool->elements[pool->curr_nr++] = element; |
23 | } | 131 | } |
24 | 132 | ||
25 | static void *remove_element(mempool_t *pool) | 133 | static void *remove_element(mempool_t *pool) |
26 | { | 134 | { |
27 | BUG_ON(pool->curr_nr <= 0); | 135 | void *element = pool->elements[--pool->curr_nr]; |
28 | return pool->elements[--pool->curr_nr]; | 136 | |
137 | BUG_ON(pool->curr_nr < 0); | ||
138 | check_element(pool, element); | ||
139 | kasan_unpoison_element(pool, element); | ||
140 | return element; | ||
29 | } | 141 | } |
30 | 142 | ||
31 | /** | 143 | /** |
@@ -334,6 +446,7 @@ EXPORT_SYMBOL(mempool_free); | |||
334 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) | 446 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) |
335 | { | 447 | { |
336 | struct kmem_cache *mem = pool_data; | 448 | struct kmem_cache *mem = pool_data; |
449 | VM_BUG_ON(mem->ctor); | ||
337 | return kmem_cache_alloc(mem, gfp_mask); | 450 | return kmem_cache_alloc(mem, gfp_mask); |
338 | } | 451 | } |
339 | EXPORT_SYMBOL(mempool_alloc_slab); | 452 | EXPORT_SYMBOL(mempool_alloc_slab); |
diff --git a/mm/migrate.c b/mm/migrate.c index a65ff72ab739..f53838fe3dfe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
537 | * Please do not reorder this without considering how mm/ksm.c's | 537 | * Please do not reorder this without considering how mm/ksm.c's |
538 | * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). | 538 | * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). |
539 | */ | 539 | */ |
540 | ClearPageSwapCache(page); | 540 | if (PageSwapCache(page)) |
541 | ClearPageSwapCache(page); | ||
541 | ClearPagePrivate(page); | 542 | ClearPagePrivate(page); |
542 | set_page_private(page, 0); | 543 | set_page_private(page, 0); |
543 | 544 | ||
@@ -1133,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * | |||
1133 | * by another page fault trying to merge _that_. But that's ok: if it | 1133 | * by another page fault trying to merge _that_. But that's ok: if it |
1134 | * is being set up, that automatically means that it will be a singleton | 1134 | * is being set up, that automatically means that it will be a singleton |
1135 | * acceptable for merging, so we can do all of this optimistically. But | 1135 | * acceptable for merging, so we can do all of this optimistically. But |
1136 | * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. | 1136 | * we do that READ_ONCE() to make sure that we never re-load the pointer. |
1137 | * | 1137 | * |
1138 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only | 1138 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only |
1139 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid | 1139 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid |
@@ -1147,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * | |||
1147 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) | 1147 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) |
1148 | { | 1148 | { |
1149 | if (anon_vma_compatible(a, b)) { | 1149 | if (anon_vma_compatible(a, b)) { |
1150 | struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); | 1150 | struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); |
1151 | 1151 | ||
1152 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) | 1152 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) |
1153 | return anon_vma; | 1153 | return anon_vma; |
@@ -1551,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
1551 | 1551 | ||
1552 | /* Clear old maps */ | 1552 | /* Clear old maps */ |
1553 | error = -ENOMEM; | 1553 | error = -ENOMEM; |
1554 | munmap_back: | 1554 | while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, |
1555 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { | 1555 | &rb_parent)) { |
1556 | if (do_munmap(mm, addr, len)) | 1556 | if (do_munmap(mm, addr, len)) |
1557 | return -ENOMEM; | 1557 | return -ENOMEM; |
1558 | goto munmap_back; | ||
1559 | } | 1558 | } |
1560 | 1559 | ||
1561 | /* | 1560 | /* |
@@ -1571,7 +1570,8 @@ munmap_back: | |||
1571 | /* | 1570 | /* |
1572 | * Can we just expand an old mapping? | 1571 | * Can we just expand an old mapping? |
1573 | */ | 1572 | */ |
1574 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); | 1573 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, |
1574 | NULL); | ||
1575 | if (vma) | 1575 | if (vma) |
1576 | goto out; | 1576 | goto out; |
1577 | 1577 | ||
@@ -2100,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
2100 | actual_size = size; | 2100 | actual_size = size; |
2101 | if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) | 2101 | if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) |
2102 | actual_size -= PAGE_SIZE; | 2102 | actual_size -= PAGE_SIZE; |
2103 | if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) | 2103 | if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) |
2104 | return -ENOMEM; | 2104 | return -ENOMEM; |
2105 | 2105 | ||
2106 | /* mlock limit tests */ | 2106 | /* mlock limit tests */ |
@@ -2108,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
2108 | unsigned long locked; | 2108 | unsigned long locked; |
2109 | unsigned long limit; | 2109 | unsigned long limit; |
2110 | locked = mm->locked_vm + grow; | 2110 | locked = mm->locked_vm + grow; |
2111 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); | 2111 | limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); |
2112 | limit >>= PAGE_SHIFT; | 2112 | limit >>= PAGE_SHIFT; |
2113 | if (locked > limit && !capable(CAP_IPC_LOCK)) | 2113 | if (locked > limit && !capable(CAP_IPC_LOCK)) |
2114 | return -ENOMEM; | 2114 | return -ENOMEM; |
@@ -2739,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2739 | /* | 2739 | /* |
2740 | * Clear old maps. this also does some error checking for us | 2740 | * Clear old maps. this also does some error checking for us |
2741 | */ | 2741 | */ |
2742 | munmap_back: | 2742 | while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, |
2743 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { | 2743 | &rb_parent)) { |
2744 | if (do_munmap(mm, addr, len)) | 2744 | if (do_munmap(mm, addr, len)) |
2745 | return -ENOMEM; | 2745 | return -ENOMEM; |
2746 | goto munmap_back; | ||
2747 | } | 2746 | } |
2748 | 2747 | ||
2749 | /* Check against address space limits *after* clearing old maps... */ | 2748 | /* Check against address space limits *after* clearing old maps... */ |
diff --git a/mm/mremap.c b/mm/mremap.c index 2dc44b1cb1df..034e2d360652 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -345,25 +345,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
345 | struct vm_area_struct *vma = find_vma(mm, addr); | 345 | struct vm_area_struct *vma = find_vma(mm, addr); |
346 | 346 | ||
347 | if (!vma || vma->vm_start > addr) | 347 | if (!vma || vma->vm_start > addr) |
348 | goto Efault; | 348 | return ERR_PTR(-EFAULT); |
349 | 349 | ||
350 | if (is_vm_hugetlb_page(vma)) | 350 | if (is_vm_hugetlb_page(vma)) |
351 | goto Einval; | 351 | return ERR_PTR(-EINVAL); |
352 | 352 | ||
353 | /* We can't remap across vm area boundaries */ | 353 | /* We can't remap across vm area boundaries */ |
354 | if (old_len > vma->vm_end - addr) | 354 | if (old_len > vma->vm_end - addr) |
355 | goto Efault; | 355 | return ERR_PTR(-EFAULT); |
356 | 356 | ||
357 | /* Need to be careful about a growing mapping */ | 357 | /* Need to be careful about a growing mapping */ |
358 | if (new_len > old_len) { | 358 | if (new_len > old_len) { |
359 | unsigned long pgoff; | 359 | unsigned long pgoff; |
360 | 360 | ||
361 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) | 361 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) |
362 | goto Efault; | 362 | return ERR_PTR(-EFAULT); |
363 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; | 363 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; |
364 | pgoff += vma->vm_pgoff; | 364 | pgoff += vma->vm_pgoff; |
365 | if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) | 365 | if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) |
366 | goto Einval; | 366 | return ERR_PTR(-EINVAL); |
367 | } | 367 | } |
368 | 368 | ||
369 | if (vma->vm_flags & VM_LOCKED) { | 369 | if (vma->vm_flags & VM_LOCKED) { |
@@ -372,29 +372,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
372 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 372 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
373 | locked += new_len - old_len; | 373 | locked += new_len - old_len; |
374 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 374 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
375 | goto Eagain; | 375 | return ERR_PTR(-EAGAIN); |
376 | } | 376 | } |
377 | 377 | ||
378 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) | 378 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) |
379 | goto Enomem; | 379 | return ERR_PTR(-ENOMEM); |
380 | 380 | ||
381 | if (vma->vm_flags & VM_ACCOUNT) { | 381 | if (vma->vm_flags & VM_ACCOUNT) { |
382 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; | 382 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; |
383 | if (security_vm_enough_memory_mm(mm, charged)) | 383 | if (security_vm_enough_memory_mm(mm, charged)) |
384 | goto Efault; | 384 | return ERR_PTR(-ENOMEM); |
385 | *p = charged; | 385 | *p = charged; |
386 | } | 386 | } |
387 | 387 | ||
388 | return vma; | 388 | return vma; |
389 | |||
390 | Efault: /* very odd choice for most of the cases, but... */ | ||
391 | return ERR_PTR(-EFAULT); | ||
392 | Einval: | ||
393 | return ERR_PTR(-EINVAL); | ||
394 | Enomem: | ||
395 | return ERR_PTR(-ENOMEM); | ||
396 | Eagain: | ||
397 | return ERR_PTR(-EAGAIN); | ||
398 | } | 389 | } |
399 | 390 | ||
400 | static unsigned long mremap_to(unsigned long addr, unsigned long old_len, | 391 | static unsigned long mremap_to(unsigned long addr, unsigned long old_len, |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 52628c819bf7..2b665da1b3c9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -408,7 +408,7 @@ bool oom_killer_disabled __read_mostly; | |||
408 | static DECLARE_RWSEM(oom_sem); | 408 | static DECLARE_RWSEM(oom_sem); |
409 | 409 | ||
410 | /** | 410 | /** |
411 | * mark_tsk_oom_victim - marks the given taks as OOM victim. | 411 | * mark_tsk_oom_victim - marks the given task as OOM victim. |
412 | * @tsk: task to mark | 412 | * @tsk: task to mark |
413 | * | 413 | * |
414 | * Has to be called with oom_sem taken for read and never after | 414 | * Has to be called with oom_sem taken for read and never after |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0372411f38fc..5daf5568b9e1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2228,7 +2228,8 @@ int set_page_dirty(struct page *page) | |||
2228 | * it will confuse readahead and make it restart the size rampup | 2228 | * it will confuse readahead and make it restart the size rampup |
2229 | * process. But it's a trivial problem. | 2229 | * process. But it's a trivial problem. |
2230 | */ | 2230 | */ |
2231 | ClearPageReclaim(page); | 2231 | if (PageReclaim(page)) |
2232 | ClearPageReclaim(page); | ||
2232 | #ifdef CONFIG_BLOCK | 2233 | #ifdef CONFIG_BLOCK |
2233 | if (!spd) | 2234 | if (!spd) |
2234 | spd = __set_page_dirty_buffers; | 2235 | spd = __set_page_dirty_buffers; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1b849500640c..ebffa0e4a9c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1371,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1371 | int to_drain, batch; | 1371 | int to_drain, batch; |
1372 | 1372 | ||
1373 | local_irq_save(flags); | 1373 | local_irq_save(flags); |
1374 | batch = ACCESS_ONCE(pcp->batch); | 1374 | batch = READ_ONCE(pcp->batch); |
1375 | to_drain = min(pcp->count, batch); | 1375 | to_drain = min(pcp->count, batch); |
1376 | if (to_drain > 0) { | 1376 | if (to_drain > 0) { |
1377 | free_pcppages_bulk(zone, to_drain, pcp); | 1377 | free_pcppages_bulk(zone, to_drain, pcp); |
@@ -1570,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
1570 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | 1570 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
1571 | pcp->count++; | 1571 | pcp->count++; |
1572 | if (pcp->count >= pcp->high) { | 1572 | if (pcp->count >= pcp->high) { |
1573 | unsigned long batch = ACCESS_ONCE(pcp->batch); | 1573 | unsigned long batch = READ_ONCE(pcp->batch); |
1574 | free_pcppages_bulk(zone, batch, pcp); | 1574 | free_pcppages_bulk(zone, batch, pcp); |
1575 | pcp->count -= batch; | 1575 | pcp->count -= batch; |
1576 | } | 1576 | } |
@@ -6207,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, | |||
6207 | mask <<= (BITS_PER_LONG - bitidx - 1); | 6207 | mask <<= (BITS_PER_LONG - bitidx - 1); |
6208 | flags <<= (BITS_PER_LONG - bitidx - 1); | 6208 | flags <<= (BITS_PER_LONG - bitidx - 1); |
6209 | 6209 | ||
6210 | word = ACCESS_ONCE(bitmap[word_bitidx]); | 6210 | word = READ_ONCE(bitmap[word_bitidx]); |
6211 | for (;;) { | 6211 | for (;;) { |
6212 | old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); | 6212 | old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); |
6213 | if (word == old_word) | 6213 | if (word == old_word) |
@@ -456,7 +456,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) | |||
456 | unsigned long anon_mapping; | 456 | unsigned long anon_mapping; |
457 | 457 | ||
458 | rcu_read_lock(); | 458 | rcu_read_lock(); |
459 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | 459 | anon_mapping = (unsigned long)READ_ONCE(page->mapping); |
460 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | 460 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
461 | goto out; | 461 | goto out; |
462 | if (!page_mapped(page)) | 462 | if (!page_mapped(page)) |
@@ -500,14 +500,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) | |||
500 | unsigned long anon_mapping; | 500 | unsigned long anon_mapping; |
501 | 501 | ||
502 | rcu_read_lock(); | 502 | rcu_read_lock(); |
503 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | 503 | anon_mapping = (unsigned long)READ_ONCE(page->mapping); |
504 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | 504 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
505 | goto out; | 505 | goto out; |
506 | if (!page_mapped(page)) | 506 | if (!page_mapped(page)) |
507 | goto out; | 507 | goto out; |
508 | 508 | ||
509 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 509 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
510 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 510 | root_anon_vma = READ_ONCE(anon_vma->root); |
511 | if (down_read_trylock(&root_anon_vma->rwsem)) { | 511 | if (down_read_trylock(&root_anon_vma->rwsem)) { |
512 | /* | 512 | /* |
513 | * If the page is still mapped, then this anon_vma is still | 513 | * If the page is still mapped, then this anon_vma is still |
@@ -4277,7 +4277,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4277 | int node; | 4277 | int node; |
4278 | struct page *page; | 4278 | struct page *page; |
4279 | 4279 | ||
4280 | page = ACCESS_ONCE(c->page); | 4280 | page = READ_ONCE(c->page); |
4281 | if (!page) | 4281 | if (!page) |
4282 | continue; | 4282 | continue; |
4283 | 4283 | ||
@@ -4292,7 +4292,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4292 | total += x; | 4292 | total += x; |
4293 | nodes[node] += x; | 4293 | nodes[node] += x; |
4294 | 4294 | ||
4295 | page = ACCESS_ONCE(c->partial); | 4295 | page = READ_ONCE(c->partial); |
4296 | if (page) { | 4296 | if (page) { |
4297 | node = page_to_nid(page); | 4297 | node = page_to_nid(page); |
4298 | if (flags & SO_TOTAL) | 4298 | if (flags & SO_TOTAL) |
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
33 | #include <linux/uio.h> | 33 | #include <linux/uio.h> |
34 | #include <linux/hugetlb.h> | ||
34 | 35 | ||
35 | #include "internal.h" | 36 | #include "internal.h" |
36 | 37 | ||
@@ -42,7 +43,7 @@ int page_cluster; | |||
42 | 43 | ||
43 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); | 44 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); |
44 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); | 45 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
45 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | 46 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); |
46 | 47 | ||
47 | /* | 48 | /* |
48 | * This path almost never happens for VM activity - pages are normally | 49 | * This path almost never happens for VM activity - pages are normally |
@@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page) | |||
75 | { | 76 | { |
76 | compound_page_dtor *dtor; | 77 | compound_page_dtor *dtor; |
77 | 78 | ||
78 | __page_cache_release(page); | 79 | /* |
80 | * __page_cache_release() is supposed to be called for thp, not for | ||
81 | * hugetlb. This is because hugetlb page does never have PageLRU set | ||
82 | * (it's never listed to any LRU lists) and no memcg routines should | ||
83 | * be called for hugetlb (it has a separate hugetlb_cgroup.) | ||
84 | */ | ||
85 | if (!PageHuge(page)) | ||
86 | __page_cache_release(page); | ||
79 | dtor = get_compound_page_dtor(page); | 87 | dtor = get_compound_page_dtor(page); |
80 | (*dtor)(page); | 88 | (*dtor)(page); |
81 | } | 89 | } |
@@ -743,7 +751,7 @@ void lru_cache_add_active_or_unevictable(struct page *page, | |||
743 | * be write it out by flusher threads as this is much more effective | 751 | * be write it out by flusher threads as this is much more effective |
744 | * than the single-page writeout from reclaim. | 752 | * than the single-page writeout from reclaim. |
745 | */ | 753 | */ |
746 | static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, | 754 | static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, |
747 | void *arg) | 755 | void *arg) |
748 | { | 756 | { |
749 | int lru, file; | 757 | int lru, file; |
@@ -811,36 +819,36 @@ void lru_add_drain_cpu(int cpu) | |||
811 | local_irq_restore(flags); | 819 | local_irq_restore(flags); |
812 | } | 820 | } |
813 | 821 | ||
814 | pvec = &per_cpu(lru_deactivate_pvecs, cpu); | 822 | pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); |
815 | if (pagevec_count(pvec)) | 823 | if (pagevec_count(pvec)) |
816 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | 824 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
817 | 825 | ||
818 | activate_page_drain(cpu); | 826 | activate_page_drain(cpu); |
819 | } | 827 | } |
820 | 828 | ||
821 | /** | 829 | /** |
822 | * deactivate_page - forcefully deactivate a page | 830 | * deactivate_file_page - forcefully deactivate a file page |
823 | * @page: page to deactivate | 831 | * @page: page to deactivate |
824 | * | 832 | * |
825 | * This function hints the VM that @page is a good reclaim candidate, | 833 | * This function hints the VM that @page is a good reclaim candidate, |
826 | * for example if its invalidation fails due to the page being dirty | 834 | * for example if its invalidation fails due to the page being dirty |
827 | * or under writeback. | 835 | * or under writeback. |
828 | */ | 836 | */ |
829 | void deactivate_page(struct page *page) | 837 | void deactivate_file_page(struct page *page) |
830 | { | 838 | { |
831 | /* | 839 | /* |
832 | * In a workload with many unevictable page such as mprotect, unevictable | 840 | * In a workload with many unevictable page such as mprotect, |
833 | * page deactivation for accelerating reclaim is pointless. | 841 | * unevictable page deactivation for accelerating reclaim is pointless. |
834 | */ | 842 | */ |
835 | if (PageUnevictable(page)) | 843 | if (PageUnevictable(page)) |
836 | return; | 844 | return; |
837 | 845 | ||
838 | if (likely(get_page_unless_zero(page))) { | 846 | if (likely(get_page_unless_zero(page))) { |
839 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); | 847 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); |
840 | 848 | ||
841 | if (!pagevec_add(pvec, page)) | 849 | if (!pagevec_add(pvec, page)) |
842 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | 850 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
843 | put_cpu_var(lru_deactivate_pvecs); | 851 | put_cpu_var(lru_deactivate_file_pvecs); |
844 | } | 852 | } |
845 | } | 853 | } |
846 | 854 | ||
@@ -872,7 +880,7 @@ void lru_add_drain_all(void) | |||
872 | 880 | ||
873 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || | 881 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || |
874 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || | 882 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || |
875 | pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || | 883 | pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || |
876 | need_activate_page_drain(cpu)) { | 884 | need_activate_page_drain(cpu)) { |
877 | INIT_WORK(work, lru_add_drain_per_cpu); | 885 | INIT_WORK(work, lru_add_drain_per_cpu); |
878 | schedule_work_on(cpu, work); | 886 | schedule_work_on(cpu, work); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 405923f77334..8bc8e66138da 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) | |||
390 | unsigned int pages, max_pages, last_ra; | 390 | unsigned int pages, max_pages, last_ra; |
391 | static atomic_t last_readahead_pages; | 391 | static atomic_t last_readahead_pages; |
392 | 392 | ||
393 | max_pages = 1 << ACCESS_ONCE(page_cluster); | 393 | max_pages = 1 << READ_ONCE(page_cluster); |
394 | if (max_pages <= 1) | 394 | if (max_pages <= 1) |
395 | return 1; | 395 | return 1; |
396 | 396 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 63f55ccb9b26..a7e72103f23b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1312 | else | 1312 | else |
1313 | continue; | 1313 | continue; |
1314 | } | 1314 | } |
1315 | count = ACCESS_ONCE(si->swap_map[i]); | 1315 | count = READ_ONCE(si->swap_map[i]); |
1316 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1316 | if (count && swap_count(count) != SWAP_MAP_BAD) |
1317 | break; | 1317 | break; |
1318 | } | 1318 | } |
diff --git a/mm/truncate.c b/mm/truncate.c index 7a9d8a3cb143..66af9031fae8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -490,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
490 | * of interest and try to speed up its reclaim. | 490 | * of interest and try to speed up its reclaim. |
491 | */ | 491 | */ |
492 | if (!ret) | 492 | if (!ret) |
493 | deactivate_page(page); | 493 | deactivate_file_page(page); |
494 | count += ret; | 494 | count += ret; |
495 | } | 495 | } |
496 | pagevec_remove_exceptionals(&pvec); | 496 | pagevec_remove_exceptionals(&pvec); |
@@ -325,9 +325,37 @@ void kvfree(const void *addr) | |||
325 | } | 325 | } |
326 | EXPORT_SYMBOL(kvfree); | 326 | EXPORT_SYMBOL(kvfree); |
327 | 327 | ||
328 | static inline void *__page_rmapping(struct page *page) | ||
329 | { | ||
330 | unsigned long mapping; | ||
331 | |||
332 | mapping = (unsigned long)page->mapping; | ||
333 | mapping &= ~PAGE_MAPPING_FLAGS; | ||
334 | |||
335 | return (void *)mapping; | ||
336 | } | ||
337 | |||
338 | /* Neutral page->mapping pointer to address_space or anon_vma or other */ | ||
339 | void *page_rmapping(struct page *page) | ||
340 | { | ||
341 | page = compound_head(page); | ||
342 | return __page_rmapping(page); | ||
343 | } | ||
344 | |||
345 | struct anon_vma *page_anon_vma(struct page *page) | ||
346 | { | ||
347 | unsigned long mapping; | ||
348 | |||
349 | page = compound_head(page); | ||
350 | mapping = (unsigned long)page->mapping; | ||
351 | if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | ||
352 | return NULL; | ||
353 | return __page_rmapping(page); | ||
354 | } | ||
355 | |||
328 | struct address_space *page_mapping(struct page *page) | 356 | struct address_space *page_mapping(struct page *page) |
329 | { | 357 | { |
330 | struct address_space *mapping = page->mapping; | 358 | unsigned long mapping; |
331 | 359 | ||
332 | /* This happens if someone calls flush_dcache_page on slab page */ | 360 | /* This happens if someone calls flush_dcache_page on slab page */ |
333 | if (unlikely(PageSlab(page))) | 361 | if (unlikely(PageSlab(page))) |
@@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page) | |||
337 | swp_entry_t entry; | 365 | swp_entry_t entry; |
338 | 366 | ||
339 | entry.val = page_private(page); | 367 | entry.val = page_private(page); |
340 | mapping = swap_address_space(entry); | 368 | return swap_address_space(entry); |
341 | } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) | 369 | } |
342 | mapping = NULL; | 370 | |
343 | return mapping; | 371 | mapping = (unsigned long)page->mapping; |
372 | if (mapping & PAGE_MAPPING_FLAGS) | ||
373 | return NULL; | ||
374 | return page->mapping; | ||
344 | } | 375 | } |
345 | 376 | ||
346 | int overcommit_ratio_handler(struct ctl_table *table, int write, | 377 | int overcommit_ratio_handler(struct ctl_table *table, int write, |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a5bbdd3b5d67..2faaa2976447 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -765,7 +765,7 @@ struct vmap_block { | |||
765 | spinlock_t lock; | 765 | spinlock_t lock; |
766 | struct vmap_area *va; | 766 | struct vmap_area *va; |
767 | unsigned long free, dirty; | 767 | unsigned long free, dirty; |
768 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | 768 | unsigned long dirty_min, dirty_max; /*< dirty range */ |
769 | struct list_head free_list; | 769 | struct list_head free_list; |
770 | struct rcu_head rcu_head; | 770 | struct rcu_head rcu_head; |
771 | struct list_head purge; | 771 | struct list_head purge; |
@@ -796,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr) | |||
796 | return addr; | 796 | return addr; |
797 | } | 797 | } |
798 | 798 | ||
799 | static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | 799 | static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) |
800 | { | ||
801 | unsigned long addr; | ||
802 | |||
803 | addr = va_start + (pages_off << PAGE_SHIFT); | ||
804 | BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); | ||
805 | return (void *)addr; | ||
806 | } | ||
807 | |||
808 | /** | ||
809 | * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this | ||
810 | * block. Of course pages number can't exceed VMAP_BBMAP_BITS | ||
811 | * @order: how many 2^order pages should be occupied in newly allocated block | ||
812 | * @gfp_mask: flags for the page level allocator | ||
813 | * | ||
814 | * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) | ||
815 | */ | ||
816 | static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) | ||
800 | { | 817 | { |
801 | struct vmap_block_queue *vbq; | 818 | struct vmap_block_queue *vbq; |
802 | struct vmap_block *vb; | 819 | struct vmap_block *vb; |
803 | struct vmap_area *va; | 820 | struct vmap_area *va; |
804 | unsigned long vb_idx; | 821 | unsigned long vb_idx; |
805 | int node, err; | 822 | int node, err; |
823 | void *vaddr; | ||
806 | 824 | ||
807 | node = numa_node_id(); | 825 | node = numa_node_id(); |
808 | 826 | ||
@@ -826,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
826 | return ERR_PTR(err); | 844 | return ERR_PTR(err); |
827 | } | 845 | } |
828 | 846 | ||
847 | vaddr = vmap_block_vaddr(va->va_start, 0); | ||
829 | spin_lock_init(&vb->lock); | 848 | spin_lock_init(&vb->lock); |
830 | vb->va = va; | 849 | vb->va = va; |
831 | vb->free = VMAP_BBMAP_BITS; | 850 | /* At least something should be left free */ |
851 | BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); | ||
852 | vb->free = VMAP_BBMAP_BITS - (1UL << order); | ||
832 | vb->dirty = 0; | 853 | vb->dirty = 0; |
833 | bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); | 854 | vb->dirty_min = VMAP_BBMAP_BITS; |
855 | vb->dirty_max = 0; | ||
834 | INIT_LIST_HEAD(&vb->free_list); | 856 | INIT_LIST_HEAD(&vb->free_list); |
835 | 857 | ||
836 | vb_idx = addr_to_vb_idx(va->va_start); | 858 | vb_idx = addr_to_vb_idx(va->va_start); |
@@ -842,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
842 | 864 | ||
843 | vbq = &get_cpu_var(vmap_block_queue); | 865 | vbq = &get_cpu_var(vmap_block_queue); |
844 | spin_lock(&vbq->lock); | 866 | spin_lock(&vbq->lock); |
845 | list_add_rcu(&vb->free_list, &vbq->free); | 867 | list_add_tail_rcu(&vb->free_list, &vbq->free); |
846 | spin_unlock(&vbq->lock); | 868 | spin_unlock(&vbq->lock); |
847 | put_cpu_var(vmap_block_queue); | 869 | put_cpu_var(vmap_block_queue); |
848 | 870 | ||
849 | return vb; | 871 | return vaddr; |
850 | } | 872 | } |
851 | 873 | ||
852 | static void free_vmap_block(struct vmap_block *vb) | 874 | static void free_vmap_block(struct vmap_block *vb) |
@@ -881,7 +903,8 @@ static void purge_fragmented_blocks(int cpu) | |||
881 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { | 903 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { |
882 | vb->free = 0; /* prevent further allocs after releasing lock */ | 904 | vb->free = 0; /* prevent further allocs after releasing lock */ |
883 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ | 905 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ |
884 | bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); | 906 | vb->dirty_min = 0; |
907 | vb->dirty_max = VMAP_BBMAP_BITS; | ||
885 | spin_lock(&vbq->lock); | 908 | spin_lock(&vbq->lock); |
886 | list_del_rcu(&vb->free_list); | 909 | list_del_rcu(&vb->free_list); |
887 | spin_unlock(&vbq->lock); | 910 | spin_unlock(&vbq->lock); |
@@ -910,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
910 | { | 933 | { |
911 | struct vmap_block_queue *vbq; | 934 | struct vmap_block_queue *vbq; |
912 | struct vmap_block *vb; | 935 | struct vmap_block *vb; |
913 | unsigned long addr = 0; | 936 | void *vaddr = NULL; |
914 | unsigned int order; | 937 | unsigned int order; |
915 | 938 | ||
916 | BUG_ON(size & ~PAGE_MASK); | 939 | BUG_ON(size & ~PAGE_MASK); |
@@ -925,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
925 | } | 948 | } |
926 | order = get_order(size); | 949 | order = get_order(size); |
927 | 950 | ||
928 | again: | ||
929 | rcu_read_lock(); | 951 | rcu_read_lock(); |
930 | vbq = &get_cpu_var(vmap_block_queue); | 952 | vbq = &get_cpu_var(vmap_block_queue); |
931 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | 953 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { |
932 | int i; | 954 | unsigned long pages_off; |
933 | 955 | ||
934 | spin_lock(&vb->lock); | 956 | spin_lock(&vb->lock); |
935 | if (vb->free < 1UL << order) | 957 | if (vb->free < (1UL << order)) { |
936 | goto next; | 958 | spin_unlock(&vb->lock); |
959 | continue; | ||
960 | } | ||
937 | 961 | ||
938 | i = VMAP_BBMAP_BITS - vb->free; | 962 | pages_off = VMAP_BBMAP_BITS - vb->free; |
939 | addr = vb->va->va_start + (i << PAGE_SHIFT); | 963 | vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); |
940 | BUG_ON(addr_to_vb_idx(addr) != | ||
941 | addr_to_vb_idx(vb->va->va_start)); | ||
942 | vb->free -= 1UL << order; | 964 | vb->free -= 1UL << order; |
943 | if (vb->free == 0) { | 965 | if (vb->free == 0) { |
944 | spin_lock(&vbq->lock); | 966 | spin_lock(&vbq->lock); |
945 | list_del_rcu(&vb->free_list); | 967 | list_del_rcu(&vb->free_list); |
946 | spin_unlock(&vbq->lock); | 968 | spin_unlock(&vbq->lock); |
947 | } | 969 | } |
970 | |||
948 | spin_unlock(&vb->lock); | 971 | spin_unlock(&vb->lock); |
949 | break; | 972 | break; |
950 | next: | ||
951 | spin_unlock(&vb->lock); | ||
952 | } | 973 | } |
953 | 974 | ||
954 | put_cpu_var(vmap_block_queue); | 975 | put_cpu_var(vmap_block_queue); |
955 | rcu_read_unlock(); | 976 | rcu_read_unlock(); |
956 | 977 | ||
957 | if (!addr) { | 978 | /* Allocate new block if nothing was found */ |
958 | vb = new_vmap_block(gfp_mask); | 979 | if (!vaddr) |
959 | if (IS_ERR(vb)) | 980 | vaddr = new_vmap_block(order, gfp_mask); |
960 | return vb; | ||
961 | goto again; | ||
962 | } | ||
963 | 981 | ||
964 | return (void *)addr; | 982 | return vaddr; |
965 | } | 983 | } |
966 | 984 | ||
967 | static void vb_free(const void *addr, unsigned long size) | 985 | static void vb_free(const void *addr, unsigned long size) |
@@ -979,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size) | |||
979 | order = get_order(size); | 997 | order = get_order(size); |
980 | 998 | ||
981 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); | 999 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); |
1000 | offset >>= PAGE_SHIFT; | ||
982 | 1001 | ||
983 | vb_idx = addr_to_vb_idx((unsigned long)addr); | 1002 | vb_idx = addr_to_vb_idx((unsigned long)addr); |
984 | rcu_read_lock(); | 1003 | rcu_read_lock(); |
@@ -989,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size) | |||
989 | vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); | 1008 | vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); |
990 | 1009 | ||
991 | spin_lock(&vb->lock); | 1010 | spin_lock(&vb->lock); |
992 | BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); | 1011 | |
1012 | /* Expand dirty range */ | ||
1013 | vb->dirty_min = min(vb->dirty_min, offset); | ||
1014 | vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); | ||
993 | 1015 | ||
994 | vb->dirty += 1UL << order; | 1016 | vb->dirty += 1UL << order; |
995 | if (vb->dirty == VMAP_BBMAP_BITS) { | 1017 | if (vb->dirty == VMAP_BBMAP_BITS) { |
@@ -1028,25 +1050,18 @@ void vm_unmap_aliases(void) | |||
1028 | 1050 | ||
1029 | rcu_read_lock(); | 1051 | rcu_read_lock(); |
1030 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | 1052 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { |
1031 | int i, j; | ||
1032 | |||
1033 | spin_lock(&vb->lock); | 1053 | spin_lock(&vb->lock); |
1034 | i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); | 1054 | if (vb->dirty) { |
1035 | if (i < VMAP_BBMAP_BITS) { | 1055 | unsigned long va_start = vb->va->va_start; |
1036 | unsigned long s, e; | 1056 | unsigned long s, e; |
1037 | 1057 | ||
1038 | j = find_last_bit(vb->dirty_map, | 1058 | s = va_start + (vb->dirty_min << PAGE_SHIFT); |
1039 | VMAP_BBMAP_BITS); | 1059 | e = va_start + (vb->dirty_max << PAGE_SHIFT); |
1040 | j = j + 1; /* need exclusive index */ | ||
1041 | 1060 | ||
1042 | s = vb->va->va_start + (i << PAGE_SHIFT); | 1061 | start = min(s, start); |
1043 | e = vb->va->va_start + (j << PAGE_SHIFT); | 1062 | end = max(e, end); |
1044 | flush = 1; | ||
1045 | 1063 | ||
1046 | if (s < start) | 1064 | flush = 1; |
1047 | start = s; | ||
1048 | if (e > end) | ||
1049 | end = e; | ||
1050 | } | 1065 | } |
1051 | spin_unlock(&vb->lock); | 1066 | spin_unlock(&vb->lock); |
1052 | } | 1067 | } |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0dec1fa5f656..08bd7a3d464a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -12,35 +12,6 @@ | |||
12 | */ | 12 | */ |
13 | 13 | ||
14 | /* | 14 | /* |
15 | * This allocator is designed for use with zram. Thus, the allocator is | ||
16 | * supposed to work well under low memory conditions. In particular, it | ||
17 | * never attempts higher order page allocation which is very likely to | ||
18 | * fail under memory pressure. On the other hand, if we just use single | ||
19 | * (0-order) pages, it would suffer from very high fragmentation -- | ||
20 | * any object of size PAGE_SIZE/2 or larger would occupy an entire page. | ||
21 | * This was one of the major issues with its predecessor (xvmalloc). | ||
22 | * | ||
23 | * To overcome these issues, zsmalloc allocates a bunch of 0-order pages | ||
24 | * and links them together using various 'struct page' fields. These linked | ||
25 | * pages act as a single higher-order page i.e. an object can span 0-order | ||
26 | * page boundaries. The code refers to these linked pages as a single entity | ||
27 | * called zspage. | ||
28 | * | ||
29 | * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE | ||
30 | * since this satisfies the requirements of all its current users (in the | ||
31 | * worst case, page is incompressible and is thus stored "as-is" i.e. in | ||
32 | * uncompressed form). For allocation requests larger than this size, failure | ||
33 | * is returned (see zs_malloc). | ||
34 | * | ||
35 | * Additionally, zs_malloc() does not return a dereferenceable pointer. | ||
36 | * Instead, it returns an opaque handle (unsigned long) which encodes actual | ||
37 | * location of the allocated object. The reason for this indirection is that | ||
38 | * zsmalloc does not keep zspages permanently mapped since that would cause | ||
39 | * issues on 32-bit systems where the VA region for kernel space mappings | ||
40 | * is very small. So, before using the allocating memory, the object has to | ||
41 | * be mapped using zs_map_object() to get a usable pointer and subsequently | ||
42 | * unmapped using zs_unmap_object(). | ||
43 | * | ||
44 | * Following is how we use various fields and flags of underlying | 15 | * Following is how we use various fields and flags of underlying |
45 | * struct page(s) to form a zspage. | 16 | * struct page(s) to form a zspage. |
46 | * | 17 | * |
@@ -57,6 +28,8 @@ | |||
57 | * | 28 | * |
58 | * page->private (union with page->first_page): refers to the | 29 | * page->private (union with page->first_page): refers to the |
59 | * component page after the first page | 30 | * component page after the first page |
31 | * If the page is first_page for huge object, it stores handle. | ||
32 | * Look at size_class->huge. | ||
60 | * page->freelist: points to the first free object in zspage. | 33 | * page->freelist: points to the first free object in zspage. |
61 | * Free objects are linked together using in-place | 34 | * Free objects are linked together using in-place |
62 | * metadata. | 35 | * metadata. |
@@ -78,6 +51,7 @@ | |||
78 | 51 | ||
79 | #include <linux/module.h> | 52 | #include <linux/module.h> |
80 | #include <linux/kernel.h> | 53 | #include <linux/kernel.h> |
54 | #include <linux/sched.h> | ||
81 | #include <linux/bitops.h> | 55 | #include <linux/bitops.h> |
82 | #include <linux/errno.h> | 56 | #include <linux/errno.h> |
83 | #include <linux/highmem.h> | 57 | #include <linux/highmem.h> |
@@ -110,6 +84,8 @@ | |||
110 | #define ZS_MAX_ZSPAGE_ORDER 2 | 84 | #define ZS_MAX_ZSPAGE_ORDER 2 |
111 | #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) | 85 | #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) |
112 | 86 | ||
87 | #define ZS_HANDLE_SIZE (sizeof(unsigned long)) | ||
88 | |||
113 | /* | 89 | /* |
114 | * Object location (<PFN>, <obj_idx>) is encoded as | 90 | * Object location (<PFN>, <obj_idx>) is encoded as |
115 | * as single (unsigned long) handle value. | 91 | * as single (unsigned long) handle value. |
@@ -133,13 +109,33 @@ | |||
133 | #endif | 109 | #endif |
134 | #endif | 110 | #endif |
135 | #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) | 111 | #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) |
136 | #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) | 112 | |
113 | /* | ||
114 | * Memory for allocating for handle keeps object position by | ||
115 | * encoding <page, obj_idx> and the encoded value has a room | ||
116 | * in least bit(ie, look at obj_to_location). | ||
117 | * We use the bit to synchronize between object access by | ||
118 | * user and migration. | ||
119 | */ | ||
120 | #define HANDLE_PIN_BIT 0 | ||
121 | |||
122 | /* | ||
123 | * Head in allocated object should have OBJ_ALLOCATED_TAG | ||
124 | * to identify the object was allocated or not. | ||
125 | * It's okay to add the status bit in the least bit because | ||
126 | * header keeps handle which is 4byte-aligned address so we | ||
127 | * have room for two bit at least. | ||
128 | */ | ||
129 | #define OBJ_ALLOCATED_TAG 1 | ||
130 | #define OBJ_TAG_BITS 1 | ||
131 | #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) | ||
137 | #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) | 132 | #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) |
138 | 133 | ||
139 | #define MAX(a, b) ((a) >= (b) ? (a) : (b)) | 134 | #define MAX(a, b) ((a) >= (b) ? (a) : (b)) |
140 | /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ | 135 | /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ |
141 | #define ZS_MIN_ALLOC_SIZE \ | 136 | #define ZS_MIN_ALLOC_SIZE \ |
142 | MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) | 137 | MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) |
138 | /* each chunk includes extra space to keep handle */ | ||
143 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE | 139 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE |
144 | 140 | ||
145 | /* | 141 | /* |
@@ -172,6 +168,8 @@ enum fullness_group { | |||
172 | enum zs_stat_type { | 168 | enum zs_stat_type { |
173 | OBJ_ALLOCATED, | 169 | OBJ_ALLOCATED, |
174 | OBJ_USED, | 170 | OBJ_USED, |
171 | CLASS_ALMOST_FULL, | ||
172 | CLASS_ALMOST_EMPTY, | ||
175 | NR_ZS_STAT_TYPE, | 173 | NR_ZS_STAT_TYPE, |
176 | }; | 174 | }; |
177 | 175 | ||
@@ -216,6 +214,8 @@ struct size_class { | |||
216 | 214 | ||
217 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ | 215 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ |
218 | int pages_per_zspage; | 216 | int pages_per_zspage; |
217 | /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ | ||
218 | bool huge; | ||
219 | 219 | ||
220 | #ifdef CONFIG_ZSMALLOC_STAT | 220 | #ifdef CONFIG_ZSMALLOC_STAT |
221 | struct zs_size_stat stats; | 221 | struct zs_size_stat stats; |
@@ -233,14 +233,24 @@ struct size_class { | |||
233 | * This must be power of 2 and less than or equal to ZS_ALIGN | 233 | * This must be power of 2 and less than or equal to ZS_ALIGN |
234 | */ | 234 | */ |
235 | struct link_free { | 235 | struct link_free { |
236 | /* Handle of next free chunk (encodes <PFN, obj_idx>) */ | 236 | union { |
237 | void *next; | 237 | /* |
238 | * Position of next free chunk (encodes <PFN, obj_idx>) | ||
239 | * It's valid for non-allocated object | ||
240 | */ | ||
241 | void *next; | ||
242 | /* | ||
243 | * Handle of allocated object. | ||
244 | */ | ||
245 | unsigned long handle; | ||
246 | }; | ||
238 | }; | 247 | }; |
239 | 248 | ||
240 | struct zs_pool { | 249 | struct zs_pool { |
241 | char *name; | 250 | char *name; |
242 | 251 | ||
243 | struct size_class **size_class; | 252 | struct size_class **size_class; |
253 | struct kmem_cache *handle_cachep; | ||
244 | 254 | ||
245 | gfp_t flags; /* allocation flags used when growing pool */ | 255 | gfp_t flags; /* allocation flags used when growing pool */ |
246 | atomic_long_t pages_allocated; | 256 | atomic_long_t pages_allocated; |
@@ -267,8 +277,37 @@ struct mapping_area { | |||
267 | #endif | 277 | #endif |
268 | char *vm_addr; /* address of kmap_atomic()'ed pages */ | 278 | char *vm_addr; /* address of kmap_atomic()'ed pages */ |
269 | enum zs_mapmode vm_mm; /* mapping mode */ | 279 | enum zs_mapmode vm_mm; /* mapping mode */ |
280 | bool huge; | ||
270 | }; | 281 | }; |
271 | 282 | ||
283 | static int create_handle_cache(struct zs_pool *pool) | ||
284 | { | ||
285 | pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, | ||
286 | 0, 0, NULL); | ||
287 | return pool->handle_cachep ? 0 : 1; | ||
288 | } | ||
289 | |||
290 | static void destroy_handle_cache(struct zs_pool *pool) | ||
291 | { | ||
292 | kmem_cache_destroy(pool->handle_cachep); | ||
293 | } | ||
294 | |||
295 | static unsigned long alloc_handle(struct zs_pool *pool) | ||
296 | { | ||
297 | return (unsigned long)kmem_cache_alloc(pool->handle_cachep, | ||
298 | pool->flags & ~__GFP_HIGHMEM); | ||
299 | } | ||
300 | |||
301 | static void free_handle(struct zs_pool *pool, unsigned long handle) | ||
302 | { | ||
303 | kmem_cache_free(pool->handle_cachep, (void *)handle); | ||
304 | } | ||
305 | |||
306 | static void record_obj(unsigned long handle, unsigned long obj) | ||
307 | { | ||
308 | *(unsigned long *)handle = obj; | ||
309 | } | ||
310 | |||
272 | /* zpool driver */ | 311 | /* zpool driver */ |
273 | 312 | ||
274 | #ifdef CONFIG_ZPOOL | 313 | #ifdef CONFIG_ZPOOL |
@@ -346,6 +385,11 @@ static struct zpool_driver zs_zpool_driver = { | |||
346 | MODULE_ALIAS("zpool-zsmalloc"); | 385 | MODULE_ALIAS("zpool-zsmalloc"); |
347 | #endif /* CONFIG_ZPOOL */ | 386 | #endif /* CONFIG_ZPOOL */ |
348 | 387 | ||
388 | static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) | ||
389 | { | ||
390 | return pages_per_zspage * PAGE_SIZE / size; | ||
391 | } | ||
392 | |||
349 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ | 393 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ |
350 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); | 394 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); |
351 | 395 | ||
@@ -396,9 +440,182 @@ static int get_size_class_index(int size) | |||
396 | idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, | 440 | idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, |
397 | ZS_SIZE_CLASS_DELTA); | 441 | ZS_SIZE_CLASS_DELTA); |
398 | 442 | ||
399 | return idx; | 443 | return min(zs_size_classes - 1, idx); |
444 | } | ||
445 | |||
446 | #ifdef CONFIG_ZSMALLOC_STAT | ||
447 | |||
448 | static inline void zs_stat_inc(struct size_class *class, | ||
449 | enum zs_stat_type type, unsigned long cnt) | ||
450 | { | ||
451 | class->stats.objs[type] += cnt; | ||
452 | } | ||
453 | |||
454 | static inline void zs_stat_dec(struct size_class *class, | ||
455 | enum zs_stat_type type, unsigned long cnt) | ||
456 | { | ||
457 | class->stats.objs[type] -= cnt; | ||
458 | } | ||
459 | |||
460 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
461 | enum zs_stat_type type) | ||
462 | { | ||
463 | return class->stats.objs[type]; | ||
464 | } | ||
465 | |||
466 | static int __init zs_stat_init(void) | ||
467 | { | ||
468 | if (!debugfs_initialized()) | ||
469 | return -ENODEV; | ||
470 | |||
471 | zs_stat_root = debugfs_create_dir("zsmalloc", NULL); | ||
472 | if (!zs_stat_root) | ||
473 | return -ENOMEM; | ||
474 | |||
475 | return 0; | ||
476 | } | ||
477 | |||
478 | static void __exit zs_stat_exit(void) | ||
479 | { | ||
480 | debugfs_remove_recursive(zs_stat_root); | ||
481 | } | ||
482 | |||
483 | static int zs_stats_size_show(struct seq_file *s, void *v) | ||
484 | { | ||
485 | int i; | ||
486 | struct zs_pool *pool = s->private; | ||
487 | struct size_class *class; | ||
488 | int objs_per_zspage; | ||
489 | unsigned long class_almost_full, class_almost_empty; | ||
490 | unsigned long obj_allocated, obj_used, pages_used; | ||
491 | unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; | ||
492 | unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; | ||
493 | |||
494 | seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", | ||
495 | "class", "size", "almost_full", "almost_empty", | ||
496 | "obj_allocated", "obj_used", "pages_used", | ||
497 | "pages_per_zspage"); | ||
498 | |||
499 | for (i = 0; i < zs_size_classes; i++) { | ||
500 | class = pool->size_class[i]; | ||
501 | |||
502 | if (class->index != i) | ||
503 | continue; | ||
504 | |||
505 | spin_lock(&class->lock); | ||
506 | class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); | ||
507 | class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); | ||
508 | obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); | ||
509 | obj_used = zs_stat_get(class, OBJ_USED); | ||
510 | spin_unlock(&class->lock); | ||
511 | |||
512 | objs_per_zspage = get_maxobj_per_zspage(class->size, | ||
513 | class->pages_per_zspage); | ||
514 | pages_used = obj_allocated / objs_per_zspage * | ||
515 | class->pages_per_zspage; | ||
516 | |||
517 | seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", | ||
518 | i, class->size, class_almost_full, class_almost_empty, | ||
519 | obj_allocated, obj_used, pages_used, | ||
520 | class->pages_per_zspage); | ||
521 | |||
522 | total_class_almost_full += class_almost_full; | ||
523 | total_class_almost_empty += class_almost_empty; | ||
524 | total_objs += obj_allocated; | ||
525 | total_used_objs += obj_used; | ||
526 | total_pages += pages_used; | ||
527 | } | ||
528 | |||
529 | seq_puts(s, "\n"); | ||
530 | seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", | ||
531 | "Total", "", total_class_almost_full, | ||
532 | total_class_almost_empty, total_objs, | ||
533 | total_used_objs, total_pages); | ||
534 | |||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | static int zs_stats_size_open(struct inode *inode, struct file *file) | ||
539 | { | ||
540 | return single_open(file, zs_stats_size_show, inode->i_private); | ||
541 | } | ||
542 | |||
543 | static const struct file_operations zs_stat_size_ops = { | ||
544 | .open = zs_stats_size_open, | ||
545 | .read = seq_read, | ||
546 | .llseek = seq_lseek, | ||
547 | .release = single_release, | ||
548 | }; | ||
549 | |||
550 | static int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
551 | { | ||
552 | struct dentry *entry; | ||
553 | |||
554 | if (!zs_stat_root) | ||
555 | return -ENODEV; | ||
556 | |||
557 | entry = debugfs_create_dir(name, zs_stat_root); | ||
558 | if (!entry) { | ||
559 | pr_warn("debugfs dir <%s> creation failed\n", name); | ||
560 | return -ENOMEM; | ||
561 | } | ||
562 | pool->stat_dentry = entry; | ||
563 | |||
564 | entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, | ||
565 | pool->stat_dentry, pool, &zs_stat_size_ops); | ||
566 | if (!entry) { | ||
567 | pr_warn("%s: debugfs file entry <%s> creation failed\n", | ||
568 | name, "classes"); | ||
569 | return -ENOMEM; | ||
570 | } | ||
571 | |||
572 | return 0; | ||
573 | } | ||
574 | |||
575 | static void zs_pool_stat_destroy(struct zs_pool *pool) | ||
576 | { | ||
577 | debugfs_remove_recursive(pool->stat_dentry); | ||
578 | } | ||
579 | |||
580 | #else /* CONFIG_ZSMALLOC_STAT */ | ||
581 | |||
582 | static inline void zs_stat_inc(struct size_class *class, | ||
583 | enum zs_stat_type type, unsigned long cnt) | ||
584 | { | ||
585 | } | ||
586 | |||
587 | static inline void zs_stat_dec(struct size_class *class, | ||
588 | enum zs_stat_type type, unsigned long cnt) | ||
589 | { | ||
590 | } | ||
591 | |||
592 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
593 | enum zs_stat_type type) | ||
594 | { | ||
595 | return 0; | ||
596 | } | ||
597 | |||
598 | static int __init zs_stat_init(void) | ||
599 | { | ||
600 | return 0; | ||
601 | } | ||
602 | |||
603 | static void __exit zs_stat_exit(void) | ||
604 | { | ||
605 | } | ||
606 | |||
607 | static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
608 | { | ||
609 | return 0; | ||
610 | } | ||
611 | |||
612 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) | ||
613 | { | ||
400 | } | 614 | } |
401 | 615 | ||
616 | #endif | ||
617 | |||
618 | |||
402 | /* | 619 | /* |
403 | * For each size class, zspages are divided into different groups | 620 | * For each size class, zspages are divided into different groups |
404 | * depending on how "full" they are. This was done so that we could | 621 | * depending on how "full" they are. This was done so that we could |
@@ -419,7 +636,7 @@ static enum fullness_group get_fullness_group(struct page *page) | |||
419 | fg = ZS_EMPTY; | 636 | fg = ZS_EMPTY; |
420 | else if (inuse == max_objects) | 637 | else if (inuse == max_objects) |
421 | fg = ZS_FULL; | 638 | fg = ZS_FULL; |
422 | else if (inuse <= max_objects / fullness_threshold_frac) | 639 | else if (inuse <= 3 * max_objects / fullness_threshold_frac) |
423 | fg = ZS_ALMOST_EMPTY; | 640 | fg = ZS_ALMOST_EMPTY; |
424 | else | 641 | else |
425 | fg = ZS_ALMOST_FULL; | 642 | fg = ZS_ALMOST_FULL; |
@@ -448,6 +665,8 @@ static void insert_zspage(struct page *page, struct size_class *class, | |||
448 | list_add_tail(&page->lru, &(*head)->lru); | 665 | list_add_tail(&page->lru, &(*head)->lru); |
449 | 666 | ||
450 | *head = page; | 667 | *head = page; |
668 | zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? | ||
669 | CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); | ||
451 | } | 670 | } |
452 | 671 | ||
453 | /* | 672 | /* |
@@ -473,6 +692,8 @@ static void remove_zspage(struct page *page, struct size_class *class, | |||
473 | struct page, lru); | 692 | struct page, lru); |
474 | 693 | ||
475 | list_del_init(&page->lru); | 694 | list_del_init(&page->lru); |
695 | zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? | ||
696 | CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); | ||
476 | } | 697 | } |
477 | 698 | ||
478 | /* | 699 | /* |
@@ -484,11 +705,10 @@ static void remove_zspage(struct page *page, struct size_class *class, | |||
484 | * page from the freelist of the old fullness group to that of the new | 705 | * page from the freelist of the old fullness group to that of the new |
485 | * fullness group. | 706 | * fullness group. |
486 | */ | 707 | */ |
487 | static enum fullness_group fix_fullness_group(struct zs_pool *pool, | 708 | static enum fullness_group fix_fullness_group(struct size_class *class, |
488 | struct page *page) | 709 | struct page *page) |
489 | { | 710 | { |
490 | int class_idx; | 711 | int class_idx; |
491 | struct size_class *class; | ||
492 | enum fullness_group currfg, newfg; | 712 | enum fullness_group currfg, newfg; |
493 | 713 | ||
494 | BUG_ON(!is_first_page(page)); | 714 | BUG_ON(!is_first_page(page)); |
@@ -498,7 +718,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, | |||
498 | if (newfg == currfg) | 718 | if (newfg == currfg) |
499 | goto out; | 719 | goto out; |
500 | 720 | ||
501 | class = pool->size_class[class_idx]; | ||
502 | remove_zspage(page, class, currfg); | 721 | remove_zspage(page, class, currfg); |
503 | insert_zspage(page, class, newfg); | 722 | insert_zspage(page, class, newfg); |
504 | set_zspage_mapping(page, class_idx, newfg); | 723 | set_zspage_mapping(page, class_idx, newfg); |
@@ -512,7 +731,8 @@ out: | |||
512 | * to form a zspage for each size class. This is important | 731 | * to form a zspage for each size class. This is important |
513 | * to reduce wastage due to unusable space left at end of | 732 | * to reduce wastage due to unusable space left at end of |
514 | * each zspage which is given as: | 733 | * each zspage which is given as: |
515 | * wastage = Zp - Zp % size_class | 734 | * wastage = Zp % class_size |
735 | * usage = Zp - wastage | ||
516 | * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... | 736 | * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... |
517 | * | 737 | * |
518 | * For example, for size class of 3/8 * PAGE_SIZE, we should | 738 | * For example, for size class of 3/8 * PAGE_SIZE, we should |
@@ -571,35 +791,50 @@ static struct page *get_next_page(struct page *page) | |||
571 | 791 | ||
572 | /* | 792 | /* |
573 | * Encode <page, obj_idx> as a single handle value. | 793 | * Encode <page, obj_idx> as a single handle value. |
574 | * On hardware platforms with physical memory starting at 0x0 the pfn | 794 | * We use the least bit of handle for tagging. |
575 | * could be 0 so we ensure that the handle will never be 0 by adjusting the | ||
576 | * encoded obj_idx value before encoding. | ||
577 | */ | 795 | */ |
578 | static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) | 796 | static void *location_to_obj(struct page *page, unsigned long obj_idx) |
579 | { | 797 | { |
580 | unsigned long handle; | 798 | unsigned long obj; |
581 | 799 | ||
582 | if (!page) { | 800 | if (!page) { |
583 | BUG_ON(obj_idx); | 801 | BUG_ON(obj_idx); |
584 | return NULL; | 802 | return NULL; |
585 | } | 803 | } |
586 | 804 | ||
587 | handle = page_to_pfn(page) << OBJ_INDEX_BITS; | 805 | obj = page_to_pfn(page) << OBJ_INDEX_BITS; |
588 | handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); | 806 | obj |= ((obj_idx) & OBJ_INDEX_MASK); |
807 | obj <<= OBJ_TAG_BITS; | ||
589 | 808 | ||
590 | return (void *)handle; | 809 | return (void *)obj; |
591 | } | 810 | } |
592 | 811 | ||
593 | /* | 812 | /* |
594 | * Decode <page, obj_idx> pair from the given object handle. We adjust the | 813 | * Decode <page, obj_idx> pair from the given object handle. We adjust the |
595 | * decoded obj_idx back to its original value since it was adjusted in | 814 | * decoded obj_idx back to its original value since it was adjusted in |
596 | * obj_location_to_handle(). | 815 | * location_to_obj(). |
597 | */ | 816 | */ |
598 | static void obj_handle_to_location(unsigned long handle, struct page **page, | 817 | static void obj_to_location(unsigned long obj, struct page **page, |
599 | unsigned long *obj_idx) | 818 | unsigned long *obj_idx) |
600 | { | 819 | { |
601 | *page = pfn_to_page(handle >> OBJ_INDEX_BITS); | 820 | obj >>= OBJ_TAG_BITS; |
602 | *obj_idx = (handle & OBJ_INDEX_MASK) - 1; | 821 | *page = pfn_to_page(obj >> OBJ_INDEX_BITS); |
822 | *obj_idx = (obj & OBJ_INDEX_MASK); | ||
823 | } | ||
824 | |||
825 | static unsigned long handle_to_obj(unsigned long handle) | ||
826 | { | ||
827 | return *(unsigned long *)handle; | ||
828 | } | ||
829 | |||
830 | static unsigned long obj_to_head(struct size_class *class, struct page *page, | ||
831 | void *obj) | ||
832 | { | ||
833 | if (class->huge) { | ||
834 | VM_BUG_ON(!is_first_page(page)); | ||
835 | return *(unsigned long *)page_private(page); | ||
836 | } else | ||
837 | return *(unsigned long *)obj; | ||
603 | } | 838 | } |
604 | 839 | ||
605 | static unsigned long obj_idx_to_offset(struct page *page, | 840 | static unsigned long obj_idx_to_offset(struct page *page, |
@@ -613,6 +848,25 @@ static unsigned long obj_idx_to_offset(struct page *page, | |||
613 | return off + obj_idx * class_size; | 848 | return off + obj_idx * class_size; |
614 | } | 849 | } |
615 | 850 | ||
851 | static inline int trypin_tag(unsigned long handle) | ||
852 | { | ||
853 | unsigned long *ptr = (unsigned long *)handle; | ||
854 | |||
855 | return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); | ||
856 | } | ||
857 | |||
858 | static void pin_tag(unsigned long handle) | ||
859 | { | ||
860 | while (!trypin_tag(handle)); | ||
861 | } | ||
862 | |||
863 | static void unpin_tag(unsigned long handle) | ||
864 | { | ||
865 | unsigned long *ptr = (unsigned long *)handle; | ||
866 | |||
867 | clear_bit_unlock(HANDLE_PIN_BIT, ptr); | ||
868 | } | ||
869 | |||
616 | static void reset_page(struct page *page) | 870 | static void reset_page(struct page *page) |
617 | { | 871 | { |
618 | clear_bit(PG_private, &page->flags); | 872 | clear_bit(PG_private, &page->flags); |
@@ -674,7 +928,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
674 | link = (struct link_free *)vaddr + off / sizeof(*link); | 928 | link = (struct link_free *)vaddr + off / sizeof(*link); |
675 | 929 | ||
676 | while ((off += class->size) < PAGE_SIZE) { | 930 | while ((off += class->size) < PAGE_SIZE) { |
677 | link->next = obj_location_to_handle(page, i++); | 931 | link->next = location_to_obj(page, i++); |
678 | link += class->size / sizeof(*link); | 932 | link += class->size / sizeof(*link); |
679 | } | 933 | } |
680 | 934 | ||
@@ -684,7 +938,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
684 | * page (if present) | 938 | * page (if present) |
685 | */ | 939 | */ |
686 | next_page = get_next_page(page); | 940 | next_page = get_next_page(page); |
687 | link->next = obj_location_to_handle(next_page, 0); | 941 | link->next = location_to_obj(next_page, 0); |
688 | kunmap_atomic(vaddr); | 942 | kunmap_atomic(vaddr); |
689 | page = next_page; | 943 | page = next_page; |
690 | off %= PAGE_SIZE; | 944 | off %= PAGE_SIZE; |
@@ -738,7 +992,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) | |||
738 | 992 | ||
739 | init_zspage(first_page, class); | 993 | init_zspage(first_page, class); |
740 | 994 | ||
741 | first_page->freelist = obj_location_to_handle(first_page, 0); | 995 | first_page->freelist = location_to_obj(first_page, 0); |
742 | /* Maximum number of objects we can store in this zspage */ | 996 | /* Maximum number of objects we can store in this zspage */ |
743 | first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; | 997 | first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; |
744 | 998 | ||
@@ -860,12 +1114,19 @@ static void __zs_unmap_object(struct mapping_area *area, | |||
860 | { | 1114 | { |
861 | int sizes[2]; | 1115 | int sizes[2]; |
862 | void *addr; | 1116 | void *addr; |
863 | char *buf = area->vm_buf; | 1117 | char *buf; |
864 | 1118 | ||
865 | /* no write fastpath */ | 1119 | /* no write fastpath */ |
866 | if (area->vm_mm == ZS_MM_RO) | 1120 | if (area->vm_mm == ZS_MM_RO) |
867 | goto out; | 1121 | goto out; |
868 | 1122 | ||
1123 | buf = area->vm_buf; | ||
1124 | if (!area->huge) { | ||
1125 | buf = buf + ZS_HANDLE_SIZE; | ||
1126 | size -= ZS_HANDLE_SIZE; | ||
1127 | off += ZS_HANDLE_SIZE; | ||
1128 | } | ||
1129 | |||
869 | sizes[0] = PAGE_SIZE - off; | 1130 | sizes[0] = PAGE_SIZE - off; |
870 | sizes[1] = size - sizes[0]; | 1131 | sizes[1] = size - sizes[0]; |
871 | 1132 | ||
@@ -952,11 +1213,6 @@ static void init_zs_size_classes(void) | |||
952 | zs_size_classes = nr; | 1213 | zs_size_classes = nr; |
953 | } | 1214 | } |
954 | 1215 | ||
955 | static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) | ||
956 | { | ||
957 | return pages_per_zspage * PAGE_SIZE / size; | ||
958 | } | ||
959 | |||
960 | static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | 1216 | static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) |
961 | { | 1217 | { |
962 | if (prev->pages_per_zspage != pages_per_zspage) | 1218 | if (prev->pages_per_zspage != pages_per_zspage) |
@@ -969,166 +1225,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | |||
969 | return true; | 1225 | return true; |
970 | } | 1226 | } |
971 | 1227 | ||
972 | #ifdef CONFIG_ZSMALLOC_STAT | 1228 | static bool zspage_full(struct page *page) |
973 | |||
974 | static inline void zs_stat_inc(struct size_class *class, | ||
975 | enum zs_stat_type type, unsigned long cnt) | ||
976 | { | ||
977 | class->stats.objs[type] += cnt; | ||
978 | } | ||
979 | |||
980 | static inline void zs_stat_dec(struct size_class *class, | ||
981 | enum zs_stat_type type, unsigned long cnt) | ||
982 | { | ||
983 | class->stats.objs[type] -= cnt; | ||
984 | } | ||
985 | |||
986 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
987 | enum zs_stat_type type) | ||
988 | { | ||
989 | return class->stats.objs[type]; | ||
990 | } | ||
991 | |||
992 | static int __init zs_stat_init(void) | ||
993 | { | ||
994 | if (!debugfs_initialized()) | ||
995 | return -ENODEV; | ||
996 | |||
997 | zs_stat_root = debugfs_create_dir("zsmalloc", NULL); | ||
998 | if (!zs_stat_root) | ||
999 | return -ENOMEM; | ||
1000 | |||
1001 | return 0; | ||
1002 | } | ||
1003 | |||
1004 | static void __exit zs_stat_exit(void) | ||
1005 | { | ||
1006 | debugfs_remove_recursive(zs_stat_root); | ||
1007 | } | ||
1008 | |||
1009 | static int zs_stats_size_show(struct seq_file *s, void *v) | ||
1010 | { | 1229 | { |
1011 | int i; | 1230 | BUG_ON(!is_first_page(page)); |
1012 | struct zs_pool *pool = s->private; | ||
1013 | struct size_class *class; | ||
1014 | int objs_per_zspage; | ||
1015 | unsigned long obj_allocated, obj_used, pages_used; | ||
1016 | unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; | ||
1017 | |||
1018 | seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size", | ||
1019 | "obj_allocated", "obj_used", "pages_used"); | ||
1020 | |||
1021 | for (i = 0; i < zs_size_classes; i++) { | ||
1022 | class = pool->size_class[i]; | ||
1023 | |||
1024 | if (class->index != i) | ||
1025 | continue; | ||
1026 | |||
1027 | spin_lock(&class->lock); | ||
1028 | obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); | ||
1029 | obj_used = zs_stat_get(class, OBJ_USED); | ||
1030 | spin_unlock(&class->lock); | ||
1031 | |||
1032 | objs_per_zspage = get_maxobj_per_zspage(class->size, | ||
1033 | class->pages_per_zspage); | ||
1034 | pages_used = obj_allocated / objs_per_zspage * | ||
1035 | class->pages_per_zspage; | ||
1036 | |||
1037 | seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i, | ||
1038 | class->size, obj_allocated, obj_used, pages_used); | ||
1039 | |||
1040 | total_objs += obj_allocated; | ||
1041 | total_used_objs += obj_used; | ||
1042 | total_pages += pages_used; | ||
1043 | } | ||
1044 | |||
1045 | seq_puts(s, "\n"); | ||
1046 | seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "", | ||
1047 | total_objs, total_used_objs, total_pages); | ||
1048 | |||
1049 | return 0; | ||
1050 | } | ||
1051 | |||
1052 | static int zs_stats_size_open(struct inode *inode, struct file *file) | ||
1053 | { | ||
1054 | return single_open(file, zs_stats_size_show, inode->i_private); | ||
1055 | } | ||
1056 | |||
1057 | static const struct file_operations zs_stat_size_ops = { | ||
1058 | .open = zs_stats_size_open, | ||
1059 | .read = seq_read, | ||
1060 | .llseek = seq_lseek, | ||
1061 | .release = single_release, | ||
1062 | }; | ||
1063 | |||
1064 | static int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
1065 | { | ||
1066 | struct dentry *entry; | ||
1067 | |||
1068 | if (!zs_stat_root) | ||
1069 | return -ENODEV; | ||
1070 | |||
1071 | entry = debugfs_create_dir(name, zs_stat_root); | ||
1072 | if (!entry) { | ||
1073 | pr_warn("debugfs dir <%s> creation failed\n", name); | ||
1074 | return -ENOMEM; | ||
1075 | } | ||
1076 | pool->stat_dentry = entry; | ||
1077 | |||
1078 | entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO, | ||
1079 | pool->stat_dentry, pool, &zs_stat_size_ops); | ||
1080 | if (!entry) { | ||
1081 | pr_warn("%s: debugfs file entry <%s> creation failed\n", | ||
1082 | name, "obj_in_classes"); | ||
1083 | return -ENOMEM; | ||
1084 | } | ||
1085 | |||
1086 | return 0; | ||
1087 | } | ||
1088 | |||
1089 | static void zs_pool_stat_destroy(struct zs_pool *pool) | ||
1090 | { | ||
1091 | debugfs_remove_recursive(pool->stat_dentry); | ||
1092 | } | ||
1093 | |||
1094 | #else /* CONFIG_ZSMALLOC_STAT */ | ||
1095 | |||
1096 | static inline void zs_stat_inc(struct size_class *class, | ||
1097 | enum zs_stat_type type, unsigned long cnt) | ||
1098 | { | ||
1099 | } | ||
1100 | |||
1101 | static inline void zs_stat_dec(struct size_class *class, | ||
1102 | enum zs_stat_type type, unsigned long cnt) | ||
1103 | { | ||
1104 | } | ||
1105 | |||
1106 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
1107 | enum zs_stat_type type) | ||
1108 | { | ||
1109 | return 0; | ||
1110 | } | ||
1111 | |||
1112 | static int __init zs_stat_init(void) | ||
1113 | { | ||
1114 | return 0; | ||
1115 | } | ||
1116 | |||
1117 | static void __exit zs_stat_exit(void) | ||
1118 | { | ||
1119 | } | ||
1120 | |||
1121 | static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
1122 | { | ||
1123 | return 0; | ||
1124 | } | ||
1125 | 1231 | ||
1126 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) | 1232 | return page->inuse == page->objects; |
1127 | { | ||
1128 | } | 1233 | } |
1129 | 1234 | ||
1130 | #endif | ||
1131 | |||
1132 | unsigned long zs_get_total_pages(struct zs_pool *pool) | 1235 | unsigned long zs_get_total_pages(struct zs_pool *pool) |
1133 | { | 1236 | { |
1134 | return atomic_long_read(&pool->pages_allocated); | 1237 | return atomic_long_read(&pool->pages_allocated); |
@@ -1153,13 +1256,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
1153 | enum zs_mapmode mm) | 1256 | enum zs_mapmode mm) |
1154 | { | 1257 | { |
1155 | struct page *page; | 1258 | struct page *page; |
1156 | unsigned long obj_idx, off; | 1259 | unsigned long obj, obj_idx, off; |
1157 | 1260 | ||
1158 | unsigned int class_idx; | 1261 | unsigned int class_idx; |
1159 | enum fullness_group fg; | 1262 | enum fullness_group fg; |
1160 | struct size_class *class; | 1263 | struct size_class *class; |
1161 | struct mapping_area *area; | 1264 | struct mapping_area *area; |
1162 | struct page *pages[2]; | 1265 | struct page *pages[2]; |
1266 | void *ret; | ||
1163 | 1267 | ||
1164 | BUG_ON(!handle); | 1268 | BUG_ON(!handle); |
1165 | 1269 | ||
@@ -1170,7 +1274,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
1170 | */ | 1274 | */ |
1171 | BUG_ON(in_interrupt()); | 1275 | BUG_ON(in_interrupt()); |
1172 | 1276 | ||
1173 | obj_handle_to_location(handle, &page, &obj_idx); | 1277 | /* From now on, migration cannot move the object */ |
1278 | pin_tag(handle); | ||
1279 | |||
1280 | obj = handle_to_obj(handle); | ||
1281 | obj_to_location(obj, &page, &obj_idx); | ||
1174 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1282 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
1175 | class = pool->size_class[class_idx]; | 1283 | class = pool->size_class[class_idx]; |
1176 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1284 | off = obj_idx_to_offset(page, obj_idx, class->size); |
@@ -1180,7 +1288,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
1180 | if (off + class->size <= PAGE_SIZE) { | 1288 | if (off + class->size <= PAGE_SIZE) { |
1181 | /* this object is contained entirely within a page */ | 1289 | /* this object is contained entirely within a page */ |
1182 | area->vm_addr = kmap_atomic(page); | 1290 | area->vm_addr = kmap_atomic(page); |
1183 | return area->vm_addr + off; | 1291 | ret = area->vm_addr + off; |
1292 | goto out; | ||
1184 | } | 1293 | } |
1185 | 1294 | ||
1186 | /* this object spans two pages */ | 1295 | /* this object spans two pages */ |
@@ -1188,14 +1297,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
1188 | pages[1] = get_next_page(page); | 1297 | pages[1] = get_next_page(page); |
1189 | BUG_ON(!pages[1]); | 1298 | BUG_ON(!pages[1]); |
1190 | 1299 | ||
1191 | return __zs_map_object(area, pages, off, class->size); | 1300 | ret = __zs_map_object(area, pages, off, class->size); |
1301 | out: | ||
1302 | if (!class->huge) | ||
1303 | ret += ZS_HANDLE_SIZE; | ||
1304 | |||
1305 | return ret; | ||
1192 | } | 1306 | } |
1193 | EXPORT_SYMBOL_GPL(zs_map_object); | 1307 | EXPORT_SYMBOL_GPL(zs_map_object); |
1194 | 1308 | ||
1195 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | 1309 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle) |
1196 | { | 1310 | { |
1197 | struct page *page; | 1311 | struct page *page; |
1198 | unsigned long obj_idx, off; | 1312 | unsigned long obj, obj_idx, off; |
1199 | 1313 | ||
1200 | unsigned int class_idx; | 1314 | unsigned int class_idx; |
1201 | enum fullness_group fg; | 1315 | enum fullness_group fg; |
@@ -1204,7 +1318,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
1204 | 1318 | ||
1205 | BUG_ON(!handle); | 1319 | BUG_ON(!handle); |
1206 | 1320 | ||
1207 | obj_handle_to_location(handle, &page, &obj_idx); | 1321 | obj = handle_to_obj(handle); |
1322 | obj_to_location(obj, &page, &obj_idx); | ||
1208 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1323 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
1209 | class = pool->size_class[class_idx]; | 1324 | class = pool->size_class[class_idx]; |
1210 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1325 | off = obj_idx_to_offset(page, obj_idx, class->size); |
@@ -1222,9 +1337,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
1222 | __zs_unmap_object(area, pages, off, class->size); | 1337 | __zs_unmap_object(area, pages, off, class->size); |
1223 | } | 1338 | } |
1224 | put_cpu_var(zs_map_area); | 1339 | put_cpu_var(zs_map_area); |
1340 | unpin_tag(handle); | ||
1225 | } | 1341 | } |
1226 | EXPORT_SYMBOL_GPL(zs_unmap_object); | 1342 | EXPORT_SYMBOL_GPL(zs_unmap_object); |
1227 | 1343 | ||
1344 | static unsigned long obj_malloc(struct page *first_page, | ||
1345 | struct size_class *class, unsigned long handle) | ||
1346 | { | ||
1347 | unsigned long obj; | ||
1348 | struct link_free *link; | ||
1349 | |||
1350 | struct page *m_page; | ||
1351 | unsigned long m_objidx, m_offset; | ||
1352 | void *vaddr; | ||
1353 | |||
1354 | handle |= OBJ_ALLOCATED_TAG; | ||
1355 | obj = (unsigned long)first_page->freelist; | ||
1356 | obj_to_location(obj, &m_page, &m_objidx); | ||
1357 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | ||
1358 | |||
1359 | vaddr = kmap_atomic(m_page); | ||
1360 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); | ||
1361 | first_page->freelist = link->next; | ||
1362 | if (!class->huge) | ||
1363 | /* record handle in the header of allocated chunk */ | ||
1364 | link->handle = handle; | ||
1365 | else | ||
1366 | /* record handle in first_page->private */ | ||
1367 | set_page_private(first_page, handle); | ||
1368 | kunmap_atomic(vaddr); | ||
1369 | first_page->inuse++; | ||
1370 | zs_stat_inc(class, OBJ_USED, 1); | ||
1371 | |||
1372 | return obj; | ||
1373 | } | ||
1374 | |||
1375 | |||
1228 | /** | 1376 | /** |
1229 | * zs_malloc - Allocate block of given size from pool. | 1377 | * zs_malloc - Allocate block of given size from pool. |
1230 | * @pool: pool to allocate from | 1378 | * @pool: pool to allocate from |
@@ -1236,17 +1384,19 @@ EXPORT_SYMBOL_GPL(zs_unmap_object); | |||
1236 | */ | 1384 | */ |
1237 | unsigned long zs_malloc(struct zs_pool *pool, size_t size) | 1385 | unsigned long zs_malloc(struct zs_pool *pool, size_t size) |
1238 | { | 1386 | { |
1239 | unsigned long obj; | 1387 | unsigned long handle, obj; |
1240 | struct link_free *link; | ||
1241 | struct size_class *class; | 1388 | struct size_class *class; |
1242 | void *vaddr; | 1389 | struct page *first_page; |
1243 | |||
1244 | struct page *first_page, *m_page; | ||
1245 | unsigned long m_objidx, m_offset; | ||
1246 | 1390 | ||
1247 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) | 1391 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) |
1248 | return 0; | 1392 | return 0; |
1249 | 1393 | ||
1394 | handle = alloc_handle(pool); | ||
1395 | if (!handle) | ||
1396 | return 0; | ||
1397 | |||
1398 | /* extra space in chunk to keep the handle */ | ||
1399 | size += ZS_HANDLE_SIZE; | ||
1250 | class = pool->size_class[get_size_class_index(size)]; | 1400 | class = pool->size_class[get_size_class_index(size)]; |
1251 | 1401 | ||
1252 | spin_lock(&class->lock); | 1402 | spin_lock(&class->lock); |
@@ -1255,8 +1405,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
1255 | if (!first_page) { | 1405 | if (!first_page) { |
1256 | spin_unlock(&class->lock); | 1406 | spin_unlock(&class->lock); |
1257 | first_page = alloc_zspage(class, pool->flags); | 1407 | first_page = alloc_zspage(class, pool->flags); |
1258 | if (unlikely(!first_page)) | 1408 | if (unlikely(!first_page)) { |
1409 | free_handle(pool, handle); | ||
1259 | return 0; | 1410 | return 0; |
1411 | } | ||
1260 | 1412 | ||
1261 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); | 1413 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); |
1262 | atomic_long_add(class->pages_per_zspage, | 1414 | atomic_long_add(class->pages_per_zspage, |
@@ -1267,73 +1419,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
1267 | class->size, class->pages_per_zspage)); | 1419 | class->size, class->pages_per_zspage)); |
1268 | } | 1420 | } |
1269 | 1421 | ||
1270 | obj = (unsigned long)first_page->freelist; | 1422 | obj = obj_malloc(first_page, class, handle); |
1271 | obj_handle_to_location(obj, &m_page, &m_objidx); | ||
1272 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | ||
1273 | |||
1274 | vaddr = kmap_atomic(m_page); | ||
1275 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); | ||
1276 | first_page->freelist = link->next; | ||
1277 | memset(link, POISON_INUSE, sizeof(*link)); | ||
1278 | kunmap_atomic(vaddr); | ||
1279 | |||
1280 | first_page->inuse++; | ||
1281 | zs_stat_inc(class, OBJ_USED, 1); | ||
1282 | /* Now move the zspage to another fullness group, if required */ | 1423 | /* Now move the zspage to another fullness group, if required */ |
1283 | fix_fullness_group(pool, first_page); | 1424 | fix_fullness_group(class, first_page); |
1425 | record_obj(handle, obj); | ||
1284 | spin_unlock(&class->lock); | 1426 | spin_unlock(&class->lock); |
1285 | 1427 | ||
1286 | return obj; | 1428 | return handle; |
1287 | } | 1429 | } |
1288 | EXPORT_SYMBOL_GPL(zs_malloc); | 1430 | EXPORT_SYMBOL_GPL(zs_malloc); |
1289 | 1431 | ||
1290 | void zs_free(struct zs_pool *pool, unsigned long obj) | 1432 | static void obj_free(struct zs_pool *pool, struct size_class *class, |
1433 | unsigned long obj) | ||
1291 | { | 1434 | { |
1292 | struct link_free *link; | 1435 | struct link_free *link; |
1293 | struct page *first_page, *f_page; | 1436 | struct page *first_page, *f_page; |
1294 | unsigned long f_objidx, f_offset; | 1437 | unsigned long f_objidx, f_offset; |
1295 | void *vaddr; | 1438 | void *vaddr; |
1296 | |||
1297 | int class_idx; | 1439 | int class_idx; |
1298 | struct size_class *class; | ||
1299 | enum fullness_group fullness; | 1440 | enum fullness_group fullness; |
1300 | 1441 | ||
1301 | if (unlikely(!obj)) | 1442 | BUG_ON(!obj); |
1302 | return; | ||
1303 | 1443 | ||
1304 | obj_handle_to_location(obj, &f_page, &f_objidx); | 1444 | obj &= ~OBJ_ALLOCATED_TAG; |
1445 | obj_to_location(obj, &f_page, &f_objidx); | ||
1305 | first_page = get_first_page(f_page); | 1446 | first_page = get_first_page(f_page); |
1306 | 1447 | ||
1307 | get_zspage_mapping(first_page, &class_idx, &fullness); | 1448 | get_zspage_mapping(first_page, &class_idx, &fullness); |
1308 | class = pool->size_class[class_idx]; | ||
1309 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); | 1449 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); |
1310 | 1450 | ||
1311 | spin_lock(&class->lock); | 1451 | vaddr = kmap_atomic(f_page); |
1312 | 1452 | ||
1313 | /* Insert this object in containing zspage's freelist */ | 1453 | /* Insert this object in containing zspage's freelist */ |
1314 | vaddr = kmap_atomic(f_page); | ||
1315 | link = (struct link_free *)(vaddr + f_offset); | 1454 | link = (struct link_free *)(vaddr + f_offset); |
1316 | link->next = first_page->freelist; | 1455 | link->next = first_page->freelist; |
1456 | if (class->huge) | ||
1457 | set_page_private(first_page, 0); | ||
1317 | kunmap_atomic(vaddr); | 1458 | kunmap_atomic(vaddr); |
1318 | first_page->freelist = (void *)obj; | 1459 | first_page->freelist = (void *)obj; |
1319 | |||
1320 | first_page->inuse--; | 1460 | first_page->inuse--; |
1321 | fullness = fix_fullness_group(pool, first_page); | ||
1322 | |||
1323 | zs_stat_dec(class, OBJ_USED, 1); | 1461 | zs_stat_dec(class, OBJ_USED, 1); |
1324 | if (fullness == ZS_EMPTY) | 1462 | } |
1463 | |||
1464 | void zs_free(struct zs_pool *pool, unsigned long handle) | ||
1465 | { | ||
1466 | struct page *first_page, *f_page; | ||
1467 | unsigned long obj, f_objidx; | ||
1468 | int class_idx; | ||
1469 | struct size_class *class; | ||
1470 | enum fullness_group fullness; | ||
1471 | |||
1472 | if (unlikely(!handle)) | ||
1473 | return; | ||
1474 | |||
1475 | pin_tag(handle); | ||
1476 | obj = handle_to_obj(handle); | ||
1477 | obj_to_location(obj, &f_page, &f_objidx); | ||
1478 | first_page = get_first_page(f_page); | ||
1479 | |||
1480 | get_zspage_mapping(first_page, &class_idx, &fullness); | ||
1481 | class = pool->size_class[class_idx]; | ||
1482 | |||
1483 | spin_lock(&class->lock); | ||
1484 | obj_free(pool, class, obj); | ||
1485 | fullness = fix_fullness_group(class, first_page); | ||
1486 | if (fullness == ZS_EMPTY) { | ||
1325 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | 1487 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( |
1326 | class->size, class->pages_per_zspage)); | 1488 | class->size, class->pages_per_zspage)); |
1327 | 1489 | atomic_long_sub(class->pages_per_zspage, | |
1490 | &pool->pages_allocated); | ||
1491 | free_zspage(first_page); | ||
1492 | } | ||
1328 | spin_unlock(&class->lock); | 1493 | spin_unlock(&class->lock); |
1494 | unpin_tag(handle); | ||
1495 | |||
1496 | free_handle(pool, handle); | ||
1497 | } | ||
1498 | EXPORT_SYMBOL_GPL(zs_free); | ||
1499 | |||
1500 | static void zs_object_copy(unsigned long src, unsigned long dst, | ||
1501 | struct size_class *class) | ||
1502 | { | ||
1503 | struct page *s_page, *d_page; | ||
1504 | unsigned long s_objidx, d_objidx; | ||
1505 | unsigned long s_off, d_off; | ||
1506 | void *s_addr, *d_addr; | ||
1507 | int s_size, d_size, size; | ||
1508 | int written = 0; | ||
1509 | |||
1510 | s_size = d_size = class->size; | ||
1511 | |||
1512 | obj_to_location(src, &s_page, &s_objidx); | ||
1513 | obj_to_location(dst, &d_page, &d_objidx); | ||
1514 | |||
1515 | s_off = obj_idx_to_offset(s_page, s_objidx, class->size); | ||
1516 | d_off = obj_idx_to_offset(d_page, d_objidx, class->size); | ||
1517 | |||
1518 | if (s_off + class->size > PAGE_SIZE) | ||
1519 | s_size = PAGE_SIZE - s_off; | ||
1520 | |||
1521 | if (d_off + class->size > PAGE_SIZE) | ||
1522 | d_size = PAGE_SIZE - d_off; | ||
1523 | |||
1524 | s_addr = kmap_atomic(s_page); | ||
1525 | d_addr = kmap_atomic(d_page); | ||
1526 | |||
1527 | while (1) { | ||
1528 | size = min(s_size, d_size); | ||
1529 | memcpy(d_addr + d_off, s_addr + s_off, size); | ||
1530 | written += size; | ||
1531 | |||
1532 | if (written == class->size) | ||
1533 | break; | ||
1534 | |||
1535 | s_off += size; | ||
1536 | s_size -= size; | ||
1537 | d_off += size; | ||
1538 | d_size -= size; | ||
1539 | |||
1540 | if (s_off >= PAGE_SIZE) { | ||
1541 | kunmap_atomic(d_addr); | ||
1542 | kunmap_atomic(s_addr); | ||
1543 | s_page = get_next_page(s_page); | ||
1544 | BUG_ON(!s_page); | ||
1545 | s_addr = kmap_atomic(s_page); | ||
1546 | d_addr = kmap_atomic(d_page); | ||
1547 | s_size = class->size - written; | ||
1548 | s_off = 0; | ||
1549 | } | ||
1550 | |||
1551 | if (d_off >= PAGE_SIZE) { | ||
1552 | kunmap_atomic(d_addr); | ||
1553 | d_page = get_next_page(d_page); | ||
1554 | BUG_ON(!d_page); | ||
1555 | d_addr = kmap_atomic(d_page); | ||
1556 | d_size = class->size - written; | ||
1557 | d_off = 0; | ||
1558 | } | ||
1559 | } | ||
1560 | |||
1561 | kunmap_atomic(d_addr); | ||
1562 | kunmap_atomic(s_addr); | ||
1563 | } | ||
1564 | |||
1565 | /* | ||
1566 | * Find alloced object in zspage from index object and | ||
1567 | * return handle. | ||
1568 | */ | ||
1569 | static unsigned long find_alloced_obj(struct page *page, int index, | ||
1570 | struct size_class *class) | ||
1571 | { | ||
1572 | unsigned long head; | ||
1573 | int offset = 0; | ||
1574 | unsigned long handle = 0; | ||
1575 | void *addr = kmap_atomic(page); | ||
1576 | |||
1577 | if (!is_first_page(page)) | ||
1578 | offset = page->index; | ||
1579 | offset += class->size * index; | ||
1580 | |||
1581 | while (offset < PAGE_SIZE) { | ||
1582 | head = obj_to_head(class, page, addr + offset); | ||
1583 | if (head & OBJ_ALLOCATED_TAG) { | ||
1584 | handle = head & ~OBJ_ALLOCATED_TAG; | ||
1585 | if (trypin_tag(handle)) | ||
1586 | break; | ||
1587 | handle = 0; | ||
1588 | } | ||
1589 | |||
1590 | offset += class->size; | ||
1591 | index++; | ||
1592 | } | ||
1593 | |||
1594 | kunmap_atomic(addr); | ||
1595 | return handle; | ||
1596 | } | ||
1597 | |||
1598 | struct zs_compact_control { | ||
1599 | /* Source page for migration which could be a subpage of zspage. */ | ||
1600 | struct page *s_page; | ||
1601 | /* Destination page for migration which should be a first page | ||
1602 | * of zspage. */ | ||
1603 | struct page *d_page; | ||
1604 | /* Starting object index within @s_page which used for live object | ||
1605 | * in the subpage. */ | ||
1606 | int index; | ||
1607 | /* how many of objects are migrated */ | ||
1608 | int nr_migrated; | ||
1609 | }; | ||
1610 | |||
1611 | static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | ||
1612 | struct zs_compact_control *cc) | ||
1613 | { | ||
1614 | unsigned long used_obj, free_obj; | ||
1615 | unsigned long handle; | ||
1616 | struct page *s_page = cc->s_page; | ||
1617 | struct page *d_page = cc->d_page; | ||
1618 | unsigned long index = cc->index; | ||
1619 | int nr_migrated = 0; | ||
1620 | int ret = 0; | ||
1621 | |||
1622 | while (1) { | ||
1623 | handle = find_alloced_obj(s_page, index, class); | ||
1624 | if (!handle) { | ||
1625 | s_page = get_next_page(s_page); | ||
1626 | if (!s_page) | ||
1627 | break; | ||
1628 | index = 0; | ||
1629 | continue; | ||
1630 | } | ||
1631 | |||
1632 | /* Stop if there is no more space */ | ||
1633 | if (zspage_full(d_page)) { | ||
1634 | unpin_tag(handle); | ||
1635 | ret = -ENOMEM; | ||
1636 | break; | ||
1637 | } | ||
1638 | |||
1639 | used_obj = handle_to_obj(handle); | ||
1640 | free_obj = obj_malloc(d_page, class, handle); | ||
1641 | zs_object_copy(used_obj, free_obj, class); | ||
1642 | index++; | ||
1643 | record_obj(handle, free_obj); | ||
1644 | unpin_tag(handle); | ||
1645 | obj_free(pool, class, used_obj); | ||
1646 | nr_migrated++; | ||
1647 | } | ||
1648 | |||
1649 | /* Remember last position in this iteration */ | ||
1650 | cc->s_page = s_page; | ||
1651 | cc->index = index; | ||
1652 | cc->nr_migrated = nr_migrated; | ||
1653 | |||
1654 | return ret; | ||
1655 | } | ||
1656 | |||
1657 | static struct page *alloc_target_page(struct size_class *class) | ||
1658 | { | ||
1659 | int i; | ||
1660 | struct page *page; | ||
1661 | |||
1662 | for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { | ||
1663 | page = class->fullness_list[i]; | ||
1664 | if (page) { | ||
1665 | remove_zspage(page, class, i); | ||
1666 | break; | ||
1667 | } | ||
1668 | } | ||
1669 | |||
1670 | return page; | ||
1671 | } | ||
1672 | |||
1673 | static void putback_zspage(struct zs_pool *pool, struct size_class *class, | ||
1674 | struct page *first_page) | ||
1675 | { | ||
1676 | enum fullness_group fullness; | ||
1677 | |||
1678 | BUG_ON(!is_first_page(first_page)); | ||
1679 | |||
1680 | fullness = get_fullness_group(first_page); | ||
1681 | insert_zspage(first_page, class, fullness); | ||
1682 | set_zspage_mapping(first_page, class->index, fullness); | ||
1329 | 1683 | ||
1330 | if (fullness == ZS_EMPTY) { | 1684 | if (fullness == ZS_EMPTY) { |
1685 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | ||
1686 | class->size, class->pages_per_zspage)); | ||
1331 | atomic_long_sub(class->pages_per_zspage, | 1687 | atomic_long_sub(class->pages_per_zspage, |
1332 | &pool->pages_allocated); | 1688 | &pool->pages_allocated); |
1689 | |||
1333 | free_zspage(first_page); | 1690 | free_zspage(first_page); |
1334 | } | 1691 | } |
1335 | } | 1692 | } |
1336 | EXPORT_SYMBOL_GPL(zs_free); | 1693 | |
1694 | static struct page *isolate_source_page(struct size_class *class) | ||
1695 | { | ||
1696 | struct page *page; | ||
1697 | |||
1698 | page = class->fullness_list[ZS_ALMOST_EMPTY]; | ||
1699 | if (page) | ||
1700 | remove_zspage(page, class, ZS_ALMOST_EMPTY); | ||
1701 | |||
1702 | return page; | ||
1703 | } | ||
1704 | |||
1705 | static unsigned long __zs_compact(struct zs_pool *pool, | ||
1706 | struct size_class *class) | ||
1707 | { | ||
1708 | int nr_to_migrate; | ||
1709 | struct zs_compact_control cc; | ||
1710 | struct page *src_page; | ||
1711 | struct page *dst_page = NULL; | ||
1712 | unsigned long nr_total_migrated = 0; | ||
1713 | |||
1714 | spin_lock(&class->lock); | ||
1715 | while ((src_page = isolate_source_page(class))) { | ||
1716 | |||
1717 | BUG_ON(!is_first_page(src_page)); | ||
1718 | |||
1719 | /* The goal is to migrate all live objects in source page */ | ||
1720 | nr_to_migrate = src_page->inuse; | ||
1721 | cc.index = 0; | ||
1722 | cc.s_page = src_page; | ||
1723 | |||
1724 | while ((dst_page = alloc_target_page(class))) { | ||
1725 | cc.d_page = dst_page; | ||
1726 | /* | ||
1727 | * If there is no more space in dst_page, try to | ||
1728 | * allocate another zspage. | ||
1729 | */ | ||
1730 | if (!migrate_zspage(pool, class, &cc)) | ||
1731 | break; | ||
1732 | |||
1733 | putback_zspage(pool, class, dst_page); | ||
1734 | nr_total_migrated += cc.nr_migrated; | ||
1735 | nr_to_migrate -= cc.nr_migrated; | ||
1736 | } | ||
1737 | |||
1738 | /* Stop if we couldn't find slot */ | ||
1739 | if (dst_page == NULL) | ||
1740 | break; | ||
1741 | |||
1742 | putback_zspage(pool, class, dst_page); | ||
1743 | putback_zspage(pool, class, src_page); | ||
1744 | spin_unlock(&class->lock); | ||
1745 | nr_total_migrated += cc.nr_migrated; | ||
1746 | cond_resched(); | ||
1747 | spin_lock(&class->lock); | ||
1748 | } | ||
1749 | |||
1750 | if (src_page) | ||
1751 | putback_zspage(pool, class, src_page); | ||
1752 | |||
1753 | spin_unlock(&class->lock); | ||
1754 | |||
1755 | return nr_total_migrated; | ||
1756 | } | ||
1757 | |||
1758 | unsigned long zs_compact(struct zs_pool *pool) | ||
1759 | { | ||
1760 | int i; | ||
1761 | unsigned long nr_migrated = 0; | ||
1762 | struct size_class *class; | ||
1763 | |||
1764 | for (i = zs_size_classes - 1; i >= 0; i--) { | ||
1765 | class = pool->size_class[i]; | ||
1766 | if (!class) | ||
1767 | continue; | ||
1768 | if (class->index != i) | ||
1769 | continue; | ||
1770 | nr_migrated += __zs_compact(pool, class); | ||
1771 | } | ||
1772 | |||
1773 | return nr_migrated; | ||
1774 | } | ||
1775 | EXPORT_SYMBOL_GPL(zs_compact); | ||
1337 | 1776 | ||
1338 | /** | 1777 | /** |
1339 | * zs_create_pool - Creates an allocation pool to work from. | 1778 | * zs_create_pool - Creates an allocation pool to work from. |
@@ -1355,20 +1794,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) | |||
1355 | if (!pool) | 1794 | if (!pool) |
1356 | return NULL; | 1795 | return NULL; |
1357 | 1796 | ||
1358 | pool->name = kstrdup(name, GFP_KERNEL); | ||
1359 | if (!pool->name) { | ||
1360 | kfree(pool); | ||
1361 | return NULL; | ||
1362 | } | ||
1363 | |||
1364 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), | 1797 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), |
1365 | GFP_KERNEL); | 1798 | GFP_KERNEL); |
1366 | if (!pool->size_class) { | 1799 | if (!pool->size_class) { |
1367 | kfree(pool->name); | ||
1368 | kfree(pool); | 1800 | kfree(pool); |
1369 | return NULL; | 1801 | return NULL; |
1370 | } | 1802 | } |
1371 | 1803 | ||
1804 | pool->name = kstrdup(name, GFP_KERNEL); | ||
1805 | if (!pool->name) | ||
1806 | goto err; | ||
1807 | |||
1808 | if (create_handle_cache(pool)) | ||
1809 | goto err; | ||
1810 | |||
1372 | /* | 1811 | /* |
1373 | * Iterate reversly, because, size of size_class that we want to use | 1812 | * Iterate reversly, because, size of size_class that we want to use |
1374 | * for merging should be larger or equal to current size. | 1813 | * for merging should be larger or equal to current size. |
@@ -1406,6 +1845,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) | |||
1406 | class->size = size; | 1845 | class->size = size; |
1407 | class->index = i; | 1846 | class->index = i; |
1408 | class->pages_per_zspage = pages_per_zspage; | 1847 | class->pages_per_zspage = pages_per_zspage; |
1848 | if (pages_per_zspage == 1 && | ||
1849 | get_maxobj_per_zspage(size, pages_per_zspage) == 1) | ||
1850 | class->huge = true; | ||
1409 | spin_lock_init(&class->lock); | 1851 | spin_lock_init(&class->lock); |
1410 | pool->size_class[i] = class; | 1852 | pool->size_class[i] = class; |
1411 | 1853 | ||
@@ -1450,6 +1892,7 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
1450 | kfree(class); | 1892 | kfree(class); |
1451 | } | 1893 | } |
1452 | 1894 | ||
1895 | destroy_handle_cache(pool); | ||
1453 | kfree(pool->size_class); | 1896 | kfree(pool->size_class); |
1454 | kfree(pool->name); | 1897 | kfree(pool->name); |
1455 | kfree(pool); | 1898 | kfree(pool); |