aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-04-15 19:39:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-15 19:39:15 -0400
commiteea3a00264cf243a28e4331566ce67b86059339d (patch)
tree487f16389e0dfa32e9caa7604d1274a7dcda8f04 /mm
parente7c82412433a8039616c7314533a0a1c025d99bf (diff)
parente693d73c20ffdb06840c9378f367bad849ac0d5d (diff)
Merge branch 'akpm' (patches from Andrew)
Merge second patchbomb from Andrew Morton: - the rest of MM - various misc bits - add ability to run /sbin/reboot at reboot time - printk/vsprintf changes - fiddle with seq_printf() return value * akpm: (114 commits) parisc: remove use of seq_printf return value lru_cache: remove use of seq_printf return value tracing: remove use of seq_printf return value cgroup: remove use of seq_printf return value proc: remove use of seq_printf return value s390: remove use of seq_printf return value cris fasttimer: remove use of seq_printf return value cris: remove use of seq_printf return value openrisc: remove use of seq_printf return value ARM: plat-pxa: remove use of seq_printf return value nios2: cpuinfo: remove use of seq_printf return value microblaze: mb: remove use of seq_printf return value ipc: remove use of seq_printf return value rtc: remove use of seq_printf return value power: wakeup: remove use of seq_printf return value x86: mtrr: if: remove use of seq_printf return value linux/bitmap.h: improve BITMAP_{LAST,FIRST}_WORD_MASK MAINTAINERS: CREDITS: remove Stefano Brivio from B43 .mailmap: add Ricardo Ribalda CREDITS: add Ricardo Ribalda Delgado ...
Diffstat (limited to 'mm')
-rw-r--r--mm/cma.c5
-rw-r--r--mm/cma_debug.c41
-rw-r--r--mm/compaction.c60
-rw-r--r--mm/gup.c4
-rw-r--r--mm/huge_memory.c86
-rw-r--r--mm/hugetlb.c234
-rw-r--r--mm/internal.h4
-rw-r--r--mm/kasan/kasan.c13
-rw-r--r--mm/ksm.c10
-rw-r--r--mm/memblock.c18
-rw-r--r--mm/memcontrol.c47
-rw-r--r--mm/memory-failure.c122
-rw-r--r--mm/memory.c56
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempool.c117
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mmap.c21
-rw-r--r--mm/mremap.c25
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c3
-rw-r--r--mm/page_alloc.c6
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/slub.c4
-rw-r--r--mm/swap.c34
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/util.c41
-rw-r--r--mm/vmalloc.c95
-rw-r--r--mm/zsmalloc.c971
30 files changed, 1453 insertions, 583 deletions
diff --git a/mm/cma.c b/mm/cma.c
index 47203faaf65e..3a7a67b93394 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -23,6 +23,7 @@
23# define DEBUG 23# define DEBUG
24#endif 24#endif
25#endif 25#endif
26#define CREATE_TRACE_POINTS
26 27
27#include <linux/memblock.h> 28#include <linux/memblock.h>
28#include <linux/err.h> 29#include <linux/err.h>
@@ -34,6 +35,7 @@
34#include <linux/cma.h> 35#include <linux/cma.h>
35#include <linux/highmem.h> 36#include <linux/highmem.h>
36#include <linux/io.h> 37#include <linux/io.h>
38#include <trace/events/cma.h>
37 39
38#include "cma.h" 40#include "cma.h"
39 41
@@ -414,6 +416,8 @@ struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
414 start = bitmap_no + mask + 1; 416 start = bitmap_no + mask + 1;
415 } 417 }
416 418
419 trace_cma_alloc(page ? pfn : -1UL, page, count, align);
420
417 pr_debug("%s(): returned %p\n", __func__, page); 421 pr_debug("%s(): returned %p\n", __func__, page);
418 return page; 422 return page;
419} 423}
@@ -446,6 +450,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
446 450
447 free_contig_range(pfn, count); 451 free_contig_range(pfn, count);
448 cma_clear_bitmap(cma, pfn, count); 452 cma_clear_bitmap(cma, pfn, count);
453 trace_cma_release(pfn, pages, count);
449 454
450 return true; 455 return true;
451} 456}
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 0b377536ccde..7621ee34daa0 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -30,9 +30,44 @@ static int cma_debugfs_get(void *data, u64 *val)
30 30
31 return 0; 31 return 0;
32} 32}
33
34DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); 33DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
35 34
35static int cma_used_get(void *data, u64 *val)
36{
37 struct cma *cma = data;
38 unsigned long used;
39
40 mutex_lock(&cma->lock);
41 /* pages counter is smaller than sizeof(int) */
42 used = bitmap_weight(cma->bitmap, (int)cma->count);
43 mutex_unlock(&cma->lock);
44 *val = (u64)used << cma->order_per_bit;
45
46 return 0;
47}
48DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
49
50static int cma_maxchunk_get(void *data, u64 *val)
51{
52 struct cma *cma = data;
53 unsigned long maxchunk = 0;
54 unsigned long start, end = 0;
55
56 mutex_lock(&cma->lock);
57 for (;;) {
58 start = find_next_zero_bit(cma->bitmap, cma->count, end);
59 if (start >= cma->count)
60 break;
61 end = find_next_bit(cma->bitmap, cma->count, start);
62 maxchunk = max(end - start, maxchunk);
63 }
64 mutex_unlock(&cma->lock);
65 *val = (u64)maxchunk << cma->order_per_bit;
66
67 return 0;
68}
69DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
70
36static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) 71static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
37{ 72{
38 spin_lock(&cma->mem_head_lock); 73 spin_lock(&cma->mem_head_lock);
@@ -91,7 +126,6 @@ static int cma_free_write(void *data, u64 val)
91 126
92 return cma_free_mem(cma, pages); 127 return cma_free_mem(cma, pages);
93} 128}
94
95DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); 129DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
96 130
97static int cma_alloc_mem(struct cma *cma, int count) 131static int cma_alloc_mem(struct cma *cma, int count)
@@ -124,7 +158,6 @@ static int cma_alloc_write(void *data, u64 val)
124 158
125 return cma_alloc_mem(cma, pages); 159 return cma_alloc_mem(cma, pages);
126} 160}
127
128DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); 161DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
129 162
130static void cma_debugfs_add_one(struct cma *cma, int idx) 163static void cma_debugfs_add_one(struct cma *cma, int idx)
@@ -149,6 +182,8 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
149 &cma->count, &cma_debugfs_fops); 182 &cma->count, &cma_debugfs_fops);
150 debugfs_create_file("order_per_bit", S_IRUGO, tmp, 183 debugfs_create_file("order_per_bit", S_IRUGO, tmp,
151 &cma->order_per_bit, &cma_debugfs_fops); 184 &cma->order_per_bit, &cma_debugfs_fops);
185 debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops);
186 debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops);
152 187
153 u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); 188 u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
154 debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); 189 debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
diff --git a/mm/compaction.c b/mm/compaction.c
index a18201a8124e..018f08da99a2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc)
391 return false; 391 return false;
392} 392}
393 393
394/* Returns true if the page is within a block suitable for migration to */
395static bool suitable_migration_target(struct page *page)
396{
397 /* If the page is a large free page, then disallow migration */
398 if (PageBuddy(page)) {
399 /*
400 * We are checking page_order without zone->lock taken. But
401 * the only small danger is that we skip a potentially suitable
402 * pageblock, so it's not worth to check order for valid range.
403 */
404 if (page_order_unsafe(page) >= pageblock_order)
405 return false;
406 }
407
408 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
409 if (migrate_async_suitable(get_pageblock_migratetype(page)))
410 return true;
411
412 /* Otherwise skip the block */
413 return false;
414}
415
416/* 394/*
417 * Isolate free pages onto a private freelist. If @strict is true, will abort 395 * Isolate free pages onto a private freelist. If @strict is true, will abort
418 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock 396 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
@@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
896 874
897#endif /* CONFIG_COMPACTION || CONFIG_CMA */ 875#endif /* CONFIG_COMPACTION || CONFIG_CMA */
898#ifdef CONFIG_COMPACTION 876#ifdef CONFIG_COMPACTION
877
878/* Returns true if the page is within a block suitable for migration to */
879static bool suitable_migration_target(struct page *page)
880{
881 /* If the page is a large free page, then disallow migration */
882 if (PageBuddy(page)) {
883 /*
884 * We are checking page_order without zone->lock taken. But
885 * the only small danger is that we skip a potentially suitable
886 * pageblock, so it's not worth to check order for valid range.
887 */
888 if (page_order_unsafe(page) >= pageblock_order)
889 return false;
890 }
891
892 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
893 if (migrate_async_suitable(get_pageblock_migratetype(page)))
894 return true;
895
896 /* Otherwise skip the block */
897 return false;
898}
899
899/* 900/*
900 * Based on information in the current compact_control, find blocks 901 * Based on information in the current compact_control, find blocks
901 * suitable for isolating free pages from and then isolate them. 902 * suitable for isolating free pages from and then isolate them.
@@ -1047,6 +1048,12 @@ typedef enum {
1047} isolate_migrate_t; 1048} isolate_migrate_t;
1048 1049
1049/* 1050/*
1051 * Allow userspace to control policy on scanning the unevictable LRU for
1052 * compactable pages.
1053 */
1054int sysctl_compact_unevictable_allowed __read_mostly = 1;
1055
1056/*
1050 * Isolate all pages that can be migrated from the first suitable block, 1057 * Isolate all pages that can be migrated from the first suitable block,
1051 * starting at the block pointed to by the migrate scanner pfn within 1058 * starting at the block pointed to by the migrate scanner pfn within
1052 * compact_control. 1059 * compact_control.
@@ -1057,6 +1064,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1057 unsigned long low_pfn, end_pfn; 1064 unsigned long low_pfn, end_pfn;
1058 struct page *page; 1065 struct page *page;
1059 const isolate_mode_t isolate_mode = 1066 const isolate_mode_t isolate_mode =
1067 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1060 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1068 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1061 1069
1062 /* 1070 /*
@@ -1598,6 +1606,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1598 INIT_LIST_HEAD(&cc->freepages); 1606 INIT_LIST_HEAD(&cc->freepages);
1599 INIT_LIST_HEAD(&cc->migratepages); 1607 INIT_LIST_HEAD(&cc->migratepages);
1600 1608
1609 /*
1610 * When called via /proc/sys/vm/compact_memory
1611 * this makes sure we compact the whole zone regardless of
1612 * cached scanner positions.
1613 */
1614 if (cc->order == -1)
1615 __reset_isolation_suitable(zone);
1616
1601 if (cc->order == -1 || !compaction_deferred(zone, cc->order)) 1617 if (cc->order == -1 || !compaction_deferred(zone, cc->order))
1602 compact_zone(zone, cc); 1618 compact_zone(zone, cc);
1603 1619
diff --git a/mm/gup.c b/mm/gup.c
index ca7b607ab671..6297f6bccfb1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1019,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1019 * 1019 *
1020 * for an example see gup_get_pte in arch/x86/mm/gup.c 1020 * for an example see gup_get_pte in arch/x86/mm/gup.c
1021 */ 1021 */
1022 pte_t pte = ACCESS_ONCE(*ptep); 1022 pte_t pte = READ_ONCE(*ptep);
1023 struct page *page; 1023 struct page *page;
1024 1024
1025 /* 1025 /*
@@ -1309,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1309 local_irq_save(flags); 1309 local_irq_save(flags);
1310 pgdp = pgd_offset(mm, addr); 1310 pgdp = pgd_offset(mm, addr);
1311 do { 1311 do {
1312 pgd_t pgd = ACCESS_ONCE(*pgdp); 1312 pgd_t pgd = READ_ONCE(*pgdp);
1313 1313
1314 next = pgd_addr_end(addr, end); 1314 next = pgd_addr_end(addr, end);
1315 if (pgd_none(pgd)) 1315 if (pgd_none(pgd))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3afb5cbe1312..078832cf3636 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
67 67
68static int khugepaged(void *none); 68static int khugepaged(void *none);
69static int khugepaged_slab_init(void); 69static int khugepaged_slab_init(void);
70static void khugepaged_slab_exit(void);
70 71
71#define MM_SLOTS_HASH_BITS 10 72#define MM_SLOTS_HASH_BITS 10
72static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 73static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void)
109 int nr_zones = 0; 110 int nr_zones = 0;
110 unsigned long recommended_min; 111 unsigned long recommended_min;
111 112
112 if (!khugepaged_enabled())
113 return 0;
114
115 for_each_populated_zone(zone) 113 for_each_populated_zone(zone)
116 nr_zones++; 114 nr_zones++;
117 115
@@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void)
143 setup_per_zone_wmarks(); 141 setup_per_zone_wmarks();
144 return 0; 142 return 0;
145} 143}
146late_initcall(set_recommended_min_free_kbytes);
147 144
148static int start_khugepaged(void) 145static int start_stop_khugepaged(void)
149{ 146{
150 int err = 0; 147 int err = 0;
151 if (khugepaged_enabled()) { 148 if (khugepaged_enabled()) {
@@ -156,6 +153,7 @@ static int start_khugepaged(void)
156 pr_err("khugepaged: kthread_run(khugepaged) failed\n"); 153 pr_err("khugepaged: kthread_run(khugepaged) failed\n");
157 err = PTR_ERR(khugepaged_thread); 154 err = PTR_ERR(khugepaged_thread);
158 khugepaged_thread = NULL; 155 khugepaged_thread = NULL;
156 goto fail;
159 } 157 }
160 158
161 if (!list_empty(&khugepaged_scan.mm_head)) 159 if (!list_empty(&khugepaged_scan.mm_head))
@@ -166,7 +164,7 @@ static int start_khugepaged(void)
166 kthread_stop(khugepaged_thread); 164 kthread_stop(khugepaged_thread);
167 khugepaged_thread = NULL; 165 khugepaged_thread = NULL;
168 } 166 }
169 167fail:
170 return err; 168 return err;
171} 169}
172 170
@@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void)
183 struct page *zero_page; 181 struct page *zero_page;
184retry: 182retry:
185 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 183 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
186 return ACCESS_ONCE(huge_zero_page); 184 return READ_ONCE(huge_zero_page);
187 185
188 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 186 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
189 HPAGE_PMD_ORDER); 187 HPAGE_PMD_ORDER);
@@ -202,7 +200,7 @@ retry:
202 /* We take additional reference here. It will be put back by shrinker */ 200 /* We take additional reference here. It will be put back by shrinker */
203 atomic_set(&huge_zero_refcount, 2); 201 atomic_set(&huge_zero_refcount, 2);
204 preempt_enable(); 202 preempt_enable();
205 return ACCESS_ONCE(huge_zero_page); 203 return READ_ONCE(huge_zero_page);
206} 204}
207 205
208static void put_huge_zero_page(void) 206static void put_huge_zero_page(void)
@@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj,
300 int err; 298 int err;
301 299
302 mutex_lock(&khugepaged_mutex); 300 mutex_lock(&khugepaged_mutex);
303 err = start_khugepaged(); 301 err = start_stop_khugepaged();
304 mutex_unlock(&khugepaged_mutex); 302 mutex_unlock(&khugepaged_mutex);
305 303
306 if (err) 304 if (err)
@@ -634,27 +632,38 @@ static int __init hugepage_init(void)
634 632
635 err = hugepage_init_sysfs(&hugepage_kobj); 633 err = hugepage_init_sysfs(&hugepage_kobj);
636 if (err) 634 if (err)
637 return err; 635 goto err_sysfs;
638 636
639 err = khugepaged_slab_init(); 637 err = khugepaged_slab_init();
640 if (err) 638 if (err)
641 goto out; 639 goto err_slab;
642 640
643 register_shrinker(&huge_zero_page_shrinker); 641 err = register_shrinker(&huge_zero_page_shrinker);
642 if (err)
643 goto err_hzp_shrinker;
644 644
645 /* 645 /*
646 * By default disable transparent hugepages on smaller systems, 646 * By default disable transparent hugepages on smaller systems,
647 * where the extra memory used could hurt more than TLB overhead 647 * where the extra memory used could hurt more than TLB overhead
648 * is likely to save. The admin can still enable it through /sys. 648 * is likely to save. The admin can still enable it through /sys.
649 */ 649 */
650 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) 650 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
651 transparent_hugepage_flags = 0; 651 transparent_hugepage_flags = 0;
652 return 0;
653 }
652 654
653 start_khugepaged(); 655 err = start_stop_khugepaged();
656 if (err)
657 goto err_khugepaged;
654 658
655 return 0; 659 return 0;
656out: 660err_khugepaged:
661 unregister_shrinker(&huge_zero_page_shrinker);
662err_hzp_shrinker:
663 khugepaged_slab_exit();
664err_slab:
657 hugepage_exit_sysfs(hugepage_kobj); 665 hugepage_exit_sysfs(hugepage_kobj);
666err_sysfs:
658 return err; 667 return err;
659} 668}
660subsys_initcall(hugepage_init); 669subsys_initcall(hugepage_init);
@@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
708static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 717static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
709 struct vm_area_struct *vma, 718 struct vm_area_struct *vma,
710 unsigned long haddr, pmd_t *pmd, 719 unsigned long haddr, pmd_t *pmd,
711 struct page *page) 720 struct page *page, gfp_t gfp)
712{ 721{
713 struct mem_cgroup *memcg; 722 struct mem_cgroup *memcg;
714 pgtable_t pgtable; 723 pgtable_t pgtable;
@@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
716 725
717 VM_BUG_ON_PAGE(!PageCompound(page), page); 726 VM_BUG_ON_PAGE(!PageCompound(page), page);
718 727
719 if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) 728 if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
720 return VM_FAULT_OOM; 729 return VM_FAULT_OOM;
721 730
722 pgtable = pte_alloc_one(mm, haddr); 731 pgtable = pte_alloc_one(mm, haddr);
@@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
822 count_vm_event(THP_FAULT_FALLBACK); 831 count_vm_event(THP_FAULT_FALLBACK);
823 return VM_FAULT_FALLBACK; 832 return VM_FAULT_FALLBACK;
824 } 833 }
825 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { 834 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
826 put_page(page); 835 put_page(page);
827 count_vm_event(THP_FAULT_FALLBACK); 836 count_vm_event(THP_FAULT_FALLBACK);
828 return VM_FAULT_FALLBACK; 837 return VM_FAULT_FALLBACK;
@@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1080 unsigned long haddr; 1089 unsigned long haddr;
1081 unsigned long mmun_start; /* For mmu_notifiers */ 1090 unsigned long mmun_start; /* For mmu_notifiers */
1082 unsigned long mmun_end; /* For mmu_notifiers */ 1091 unsigned long mmun_end; /* For mmu_notifiers */
1092 gfp_t huge_gfp; /* for allocation and charge */
1083 1093
1084 ptl = pmd_lockptr(mm, pmd); 1094 ptl = pmd_lockptr(mm, pmd);
1085 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1095 VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1106alloc: 1116alloc:
1107 if (transparent_hugepage_enabled(vma) && 1117 if (transparent_hugepage_enabled(vma) &&
1108 !transparent_hugepage_debug_cow()) { 1118 !transparent_hugepage_debug_cow()) {
1109 gfp_t gfp; 1119 huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
1110 1120 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
1111 gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
1112 new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
1113 } else 1121 } else
1114 new_page = NULL; 1122 new_page = NULL;
1115 1123
@@ -1130,8 +1138,7 @@ alloc:
1130 goto out; 1138 goto out;
1131 } 1139 }
1132 1140
1133 if (unlikely(mem_cgroup_try_charge(new_page, mm, 1141 if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
1134 GFP_TRANSHUGE, &memcg))) {
1135 put_page(new_page); 1142 put_page(new_page);
1136 if (page) { 1143 if (page) {
1137 split_huge_page(page); 1144 split_huge_page(page);
@@ -1976,6 +1983,11 @@ static int __init khugepaged_slab_init(void)
1976 return 0; 1983 return 0;
1977} 1984}
1978 1985
1986static void __init khugepaged_slab_exit(void)
1987{
1988 kmem_cache_destroy(mm_slot_cache);
1989}
1990
1979static inline struct mm_slot *alloc_mm_slot(void) 1991static inline struct mm_slot *alloc_mm_slot(void)
1980{ 1992{
1981 if (!mm_slot_cache) /* initialization failed */ 1993 if (!mm_slot_cache) /* initialization failed */
@@ -2323,19 +2335,13 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2323 return true; 2335 return true;
2324} 2336}
2325 2337
2326static struct page 2338static struct page *
2327*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 2339khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2328 struct vm_area_struct *vma, unsigned long address, 2340 struct vm_area_struct *vma, unsigned long address,
2329 int node) 2341 int node)
2330{ 2342{
2331 gfp_t flags;
2332
2333 VM_BUG_ON_PAGE(*hpage, *hpage); 2343 VM_BUG_ON_PAGE(*hpage, *hpage);
2334 2344
2335 /* Only allocate from the target node */
2336 flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
2337 __GFP_THISNODE;
2338
2339 /* 2345 /*
2340 * Before allocating the hugepage, release the mmap_sem read lock. 2346 * Before allocating the hugepage, release the mmap_sem read lock.
2341 * The allocation can take potentially a long time if it involves 2347 * The allocation can take potentially a long time if it involves
@@ -2344,7 +2350,7 @@ static struct page
2344 */ 2350 */
2345 up_read(&mm->mmap_sem); 2351 up_read(&mm->mmap_sem);
2346 2352
2347 *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER); 2353 *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
2348 if (unlikely(!*hpage)) { 2354 if (unlikely(!*hpage)) {
2349 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2355 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2350 *hpage = ERR_PTR(-ENOMEM); 2356 *hpage = ERR_PTR(-ENOMEM);
@@ -2397,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2397 return true; 2403 return true;
2398} 2404}
2399 2405
2400static struct page 2406static struct page *
2401*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 2407khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2402 struct vm_area_struct *vma, unsigned long address, 2408 struct vm_area_struct *vma, unsigned long address,
2403 int node) 2409 int node)
2404{ 2410{
2405 up_read(&mm->mmap_sem); 2411 up_read(&mm->mmap_sem);
2406 VM_BUG_ON(!*hpage); 2412 VM_BUG_ON(!*hpage);
2413
2407 return *hpage; 2414 return *hpage;
2408} 2415}
2409#endif 2416#endif
@@ -2438,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm,
2438 struct mem_cgroup *memcg; 2445 struct mem_cgroup *memcg;
2439 unsigned long mmun_start; /* For mmu_notifiers */ 2446 unsigned long mmun_start; /* For mmu_notifiers */
2440 unsigned long mmun_end; /* For mmu_notifiers */ 2447 unsigned long mmun_end; /* For mmu_notifiers */
2448 gfp_t gfp;
2441 2449
2442 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2450 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2443 2451
2452 /* Only allocate from the target node */
2453 gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
2454 __GFP_THISNODE;
2455
2444 /* release the mmap_sem read lock. */ 2456 /* release the mmap_sem read lock. */
2445 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); 2457 new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
2446 if (!new_page) 2458 if (!new_page)
2447 return; 2459 return;
2448 2460
2449 if (unlikely(mem_cgroup_try_charge(new_page, mm, 2461 if (unlikely(mem_cgroup_try_charge(new_page, mm,
2450 GFP_TRANSHUGE, &memcg))) 2462 gfp, &memcg)))
2451 return; 2463 return;
2452 2464
2453 /* 2465 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8874c8ad55aa..271e4432734c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
61static int num_fault_mutexes; 61static int num_fault_mutexes;
62static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; 62static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
63 63
64/* Forward declaration */
65static int hugetlb_acct_memory(struct hstate *h, long delta);
66
64static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 67static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
65{ 68{
66 bool free = (spool->count == 0) && (spool->used_hpages == 0); 69 bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
68 spin_unlock(&spool->lock); 71 spin_unlock(&spool->lock);
69 72
70 /* If no pages are used, and no other handles to the subpool 73 /* If no pages are used, and no other handles to the subpool
71 * remain, free the subpool the subpool remain */ 74 * remain, give up any reservations mased on minimum size and
72 if (free) 75 * free the subpool */
76 if (free) {
77 if (spool->min_hpages != -1)
78 hugetlb_acct_memory(spool->hstate,
79 -spool->min_hpages);
73 kfree(spool); 80 kfree(spool);
81 }
74} 82}
75 83
76struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) 84struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
85 long min_hpages)
77{ 86{
78 struct hugepage_subpool *spool; 87 struct hugepage_subpool *spool;
79 88
80 spool = kmalloc(sizeof(*spool), GFP_KERNEL); 89 spool = kzalloc(sizeof(*spool), GFP_KERNEL);
81 if (!spool) 90 if (!spool)
82 return NULL; 91 return NULL;
83 92
84 spin_lock_init(&spool->lock); 93 spin_lock_init(&spool->lock);
85 spool->count = 1; 94 spool->count = 1;
86 spool->max_hpages = nr_blocks; 95 spool->max_hpages = max_hpages;
87 spool->used_hpages = 0; 96 spool->hstate = h;
97 spool->min_hpages = min_hpages;
98
99 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
100 kfree(spool);
101 return NULL;
102 }
103 spool->rsv_hpages = min_hpages;
88 104
89 return spool; 105 return spool;
90} 106}
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
97 unlock_or_release_subpool(spool); 113 unlock_or_release_subpool(spool);
98} 114}
99 115
100static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, 116/*
117 * Subpool accounting for allocating and reserving pages.
118 * Return -ENOMEM if there are not enough resources to satisfy the
119 * the request. Otherwise, return the number of pages by which the
120 * global pools must be adjusted (upward). The returned value may
121 * only be different than the passed value (delta) in the case where
122 * a subpool minimum size must be manitained.
123 */
124static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
101 long delta) 125 long delta)
102{ 126{
103 int ret = 0; 127 long ret = delta;
104 128
105 if (!spool) 129 if (!spool)
106 return 0; 130 return ret;
107 131
108 spin_lock(&spool->lock); 132 spin_lock(&spool->lock);
109 if ((spool->used_hpages + delta) <= spool->max_hpages) { 133
110 spool->used_hpages += delta; 134 if (spool->max_hpages != -1) { /* maximum size accounting */
111 } else { 135 if ((spool->used_hpages + delta) <= spool->max_hpages)
112 ret = -ENOMEM; 136 spool->used_hpages += delta;
137 else {
138 ret = -ENOMEM;
139 goto unlock_ret;
140 }
141 }
142
143 if (spool->min_hpages != -1) { /* minimum size accounting */
144 if (delta > spool->rsv_hpages) {
145 /*
146 * Asking for more reserves than those already taken on
147 * behalf of subpool. Return difference.
148 */
149 ret = delta - spool->rsv_hpages;
150 spool->rsv_hpages = 0;
151 } else {
152 ret = 0; /* reserves already accounted for */
153 spool->rsv_hpages -= delta;
154 }
113 } 155 }
114 spin_unlock(&spool->lock);
115 156
157unlock_ret:
158 spin_unlock(&spool->lock);
116 return ret; 159 return ret;
117} 160}
118 161
119static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, 162/*
163 * Subpool accounting for freeing and unreserving pages.
164 * Return the number of global page reservations that must be dropped.
165 * The return value may only be different than the passed value (delta)
166 * in the case where a subpool minimum size must be maintained.
167 */
168static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
120 long delta) 169 long delta)
121{ 170{
171 long ret = delta;
172
122 if (!spool) 173 if (!spool)
123 return; 174 return delta;
124 175
125 spin_lock(&spool->lock); 176 spin_lock(&spool->lock);
126 spool->used_hpages -= delta; 177
127 /* If hugetlbfs_put_super couldn't free spool due to 178 if (spool->max_hpages != -1) /* maximum size accounting */
128 * an outstanding quota reference, free it now. */ 179 spool->used_hpages -= delta;
180
181 if (spool->min_hpages != -1) { /* minimum size accounting */
182 if (spool->rsv_hpages + delta <= spool->min_hpages)
183 ret = 0;
184 else
185 ret = spool->rsv_hpages + delta - spool->min_hpages;
186
187 spool->rsv_hpages += delta;
188 if (spool->rsv_hpages > spool->min_hpages)
189 spool->rsv_hpages = spool->min_hpages;
190 }
191
192 /*
193 * If hugetlbfs_put_super couldn't free spool due to an outstanding
194 * quota reference, free it now.
195 */
129 unlock_or_release_subpool(spool); 196 unlock_or_release_subpool(spool);
197
198 return ret;
130} 199}
131 200
132static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 201static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size)
855 return NULL; 924 return NULL;
856} 925}
857 926
927/*
928 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
929 * to hstate->hugepage_activelist.)
930 *
931 * This function can be called for tail pages, but never returns true for them.
932 */
933bool page_huge_active(struct page *page)
934{
935 VM_BUG_ON_PAGE(!PageHuge(page), page);
936 return PageHead(page) && PagePrivate(&page[1]);
937}
938
939/* never called for tail page */
940static void set_page_huge_active(struct page *page)
941{
942 VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
943 SetPagePrivate(&page[1]);
944}
945
946static void clear_page_huge_active(struct page *page)
947{
948 VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
949 ClearPagePrivate(&page[1]);
950}
951
858void free_huge_page(struct page *page) 952void free_huge_page(struct page *page)
859{ 953{
860 /* 954 /*
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page)
874 restore_reserve = PagePrivate(page); 968 restore_reserve = PagePrivate(page);
875 ClearPagePrivate(page); 969 ClearPagePrivate(page);
876 970
971 /*
972 * A return code of zero implies that the subpool will be under its
973 * minimum size if the reservation is not restored after page is free.
974 * Therefore, force restore_reserve operation.
975 */
976 if (hugepage_subpool_put_pages(spool, 1) == 0)
977 restore_reserve = true;
978
877 spin_lock(&hugetlb_lock); 979 spin_lock(&hugetlb_lock);
980 clear_page_huge_active(page);
878 hugetlb_cgroup_uncharge_page(hstate_index(h), 981 hugetlb_cgroup_uncharge_page(hstate_index(h),
879 pages_per_huge_page(h), page); 982 pages_per_huge_page(h), page);
880 if (restore_reserve) 983 if (restore_reserve)
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page)
891 enqueue_huge_page(h, page); 994 enqueue_huge_page(h, page);
892 } 995 }
893 spin_unlock(&hugetlb_lock); 996 spin_unlock(&hugetlb_lock);
894 hugepage_subpool_put_pages(spool, 1);
895} 997}
896 998
897static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 999static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1386,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1386 if (chg < 0) 1488 if (chg < 0)
1387 return ERR_PTR(-ENOMEM); 1489 return ERR_PTR(-ENOMEM);
1388 if (chg || avoid_reserve) 1490 if (chg || avoid_reserve)
1389 if (hugepage_subpool_get_pages(spool, 1)) 1491 if (hugepage_subpool_get_pages(spool, 1) < 0)
1390 return ERR_PTR(-ENOSPC); 1492 return ERR_PTR(-ENOSPC);
1391 1493
1392 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1494 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2454,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2454 struct resv_map *resv = vma_resv_map(vma); 2556 struct resv_map *resv = vma_resv_map(vma);
2455 struct hugepage_subpool *spool = subpool_vma(vma); 2557 struct hugepage_subpool *spool = subpool_vma(vma);
2456 unsigned long reserve, start, end; 2558 unsigned long reserve, start, end;
2559 long gbl_reserve;
2457 2560
2458 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 2561 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
2459 return; 2562 return;
@@ -2466,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2466 kref_put(&resv->refs, resv_map_release); 2569 kref_put(&resv->refs, resv_map_release);
2467 2570
2468 if (reserve) { 2571 if (reserve) {
2469 hugetlb_acct_memory(h, -reserve); 2572 /*
2470 hugepage_subpool_put_pages(spool, reserve); 2573 * Decrement reserve counts. The global reserve count may be
2574 * adjusted if the subpool has a minimum size.
2575 */
2576 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
2577 hugetlb_acct_memory(h, -gbl_reserve);
2471 } 2578 }
2472} 2579}
2473 2580
@@ -2891,6 +2998,7 @@ retry_avoidcopy:
2891 copy_user_huge_page(new_page, old_page, address, vma, 2998 copy_user_huge_page(new_page, old_page, address, vma,
2892 pages_per_huge_page(h)); 2999 pages_per_huge_page(h));
2893 __SetPageUptodate(new_page); 3000 __SetPageUptodate(new_page);
3001 set_page_huge_active(new_page);
2894 3002
2895 mmun_start = address & huge_page_mask(h); 3003 mmun_start = address & huge_page_mask(h);
2896 mmun_end = mmun_start + huge_page_size(h); 3004 mmun_end = mmun_start + huge_page_size(h);
@@ -3003,6 +3111,7 @@ retry:
3003 } 3111 }
3004 clear_huge_page(page, address, pages_per_huge_page(h)); 3112 clear_huge_page(page, address, pages_per_huge_page(h));
3005 __SetPageUptodate(page); 3113 __SetPageUptodate(page);
3114 set_page_huge_active(page);
3006 3115
3007 if (vma->vm_flags & VM_MAYSHARE) { 3116 if (vma->vm_flags & VM_MAYSHARE) {
3008 int err; 3117 int err;
@@ -3447,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode,
3447 struct hstate *h = hstate_inode(inode); 3556 struct hstate *h = hstate_inode(inode);
3448 struct hugepage_subpool *spool = subpool_inode(inode); 3557 struct hugepage_subpool *spool = subpool_inode(inode);
3449 struct resv_map *resv_map; 3558 struct resv_map *resv_map;
3559 long gbl_reserve;
3450 3560
3451 /* 3561 /*
3452 * Only apply hugepage reservation if asked. At fault time, an 3562 * Only apply hugepage reservation if asked. At fault time, an
@@ -3483,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode,
3483 goto out_err; 3593 goto out_err;
3484 } 3594 }
3485 3595
3486 /* There must be enough pages in the subpool for the mapping */ 3596 /*
3487 if (hugepage_subpool_get_pages(spool, chg)) { 3597 * There must be enough pages in the subpool for the mapping. If
3598 * the subpool has a minimum size, there may be some global
3599 * reservations already in place (gbl_reserve).
3600 */
3601 gbl_reserve = hugepage_subpool_get_pages(spool, chg);
3602 if (gbl_reserve < 0) {
3488 ret = -ENOSPC; 3603 ret = -ENOSPC;
3489 goto out_err; 3604 goto out_err;
3490 } 3605 }
@@ -3493,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode,
3493 * Check enough hugepages are available for the reservation. 3608 * Check enough hugepages are available for the reservation.
3494 * Hand the pages back to the subpool if there are not 3609 * Hand the pages back to the subpool if there are not
3495 */ 3610 */
3496 ret = hugetlb_acct_memory(h, chg); 3611 ret = hugetlb_acct_memory(h, gbl_reserve);
3497 if (ret < 0) { 3612 if (ret < 0) {
3498 hugepage_subpool_put_pages(spool, chg); 3613 /* put back original number of pages, chg */
3614 (void)hugepage_subpool_put_pages(spool, chg);
3499 goto out_err; 3615 goto out_err;
3500 } 3616 }
3501 3617
@@ -3525,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3525 struct resv_map *resv_map = inode_resv_map(inode); 3641 struct resv_map *resv_map = inode_resv_map(inode);
3526 long chg = 0; 3642 long chg = 0;
3527 struct hugepage_subpool *spool = subpool_inode(inode); 3643 struct hugepage_subpool *spool = subpool_inode(inode);
3644 long gbl_reserve;
3528 3645
3529 if (resv_map) 3646 if (resv_map)
3530 chg = region_truncate(resv_map, offset); 3647 chg = region_truncate(resv_map, offset);
@@ -3532,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3532 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3649 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
3533 spin_unlock(&inode->i_lock); 3650 spin_unlock(&inode->i_lock);
3534 3651
3535 hugepage_subpool_put_pages(spool, (chg - freed)); 3652 /*
3536 hugetlb_acct_memory(h, -(chg - freed)); 3653 * If the subpool has a minimum size, the number of global
3654 * reservations to be released may be adjusted.
3655 */
3656 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
3657 hugetlb_acct_memory(h, -gbl_reserve);
3537} 3658}
3538 3659
3539#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 3660#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3775,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
3775 3896
3776#ifdef CONFIG_MEMORY_FAILURE 3897#ifdef CONFIG_MEMORY_FAILURE
3777 3898
3778/* Should be called in hugetlb_lock */
3779static int is_hugepage_on_freelist(struct page *hpage)
3780{
3781 struct page *page;
3782 struct page *tmp;
3783 struct hstate *h = page_hstate(hpage);
3784 int nid = page_to_nid(hpage);
3785
3786 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
3787 if (page == hpage)
3788 return 1;
3789 return 0;
3790}
3791
3792/* 3899/*
3793 * This function is called from memory failure code. 3900 * This function is called from memory failure code.
3794 * Assume the caller holds page lock of the head page. 3901 * Assume the caller holds page lock of the head page.
@@ -3800,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3800 int ret = -EBUSY; 3907 int ret = -EBUSY;
3801 3908
3802 spin_lock(&hugetlb_lock); 3909 spin_lock(&hugetlb_lock);
3803 if (is_hugepage_on_freelist(hpage)) { 3910 /*
3911 * Just checking !page_huge_active is not enough, because that could be
3912 * an isolated/hwpoisoned hugepage (which have >0 refcount).
3913 */
3914 if (!page_huge_active(hpage) && !page_count(hpage)) {
3804 /* 3915 /*
3805 * Hwpoisoned hugepage isn't linked to activelist or freelist, 3916 * Hwpoisoned hugepage isn't linked to activelist or freelist,
3806 * but dangling hpage->lru can trigger list-debug warnings 3917 * but dangling hpage->lru can trigger list-debug warnings
@@ -3820,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3820 3931
3821bool isolate_huge_page(struct page *page, struct list_head *list) 3932bool isolate_huge_page(struct page *page, struct list_head *list)
3822{ 3933{
3934 bool ret = true;
3935
3823 VM_BUG_ON_PAGE(!PageHead(page), page); 3936 VM_BUG_ON_PAGE(!PageHead(page), page);
3824 if (!get_page_unless_zero(page))
3825 return false;
3826 spin_lock(&hugetlb_lock); 3937 spin_lock(&hugetlb_lock);
3938 if (!page_huge_active(page) || !get_page_unless_zero(page)) {
3939 ret = false;
3940 goto unlock;
3941 }
3942 clear_page_huge_active(page);
3827 list_move_tail(&page->lru, list); 3943 list_move_tail(&page->lru, list);
3944unlock:
3828 spin_unlock(&hugetlb_lock); 3945 spin_unlock(&hugetlb_lock);
3829 return true; 3946 return ret;
3830} 3947}
3831 3948
3832void putback_active_hugepage(struct page *page) 3949void putback_active_hugepage(struct page *page)
3833{ 3950{
3834 VM_BUG_ON_PAGE(!PageHead(page), page); 3951 VM_BUG_ON_PAGE(!PageHead(page), page);
3835 spin_lock(&hugetlb_lock); 3952 spin_lock(&hugetlb_lock);
3953 set_page_huge_active(page);
3836 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 3954 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
3837 spin_unlock(&hugetlb_lock); 3955 spin_unlock(&hugetlb_lock);
3838 put_page(page); 3956 put_page(page);
3839} 3957}
3840
3841bool is_hugepage_active(struct page *page)
3842{
3843 VM_BUG_ON_PAGE(!PageHuge(page), page);
3844 /*
3845 * This function can be called for a tail page because the caller,
3846 * scan_movable_pages, scans through a given pfn-range which typically
3847 * covers one memory block. In systems using gigantic hugepage (1GB
3848 * for x86_64,) a hugepage is larger than a memory block, and we don't
3849 * support migrating such large hugepages for now, so return false
3850 * when called for tail pages.
3851 */
3852 if (PageTail(page))
3853 return false;
3854 /*
3855 * Refcount of a hwpoisoned hugepages is 1, but they are not active,
3856 * so we should return false for them.
3857 */
3858 if (unlikely(PageHWPoison(page)))
3859 return false;
3860 return page_count(page) > 0;
3861}
diff --git a/mm/internal.h b/mm/internal.h
index edaab69a9c35..a25e359a4039 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -224,13 +224,13 @@ static inline unsigned long page_order(struct page *page)
224 * PageBuddy() should be checked first by the caller to minimize race window, 224 * PageBuddy() should be checked first by the caller to minimize race window,
225 * and invalid values must be handled gracefully. 225 * and invalid values must be handled gracefully.
226 * 226 *
227 * ACCESS_ONCE is used so that if the caller assigns the result into a local 227 * READ_ONCE is used so that if the caller assigns the result into a local
228 * variable and e.g. tests it for valid range before using, the compiler cannot 228 * variable and e.g. tests it for valid range before using, the compiler cannot
229 * decide to remove the variable and inline the page_private(page) multiple 229 * decide to remove the variable and inline the page_private(page) multiple
230 * times, potentially observing different values in the tests and the actual 230 * times, potentially observing different values in the tests and the actual
231 * use of the result. 231 * use of the result.
232 */ 232 */
233#define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) 233#define page_order_unsafe(page) READ_ONCE(page_private(page))
234 234
235static inline bool is_cow_mapping(vm_flags_t flags) 235static inline bool is_cow_mapping(vm_flags_t flags)
236{ 236{
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 936d81661c47..6c513a63ea84 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -389,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size)
389 kasan_kmalloc(page->slab_cache, object, size); 389 kasan_kmalloc(page->slab_cache, object, size);
390} 390}
391 391
392void kasan_kfree(void *ptr)
393{
394 struct page *page;
395
396 page = virt_to_head_page(ptr);
397
398 if (unlikely(!PageSlab(page)))
399 kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
400 KASAN_FREE_PAGE);
401 else
402 kasan_slab_free(page->slab_cache, ptr);
403}
404
392void kasan_kfree_large(const void *ptr) 405void kasan_kfree_large(const void *ptr)
393{ 406{
394 struct page *page = virt_to_page(ptr); 407 struct page *page = virt_to_page(ptr);
diff --git a/mm/ksm.c b/mm/ksm.c
index 4162dce2eb44..7ee101eaacdf 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
542 expected_mapping = (void *)stable_node + 542 expected_mapping = (void *)stable_node +
543 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 543 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
544again: 544again:
545 kpfn = ACCESS_ONCE(stable_node->kpfn); 545 kpfn = READ_ONCE(stable_node->kpfn);
546 page = pfn_to_page(kpfn); 546 page = pfn_to_page(kpfn);
547 547
548 /* 548 /*
@@ -551,7 +551,7 @@ again:
551 * but on Alpha we need to be more careful. 551 * but on Alpha we need to be more careful.
552 */ 552 */
553 smp_read_barrier_depends(); 553 smp_read_barrier_depends();
554 if (ACCESS_ONCE(page->mapping) != expected_mapping) 554 if (READ_ONCE(page->mapping) != expected_mapping)
555 goto stale; 555 goto stale;
556 556
557 /* 557 /*
@@ -577,14 +577,14 @@ again:
577 cpu_relax(); 577 cpu_relax();
578 } 578 }
579 579
580 if (ACCESS_ONCE(page->mapping) != expected_mapping) { 580 if (READ_ONCE(page->mapping) != expected_mapping) {
581 put_page(page); 581 put_page(page);
582 goto stale; 582 goto stale;
583 } 583 }
584 584
585 if (lock_it) { 585 if (lock_it) {
586 lock_page(page); 586 lock_page(page);
587 if (ACCESS_ONCE(page->mapping) != expected_mapping) { 587 if (READ_ONCE(page->mapping) != expected_mapping) {
588 unlock_page(page); 588 unlock_page(page);
589 put_page(page); 589 put_page(page);
590 goto stale; 590 goto stale;
@@ -600,7 +600,7 @@ stale:
600 * before checking whether node->kpfn has been changed. 600 * before checking whether node->kpfn has been changed.
601 */ 601 */
602 smp_rmb(); 602 smp_rmb();
603 if (ACCESS_ONCE(stable_node->kpfn) != kpfn) 603 if (READ_ONCE(stable_node->kpfn) != kpfn)
604 goto again; 604 goto again;
605 remove_node_from_stable_tree(stable_node); 605 remove_node_from_stable_tree(stable_node);
606 return NULL; 606 return NULL;
diff --git a/mm/memblock.c b/mm/memblock.c
index 3f37a0bca5d5..9318b567ed79 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -580,10 +580,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
580 return memblock_add_range(&memblock.memory, base, size, nid, 0); 580 return memblock_add_range(&memblock.memory, base, size, nid, 0);
581} 581}
582 582
583static int __init_memblock memblock_add_region(phys_addr_t base,
584 phys_addr_t size,
585 int nid,
586 unsigned long flags)
587{
588 struct memblock_type *_rgn = &memblock.memory;
589
590 memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
591 (unsigned long long)base,
592 (unsigned long long)base + size - 1,
593 flags, (void *)_RET_IP_);
594
595 return memblock_add_range(_rgn, base, size, nid, flags);
596}
597
583int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 598int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
584{ 599{
585 return memblock_add_range(&memblock.memory, base, size, 600 return memblock_add_region(base, size, MAX_NUMNODES, 0);
586 MAX_NUMNODES, 0);
587} 601}
588 602
589/** 603/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c3f09b2dda5f..14c2f2017e37 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -259,11 +259,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
259 * page cache and RSS per cgroup. We would eventually like to provide 259 * page cache and RSS per cgroup. We would eventually like to provide
260 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 260 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
261 * to help the administrator determine what knobs to tune. 261 * to help the administrator determine what knobs to tune.
262 *
263 * TODO: Add a water mark for the memory controller. Reclaim will begin when
264 * we hit the water mark. May be even add a low water mark, such that
265 * no reclaim occurs from a cgroup at it's low water mark, this is
266 * a feature that will be implemented much later in the future.
267 */ 262 */
268struct mem_cgroup { 263struct mem_cgroup {
269 struct cgroup_subsys_state css; 264 struct cgroup_subsys_state css;
@@ -460,6 +455,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
460 return memcg->css.id; 455 return memcg->css.id;
461} 456}
462 457
458/*
459 * A helper function to get mem_cgroup from ID. must be called under
460 * rcu_read_lock(). The caller is responsible for calling
461 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
462 * refcnt from swap can be called against removed memcg.)
463 */
463static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 464static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
464{ 465{
465 struct cgroup_subsys_state *css; 466 struct cgroup_subsys_state *css;
@@ -673,7 +674,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
673static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 674static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
674{ 675{
675 unsigned long nr_pages = page_counter_read(&memcg->memory); 676 unsigned long nr_pages = page_counter_read(&memcg->memory);
676 unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); 677 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
677 unsigned long excess = 0; 678 unsigned long excess = 0;
678 679
679 if (nr_pages > soft_limit) 680 if (nr_pages > soft_limit)
@@ -1041,7 +1042,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1041 goto out_unlock; 1042 goto out_unlock;
1042 1043
1043 do { 1044 do {
1044 pos = ACCESS_ONCE(iter->position); 1045 pos = READ_ONCE(iter->position);
1045 /* 1046 /*
1046 * A racing update may change the position and 1047 * A racing update may change the position and
1047 * put the last reference, hence css_tryget(), 1048 * put the last reference, hence css_tryget(),
@@ -1358,13 +1359,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1358 unsigned long limit; 1359 unsigned long limit;
1359 1360
1360 count = page_counter_read(&memcg->memory); 1361 count = page_counter_read(&memcg->memory);
1361 limit = ACCESS_ONCE(memcg->memory.limit); 1362 limit = READ_ONCE(memcg->memory.limit);
1362 if (count < limit) 1363 if (count < limit)
1363 margin = limit - count; 1364 margin = limit - count;
1364 1365
1365 if (do_swap_account) { 1366 if (do_swap_account) {
1366 count = page_counter_read(&memcg->memsw); 1367 count = page_counter_read(&memcg->memsw);
1367 limit = ACCESS_ONCE(memcg->memsw.limit); 1368 limit = READ_ONCE(memcg->memsw.limit);
1368 if (count <= limit) 1369 if (count <= limit)
1369 margin = min(margin, limit - count); 1370 margin = min(margin, limit - count);
1370 } 1371 }
@@ -2349,20 +2350,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2349} 2350}
2350 2351
2351/* 2352/*
2352 * A helper function to get mem_cgroup from ID. must be called under
2353 * rcu_read_lock(). The caller is responsible for calling
2354 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
2355 * refcnt from swap can be called against removed memcg.)
2356 */
2357static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2358{
2359 /* ID 0 is unused ID */
2360 if (!id)
2361 return NULL;
2362 return mem_cgroup_from_id(id);
2363}
2364
2365/*
2366 * try_get_mem_cgroup_from_page - look up page's memcg association 2353 * try_get_mem_cgroup_from_page - look up page's memcg association
2367 * @page: the page 2354 * @page: the page
2368 * 2355 *
@@ -2388,7 +2375,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2388 ent.val = page_private(page); 2375 ent.val = page_private(page);
2389 id = lookup_swap_cgroup_id(ent); 2376 id = lookup_swap_cgroup_id(ent);
2390 rcu_read_lock(); 2377 rcu_read_lock();
2391 memcg = mem_cgroup_lookup(id); 2378 memcg = mem_cgroup_from_id(id);
2392 if (memcg && !css_tryget_online(&memcg->css)) 2379 if (memcg && !css_tryget_online(&memcg->css))
2393 memcg = NULL; 2380 memcg = NULL;
2394 rcu_read_unlock(); 2381 rcu_read_unlock();
@@ -2650,7 +2637,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2650 return cachep; 2637 return cachep;
2651 2638
2652 memcg = get_mem_cgroup_from_mm(current->mm); 2639 memcg = get_mem_cgroup_from_mm(current->mm);
2653 kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); 2640 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2654 if (kmemcg_id < 0) 2641 if (kmemcg_id < 0)
2655 goto out; 2642 goto out;
2656 2643
@@ -5020,7 +5007,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5020 * tunable will only affect upcoming migrations, not the current one. 5007 * tunable will only affect upcoming migrations, not the current one.
5021 * So we need to save it, and keep it going. 5008 * So we need to save it, and keep it going.
5022 */ 5009 */
5023 move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); 5010 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5024 if (move_flags) { 5011 if (move_flags) {
5025 struct mm_struct *mm; 5012 struct mm_struct *mm;
5026 struct mem_cgroup *from = mem_cgroup_from_task(p); 5013 struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5254,7 +5241,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
5254static int memory_low_show(struct seq_file *m, void *v) 5241static int memory_low_show(struct seq_file *m, void *v)
5255{ 5242{
5256 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5243 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5257 unsigned long low = ACCESS_ONCE(memcg->low); 5244 unsigned long low = READ_ONCE(memcg->low);
5258 5245
5259 if (low == PAGE_COUNTER_MAX) 5246 if (low == PAGE_COUNTER_MAX)
5260 seq_puts(m, "max\n"); 5247 seq_puts(m, "max\n");
@@ -5284,7 +5271,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
5284static int memory_high_show(struct seq_file *m, void *v) 5271static int memory_high_show(struct seq_file *m, void *v)
5285{ 5272{
5286 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5273 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5287 unsigned long high = ACCESS_ONCE(memcg->high); 5274 unsigned long high = READ_ONCE(memcg->high);
5288 5275
5289 if (high == PAGE_COUNTER_MAX) 5276 if (high == PAGE_COUNTER_MAX)
5290 seq_puts(m, "max\n"); 5277 seq_puts(m, "max\n");
@@ -5314,7 +5301,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
5314static int memory_max_show(struct seq_file *m, void *v) 5301static int memory_max_show(struct seq_file *m, void *v)
5315{ 5302{
5316 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5303 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5317 unsigned long max = ACCESS_ONCE(memcg->memory.limit); 5304 unsigned long max = READ_ONCE(memcg->memory.limit);
5318 5305
5319 if (max == PAGE_COUNTER_MAX) 5306 if (max == PAGE_COUNTER_MAX)
5320 seq_puts(m, "max\n"); 5307 seq_puts(m, "max\n");
@@ -5869,7 +5856,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
5869 5856
5870 id = swap_cgroup_record(entry, 0); 5857 id = swap_cgroup_record(entry, 0);
5871 rcu_read_lock(); 5858 rcu_read_lock();
5872 memcg = mem_cgroup_lookup(id); 5859 memcg = mem_cgroup_from_id(id);
5873 if (memcg) { 5860 if (memcg) {
5874 if (!mem_cgroup_is_root(memcg)) 5861 if (!mem_cgroup_is_root(memcg))
5875 page_counter_uncharge(&memcg->memsw, 1); 5862 page_counter_uncharge(&memcg->memsw, 1);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d487f8dc6d39..d9359b770cd9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -521,6 +521,52 @@ static const char *action_name[] = {
521 [RECOVERED] = "Recovered", 521 [RECOVERED] = "Recovered",
522}; 522};
523 523
524enum action_page_type {
525 MSG_KERNEL,
526 MSG_KERNEL_HIGH_ORDER,
527 MSG_SLAB,
528 MSG_DIFFERENT_COMPOUND,
529 MSG_POISONED_HUGE,
530 MSG_HUGE,
531 MSG_FREE_HUGE,
532 MSG_UNMAP_FAILED,
533 MSG_DIRTY_SWAPCACHE,
534 MSG_CLEAN_SWAPCACHE,
535 MSG_DIRTY_MLOCKED_LRU,
536 MSG_CLEAN_MLOCKED_LRU,
537 MSG_DIRTY_UNEVICTABLE_LRU,
538 MSG_CLEAN_UNEVICTABLE_LRU,
539 MSG_DIRTY_LRU,
540 MSG_CLEAN_LRU,
541 MSG_TRUNCATED_LRU,
542 MSG_BUDDY,
543 MSG_BUDDY_2ND,
544 MSG_UNKNOWN,
545};
546
547static const char * const action_page_types[] = {
548 [MSG_KERNEL] = "reserved kernel page",
549 [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
550 [MSG_SLAB] = "kernel slab page",
551 [MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
552 [MSG_POISONED_HUGE] = "huge page already hardware poisoned",
553 [MSG_HUGE] = "huge page",
554 [MSG_FREE_HUGE] = "free huge page",
555 [MSG_UNMAP_FAILED] = "unmapping failed page",
556 [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
557 [MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
558 [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
559 [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
560 [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
561 [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
562 [MSG_DIRTY_LRU] = "dirty LRU page",
563 [MSG_CLEAN_LRU] = "clean LRU page",
564 [MSG_TRUNCATED_LRU] = "already truncated LRU page",
565 [MSG_BUDDY] = "free buddy page",
566 [MSG_BUDDY_2ND] = "free buddy page (2nd try)",
567 [MSG_UNKNOWN] = "unknown page",
568};
569
524/* 570/*
525 * XXX: It is possible that a page is isolated from LRU cache, 571 * XXX: It is possible that a page is isolated from LRU cache,
526 * and then kept in swap cache or failed to remove from page cache. 572 * and then kept in swap cache or failed to remove from page cache.
@@ -777,10 +823,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
777static struct page_state { 823static struct page_state {
778 unsigned long mask; 824 unsigned long mask;
779 unsigned long res; 825 unsigned long res;
780 char *msg; 826 enum action_page_type type;
781 int (*action)(struct page *p, unsigned long pfn); 827 int (*action)(struct page *p, unsigned long pfn);
782} error_states[] = { 828} error_states[] = {
783 { reserved, reserved, "reserved kernel", me_kernel }, 829 { reserved, reserved, MSG_KERNEL, me_kernel },
784 /* 830 /*
785 * free pages are specially detected outside this table: 831 * free pages are specially detected outside this table:
786 * PG_buddy pages only make a small fraction of all free pages. 832 * PG_buddy pages only make a small fraction of all free pages.
@@ -791,31 +837,31 @@ static struct page_state {
791 * currently unused objects without touching them. But just 837 * currently unused objects without touching them. But just
792 * treat it as standard kernel for now. 838 * treat it as standard kernel for now.
793 */ 839 */
794 { slab, slab, "kernel slab", me_kernel }, 840 { slab, slab, MSG_SLAB, me_kernel },
795 841
796#ifdef CONFIG_PAGEFLAGS_EXTENDED 842#ifdef CONFIG_PAGEFLAGS_EXTENDED
797 { head, head, "huge", me_huge_page }, 843 { head, head, MSG_HUGE, me_huge_page },
798 { tail, tail, "huge", me_huge_page }, 844 { tail, tail, MSG_HUGE, me_huge_page },
799#else 845#else
800 { compound, compound, "huge", me_huge_page }, 846 { compound, compound, MSG_HUGE, me_huge_page },
801#endif 847#endif
802 848
803 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, 849 { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
804 { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, 850 { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
805 851
806 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, 852 { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
807 { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, 853 { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
808 854
809 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, 855 { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
810 { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, 856 { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
811 857
812 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, 858 { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty },
813 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 859 { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean },
814 860
815 /* 861 /*
816 * Catchall entry: must be at end. 862 * Catchall entry: must be at end.
817 */ 863 */
818 { 0, 0, "unknown page state", me_unknown }, 864 { 0, 0, MSG_UNKNOWN, me_unknown },
819}; 865};
820 866
821#undef dirty 867#undef dirty
@@ -835,10 +881,10 @@ static struct page_state {
835 * "Dirty/Clean" indication is not 100% accurate due to the possibility of 881 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
836 * setting PG_dirty outside page lock. See also comment above set_page_dirty(). 882 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
837 */ 883 */
838static void action_result(unsigned long pfn, char *msg, int result) 884static void action_result(unsigned long pfn, enum action_page_type type, int result)
839{ 885{
840 pr_err("MCE %#lx: %s page recovery: %s\n", 886 pr_err("MCE %#lx: recovery action for %s: %s\n",
841 pfn, msg, action_name[result]); 887 pfn, action_page_types[type], action_name[result]);
842} 888}
843 889
844static int page_action(struct page_state *ps, struct page *p, 890static int page_action(struct page_state *ps, struct page *p,
@@ -854,11 +900,11 @@ static int page_action(struct page_state *ps, struct page *p,
854 count--; 900 count--;
855 if (count != 0) { 901 if (count != 0) {
856 printk(KERN_ERR 902 printk(KERN_ERR
857 "MCE %#lx: %s page still referenced by %d users\n", 903 "MCE %#lx: %s still referenced by %d users\n",
858 pfn, ps->msg, count); 904 pfn, action_page_types[ps->type], count);
859 result = FAILED; 905 result = FAILED;
860 } 906 }
861 action_result(pfn, ps->msg, result); 907 action_result(pfn, ps->type, result);
862 908
863 /* Could do more checks here if page looks ok */ 909 /* Could do more checks here if page looks ok */
864 /* 910 /*
@@ -1106,7 +1152,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1106 if (!(flags & MF_COUNT_INCREASED) && 1152 if (!(flags & MF_COUNT_INCREASED) &&
1107 !get_page_unless_zero(hpage)) { 1153 !get_page_unless_zero(hpage)) {
1108 if (is_free_buddy_page(p)) { 1154 if (is_free_buddy_page(p)) {
1109 action_result(pfn, "free buddy", DELAYED); 1155 action_result(pfn, MSG_BUDDY, DELAYED);
1110 return 0; 1156 return 0;
1111 } else if (PageHuge(hpage)) { 1157 } else if (PageHuge(hpage)) {
1112 /* 1158 /*
@@ -1123,12 +1169,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1123 } 1169 }
1124 set_page_hwpoison_huge_page(hpage); 1170 set_page_hwpoison_huge_page(hpage);
1125 res = dequeue_hwpoisoned_huge_page(hpage); 1171 res = dequeue_hwpoisoned_huge_page(hpage);
1126 action_result(pfn, "free huge", 1172 action_result(pfn, MSG_FREE_HUGE,
1127 res ? IGNORED : DELAYED); 1173 res ? IGNORED : DELAYED);
1128 unlock_page(hpage); 1174 unlock_page(hpage);
1129 return res; 1175 return res;
1130 } else { 1176 } else {
1131 action_result(pfn, "high order kernel", IGNORED); 1177 action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED);
1132 return -EBUSY; 1178 return -EBUSY;
1133 } 1179 }
1134 } 1180 }
@@ -1150,9 +1196,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1150 */ 1196 */
1151 if (is_free_buddy_page(p)) { 1197 if (is_free_buddy_page(p)) {
1152 if (flags & MF_COUNT_INCREASED) 1198 if (flags & MF_COUNT_INCREASED)
1153 action_result(pfn, "free buddy", DELAYED); 1199 action_result(pfn, MSG_BUDDY, DELAYED);
1154 else 1200 else
1155 action_result(pfn, "free buddy, 2nd try", DELAYED); 1201 action_result(pfn, MSG_BUDDY_2ND,
1202 DELAYED);
1156 return 0; 1203 return 0;
1157 } 1204 }
1158 } 1205 }
@@ -1165,7 +1212,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1165 * If this happens just bail out. 1212 * If this happens just bail out.
1166 */ 1213 */
1167 if (compound_head(p) != hpage) { 1214 if (compound_head(p) != hpage) {
1168 action_result(pfn, "different compound page after locking", IGNORED); 1215 action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED);
1169 res = -EBUSY; 1216 res = -EBUSY;
1170 goto out; 1217 goto out;
1171 } 1218 }
@@ -1205,8 +1252,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1205 * on the head page to show that the hugepage is hwpoisoned 1252 * on the head page to show that the hugepage is hwpoisoned
1206 */ 1253 */
1207 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { 1254 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1208 action_result(pfn, "hugepage already hardware poisoned", 1255 action_result(pfn, MSG_POISONED_HUGE, IGNORED);
1209 IGNORED);
1210 unlock_page(hpage); 1256 unlock_page(hpage);
1211 put_page(hpage); 1257 put_page(hpage);
1212 return 0; 1258 return 0;
@@ -1235,7 +1281,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1235 */ 1281 */
1236 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) 1282 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1237 != SWAP_SUCCESS) { 1283 != SWAP_SUCCESS) {
1238 action_result(pfn, "unmapping failed", IGNORED); 1284 action_result(pfn, MSG_UNMAP_FAILED, IGNORED);
1239 res = -EBUSY; 1285 res = -EBUSY;
1240 goto out; 1286 goto out;
1241 } 1287 }
@@ -1244,7 +1290,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1244 * Torn down by someone else? 1290 * Torn down by someone else?
1245 */ 1291 */
1246 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 1292 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1247 action_result(pfn, "already truncated LRU", IGNORED); 1293 action_result(pfn, MSG_TRUNCATED_LRU, IGNORED);
1248 res = -EBUSY; 1294 res = -EBUSY;
1249 goto out; 1295 goto out;
1250 } 1296 }
@@ -1540,8 +1586,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
1540 } 1586 }
1541 unlock_page(hpage); 1587 unlock_page(hpage);
1542 1588
1543 /* Keep page count to indicate a given hugepage is isolated. */ 1589 ret = isolate_huge_page(hpage, &pagelist);
1544 list_move(&hpage->lru, &pagelist); 1590 if (ret) {
1591 /*
1592 * get_any_page() and isolate_huge_page() takes a refcount each,
1593 * so need to drop one here.
1594 */
1595 put_page(hpage);
1596 } else {
1597 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1598 return -EBUSY;
1599 }
1600
1545 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, 1601 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1546 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1602 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1547 if (ret) { 1603 if (ret) {
diff --git a/mm/memory.c b/mm/memory.c
index ac20b2a6a0c3..22e037e3364e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
690 /* 690 /*
691 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y 691 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
692 */ 692 */
693 if (vma->vm_ops) 693 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
694 printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", 694 vma->vm_file,
695 vma->vm_ops->fault); 695 vma->vm_ops ? vma->vm_ops->fault : NULL,
696 if (vma->vm_file) 696 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
697 printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", 697 mapping ? mapping->a_ops->readpage : NULL);
698 vma->vm_file->f_op->mmap);
699 dump_stack(); 698 dump_stack();
700 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 699 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
701} 700}
@@ -2181,6 +2180,42 @@ oom:
2181 return VM_FAULT_OOM; 2180 return VM_FAULT_OOM;
2182} 2181}
2183 2182
2183/*
2184 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
2185 * mapping
2186 */
2187static int wp_pfn_shared(struct mm_struct *mm,
2188 struct vm_area_struct *vma, unsigned long address,
2189 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
2190 pmd_t *pmd)
2191{
2192 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2193 struct vm_fault vmf = {
2194 .page = NULL,
2195 .pgoff = linear_page_index(vma, address),
2196 .virtual_address = (void __user *)(address & PAGE_MASK),
2197 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2198 };
2199 int ret;
2200
2201 pte_unmap_unlock(page_table, ptl);
2202 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
2203 if (ret & VM_FAULT_ERROR)
2204 return ret;
2205 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2206 /*
2207 * We might have raced with another page fault while we
2208 * released the pte_offset_map_lock.
2209 */
2210 if (!pte_same(*page_table, orig_pte)) {
2211 pte_unmap_unlock(page_table, ptl);
2212 return 0;
2213 }
2214 }
2215 return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
2216 NULL, 0, 0);
2217}
2218
2184static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, 2219static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2185 unsigned long address, pte_t *page_table, 2220 unsigned long address, pte_t *page_table,
2186 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, 2221 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
@@ -2259,13 +2294,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2259 * VM_PFNMAP VMA. 2294 * VM_PFNMAP VMA.
2260 * 2295 *
2261 * We should not cow pages in a shared writeable mapping. 2296 * We should not cow pages in a shared writeable mapping.
2262 * Just mark the pages writable as we can't do any dirty 2297 * Just mark the pages writable and/or call ops->pfn_mkwrite.
2263 * accounting on raw pfn maps.
2264 */ 2298 */
2265 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2299 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2266 (VM_WRITE|VM_SHARED)) 2300 (VM_WRITE|VM_SHARED))
2267 return wp_page_reuse(mm, vma, address, page_table, ptl, 2301 return wp_pfn_shared(mm, vma, address, page_table, ptl,
2268 orig_pte, old_page, 0, 0); 2302 orig_pte, pmd);
2269 2303
2270 pte_unmap_unlock(page_table, ptl); 2304 pte_unmap_unlock(page_table, ptl);
2271 return wp_page_copy(mm, vma, address, page_table, pmd, 2305 return wp_page_copy(mm, vma, address, page_table, pmd,
@@ -2845,7 +2879,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
2845 struct vm_fault vmf; 2879 struct vm_fault vmf;
2846 int off; 2880 int off;
2847 2881
2848 nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; 2882 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2849 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; 2883 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2850 2884
2851 start_addr = max(address & mask, vma->vm_start); 2885 start_addr = max(address & mask, vma->vm_start);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e2e8014fb755..457bde530cbe 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1373,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1373 if (PageLRU(page)) 1373 if (PageLRU(page))
1374 return pfn; 1374 return pfn;
1375 if (PageHuge(page)) { 1375 if (PageHuge(page)) {
1376 if (is_hugepage_active(page)) 1376 if (page_huge_active(page))
1377 return pfn; 1377 return pfn;
1378 else 1378 else
1379 pfn = round_up(pfn + 1, 1379 pfn = round_up(pfn + 1,
diff --git a/mm/mempool.c b/mm/mempool.c
index 949970db2874..2cc08de8b1db 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -6,26 +6,138 @@
6 * extreme VM load. 6 * extreme VM load.
7 * 7 *
8 * started by Ingo Molnar, Copyright (C) 2001 8 * started by Ingo Molnar, Copyright (C) 2001
9 * debugging by David Rientjes, Copyright (C) 2015
9 */ 10 */
10 11
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/highmem.h>
15#include <linux/kasan.h>
13#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
14#include <linux/export.h> 17#include <linux/export.h>
15#include <linux/mempool.h> 18#include <linux/mempool.h>
16#include <linux/blkdev.h> 19#include <linux/blkdev.h>
17#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include "slab.h"
22
23#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
24static void poison_error(mempool_t *pool, void *element, size_t size,
25 size_t byte)
26{
27 const int nr = pool->curr_nr;
28 const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
29 const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
30 int i;
31
32 pr_err("BUG: mempool element poison mismatch\n");
33 pr_err("Mempool %p size %zu\n", pool, size);
34 pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
35 for (i = start; i < end; i++)
36 pr_cont("%x ", *(u8 *)(element + i));
37 pr_cont("%s\n", end < size ? "..." : "");
38 dump_stack();
39}
40
41static void __check_element(mempool_t *pool, void *element, size_t size)
42{
43 u8 *obj = element;
44 size_t i;
45
46 for (i = 0; i < size; i++) {
47 u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
48
49 if (obj[i] != exp) {
50 poison_error(pool, element, size, i);
51 return;
52 }
53 }
54 memset(obj, POISON_INUSE, size);
55}
56
57static void check_element(mempool_t *pool, void *element)
58{
59 /* Mempools backed by slab allocator */
60 if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
61 __check_element(pool, element, ksize(element));
62
63 /* Mempools backed by page allocator */
64 if (pool->free == mempool_free_pages) {
65 int order = (int)(long)pool->pool_data;
66 void *addr = kmap_atomic((struct page *)element);
67
68 __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
69 kunmap_atomic(addr);
70 }
71}
72
73static void __poison_element(void *element, size_t size)
74{
75 u8 *obj = element;
76
77 memset(obj, POISON_FREE, size - 1);
78 obj[size - 1] = POISON_END;
79}
80
81static void poison_element(mempool_t *pool, void *element)
82{
83 /* Mempools backed by slab allocator */
84 if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
85 __poison_element(element, ksize(element));
86
87 /* Mempools backed by page allocator */
88 if (pool->alloc == mempool_alloc_pages) {
89 int order = (int)(long)pool->pool_data;
90 void *addr = kmap_atomic((struct page *)element);
91
92 __poison_element(addr, 1UL << (PAGE_SHIFT + order));
93 kunmap_atomic(addr);
94 }
95}
96#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
97static inline void check_element(mempool_t *pool, void *element)
98{
99}
100static inline void poison_element(mempool_t *pool, void *element)
101{
102}
103#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
104
105static void kasan_poison_element(mempool_t *pool, void *element)
106{
107 if (pool->alloc == mempool_alloc_slab)
108 kasan_slab_free(pool->pool_data, element);
109 if (pool->alloc == mempool_kmalloc)
110 kasan_kfree(element);
111 if (pool->alloc == mempool_alloc_pages)
112 kasan_free_pages(element, (unsigned long)pool->pool_data);
113}
114
115static void kasan_unpoison_element(mempool_t *pool, void *element)
116{
117 if (pool->alloc == mempool_alloc_slab)
118 kasan_slab_alloc(pool->pool_data, element);
119 if (pool->alloc == mempool_kmalloc)
120 kasan_krealloc(element, (size_t)pool->pool_data);
121 if (pool->alloc == mempool_alloc_pages)
122 kasan_alloc_pages(element, (unsigned long)pool->pool_data);
123}
18 124
19static void add_element(mempool_t *pool, void *element) 125static void add_element(mempool_t *pool, void *element)
20{ 126{
21 BUG_ON(pool->curr_nr >= pool->min_nr); 127 BUG_ON(pool->curr_nr >= pool->min_nr);
128 poison_element(pool, element);
129 kasan_poison_element(pool, element);
22 pool->elements[pool->curr_nr++] = element; 130 pool->elements[pool->curr_nr++] = element;
23} 131}
24 132
25static void *remove_element(mempool_t *pool) 133static void *remove_element(mempool_t *pool)
26{ 134{
27 BUG_ON(pool->curr_nr <= 0); 135 void *element = pool->elements[--pool->curr_nr];
28 return pool->elements[--pool->curr_nr]; 136
137 BUG_ON(pool->curr_nr < 0);
138 check_element(pool, element);
139 kasan_unpoison_element(pool, element);
140 return element;
29} 141}
30 142
31/** 143/**
@@ -334,6 +446,7 @@ EXPORT_SYMBOL(mempool_free);
334void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) 446void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
335{ 447{
336 struct kmem_cache *mem = pool_data; 448 struct kmem_cache *mem = pool_data;
449 VM_BUG_ON(mem->ctor);
337 return kmem_cache_alloc(mem, gfp_mask); 450 return kmem_cache_alloc(mem, gfp_mask);
338} 451}
339EXPORT_SYMBOL(mempool_alloc_slab); 452EXPORT_SYMBOL(mempool_alloc_slab);
diff --git a/mm/migrate.c b/mm/migrate.c
index a65ff72ab739..f53838fe3dfe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)
537 * Please do not reorder this without considering how mm/ksm.c's 537 * Please do not reorder this without considering how mm/ksm.c's
538 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). 538 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
539 */ 539 */
540 ClearPageSwapCache(page); 540 if (PageSwapCache(page))
541 ClearPageSwapCache(page);
541 ClearPagePrivate(page); 542 ClearPagePrivate(page);
542 set_page_private(page, 0); 543 set_page_private(page, 0);
543 544
diff --git a/mm/mmap.c b/mm/mmap.c
index 06a6076c92e5..bb50cacc3ea5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1133,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
1133 * by another page fault trying to merge _that_. But that's ok: if it 1133 * by another page fault trying to merge _that_. But that's ok: if it
1134 * is being set up, that automatically means that it will be a singleton 1134 * is being set up, that automatically means that it will be a singleton
1135 * acceptable for merging, so we can do all of this optimistically. But 1135 * acceptable for merging, so we can do all of this optimistically. But
1136 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. 1136 * we do that READ_ONCE() to make sure that we never re-load the pointer.
1137 * 1137 *
1138 * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1138 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1139 * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1139 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
@@ -1147,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
1147static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) 1147static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
1148{ 1148{
1149 if (anon_vma_compatible(a, b)) { 1149 if (anon_vma_compatible(a, b)) {
1150 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); 1150 struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
1151 1151
1152 if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1152 if (anon_vma && list_is_singular(&old->anon_vma_chain))
1153 return anon_vma; 1153 return anon_vma;
@@ -1551,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1551 1551
1552 /* Clear old maps */ 1552 /* Clear old maps */
1553 error = -ENOMEM; 1553 error = -ENOMEM;
1554munmap_back: 1554 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
1555 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 1555 &rb_parent)) {
1556 if (do_munmap(mm, addr, len)) 1556 if (do_munmap(mm, addr, len))
1557 return -ENOMEM; 1557 return -ENOMEM;
1558 goto munmap_back;
1559 } 1558 }
1560 1559
1561 /* 1560 /*
@@ -1571,7 +1570,8 @@ munmap_back:
1571 /* 1570 /*
1572 * Can we just expand an old mapping? 1571 * Can we just expand an old mapping?
1573 */ 1572 */
1574 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); 1573 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
1574 NULL);
1575 if (vma) 1575 if (vma)
1576 goto out; 1576 goto out;
1577 1577
@@ -2100,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
2100 actual_size = size; 2100 actual_size = size;
2101 if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) 2101 if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
2102 actual_size -= PAGE_SIZE; 2102 actual_size -= PAGE_SIZE;
2103 if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) 2103 if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
2104 return -ENOMEM; 2104 return -ENOMEM;
2105 2105
2106 /* mlock limit tests */ 2106 /* mlock limit tests */
@@ -2108,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
2108 unsigned long locked; 2108 unsigned long locked;
2109 unsigned long limit; 2109 unsigned long limit;
2110 locked = mm->locked_vm + grow; 2110 locked = mm->locked_vm + grow;
2111 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); 2111 limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
2112 limit >>= PAGE_SHIFT; 2112 limit >>= PAGE_SHIFT;
2113 if (locked > limit && !capable(CAP_IPC_LOCK)) 2113 if (locked > limit && !capable(CAP_IPC_LOCK))
2114 return -ENOMEM; 2114 return -ENOMEM;
@@ -2739,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2739 /* 2739 /*
2740 * Clear old maps. this also does some error checking for us 2740 * Clear old maps. this also does some error checking for us
2741 */ 2741 */
2742 munmap_back: 2742 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
2743 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 2743 &rb_parent)) {
2744 if (do_munmap(mm, addr, len)) 2744 if (do_munmap(mm, addr, len))
2745 return -ENOMEM; 2745 return -ENOMEM;
2746 goto munmap_back;
2747 } 2746 }
2748 2747
2749 /* Check against address space limits *after* clearing old maps... */ 2748 /* Check against address space limits *after* clearing old maps... */
diff --git a/mm/mremap.c b/mm/mremap.c
index 2dc44b1cb1df..034e2d360652 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -345,25 +345,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
345 struct vm_area_struct *vma = find_vma(mm, addr); 345 struct vm_area_struct *vma = find_vma(mm, addr);
346 346
347 if (!vma || vma->vm_start > addr) 347 if (!vma || vma->vm_start > addr)
348 goto Efault; 348 return ERR_PTR(-EFAULT);
349 349
350 if (is_vm_hugetlb_page(vma)) 350 if (is_vm_hugetlb_page(vma))
351 goto Einval; 351 return ERR_PTR(-EINVAL);
352 352
353 /* We can't remap across vm area boundaries */ 353 /* We can't remap across vm area boundaries */
354 if (old_len > vma->vm_end - addr) 354 if (old_len > vma->vm_end - addr)
355 goto Efault; 355 return ERR_PTR(-EFAULT);
356 356
357 /* Need to be careful about a growing mapping */ 357 /* Need to be careful about a growing mapping */
358 if (new_len > old_len) { 358 if (new_len > old_len) {
359 unsigned long pgoff; 359 unsigned long pgoff;
360 360
361 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) 361 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
362 goto Efault; 362 return ERR_PTR(-EFAULT);
363 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; 363 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
364 pgoff += vma->vm_pgoff; 364 pgoff += vma->vm_pgoff;
365 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) 365 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
366 goto Einval; 366 return ERR_PTR(-EINVAL);
367 } 367 }
368 368
369 if (vma->vm_flags & VM_LOCKED) { 369 if (vma->vm_flags & VM_LOCKED) {
@@ -372,29 +372,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
372 lock_limit = rlimit(RLIMIT_MEMLOCK); 372 lock_limit = rlimit(RLIMIT_MEMLOCK);
373 locked += new_len - old_len; 373 locked += new_len - old_len;
374 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 374 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
375 goto Eagain; 375 return ERR_PTR(-EAGAIN);
376 } 376 }
377 377
378 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) 378 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
379 goto Enomem; 379 return ERR_PTR(-ENOMEM);
380 380
381 if (vma->vm_flags & VM_ACCOUNT) { 381 if (vma->vm_flags & VM_ACCOUNT) {
382 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; 382 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
383 if (security_vm_enough_memory_mm(mm, charged)) 383 if (security_vm_enough_memory_mm(mm, charged))
384 goto Efault; 384 return ERR_PTR(-ENOMEM);
385 *p = charged; 385 *p = charged;
386 } 386 }
387 387
388 return vma; 388 return vma;
389
390Efault: /* very odd choice for most of the cases, but... */
391 return ERR_PTR(-EFAULT);
392Einval:
393 return ERR_PTR(-EINVAL);
394Enomem:
395 return ERR_PTR(-ENOMEM);
396Eagain:
397 return ERR_PTR(-EAGAIN);
398} 389}
399 390
400static unsigned long mremap_to(unsigned long addr, unsigned long old_len, 391static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 52628c819bf7..2b665da1b3c9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -408,7 +408,7 @@ bool oom_killer_disabled __read_mostly;
408static DECLARE_RWSEM(oom_sem); 408static DECLARE_RWSEM(oom_sem);
409 409
410/** 410/**
411 * mark_tsk_oom_victim - marks the given taks as OOM victim. 411 * mark_tsk_oom_victim - marks the given task as OOM victim.
412 * @tsk: task to mark 412 * @tsk: task to mark
413 * 413 *
414 * Has to be called with oom_sem taken for read and never after 414 * Has to be called with oom_sem taken for read and never after
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0372411f38fc..5daf5568b9e1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2228,7 +2228,8 @@ int set_page_dirty(struct page *page)
2228 * it will confuse readahead and make it restart the size rampup 2228 * it will confuse readahead and make it restart the size rampup
2229 * process. But it's a trivial problem. 2229 * process. But it's a trivial problem.
2230 */ 2230 */
2231 ClearPageReclaim(page); 2231 if (PageReclaim(page))
2232 ClearPageReclaim(page);
2232#ifdef CONFIG_BLOCK 2233#ifdef CONFIG_BLOCK
2233 if (!spd) 2234 if (!spd)
2234 spd = __set_page_dirty_buffers; 2235 spd = __set_page_dirty_buffers;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1b849500640c..ebffa0e4a9c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1371,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1371 int to_drain, batch; 1371 int to_drain, batch;
1372 1372
1373 local_irq_save(flags); 1373 local_irq_save(flags);
1374 batch = ACCESS_ONCE(pcp->batch); 1374 batch = READ_ONCE(pcp->batch);
1375 to_drain = min(pcp->count, batch); 1375 to_drain = min(pcp->count, batch);
1376 if (to_drain > 0) { 1376 if (to_drain > 0) {
1377 free_pcppages_bulk(zone, to_drain, pcp); 1377 free_pcppages_bulk(zone, to_drain, pcp);
@@ -1570,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold)
1570 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1570 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1571 pcp->count++; 1571 pcp->count++;
1572 if (pcp->count >= pcp->high) { 1572 if (pcp->count >= pcp->high) {
1573 unsigned long batch = ACCESS_ONCE(pcp->batch); 1573 unsigned long batch = READ_ONCE(pcp->batch);
1574 free_pcppages_bulk(zone, batch, pcp); 1574 free_pcppages_bulk(zone, batch, pcp);
1575 pcp->count -= batch; 1575 pcp->count -= batch;
1576 } 1576 }
@@ -6207,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
6207 mask <<= (BITS_PER_LONG - bitidx - 1); 6207 mask <<= (BITS_PER_LONG - bitidx - 1);
6208 flags <<= (BITS_PER_LONG - bitidx - 1); 6208 flags <<= (BITS_PER_LONG - bitidx - 1);
6209 6209
6210 word = ACCESS_ONCE(bitmap[word_bitidx]); 6210 word = READ_ONCE(bitmap[word_bitidx]);
6211 for (;;) { 6211 for (;;) {
6212 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 6212 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6213 if (word == old_word) 6213 if (word == old_word)
diff --git a/mm/rmap.c b/mm/rmap.c
index c161a14b6a8f..24dd3f9fee27 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -456,7 +456,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
456 unsigned long anon_mapping; 456 unsigned long anon_mapping;
457 457
458 rcu_read_lock(); 458 rcu_read_lock();
459 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); 459 anon_mapping = (unsigned long)READ_ONCE(page->mapping);
460 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 460 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
461 goto out; 461 goto out;
462 if (!page_mapped(page)) 462 if (!page_mapped(page))
@@ -500,14 +500,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
500 unsigned long anon_mapping; 500 unsigned long anon_mapping;
501 501
502 rcu_read_lock(); 502 rcu_read_lock();
503 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); 503 anon_mapping = (unsigned long)READ_ONCE(page->mapping);
504 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 504 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
505 goto out; 505 goto out;
506 if (!page_mapped(page)) 506 if (!page_mapped(page))
507 goto out; 507 goto out;
508 508
509 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 509 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
510 root_anon_vma = ACCESS_ONCE(anon_vma->root); 510 root_anon_vma = READ_ONCE(anon_vma->root);
511 if (down_read_trylock(&root_anon_vma->rwsem)) { 511 if (down_read_trylock(&root_anon_vma->rwsem)) {
512 /* 512 /*
513 * If the page is still mapped, then this anon_vma is still 513 * If the page is still mapped, then this anon_vma is still
diff --git a/mm/slub.c b/mm/slub.c
index 0fdd6c1e1f82..54c0876b43d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4277,7 +4277,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4277 int node; 4277 int node;
4278 struct page *page; 4278 struct page *page;
4279 4279
4280 page = ACCESS_ONCE(c->page); 4280 page = READ_ONCE(c->page);
4281 if (!page) 4281 if (!page)
4282 continue; 4282 continue;
4283 4283
@@ -4292,7 +4292,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4292 total += x; 4292 total += x;
4293 nodes[node] += x; 4293 nodes[node] += x;
4294 4294
4295 page = ACCESS_ONCE(c->partial); 4295 page = READ_ONCE(c->partial);
4296 if (page) { 4296 if (page) {
4297 node = page_to_nid(page); 4297 node = page_to_nid(page);
4298 if (flags & SO_TOTAL) 4298 if (flags & SO_TOTAL)
diff --git a/mm/swap.c b/mm/swap.c
index cd3a5e64cea9..a7251a8ed532 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/uio.h> 33#include <linux/uio.h>
34#include <linux/hugetlb.h>
34 35
35#include "internal.h" 36#include "internal.h"
36 37
@@ -42,7 +43,7 @@ int page_cluster;
42 43
43static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 44static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
44static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 45static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
45static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 46static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
46 47
47/* 48/*
48 * This path almost never happens for VM activity - pages are normally 49 * This path almost never happens for VM activity - pages are normally
@@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page)
75{ 76{
76 compound_page_dtor *dtor; 77 compound_page_dtor *dtor;
77 78
78 __page_cache_release(page); 79 /*
80 * __page_cache_release() is supposed to be called for thp, not for
81 * hugetlb. This is because hugetlb page does never have PageLRU set
82 * (it's never listed to any LRU lists) and no memcg routines should
83 * be called for hugetlb (it has a separate hugetlb_cgroup.)
84 */
85 if (!PageHuge(page))
86 __page_cache_release(page);
79 dtor = get_compound_page_dtor(page); 87 dtor = get_compound_page_dtor(page);
80 (*dtor)(page); 88 (*dtor)(page);
81} 89}
@@ -743,7 +751,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
743 * be write it out by flusher threads as this is much more effective 751 * be write it out by flusher threads as this is much more effective
744 * than the single-page writeout from reclaim. 752 * than the single-page writeout from reclaim.
745 */ 753 */
746static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, 754static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
747 void *arg) 755 void *arg)
748{ 756{
749 int lru, file; 757 int lru, file;
@@ -811,36 +819,36 @@ void lru_add_drain_cpu(int cpu)
811 local_irq_restore(flags); 819 local_irq_restore(flags);
812 } 820 }
813 821
814 pvec = &per_cpu(lru_deactivate_pvecs, cpu); 822 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
815 if (pagevec_count(pvec)) 823 if (pagevec_count(pvec))
816 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 824 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
817 825
818 activate_page_drain(cpu); 826 activate_page_drain(cpu);
819} 827}
820 828
821/** 829/**
822 * deactivate_page - forcefully deactivate a page 830 * deactivate_file_page - forcefully deactivate a file page
823 * @page: page to deactivate 831 * @page: page to deactivate
824 * 832 *
825 * This function hints the VM that @page is a good reclaim candidate, 833 * This function hints the VM that @page is a good reclaim candidate,
826 * for example if its invalidation fails due to the page being dirty 834 * for example if its invalidation fails due to the page being dirty
827 * or under writeback. 835 * or under writeback.
828 */ 836 */
829void deactivate_page(struct page *page) 837void deactivate_file_page(struct page *page)
830{ 838{
831 /* 839 /*
832 * In a workload with many unevictable page such as mprotect, unevictable 840 * In a workload with many unevictable page such as mprotect,
833 * page deactivation for accelerating reclaim is pointless. 841 * unevictable page deactivation for accelerating reclaim is pointless.
834 */ 842 */
835 if (PageUnevictable(page)) 843 if (PageUnevictable(page))
836 return; 844 return;
837 845
838 if (likely(get_page_unless_zero(page))) { 846 if (likely(get_page_unless_zero(page))) {
839 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 847 struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
840 848
841 if (!pagevec_add(pvec, page)) 849 if (!pagevec_add(pvec, page))
842 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 850 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
843 put_cpu_var(lru_deactivate_pvecs); 851 put_cpu_var(lru_deactivate_file_pvecs);
844 } 852 }
845} 853}
846 854
@@ -872,7 +880,7 @@ void lru_add_drain_all(void)
872 880
873 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 881 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
874 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 882 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
875 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 883 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
876 need_activate_page_drain(cpu)) { 884 need_activate_page_drain(cpu)) {
877 INIT_WORK(work, lru_add_drain_per_cpu); 885 INIT_WORK(work, lru_add_drain_per_cpu);
878 schedule_work_on(cpu, work); 886 schedule_work_on(cpu, work);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 405923f77334..8bc8e66138da 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
390 unsigned int pages, max_pages, last_ra; 390 unsigned int pages, max_pages, last_ra;
391 static atomic_t last_readahead_pages; 391 static atomic_t last_readahead_pages;
392 392
393 max_pages = 1 << ACCESS_ONCE(page_cluster); 393 max_pages = 1 << READ_ONCE(page_cluster);
394 if (max_pages <= 1) 394 if (max_pages <= 1)
395 return 1; 395 return 1;
396 396
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 63f55ccb9b26..a7e72103f23b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1312 else 1312 else
1313 continue; 1313 continue;
1314 } 1314 }
1315 count = ACCESS_ONCE(si->swap_map[i]); 1315 count = READ_ONCE(si->swap_map[i]);
1316 if (count && swap_count(count) != SWAP_MAP_BAD) 1316 if (count && swap_count(count) != SWAP_MAP_BAD)
1317 break; 1317 break;
1318 } 1318 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 7a9d8a3cb143..66af9031fae8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -490,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
490 * of interest and try to speed up its reclaim. 490 * of interest and try to speed up its reclaim.
491 */ 491 */
492 if (!ret) 492 if (!ret)
493 deactivate_page(page); 493 deactivate_file_page(page);
494 count += ret; 494 count += ret;
495 } 495 }
496 pagevec_remove_exceptionals(&pvec); 496 pagevec_remove_exceptionals(&pvec);
diff --git a/mm/util.c b/mm/util.c
index 3981ae9d1b15..68ff8a5361e7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -325,9 +325,37 @@ void kvfree(const void *addr)
325} 325}
326EXPORT_SYMBOL(kvfree); 326EXPORT_SYMBOL(kvfree);
327 327
328static inline void *__page_rmapping(struct page *page)
329{
330 unsigned long mapping;
331
332 mapping = (unsigned long)page->mapping;
333 mapping &= ~PAGE_MAPPING_FLAGS;
334
335 return (void *)mapping;
336}
337
338/* Neutral page->mapping pointer to address_space or anon_vma or other */
339void *page_rmapping(struct page *page)
340{
341 page = compound_head(page);
342 return __page_rmapping(page);
343}
344
345struct anon_vma *page_anon_vma(struct page *page)
346{
347 unsigned long mapping;
348
349 page = compound_head(page);
350 mapping = (unsigned long)page->mapping;
351 if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
352 return NULL;
353 return __page_rmapping(page);
354}
355
328struct address_space *page_mapping(struct page *page) 356struct address_space *page_mapping(struct page *page)
329{ 357{
330 struct address_space *mapping = page->mapping; 358 unsigned long mapping;
331 359
332 /* This happens if someone calls flush_dcache_page on slab page */ 360 /* This happens if someone calls flush_dcache_page on slab page */
333 if (unlikely(PageSlab(page))) 361 if (unlikely(PageSlab(page)))
@@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page)
337 swp_entry_t entry; 365 swp_entry_t entry;
338 366
339 entry.val = page_private(page); 367 entry.val = page_private(page);
340 mapping = swap_address_space(entry); 368 return swap_address_space(entry);
341 } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) 369 }
342 mapping = NULL; 370
343 return mapping; 371 mapping = (unsigned long)page->mapping;
372 if (mapping & PAGE_MAPPING_FLAGS)
373 return NULL;
374 return page->mapping;
344} 375}
345 376
346int overcommit_ratio_handler(struct ctl_table *table, int write, 377int overcommit_ratio_handler(struct ctl_table *table, int write,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a5bbdd3b5d67..2faaa2976447 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -765,7 +765,7 @@ struct vmap_block {
765 spinlock_t lock; 765 spinlock_t lock;
766 struct vmap_area *va; 766 struct vmap_area *va;
767 unsigned long free, dirty; 767 unsigned long free, dirty;
768 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 768 unsigned long dirty_min, dirty_max; /*< dirty range */
769 struct list_head free_list; 769 struct list_head free_list;
770 struct rcu_head rcu_head; 770 struct rcu_head rcu_head;
771 struct list_head purge; 771 struct list_head purge;
@@ -796,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr)
796 return addr; 796 return addr;
797} 797}
798 798
799static struct vmap_block *new_vmap_block(gfp_t gfp_mask) 799static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
800{
801 unsigned long addr;
802
803 addr = va_start + (pages_off << PAGE_SHIFT);
804 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
805 return (void *)addr;
806}
807
808/**
809 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
810 * block. Of course pages number can't exceed VMAP_BBMAP_BITS
811 * @order: how many 2^order pages should be occupied in newly allocated block
812 * @gfp_mask: flags for the page level allocator
813 *
814 * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
815 */
816static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
800{ 817{
801 struct vmap_block_queue *vbq; 818 struct vmap_block_queue *vbq;
802 struct vmap_block *vb; 819 struct vmap_block *vb;
803 struct vmap_area *va; 820 struct vmap_area *va;
804 unsigned long vb_idx; 821 unsigned long vb_idx;
805 int node, err; 822 int node, err;
823 void *vaddr;
806 824
807 node = numa_node_id(); 825 node = numa_node_id();
808 826
@@ -826,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
826 return ERR_PTR(err); 844 return ERR_PTR(err);
827 } 845 }
828 846
847 vaddr = vmap_block_vaddr(va->va_start, 0);
829 spin_lock_init(&vb->lock); 848 spin_lock_init(&vb->lock);
830 vb->va = va; 849 vb->va = va;
831 vb->free = VMAP_BBMAP_BITS; 850 /* At least something should be left free */
851 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
852 vb->free = VMAP_BBMAP_BITS - (1UL << order);
832 vb->dirty = 0; 853 vb->dirty = 0;
833 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); 854 vb->dirty_min = VMAP_BBMAP_BITS;
855 vb->dirty_max = 0;
834 INIT_LIST_HEAD(&vb->free_list); 856 INIT_LIST_HEAD(&vb->free_list);
835 857
836 vb_idx = addr_to_vb_idx(va->va_start); 858 vb_idx = addr_to_vb_idx(va->va_start);
@@ -842,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
842 864
843 vbq = &get_cpu_var(vmap_block_queue); 865 vbq = &get_cpu_var(vmap_block_queue);
844 spin_lock(&vbq->lock); 866 spin_lock(&vbq->lock);
845 list_add_rcu(&vb->free_list, &vbq->free); 867 list_add_tail_rcu(&vb->free_list, &vbq->free);
846 spin_unlock(&vbq->lock); 868 spin_unlock(&vbq->lock);
847 put_cpu_var(vmap_block_queue); 869 put_cpu_var(vmap_block_queue);
848 870
849 return vb; 871 return vaddr;
850} 872}
851 873
852static void free_vmap_block(struct vmap_block *vb) 874static void free_vmap_block(struct vmap_block *vb)
@@ -881,7 +903,8 @@ static void purge_fragmented_blocks(int cpu)
881 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 903 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
882 vb->free = 0; /* prevent further allocs after releasing lock */ 904 vb->free = 0; /* prevent further allocs after releasing lock */
883 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 905 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
884 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); 906 vb->dirty_min = 0;
907 vb->dirty_max = VMAP_BBMAP_BITS;
885 spin_lock(&vbq->lock); 908 spin_lock(&vbq->lock);
886 list_del_rcu(&vb->free_list); 909 list_del_rcu(&vb->free_list);
887 spin_unlock(&vbq->lock); 910 spin_unlock(&vbq->lock);
@@ -910,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
910{ 933{
911 struct vmap_block_queue *vbq; 934 struct vmap_block_queue *vbq;
912 struct vmap_block *vb; 935 struct vmap_block *vb;
913 unsigned long addr = 0; 936 void *vaddr = NULL;
914 unsigned int order; 937 unsigned int order;
915 938
916 BUG_ON(size & ~PAGE_MASK); 939 BUG_ON(size & ~PAGE_MASK);
@@ -925,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
925 } 948 }
926 order = get_order(size); 949 order = get_order(size);
927 950
928again:
929 rcu_read_lock(); 951 rcu_read_lock();
930 vbq = &get_cpu_var(vmap_block_queue); 952 vbq = &get_cpu_var(vmap_block_queue);
931 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 953 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
932 int i; 954 unsigned long pages_off;
933 955
934 spin_lock(&vb->lock); 956 spin_lock(&vb->lock);
935 if (vb->free < 1UL << order) 957 if (vb->free < (1UL << order)) {
936 goto next; 958 spin_unlock(&vb->lock);
959 continue;
960 }
937 961
938 i = VMAP_BBMAP_BITS - vb->free; 962 pages_off = VMAP_BBMAP_BITS - vb->free;
939 addr = vb->va->va_start + (i << PAGE_SHIFT); 963 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
940 BUG_ON(addr_to_vb_idx(addr) !=
941 addr_to_vb_idx(vb->va->va_start));
942 vb->free -= 1UL << order; 964 vb->free -= 1UL << order;
943 if (vb->free == 0) { 965 if (vb->free == 0) {
944 spin_lock(&vbq->lock); 966 spin_lock(&vbq->lock);
945 list_del_rcu(&vb->free_list); 967 list_del_rcu(&vb->free_list);
946 spin_unlock(&vbq->lock); 968 spin_unlock(&vbq->lock);
947 } 969 }
970
948 spin_unlock(&vb->lock); 971 spin_unlock(&vb->lock);
949 break; 972 break;
950next:
951 spin_unlock(&vb->lock);
952 } 973 }
953 974
954 put_cpu_var(vmap_block_queue); 975 put_cpu_var(vmap_block_queue);
955 rcu_read_unlock(); 976 rcu_read_unlock();
956 977
957 if (!addr) { 978 /* Allocate new block if nothing was found */
958 vb = new_vmap_block(gfp_mask); 979 if (!vaddr)
959 if (IS_ERR(vb)) 980 vaddr = new_vmap_block(order, gfp_mask);
960 return vb;
961 goto again;
962 }
963 981
964 return (void *)addr; 982 return vaddr;
965} 983}
966 984
967static void vb_free(const void *addr, unsigned long size) 985static void vb_free(const void *addr, unsigned long size)
@@ -979,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size)
979 order = get_order(size); 997 order = get_order(size);
980 998
981 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); 999 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
1000 offset >>= PAGE_SHIFT;
982 1001
983 vb_idx = addr_to_vb_idx((unsigned long)addr); 1002 vb_idx = addr_to_vb_idx((unsigned long)addr);
984 rcu_read_lock(); 1003 rcu_read_lock();
@@ -989,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size)
989 vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); 1008 vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
990 1009
991 spin_lock(&vb->lock); 1010 spin_lock(&vb->lock);
992 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); 1011
1012 /* Expand dirty range */
1013 vb->dirty_min = min(vb->dirty_min, offset);
1014 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
993 1015
994 vb->dirty += 1UL << order; 1016 vb->dirty += 1UL << order;
995 if (vb->dirty == VMAP_BBMAP_BITS) { 1017 if (vb->dirty == VMAP_BBMAP_BITS) {
@@ -1028,25 +1050,18 @@ void vm_unmap_aliases(void)
1028 1050
1029 rcu_read_lock(); 1051 rcu_read_lock();
1030 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1052 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1031 int i, j;
1032
1033 spin_lock(&vb->lock); 1053 spin_lock(&vb->lock);
1034 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); 1054 if (vb->dirty) {
1035 if (i < VMAP_BBMAP_BITS) { 1055 unsigned long va_start = vb->va->va_start;
1036 unsigned long s, e; 1056 unsigned long s, e;
1037 1057
1038 j = find_last_bit(vb->dirty_map, 1058 s = va_start + (vb->dirty_min << PAGE_SHIFT);
1039 VMAP_BBMAP_BITS); 1059 e = va_start + (vb->dirty_max << PAGE_SHIFT);
1040 j = j + 1; /* need exclusive index */
1041 1060
1042 s = vb->va->va_start + (i << PAGE_SHIFT); 1061 start = min(s, start);
1043 e = vb->va->va_start + (j << PAGE_SHIFT); 1062 end = max(e, end);
1044 flush = 1;
1045 1063
1046 if (s < start) 1064 flush = 1;
1047 start = s;
1048 if (e > end)
1049 end = e;
1050 } 1065 }
1051 spin_unlock(&vb->lock); 1066 spin_unlock(&vb->lock);
1052 } 1067 }
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0dec1fa5f656..08bd7a3d464a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -12,35 +12,6 @@
12 */ 12 */
13 13
14/* 14/*
15 * This allocator is designed for use with zram. Thus, the allocator is
16 * supposed to work well under low memory conditions. In particular, it
17 * never attempts higher order page allocation which is very likely to
18 * fail under memory pressure. On the other hand, if we just use single
19 * (0-order) pages, it would suffer from very high fragmentation --
20 * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
21 * This was one of the major issues with its predecessor (xvmalloc).
22 *
23 * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
24 * and links them together using various 'struct page' fields. These linked
25 * pages act as a single higher-order page i.e. an object can span 0-order
26 * page boundaries. The code refers to these linked pages as a single entity
27 * called zspage.
28 *
29 * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
30 * since this satisfies the requirements of all its current users (in the
31 * worst case, page is incompressible and is thus stored "as-is" i.e. in
32 * uncompressed form). For allocation requests larger than this size, failure
33 * is returned (see zs_malloc).
34 *
35 * Additionally, zs_malloc() does not return a dereferenceable pointer.
36 * Instead, it returns an opaque handle (unsigned long) which encodes actual
37 * location of the allocated object. The reason for this indirection is that
38 * zsmalloc does not keep zspages permanently mapped since that would cause
39 * issues on 32-bit systems where the VA region for kernel space mappings
40 * is very small. So, before using the allocating memory, the object has to
41 * be mapped using zs_map_object() to get a usable pointer and subsequently
42 * unmapped using zs_unmap_object().
43 *
44 * Following is how we use various fields and flags of underlying 15 * Following is how we use various fields and flags of underlying
45 * struct page(s) to form a zspage. 16 * struct page(s) to form a zspage.
46 * 17 *
@@ -57,6 +28,8 @@
57 * 28 *
58 * page->private (union with page->first_page): refers to the 29 * page->private (union with page->first_page): refers to the
59 * component page after the first page 30 * component page after the first page
31 * If the page is first_page for huge object, it stores handle.
32 * Look at size_class->huge.
60 * page->freelist: points to the first free object in zspage. 33 * page->freelist: points to the first free object in zspage.
61 * Free objects are linked together using in-place 34 * Free objects are linked together using in-place
62 * metadata. 35 * metadata.
@@ -78,6 +51,7 @@
78 51
79#include <linux/module.h> 52#include <linux/module.h>
80#include <linux/kernel.h> 53#include <linux/kernel.h>
54#include <linux/sched.h>
81#include <linux/bitops.h> 55#include <linux/bitops.h>
82#include <linux/errno.h> 56#include <linux/errno.h>
83#include <linux/highmem.h> 57#include <linux/highmem.h>
@@ -110,6 +84,8 @@
110#define ZS_MAX_ZSPAGE_ORDER 2 84#define ZS_MAX_ZSPAGE_ORDER 2
111#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 85#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
112 86
87#define ZS_HANDLE_SIZE (sizeof(unsigned long))
88
113/* 89/*
114 * Object location (<PFN>, <obj_idx>) is encoded as 90 * Object location (<PFN>, <obj_idx>) is encoded as
115 * as single (unsigned long) handle value. 91 * as single (unsigned long) handle value.
@@ -133,13 +109,33 @@
133#endif 109#endif
134#endif 110#endif
135#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 111#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
136#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) 112
113/*
114 * Memory for allocating for handle keeps object position by
115 * encoding <page, obj_idx> and the encoded value has a room
116 * in least bit(ie, look at obj_to_location).
117 * We use the bit to synchronize between object access by
118 * user and migration.
119 */
120#define HANDLE_PIN_BIT 0
121
122/*
123 * Head in allocated object should have OBJ_ALLOCATED_TAG
124 * to identify the object was allocated or not.
125 * It's okay to add the status bit in the least bit because
126 * header keeps handle which is 4byte-aligned address so we
127 * have room for two bit at least.
128 */
129#define OBJ_ALLOCATED_TAG 1
130#define OBJ_TAG_BITS 1
131#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
137#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 132#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
138 133
139#define MAX(a, b) ((a) >= (b) ? (a) : (b)) 134#define MAX(a, b) ((a) >= (b) ? (a) : (b))
140/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 135/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
141#define ZS_MIN_ALLOC_SIZE \ 136#define ZS_MIN_ALLOC_SIZE \
142 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 137 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
138/* each chunk includes extra space to keep handle */
143#define ZS_MAX_ALLOC_SIZE PAGE_SIZE 139#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
144 140
145/* 141/*
@@ -172,6 +168,8 @@ enum fullness_group {
172enum zs_stat_type { 168enum zs_stat_type {
173 OBJ_ALLOCATED, 169 OBJ_ALLOCATED,
174 OBJ_USED, 170 OBJ_USED,
171 CLASS_ALMOST_FULL,
172 CLASS_ALMOST_EMPTY,
175 NR_ZS_STAT_TYPE, 173 NR_ZS_STAT_TYPE,
176}; 174};
177 175
@@ -216,6 +214,8 @@ struct size_class {
216 214
217 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 215 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
218 int pages_per_zspage; 216 int pages_per_zspage;
217 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
218 bool huge;
219 219
220#ifdef CONFIG_ZSMALLOC_STAT 220#ifdef CONFIG_ZSMALLOC_STAT
221 struct zs_size_stat stats; 221 struct zs_size_stat stats;
@@ -233,14 +233,24 @@ struct size_class {
233 * This must be power of 2 and less than or equal to ZS_ALIGN 233 * This must be power of 2 and less than or equal to ZS_ALIGN
234 */ 234 */
235struct link_free { 235struct link_free {
236 /* Handle of next free chunk (encodes <PFN, obj_idx>) */ 236 union {
237 void *next; 237 /*
238 * Position of next free chunk (encodes <PFN, obj_idx>)
239 * It's valid for non-allocated object
240 */
241 void *next;
242 /*
243 * Handle of allocated object.
244 */
245 unsigned long handle;
246 };
238}; 247};
239 248
240struct zs_pool { 249struct zs_pool {
241 char *name; 250 char *name;
242 251
243 struct size_class **size_class; 252 struct size_class **size_class;
253 struct kmem_cache *handle_cachep;
244 254
245 gfp_t flags; /* allocation flags used when growing pool */ 255 gfp_t flags; /* allocation flags used when growing pool */
246 atomic_long_t pages_allocated; 256 atomic_long_t pages_allocated;
@@ -267,8 +277,37 @@ struct mapping_area {
267#endif 277#endif
268 char *vm_addr; /* address of kmap_atomic()'ed pages */ 278 char *vm_addr; /* address of kmap_atomic()'ed pages */
269 enum zs_mapmode vm_mm; /* mapping mode */ 279 enum zs_mapmode vm_mm; /* mapping mode */
280 bool huge;
270}; 281};
271 282
283static int create_handle_cache(struct zs_pool *pool)
284{
285 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
286 0, 0, NULL);
287 return pool->handle_cachep ? 0 : 1;
288}
289
290static void destroy_handle_cache(struct zs_pool *pool)
291{
292 kmem_cache_destroy(pool->handle_cachep);
293}
294
295static unsigned long alloc_handle(struct zs_pool *pool)
296{
297 return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
298 pool->flags & ~__GFP_HIGHMEM);
299}
300
301static void free_handle(struct zs_pool *pool, unsigned long handle)
302{
303 kmem_cache_free(pool->handle_cachep, (void *)handle);
304}
305
306static void record_obj(unsigned long handle, unsigned long obj)
307{
308 *(unsigned long *)handle = obj;
309}
310
272/* zpool driver */ 311/* zpool driver */
273 312
274#ifdef CONFIG_ZPOOL 313#ifdef CONFIG_ZPOOL
@@ -346,6 +385,11 @@ static struct zpool_driver zs_zpool_driver = {
346MODULE_ALIAS("zpool-zsmalloc"); 385MODULE_ALIAS("zpool-zsmalloc");
347#endif /* CONFIG_ZPOOL */ 386#endif /* CONFIG_ZPOOL */
348 387
388static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
389{
390 return pages_per_zspage * PAGE_SIZE / size;
391}
392
349/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 393/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
350static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 394static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
351 395
@@ -396,9 +440,182 @@ static int get_size_class_index(int size)
396 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 440 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
397 ZS_SIZE_CLASS_DELTA); 441 ZS_SIZE_CLASS_DELTA);
398 442
399 return idx; 443 return min(zs_size_classes - 1, idx);
444}
445
446#ifdef CONFIG_ZSMALLOC_STAT
447
448static inline void zs_stat_inc(struct size_class *class,
449 enum zs_stat_type type, unsigned long cnt)
450{
451 class->stats.objs[type] += cnt;
452}
453
454static inline void zs_stat_dec(struct size_class *class,
455 enum zs_stat_type type, unsigned long cnt)
456{
457 class->stats.objs[type] -= cnt;
458}
459
460static inline unsigned long zs_stat_get(struct size_class *class,
461 enum zs_stat_type type)
462{
463 return class->stats.objs[type];
464}
465
466static int __init zs_stat_init(void)
467{
468 if (!debugfs_initialized())
469 return -ENODEV;
470
471 zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
472 if (!zs_stat_root)
473 return -ENOMEM;
474
475 return 0;
476}
477
478static void __exit zs_stat_exit(void)
479{
480 debugfs_remove_recursive(zs_stat_root);
481}
482
483static int zs_stats_size_show(struct seq_file *s, void *v)
484{
485 int i;
486 struct zs_pool *pool = s->private;
487 struct size_class *class;
488 int objs_per_zspage;
489 unsigned long class_almost_full, class_almost_empty;
490 unsigned long obj_allocated, obj_used, pages_used;
491 unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
492 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
493
494 seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
495 "class", "size", "almost_full", "almost_empty",
496 "obj_allocated", "obj_used", "pages_used",
497 "pages_per_zspage");
498
499 for (i = 0; i < zs_size_classes; i++) {
500 class = pool->size_class[i];
501
502 if (class->index != i)
503 continue;
504
505 spin_lock(&class->lock);
506 class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
507 class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
508 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
509 obj_used = zs_stat_get(class, OBJ_USED);
510 spin_unlock(&class->lock);
511
512 objs_per_zspage = get_maxobj_per_zspage(class->size,
513 class->pages_per_zspage);
514 pages_used = obj_allocated / objs_per_zspage *
515 class->pages_per_zspage;
516
517 seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
518 i, class->size, class_almost_full, class_almost_empty,
519 obj_allocated, obj_used, pages_used,
520 class->pages_per_zspage);
521
522 total_class_almost_full += class_almost_full;
523 total_class_almost_empty += class_almost_empty;
524 total_objs += obj_allocated;
525 total_used_objs += obj_used;
526 total_pages += pages_used;
527 }
528
529 seq_puts(s, "\n");
530 seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
531 "Total", "", total_class_almost_full,
532 total_class_almost_empty, total_objs,
533 total_used_objs, total_pages);
534
535 return 0;
536}
537
538static int zs_stats_size_open(struct inode *inode, struct file *file)
539{
540 return single_open(file, zs_stats_size_show, inode->i_private);
541}
542
543static const struct file_operations zs_stat_size_ops = {
544 .open = zs_stats_size_open,
545 .read = seq_read,
546 .llseek = seq_lseek,
547 .release = single_release,
548};
549
550static int zs_pool_stat_create(char *name, struct zs_pool *pool)
551{
552 struct dentry *entry;
553
554 if (!zs_stat_root)
555 return -ENODEV;
556
557 entry = debugfs_create_dir(name, zs_stat_root);
558 if (!entry) {
559 pr_warn("debugfs dir <%s> creation failed\n", name);
560 return -ENOMEM;
561 }
562 pool->stat_dentry = entry;
563
564 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
565 pool->stat_dentry, pool, &zs_stat_size_ops);
566 if (!entry) {
567 pr_warn("%s: debugfs file entry <%s> creation failed\n",
568 name, "classes");
569 return -ENOMEM;
570 }
571
572 return 0;
573}
574
575static void zs_pool_stat_destroy(struct zs_pool *pool)
576{
577 debugfs_remove_recursive(pool->stat_dentry);
578}
579
580#else /* CONFIG_ZSMALLOC_STAT */
581
582static inline void zs_stat_inc(struct size_class *class,
583 enum zs_stat_type type, unsigned long cnt)
584{
585}
586
587static inline void zs_stat_dec(struct size_class *class,
588 enum zs_stat_type type, unsigned long cnt)
589{
590}
591
592static inline unsigned long zs_stat_get(struct size_class *class,
593 enum zs_stat_type type)
594{
595 return 0;
596}
597
598static int __init zs_stat_init(void)
599{
600 return 0;
601}
602
603static void __exit zs_stat_exit(void)
604{
605}
606
607static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
608{
609 return 0;
610}
611
612static inline void zs_pool_stat_destroy(struct zs_pool *pool)
613{
400} 614}
401 615
616#endif
617
618
402/* 619/*
403 * For each size class, zspages are divided into different groups 620 * For each size class, zspages are divided into different groups
404 * depending on how "full" they are. This was done so that we could 621 * depending on how "full" they are. This was done so that we could
@@ -419,7 +636,7 @@ static enum fullness_group get_fullness_group(struct page *page)
419 fg = ZS_EMPTY; 636 fg = ZS_EMPTY;
420 else if (inuse == max_objects) 637 else if (inuse == max_objects)
421 fg = ZS_FULL; 638 fg = ZS_FULL;
422 else if (inuse <= max_objects / fullness_threshold_frac) 639 else if (inuse <= 3 * max_objects / fullness_threshold_frac)
423 fg = ZS_ALMOST_EMPTY; 640 fg = ZS_ALMOST_EMPTY;
424 else 641 else
425 fg = ZS_ALMOST_FULL; 642 fg = ZS_ALMOST_FULL;
@@ -448,6 +665,8 @@ static void insert_zspage(struct page *page, struct size_class *class,
448 list_add_tail(&page->lru, &(*head)->lru); 665 list_add_tail(&page->lru, &(*head)->lru);
449 666
450 *head = page; 667 *head = page;
668 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
669 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
451} 670}
452 671
453/* 672/*
@@ -473,6 +692,8 @@ static void remove_zspage(struct page *page, struct size_class *class,
473 struct page, lru); 692 struct page, lru);
474 693
475 list_del_init(&page->lru); 694 list_del_init(&page->lru);
695 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
696 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
476} 697}
477 698
478/* 699/*
@@ -484,11 +705,10 @@ static void remove_zspage(struct page *page, struct size_class *class,
484 * page from the freelist of the old fullness group to that of the new 705 * page from the freelist of the old fullness group to that of the new
485 * fullness group. 706 * fullness group.
486 */ 707 */
487static enum fullness_group fix_fullness_group(struct zs_pool *pool, 708static enum fullness_group fix_fullness_group(struct size_class *class,
488 struct page *page) 709 struct page *page)
489{ 710{
490 int class_idx; 711 int class_idx;
491 struct size_class *class;
492 enum fullness_group currfg, newfg; 712 enum fullness_group currfg, newfg;
493 713
494 BUG_ON(!is_first_page(page)); 714 BUG_ON(!is_first_page(page));
@@ -498,7 +718,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
498 if (newfg == currfg) 718 if (newfg == currfg)
499 goto out; 719 goto out;
500 720
501 class = pool->size_class[class_idx];
502 remove_zspage(page, class, currfg); 721 remove_zspage(page, class, currfg);
503 insert_zspage(page, class, newfg); 722 insert_zspage(page, class, newfg);
504 set_zspage_mapping(page, class_idx, newfg); 723 set_zspage_mapping(page, class_idx, newfg);
@@ -512,7 +731,8 @@ out:
512 * to form a zspage for each size class. This is important 731 * to form a zspage for each size class. This is important
513 * to reduce wastage due to unusable space left at end of 732 * to reduce wastage due to unusable space left at end of
514 * each zspage which is given as: 733 * each zspage which is given as:
515 * wastage = Zp - Zp % size_class 734 * wastage = Zp % class_size
735 * usage = Zp - wastage
516 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 736 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
517 * 737 *
518 * For example, for size class of 3/8 * PAGE_SIZE, we should 738 * For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -571,35 +791,50 @@ static struct page *get_next_page(struct page *page)
571 791
572/* 792/*
573 * Encode <page, obj_idx> as a single handle value. 793 * Encode <page, obj_idx> as a single handle value.
574 * On hardware platforms with physical memory starting at 0x0 the pfn 794 * We use the least bit of handle for tagging.
575 * could be 0 so we ensure that the handle will never be 0 by adjusting the
576 * encoded obj_idx value before encoding.
577 */ 795 */
578static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) 796static void *location_to_obj(struct page *page, unsigned long obj_idx)
579{ 797{
580 unsigned long handle; 798 unsigned long obj;
581 799
582 if (!page) { 800 if (!page) {
583 BUG_ON(obj_idx); 801 BUG_ON(obj_idx);
584 return NULL; 802 return NULL;
585 } 803 }
586 804
587 handle = page_to_pfn(page) << OBJ_INDEX_BITS; 805 obj = page_to_pfn(page) << OBJ_INDEX_BITS;
588 handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); 806 obj |= ((obj_idx) & OBJ_INDEX_MASK);
807 obj <<= OBJ_TAG_BITS;
589 808
590 return (void *)handle; 809 return (void *)obj;
591} 810}
592 811
593/* 812/*
594 * Decode <page, obj_idx> pair from the given object handle. We adjust the 813 * Decode <page, obj_idx> pair from the given object handle. We adjust the
595 * decoded obj_idx back to its original value since it was adjusted in 814 * decoded obj_idx back to its original value since it was adjusted in
596 * obj_location_to_handle(). 815 * location_to_obj().
597 */ 816 */
598static void obj_handle_to_location(unsigned long handle, struct page **page, 817static void obj_to_location(unsigned long obj, struct page **page,
599 unsigned long *obj_idx) 818 unsigned long *obj_idx)
600{ 819{
601 *page = pfn_to_page(handle >> OBJ_INDEX_BITS); 820 obj >>= OBJ_TAG_BITS;
602 *obj_idx = (handle & OBJ_INDEX_MASK) - 1; 821 *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
822 *obj_idx = (obj & OBJ_INDEX_MASK);
823}
824
825static unsigned long handle_to_obj(unsigned long handle)
826{
827 return *(unsigned long *)handle;
828}
829
830static unsigned long obj_to_head(struct size_class *class, struct page *page,
831 void *obj)
832{
833 if (class->huge) {
834 VM_BUG_ON(!is_first_page(page));
835 return *(unsigned long *)page_private(page);
836 } else
837 return *(unsigned long *)obj;
603} 838}
604 839
605static unsigned long obj_idx_to_offset(struct page *page, 840static unsigned long obj_idx_to_offset(struct page *page,
@@ -613,6 +848,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
613 return off + obj_idx * class_size; 848 return off + obj_idx * class_size;
614} 849}
615 850
851static inline int trypin_tag(unsigned long handle)
852{
853 unsigned long *ptr = (unsigned long *)handle;
854
855 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
856}
857
858static void pin_tag(unsigned long handle)
859{
860 while (!trypin_tag(handle));
861}
862
863static void unpin_tag(unsigned long handle)
864{
865 unsigned long *ptr = (unsigned long *)handle;
866
867 clear_bit_unlock(HANDLE_PIN_BIT, ptr);
868}
869
616static void reset_page(struct page *page) 870static void reset_page(struct page *page)
617{ 871{
618 clear_bit(PG_private, &page->flags); 872 clear_bit(PG_private, &page->flags);
@@ -674,7 +928,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
674 link = (struct link_free *)vaddr + off / sizeof(*link); 928 link = (struct link_free *)vaddr + off / sizeof(*link);
675 929
676 while ((off += class->size) < PAGE_SIZE) { 930 while ((off += class->size) < PAGE_SIZE) {
677 link->next = obj_location_to_handle(page, i++); 931 link->next = location_to_obj(page, i++);
678 link += class->size / sizeof(*link); 932 link += class->size / sizeof(*link);
679 } 933 }
680 934
@@ -684,7 +938,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
684 * page (if present) 938 * page (if present)
685 */ 939 */
686 next_page = get_next_page(page); 940 next_page = get_next_page(page);
687 link->next = obj_location_to_handle(next_page, 0); 941 link->next = location_to_obj(next_page, 0);
688 kunmap_atomic(vaddr); 942 kunmap_atomic(vaddr);
689 page = next_page; 943 page = next_page;
690 off %= PAGE_SIZE; 944 off %= PAGE_SIZE;
@@ -738,7 +992,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
738 992
739 init_zspage(first_page, class); 993 init_zspage(first_page, class);
740 994
741 first_page->freelist = obj_location_to_handle(first_page, 0); 995 first_page->freelist = location_to_obj(first_page, 0);
742 /* Maximum number of objects we can store in this zspage */ 996 /* Maximum number of objects we can store in this zspage */
743 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 997 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
744 998
@@ -860,12 +1114,19 @@ static void __zs_unmap_object(struct mapping_area *area,
860{ 1114{
861 int sizes[2]; 1115 int sizes[2];
862 void *addr; 1116 void *addr;
863 char *buf = area->vm_buf; 1117 char *buf;
864 1118
865 /* no write fastpath */ 1119 /* no write fastpath */
866 if (area->vm_mm == ZS_MM_RO) 1120 if (area->vm_mm == ZS_MM_RO)
867 goto out; 1121 goto out;
868 1122
1123 buf = area->vm_buf;
1124 if (!area->huge) {
1125 buf = buf + ZS_HANDLE_SIZE;
1126 size -= ZS_HANDLE_SIZE;
1127 off += ZS_HANDLE_SIZE;
1128 }
1129
869 sizes[0] = PAGE_SIZE - off; 1130 sizes[0] = PAGE_SIZE - off;
870 sizes[1] = size - sizes[0]; 1131 sizes[1] = size - sizes[0];
871 1132
@@ -952,11 +1213,6 @@ static void init_zs_size_classes(void)
952 zs_size_classes = nr; 1213 zs_size_classes = nr;
953} 1214}
954 1215
955static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
956{
957 return pages_per_zspage * PAGE_SIZE / size;
958}
959
960static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1216static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
961{ 1217{
962 if (prev->pages_per_zspage != pages_per_zspage) 1218 if (prev->pages_per_zspage != pages_per_zspage)
@@ -969,166 +1225,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
969 return true; 1225 return true;
970} 1226}
971 1227
972#ifdef CONFIG_ZSMALLOC_STAT 1228static bool zspage_full(struct page *page)
973
974static inline void zs_stat_inc(struct size_class *class,
975 enum zs_stat_type type, unsigned long cnt)
976{
977 class->stats.objs[type] += cnt;
978}
979
980static inline void zs_stat_dec(struct size_class *class,
981 enum zs_stat_type type, unsigned long cnt)
982{
983 class->stats.objs[type] -= cnt;
984}
985
986static inline unsigned long zs_stat_get(struct size_class *class,
987 enum zs_stat_type type)
988{
989 return class->stats.objs[type];
990}
991
992static int __init zs_stat_init(void)
993{
994 if (!debugfs_initialized())
995 return -ENODEV;
996
997 zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
998 if (!zs_stat_root)
999 return -ENOMEM;
1000
1001 return 0;
1002}
1003
1004static void __exit zs_stat_exit(void)
1005{
1006 debugfs_remove_recursive(zs_stat_root);
1007}
1008
1009static int zs_stats_size_show(struct seq_file *s, void *v)
1010{ 1229{
1011 int i; 1230 BUG_ON(!is_first_page(page));
1012 struct zs_pool *pool = s->private;
1013 struct size_class *class;
1014 int objs_per_zspage;
1015 unsigned long obj_allocated, obj_used, pages_used;
1016 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
1017
1018 seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
1019 "obj_allocated", "obj_used", "pages_used");
1020
1021 for (i = 0; i < zs_size_classes; i++) {
1022 class = pool->size_class[i];
1023
1024 if (class->index != i)
1025 continue;
1026
1027 spin_lock(&class->lock);
1028 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
1029 obj_used = zs_stat_get(class, OBJ_USED);
1030 spin_unlock(&class->lock);
1031
1032 objs_per_zspage = get_maxobj_per_zspage(class->size,
1033 class->pages_per_zspage);
1034 pages_used = obj_allocated / objs_per_zspage *
1035 class->pages_per_zspage;
1036
1037 seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i,
1038 class->size, obj_allocated, obj_used, pages_used);
1039
1040 total_objs += obj_allocated;
1041 total_used_objs += obj_used;
1042 total_pages += pages_used;
1043 }
1044
1045 seq_puts(s, "\n");
1046 seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "",
1047 total_objs, total_used_objs, total_pages);
1048
1049 return 0;
1050}
1051
1052static int zs_stats_size_open(struct inode *inode, struct file *file)
1053{
1054 return single_open(file, zs_stats_size_show, inode->i_private);
1055}
1056
1057static const struct file_operations zs_stat_size_ops = {
1058 .open = zs_stats_size_open,
1059 .read = seq_read,
1060 .llseek = seq_lseek,
1061 .release = single_release,
1062};
1063
1064static int zs_pool_stat_create(char *name, struct zs_pool *pool)
1065{
1066 struct dentry *entry;
1067
1068 if (!zs_stat_root)
1069 return -ENODEV;
1070
1071 entry = debugfs_create_dir(name, zs_stat_root);
1072 if (!entry) {
1073 pr_warn("debugfs dir <%s> creation failed\n", name);
1074 return -ENOMEM;
1075 }
1076 pool->stat_dentry = entry;
1077
1078 entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
1079 pool->stat_dentry, pool, &zs_stat_size_ops);
1080 if (!entry) {
1081 pr_warn("%s: debugfs file entry <%s> creation failed\n",
1082 name, "obj_in_classes");
1083 return -ENOMEM;
1084 }
1085
1086 return 0;
1087}
1088
1089static void zs_pool_stat_destroy(struct zs_pool *pool)
1090{
1091 debugfs_remove_recursive(pool->stat_dentry);
1092}
1093
1094#else /* CONFIG_ZSMALLOC_STAT */
1095
1096static inline void zs_stat_inc(struct size_class *class,
1097 enum zs_stat_type type, unsigned long cnt)
1098{
1099}
1100
1101static inline void zs_stat_dec(struct size_class *class,
1102 enum zs_stat_type type, unsigned long cnt)
1103{
1104}
1105
1106static inline unsigned long zs_stat_get(struct size_class *class,
1107 enum zs_stat_type type)
1108{
1109 return 0;
1110}
1111
1112static int __init zs_stat_init(void)
1113{
1114 return 0;
1115}
1116
1117static void __exit zs_stat_exit(void)
1118{
1119}
1120
1121static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
1122{
1123 return 0;
1124}
1125 1231
1126static inline void zs_pool_stat_destroy(struct zs_pool *pool) 1232 return page->inuse == page->objects;
1127{
1128} 1233}
1129 1234
1130#endif
1131
1132unsigned long zs_get_total_pages(struct zs_pool *pool) 1235unsigned long zs_get_total_pages(struct zs_pool *pool)
1133{ 1236{
1134 return atomic_long_read(&pool->pages_allocated); 1237 return atomic_long_read(&pool->pages_allocated);
@@ -1153,13 +1256,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1153 enum zs_mapmode mm) 1256 enum zs_mapmode mm)
1154{ 1257{
1155 struct page *page; 1258 struct page *page;
1156 unsigned long obj_idx, off; 1259 unsigned long obj, obj_idx, off;
1157 1260
1158 unsigned int class_idx; 1261 unsigned int class_idx;
1159 enum fullness_group fg; 1262 enum fullness_group fg;
1160 struct size_class *class; 1263 struct size_class *class;
1161 struct mapping_area *area; 1264 struct mapping_area *area;
1162 struct page *pages[2]; 1265 struct page *pages[2];
1266 void *ret;
1163 1267
1164 BUG_ON(!handle); 1268 BUG_ON(!handle);
1165 1269
@@ -1170,7 +1274,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1170 */ 1274 */
1171 BUG_ON(in_interrupt()); 1275 BUG_ON(in_interrupt());
1172 1276
1173 obj_handle_to_location(handle, &page, &obj_idx); 1277 /* From now on, migration cannot move the object */
1278 pin_tag(handle);
1279
1280 obj = handle_to_obj(handle);
1281 obj_to_location(obj, &page, &obj_idx);
1174 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1282 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1175 class = pool->size_class[class_idx]; 1283 class = pool->size_class[class_idx];
1176 off = obj_idx_to_offset(page, obj_idx, class->size); 1284 off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1180,7 +1288,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1180 if (off + class->size <= PAGE_SIZE) { 1288 if (off + class->size <= PAGE_SIZE) {
1181 /* this object is contained entirely within a page */ 1289 /* this object is contained entirely within a page */
1182 area->vm_addr = kmap_atomic(page); 1290 area->vm_addr = kmap_atomic(page);
1183 return area->vm_addr + off; 1291 ret = area->vm_addr + off;
1292 goto out;
1184 } 1293 }
1185 1294
1186 /* this object spans two pages */ 1295 /* this object spans two pages */
@@ -1188,14 +1297,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1188 pages[1] = get_next_page(page); 1297 pages[1] = get_next_page(page);
1189 BUG_ON(!pages[1]); 1298 BUG_ON(!pages[1]);
1190 1299
1191 return __zs_map_object(area, pages, off, class->size); 1300 ret = __zs_map_object(area, pages, off, class->size);
1301out:
1302 if (!class->huge)
1303 ret += ZS_HANDLE_SIZE;
1304
1305 return ret;
1192} 1306}
1193EXPORT_SYMBOL_GPL(zs_map_object); 1307EXPORT_SYMBOL_GPL(zs_map_object);
1194 1308
1195void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1309void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1196{ 1310{
1197 struct page *page; 1311 struct page *page;
1198 unsigned long obj_idx, off; 1312 unsigned long obj, obj_idx, off;
1199 1313
1200 unsigned int class_idx; 1314 unsigned int class_idx;
1201 enum fullness_group fg; 1315 enum fullness_group fg;
@@ -1204,7 +1318,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1204 1318
1205 BUG_ON(!handle); 1319 BUG_ON(!handle);
1206 1320
1207 obj_handle_to_location(handle, &page, &obj_idx); 1321 obj = handle_to_obj(handle);
1322 obj_to_location(obj, &page, &obj_idx);
1208 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1323 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1209 class = pool->size_class[class_idx]; 1324 class = pool->size_class[class_idx];
1210 off = obj_idx_to_offset(page, obj_idx, class->size); 1325 off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1222,9 +1337,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1222 __zs_unmap_object(area, pages, off, class->size); 1337 __zs_unmap_object(area, pages, off, class->size);
1223 } 1338 }
1224 put_cpu_var(zs_map_area); 1339 put_cpu_var(zs_map_area);
1340 unpin_tag(handle);
1225} 1341}
1226EXPORT_SYMBOL_GPL(zs_unmap_object); 1342EXPORT_SYMBOL_GPL(zs_unmap_object);
1227 1343
1344static unsigned long obj_malloc(struct page *first_page,
1345 struct size_class *class, unsigned long handle)
1346{
1347 unsigned long obj;
1348 struct link_free *link;
1349
1350 struct page *m_page;
1351 unsigned long m_objidx, m_offset;
1352 void *vaddr;
1353
1354 handle |= OBJ_ALLOCATED_TAG;
1355 obj = (unsigned long)first_page->freelist;
1356 obj_to_location(obj, &m_page, &m_objidx);
1357 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
1358
1359 vaddr = kmap_atomic(m_page);
1360 link = (struct link_free *)vaddr + m_offset / sizeof(*link);
1361 first_page->freelist = link->next;
1362 if (!class->huge)
1363 /* record handle in the header of allocated chunk */
1364 link->handle = handle;
1365 else
1366 /* record handle in first_page->private */
1367 set_page_private(first_page, handle);
1368 kunmap_atomic(vaddr);
1369 first_page->inuse++;
1370 zs_stat_inc(class, OBJ_USED, 1);
1371
1372 return obj;
1373}
1374
1375
1228/** 1376/**
1229 * zs_malloc - Allocate block of given size from pool. 1377 * zs_malloc - Allocate block of given size from pool.
1230 * @pool: pool to allocate from 1378 * @pool: pool to allocate from
@@ -1236,17 +1384,19 @@ EXPORT_SYMBOL_GPL(zs_unmap_object);
1236 */ 1384 */
1237unsigned long zs_malloc(struct zs_pool *pool, size_t size) 1385unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1238{ 1386{
1239 unsigned long obj; 1387 unsigned long handle, obj;
1240 struct link_free *link;
1241 struct size_class *class; 1388 struct size_class *class;
1242 void *vaddr; 1389 struct page *first_page;
1243
1244 struct page *first_page, *m_page;
1245 unsigned long m_objidx, m_offset;
1246 1390
1247 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1391 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
1248 return 0; 1392 return 0;
1249 1393
1394 handle = alloc_handle(pool);
1395 if (!handle)
1396 return 0;
1397
1398 /* extra space in chunk to keep the handle */
1399 size += ZS_HANDLE_SIZE;
1250 class = pool->size_class[get_size_class_index(size)]; 1400 class = pool->size_class[get_size_class_index(size)];
1251 1401
1252 spin_lock(&class->lock); 1402 spin_lock(&class->lock);
@@ -1255,8 +1405,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1255 if (!first_page) { 1405 if (!first_page) {
1256 spin_unlock(&class->lock); 1406 spin_unlock(&class->lock);
1257 first_page = alloc_zspage(class, pool->flags); 1407 first_page = alloc_zspage(class, pool->flags);
1258 if (unlikely(!first_page)) 1408 if (unlikely(!first_page)) {
1409 free_handle(pool, handle);
1259 return 0; 1410 return 0;
1411 }
1260 1412
1261 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1413 set_zspage_mapping(first_page, class->index, ZS_EMPTY);
1262 atomic_long_add(class->pages_per_zspage, 1414 atomic_long_add(class->pages_per_zspage,
@@ -1267,73 +1419,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1267 class->size, class->pages_per_zspage)); 1419 class->size, class->pages_per_zspage));
1268 } 1420 }
1269 1421
1270 obj = (unsigned long)first_page->freelist; 1422 obj = obj_malloc(first_page, class, handle);
1271 obj_handle_to_location(obj, &m_page, &m_objidx);
1272 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
1273
1274 vaddr = kmap_atomic(m_page);
1275 link = (struct link_free *)vaddr + m_offset / sizeof(*link);
1276 first_page->freelist = link->next;
1277 memset(link, POISON_INUSE, sizeof(*link));
1278 kunmap_atomic(vaddr);
1279
1280 first_page->inuse++;
1281 zs_stat_inc(class, OBJ_USED, 1);
1282 /* Now move the zspage to another fullness group, if required */ 1423 /* Now move the zspage to another fullness group, if required */
1283 fix_fullness_group(pool, first_page); 1424 fix_fullness_group(class, first_page);
1425 record_obj(handle, obj);
1284 spin_unlock(&class->lock); 1426 spin_unlock(&class->lock);
1285 1427
1286 return obj; 1428 return handle;
1287} 1429}
1288EXPORT_SYMBOL_GPL(zs_malloc); 1430EXPORT_SYMBOL_GPL(zs_malloc);
1289 1431
1290void zs_free(struct zs_pool *pool, unsigned long obj) 1432static void obj_free(struct zs_pool *pool, struct size_class *class,
1433 unsigned long obj)
1291{ 1434{
1292 struct link_free *link; 1435 struct link_free *link;
1293 struct page *first_page, *f_page; 1436 struct page *first_page, *f_page;
1294 unsigned long f_objidx, f_offset; 1437 unsigned long f_objidx, f_offset;
1295 void *vaddr; 1438 void *vaddr;
1296
1297 int class_idx; 1439 int class_idx;
1298 struct size_class *class;
1299 enum fullness_group fullness; 1440 enum fullness_group fullness;
1300 1441
1301 if (unlikely(!obj)) 1442 BUG_ON(!obj);
1302 return;
1303 1443
1304 obj_handle_to_location(obj, &f_page, &f_objidx); 1444 obj &= ~OBJ_ALLOCATED_TAG;
1445 obj_to_location(obj, &f_page, &f_objidx);
1305 first_page = get_first_page(f_page); 1446 first_page = get_first_page(f_page);
1306 1447
1307 get_zspage_mapping(first_page, &class_idx, &fullness); 1448 get_zspage_mapping(first_page, &class_idx, &fullness);
1308 class = pool->size_class[class_idx];
1309 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1449 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
1310 1450
1311 spin_lock(&class->lock); 1451 vaddr = kmap_atomic(f_page);
1312 1452
1313 /* Insert this object in containing zspage's freelist */ 1453 /* Insert this object in containing zspage's freelist */
1314 vaddr = kmap_atomic(f_page);
1315 link = (struct link_free *)(vaddr + f_offset); 1454 link = (struct link_free *)(vaddr + f_offset);
1316 link->next = first_page->freelist; 1455 link->next = first_page->freelist;
1456 if (class->huge)
1457 set_page_private(first_page, 0);
1317 kunmap_atomic(vaddr); 1458 kunmap_atomic(vaddr);
1318 first_page->freelist = (void *)obj; 1459 first_page->freelist = (void *)obj;
1319
1320 first_page->inuse--; 1460 first_page->inuse--;
1321 fullness = fix_fullness_group(pool, first_page);
1322
1323 zs_stat_dec(class, OBJ_USED, 1); 1461 zs_stat_dec(class, OBJ_USED, 1);
1324 if (fullness == ZS_EMPTY) 1462}
1463
1464void zs_free(struct zs_pool *pool, unsigned long handle)
1465{
1466 struct page *first_page, *f_page;
1467 unsigned long obj, f_objidx;
1468 int class_idx;
1469 struct size_class *class;
1470 enum fullness_group fullness;
1471
1472 if (unlikely(!handle))
1473 return;
1474
1475 pin_tag(handle);
1476 obj = handle_to_obj(handle);
1477 obj_to_location(obj, &f_page, &f_objidx);
1478 first_page = get_first_page(f_page);
1479
1480 get_zspage_mapping(first_page, &class_idx, &fullness);
1481 class = pool->size_class[class_idx];
1482
1483 spin_lock(&class->lock);
1484 obj_free(pool, class, obj);
1485 fullness = fix_fullness_group(class, first_page);
1486 if (fullness == ZS_EMPTY) {
1325 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1487 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1326 class->size, class->pages_per_zspage)); 1488 class->size, class->pages_per_zspage));
1327 1489 atomic_long_sub(class->pages_per_zspage,
1490 &pool->pages_allocated);
1491 free_zspage(first_page);
1492 }
1328 spin_unlock(&class->lock); 1493 spin_unlock(&class->lock);
1494 unpin_tag(handle);
1495
1496 free_handle(pool, handle);
1497}
1498EXPORT_SYMBOL_GPL(zs_free);
1499
1500static void zs_object_copy(unsigned long src, unsigned long dst,
1501 struct size_class *class)
1502{
1503 struct page *s_page, *d_page;
1504 unsigned long s_objidx, d_objidx;
1505 unsigned long s_off, d_off;
1506 void *s_addr, *d_addr;
1507 int s_size, d_size, size;
1508 int written = 0;
1509
1510 s_size = d_size = class->size;
1511
1512 obj_to_location(src, &s_page, &s_objidx);
1513 obj_to_location(dst, &d_page, &d_objidx);
1514
1515 s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
1516 d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
1517
1518 if (s_off + class->size > PAGE_SIZE)
1519 s_size = PAGE_SIZE - s_off;
1520
1521 if (d_off + class->size > PAGE_SIZE)
1522 d_size = PAGE_SIZE - d_off;
1523
1524 s_addr = kmap_atomic(s_page);
1525 d_addr = kmap_atomic(d_page);
1526
1527 while (1) {
1528 size = min(s_size, d_size);
1529 memcpy(d_addr + d_off, s_addr + s_off, size);
1530 written += size;
1531
1532 if (written == class->size)
1533 break;
1534
1535 s_off += size;
1536 s_size -= size;
1537 d_off += size;
1538 d_size -= size;
1539
1540 if (s_off >= PAGE_SIZE) {
1541 kunmap_atomic(d_addr);
1542 kunmap_atomic(s_addr);
1543 s_page = get_next_page(s_page);
1544 BUG_ON(!s_page);
1545 s_addr = kmap_atomic(s_page);
1546 d_addr = kmap_atomic(d_page);
1547 s_size = class->size - written;
1548 s_off = 0;
1549 }
1550
1551 if (d_off >= PAGE_SIZE) {
1552 kunmap_atomic(d_addr);
1553 d_page = get_next_page(d_page);
1554 BUG_ON(!d_page);
1555 d_addr = kmap_atomic(d_page);
1556 d_size = class->size - written;
1557 d_off = 0;
1558 }
1559 }
1560
1561 kunmap_atomic(d_addr);
1562 kunmap_atomic(s_addr);
1563}
1564
1565/*
1566 * Find alloced object in zspage from index object and
1567 * return handle.
1568 */
1569static unsigned long find_alloced_obj(struct page *page, int index,
1570 struct size_class *class)
1571{
1572 unsigned long head;
1573 int offset = 0;
1574 unsigned long handle = 0;
1575 void *addr = kmap_atomic(page);
1576
1577 if (!is_first_page(page))
1578 offset = page->index;
1579 offset += class->size * index;
1580
1581 while (offset < PAGE_SIZE) {
1582 head = obj_to_head(class, page, addr + offset);
1583 if (head & OBJ_ALLOCATED_TAG) {
1584 handle = head & ~OBJ_ALLOCATED_TAG;
1585 if (trypin_tag(handle))
1586 break;
1587 handle = 0;
1588 }
1589
1590 offset += class->size;
1591 index++;
1592 }
1593
1594 kunmap_atomic(addr);
1595 return handle;
1596}
1597
1598struct zs_compact_control {
1599 /* Source page for migration which could be a subpage of zspage. */
1600 struct page *s_page;
1601 /* Destination page for migration which should be a first page
1602 * of zspage. */
1603 struct page *d_page;
1604 /* Starting object index within @s_page which used for live object
1605 * in the subpage. */
1606 int index;
1607 /* how many of objects are migrated */
1608 int nr_migrated;
1609};
1610
1611static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
1612 struct zs_compact_control *cc)
1613{
1614 unsigned long used_obj, free_obj;
1615 unsigned long handle;
1616 struct page *s_page = cc->s_page;
1617 struct page *d_page = cc->d_page;
1618 unsigned long index = cc->index;
1619 int nr_migrated = 0;
1620 int ret = 0;
1621
1622 while (1) {
1623 handle = find_alloced_obj(s_page, index, class);
1624 if (!handle) {
1625 s_page = get_next_page(s_page);
1626 if (!s_page)
1627 break;
1628 index = 0;
1629 continue;
1630 }
1631
1632 /* Stop if there is no more space */
1633 if (zspage_full(d_page)) {
1634 unpin_tag(handle);
1635 ret = -ENOMEM;
1636 break;
1637 }
1638
1639 used_obj = handle_to_obj(handle);
1640 free_obj = obj_malloc(d_page, class, handle);
1641 zs_object_copy(used_obj, free_obj, class);
1642 index++;
1643 record_obj(handle, free_obj);
1644 unpin_tag(handle);
1645 obj_free(pool, class, used_obj);
1646 nr_migrated++;
1647 }
1648
1649 /* Remember last position in this iteration */
1650 cc->s_page = s_page;
1651 cc->index = index;
1652 cc->nr_migrated = nr_migrated;
1653
1654 return ret;
1655}
1656
1657static struct page *alloc_target_page(struct size_class *class)
1658{
1659 int i;
1660 struct page *page;
1661
1662 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
1663 page = class->fullness_list[i];
1664 if (page) {
1665 remove_zspage(page, class, i);
1666 break;
1667 }
1668 }
1669
1670 return page;
1671}
1672
1673static void putback_zspage(struct zs_pool *pool, struct size_class *class,
1674 struct page *first_page)
1675{
1676 enum fullness_group fullness;
1677
1678 BUG_ON(!is_first_page(first_page));
1679
1680 fullness = get_fullness_group(first_page);
1681 insert_zspage(first_page, class, fullness);
1682 set_zspage_mapping(first_page, class->index, fullness);
1329 1683
1330 if (fullness == ZS_EMPTY) { 1684 if (fullness == ZS_EMPTY) {
1685 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1686 class->size, class->pages_per_zspage));
1331 atomic_long_sub(class->pages_per_zspage, 1687 atomic_long_sub(class->pages_per_zspage,
1332 &pool->pages_allocated); 1688 &pool->pages_allocated);
1689
1333 free_zspage(first_page); 1690 free_zspage(first_page);
1334 } 1691 }
1335} 1692}
1336EXPORT_SYMBOL_GPL(zs_free); 1693
1694static struct page *isolate_source_page(struct size_class *class)
1695{
1696 struct page *page;
1697
1698 page = class->fullness_list[ZS_ALMOST_EMPTY];
1699 if (page)
1700 remove_zspage(page, class, ZS_ALMOST_EMPTY);
1701
1702 return page;
1703}
1704
1705static unsigned long __zs_compact(struct zs_pool *pool,
1706 struct size_class *class)
1707{
1708 int nr_to_migrate;
1709 struct zs_compact_control cc;
1710 struct page *src_page;
1711 struct page *dst_page = NULL;
1712 unsigned long nr_total_migrated = 0;
1713
1714 spin_lock(&class->lock);
1715 while ((src_page = isolate_source_page(class))) {
1716
1717 BUG_ON(!is_first_page(src_page));
1718
1719 /* The goal is to migrate all live objects in source page */
1720 nr_to_migrate = src_page->inuse;
1721 cc.index = 0;
1722 cc.s_page = src_page;
1723
1724 while ((dst_page = alloc_target_page(class))) {
1725 cc.d_page = dst_page;
1726 /*
1727 * If there is no more space in dst_page, try to
1728 * allocate another zspage.
1729 */
1730 if (!migrate_zspage(pool, class, &cc))
1731 break;
1732
1733 putback_zspage(pool, class, dst_page);
1734 nr_total_migrated += cc.nr_migrated;
1735 nr_to_migrate -= cc.nr_migrated;
1736 }
1737
1738 /* Stop if we couldn't find slot */
1739 if (dst_page == NULL)
1740 break;
1741
1742 putback_zspage(pool, class, dst_page);
1743 putback_zspage(pool, class, src_page);
1744 spin_unlock(&class->lock);
1745 nr_total_migrated += cc.nr_migrated;
1746 cond_resched();
1747 spin_lock(&class->lock);
1748 }
1749
1750 if (src_page)
1751 putback_zspage(pool, class, src_page);
1752
1753 spin_unlock(&class->lock);
1754
1755 return nr_total_migrated;
1756}
1757
1758unsigned long zs_compact(struct zs_pool *pool)
1759{
1760 int i;
1761 unsigned long nr_migrated = 0;
1762 struct size_class *class;
1763
1764 for (i = zs_size_classes - 1; i >= 0; i--) {
1765 class = pool->size_class[i];
1766 if (!class)
1767 continue;
1768 if (class->index != i)
1769 continue;
1770 nr_migrated += __zs_compact(pool, class);
1771 }
1772
1773 return nr_migrated;
1774}
1775EXPORT_SYMBOL_GPL(zs_compact);
1337 1776
1338/** 1777/**
1339 * zs_create_pool - Creates an allocation pool to work from. 1778 * zs_create_pool - Creates an allocation pool to work from.
@@ -1355,20 +1794,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
1355 if (!pool) 1794 if (!pool)
1356 return NULL; 1795 return NULL;
1357 1796
1358 pool->name = kstrdup(name, GFP_KERNEL);
1359 if (!pool->name) {
1360 kfree(pool);
1361 return NULL;
1362 }
1363
1364 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1797 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
1365 GFP_KERNEL); 1798 GFP_KERNEL);
1366 if (!pool->size_class) { 1799 if (!pool->size_class) {
1367 kfree(pool->name);
1368 kfree(pool); 1800 kfree(pool);
1369 return NULL; 1801 return NULL;
1370 } 1802 }
1371 1803
1804 pool->name = kstrdup(name, GFP_KERNEL);
1805 if (!pool->name)
1806 goto err;
1807
1808 if (create_handle_cache(pool))
1809 goto err;
1810
1372 /* 1811 /*
1373 * Iterate reversly, because, size of size_class that we want to use 1812 * Iterate reversly, because, size of size_class that we want to use
1374 * for merging should be larger or equal to current size. 1813 * for merging should be larger or equal to current size.
@@ -1406,6 +1845,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
1406 class->size = size; 1845 class->size = size;
1407 class->index = i; 1846 class->index = i;
1408 class->pages_per_zspage = pages_per_zspage; 1847 class->pages_per_zspage = pages_per_zspage;
1848 if (pages_per_zspage == 1 &&
1849 get_maxobj_per_zspage(size, pages_per_zspage) == 1)
1850 class->huge = true;
1409 spin_lock_init(&class->lock); 1851 spin_lock_init(&class->lock);
1410 pool->size_class[i] = class; 1852 pool->size_class[i] = class;
1411 1853
@@ -1450,6 +1892,7 @@ void zs_destroy_pool(struct zs_pool *pool)
1450 kfree(class); 1892 kfree(class);
1451 } 1893 }
1452 1894
1895 destroy_handle_cache(pool);
1453 kfree(pool->size_class); 1896 kfree(pool->size_class);
1454 kfree(pool->name); 1897 kfree(pool->name);
1455 kfree(pool); 1898 kfree(pool);