aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAaron Lu <aaron.lu@intel.com>2016-10-07 20:00:08 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-07 21:46:28 -0400
commit6fcb52a56ff60d240f06296b12827e7f20d45f63 (patch)
tree662e3a878be89475dc0bf52a85d5396191e2166d
parent0f30206bf2a42e278c2cec32e4b722626458c75b (diff)
thp: reduce usage of huge zero page's atomic counter
The global zero page is used to satisfy an anonymous read fault. If THP(Transparent HugePage) is enabled then the global huge zero page is used. The global huge zero page uses an atomic counter for reference counting and is allocated/freed dynamically according to its counter value. CPU time spent on that counter will greatly increase if there are a lot of processes doing anonymous read faults. This patch proposes a way to reduce the access to the global counter so that the CPU load can be reduced accordingly. To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE. With this flag, the process only need to touch the global counter in two cases: 1 The first time it uses the global huge zero page; 2 The time when mm_user of its mm_struct reaches zero. Note that right now, the huge zero page is eligible to be freed as soon as its last use goes away. With this patch, the page will not be eligible to be freed until the exit of the last process from which it was ever used. And with the use of mm_user, the kthread is not eligible to use huge zero page either. Since no kthread is using huge zero page today, there is no difference after applying this patch. But if that is not desired, I can change it to when mm_count reaches zero. Case used for test on Haswell EP: usemem -n 72 --readonly -j 0x200000 100G Which spawns 72 processes and each will mmap 100G anonymous space and then do read only access to that space sequentially with a step of 2MB. CPU cycles from perf report for base commit: 54.03% usemem [kernel.kallsyms] [k] get_huge_zero_page CPU cycles from perf report for this commit: 0.11% usemem [kernel.kallsyms] [k] mm_get_huge_zero_page Performance(throughput) of the workload for base commit: 1784430792 Performance(throughput) of the workload for this commit: 4726928591 164% increase. Runtime of the workload for base commit: 707592 us Runtime of the workload for this commit: 303970 us 50% drop. Link: http://lkml.kernel.org/r/fe51a88f-446a-4622-1363-ad1282d71385@intel.com Signed-off-by: Aaron Lu <aaron.lu@intel.com> Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Jerome Marchand <jmarchan@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/dax.c2
-rw-r--r--include/linux/huge_mm.h8
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/fork.c1
-rw-r--r--mm/huge_memory.c36
-rw-r--r--mm/swap.c4
-rw-r--r--mm/swap_state.c4
7 files changed, 34 insertions, 22 deletions
diff --git a/fs/dax.c b/fs/dax.c
index cc025f82ef07..014defd2e744 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1036,7 +1036,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1036 if (!write && !buffer_mapped(&bh)) { 1036 if (!write && !buffer_mapped(&bh)) {
1037 spinlock_t *ptl; 1037 spinlock_t *ptl;
1038 pmd_t entry; 1038 pmd_t entry;
1039 struct page *zero_page = get_huge_zero_page(); 1039 struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
1040 1040
1041 if (unlikely(!zero_page)) { 1041 if (unlikely(!zero_page)) {
1042 dax_pmd_dbg(&bh, address, "no zero page"); 1042 dax_pmd_dbg(&bh, address, "no zero page");
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4fca5263fd42..9b9f65d99873 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -156,8 +156,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
156 return is_huge_zero_page(pmd_page(pmd)); 156 return is_huge_zero_page(pmd_page(pmd));
157} 157}
158 158
159struct page *get_huge_zero_page(void); 159struct page *mm_get_huge_zero_page(struct mm_struct *mm);
160void put_huge_zero_page(void); 160void mm_put_huge_zero_page(struct mm_struct *mm);
161 161
162#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) 162#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
163 163
@@ -220,9 +220,9 @@ static inline bool is_huge_zero_page(struct page *page)
220 return false; 220 return false;
221} 221}
222 222
223static inline void put_huge_zero_page(void) 223static inline void mm_put_huge_zero_page(struct mm_struct *mm)
224{ 224{
225 BUILD_BUG(); 225 return;
226} 226}
227 227
228static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, 228static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6bee6f988912..348f51b0ec92 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -526,6 +526,7 @@ static inline int get_dumpable(struct mm_struct *mm)
526#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ 526#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
527#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ 527#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
528#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ 528#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
529#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
529 530
530#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) 531#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
531 532
diff --git a/kernel/fork.c b/kernel/fork.c
index 9a8ec66cd4df..6d42242485cb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -854,6 +854,7 @@ static inline void __mmput(struct mm_struct *mm)
854 ksm_exit(mm); 854 ksm_exit(mm);
855 khugepaged_exit(mm); /* must run before exit_mmap */ 855 khugepaged_exit(mm); /* must run before exit_mmap */
856 exit_mmap(mm); 856 exit_mmap(mm);
857 mm_put_huge_zero_page(mm);
857 set_mm_exe_file(mm, NULL); 858 set_mm_exe_file(mm, NULL);
858 if (!list_empty(&mm->mmlist)) { 859 if (!list_empty(&mm->mmlist)) {
859 spin_lock(&mmlist_lock); 860 spin_lock(&mmlist_lock);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a0b0e562407d..12b9f1a39b63 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
59static atomic_t huge_zero_refcount; 59static atomic_t huge_zero_refcount;
60struct page *huge_zero_page __read_mostly; 60struct page *huge_zero_page __read_mostly;
61 61
62struct page *get_huge_zero_page(void) 62static struct page *get_huge_zero_page(void)
63{ 63{
64 struct page *zero_page; 64 struct page *zero_page;
65retry: 65retry:
@@ -86,7 +86,7 @@ retry:
86 return READ_ONCE(huge_zero_page); 86 return READ_ONCE(huge_zero_page);
87} 87}
88 88
89void put_huge_zero_page(void) 89static void put_huge_zero_page(void)
90{ 90{
91 /* 91 /*
92 * Counter should never go to zero here. Only shrinker can put 92 * Counter should never go to zero here. Only shrinker can put
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
95 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 95 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
96} 96}
97 97
98struct page *mm_get_huge_zero_page(struct mm_struct *mm)
99{
100 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
101 return READ_ONCE(huge_zero_page);
102
103 if (!get_huge_zero_page())
104 return NULL;
105
106 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
107 put_huge_zero_page();
108
109 return READ_ONCE(huge_zero_page);
110}
111
112void mm_put_huge_zero_page(struct mm_struct *mm)
113{
114 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
115 put_huge_zero_page();
116}
117
98static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 118static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
99 struct shrink_control *sc) 119 struct shrink_control *sc)
100{ 120{
@@ -644,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
644 pgtable = pte_alloc_one(vma->vm_mm, haddr); 664 pgtable = pte_alloc_one(vma->vm_mm, haddr);
645 if (unlikely(!pgtable)) 665 if (unlikely(!pgtable))
646 return VM_FAULT_OOM; 666 return VM_FAULT_OOM;
647 zero_page = get_huge_zero_page(); 667 zero_page = mm_get_huge_zero_page(vma->vm_mm);
648 if (unlikely(!zero_page)) { 668 if (unlikely(!zero_page)) {
649 pte_free(vma->vm_mm, pgtable); 669 pte_free(vma->vm_mm, pgtable);
650 count_vm_event(THP_FAULT_FALLBACK); 670 count_vm_event(THP_FAULT_FALLBACK);
@@ -666,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
666 } 686 }
667 } else 687 } else
668 spin_unlock(fe->ptl); 688 spin_unlock(fe->ptl);
669 if (!set) { 689 if (!set)
670 pte_free(vma->vm_mm, pgtable); 690 pte_free(vma->vm_mm, pgtable);
671 put_huge_zero_page();
672 }
673 return ret; 691 return ret;
674 } 692 }
675 gfp = alloc_hugepage_direct_gfpmask(vma); 693 gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -823,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
823 * since we already have a zero page to copy. It just takes a 841 * since we already have a zero page to copy. It just takes a
824 * reference. 842 * reference.
825 */ 843 */
826 zero_page = get_huge_zero_page(); 844 zero_page = mm_get_huge_zero_page(dst_mm);
827 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 845 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
828 zero_page); 846 zero_page);
829 ret = 0; 847 ret = 0;
@@ -1081,7 +1099,6 @@ alloc:
1081 update_mmu_cache_pmd(vma, fe->address, fe->pmd); 1099 update_mmu_cache_pmd(vma, fe->address, fe->pmd);
1082 if (!page) { 1100 if (!page) {
1083 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1101 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1084 put_huge_zero_page();
1085 } else { 1102 } else {
1086 VM_BUG_ON_PAGE(!PageHead(page), page); 1103 VM_BUG_ON_PAGE(!PageHead(page), page);
1087 page_remove_rmap(page, true); 1104 page_remove_rmap(page, true);
@@ -1542,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
1542 } 1559 }
1543 smp_wmb(); /* make pte visible before pmd */ 1560 smp_wmb(); /* make pte visible before pmd */
1544 pmd_populate(mm, pmd, pgtable); 1561 pmd_populate(mm, pmd, pgtable);
1545 put_huge_zero_page();
1546} 1562}
1547 1563
1548static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 1564static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1565,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
1565 1581
1566 if (!vma_is_anonymous(vma)) { 1582 if (!vma_is_anonymous(vma)) {
1567 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1583 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1568 if (is_huge_zero_pmd(_pmd))
1569 put_huge_zero_page();
1570 if (vma_is_dax(vma)) 1584 if (vma_is_dax(vma))
1571 return; 1585 return;
1572 page = pmd_page(_pmd); 1586 page = pmd_page(_pmd);
diff --git a/mm/swap.c b/mm/swap.c
index 75c63bb2a1da..4dcf852e1e6d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold)
748 locked_pgdat = NULL; 748 locked_pgdat = NULL;
749 } 749 }
750 750
751 if (is_huge_zero_page(page)) { 751 if (is_huge_zero_page(page))
752 put_huge_zero_page();
753 continue; 752 continue;
754 }
755 753
756 page = compound_head(page); 754 page = compound_head(page);
757 if (!put_page_testzero(page)) 755 if (!put_page_testzero(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 268b8191982b..8679c997eab6 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -254,9 +254,7 @@ static inline void free_swap_cache(struct page *page)
254void free_page_and_swap_cache(struct page *page) 254void free_page_and_swap_cache(struct page *page)
255{ 255{
256 free_swap_cache(page); 256 free_swap_cache(page);
257 if (is_huge_zero_page(page)) 257 if (!is_huge_zero_page(page))
258 put_huge_zero_page();
259 else
260 put_page(page); 258 put_page(page);
261} 259}
262 260