summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>2012-03-21 19:33:57 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-21 20:54:57 -0400
commit025c5b2451e42c9e8dfdecd6dc84956ce8f321b5 (patch)
tree423b4ef1a0ce021360304a80f6e0ba902581a3ad
parent5aaabe831eb527e0d9284f0745d830a755f70393 (diff)
thp: optimize away unnecessary page table locking
Currently when we check if we can handle thp as it is or we need to split it into regular sized pages, we hold page table lock prior to check whether a given pmd is mapping thp or not. Because of this, when it's not "huge pmd" we suffer from unnecessary lock/unlock overhead. To remove it, this patch introduces a optimized check function and replace several similar logics with it. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: David Rientjes <rientjes@google.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/proc/task_mmu.c73
-rw-r--r--include/linux/huge_mm.h17
-rw-r--r--mm/huge_memory.c125
3 files changed, 101 insertions, 114 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 95264c0ef308..328843de6e9f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -394,20 +394,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
394 pte_t *pte; 394 pte_t *pte;
395 spinlock_t *ptl; 395 spinlock_t *ptl;
396 396
397 spin_lock(&walk->mm->page_table_lock); 397 if (pmd_trans_huge_lock(pmd, vma) == 1) {
398 if (pmd_trans_huge(*pmd)) { 398 smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
399 if (pmd_trans_splitting(*pmd)) {
400 spin_unlock(&walk->mm->page_table_lock);
401 wait_split_huge_page(vma->anon_vma, pmd);
402 } else {
403 smaps_pte_entry(*(pte_t *)pmd, addr,
404 HPAGE_PMD_SIZE, walk);
405 spin_unlock(&walk->mm->page_table_lock);
406 mss->anonymous_thp += HPAGE_PMD_SIZE;
407 return 0;
408 }
409 } else {
410 spin_unlock(&walk->mm->page_table_lock); 399 spin_unlock(&walk->mm->page_table_lock);
400 mss->anonymous_thp += HPAGE_PMD_SIZE;
401 return 0;
411 } 402 }
412 403
413 if (pmd_trans_unstable(pmd)) 404 if (pmd_trans_unstable(pmd))
@@ -705,26 +696,19 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
705 /* find the first VMA at or above 'addr' */ 696 /* find the first VMA at or above 'addr' */
706 vma = find_vma(walk->mm, addr); 697 vma = find_vma(walk->mm, addr);
707 spin_lock(&walk->mm->page_table_lock); 698 spin_lock(&walk->mm->page_table_lock);
708 if (pmd_trans_huge(*pmd)) { 699 if (pmd_trans_huge_lock(pmd, vma) == 1) {
709 if (pmd_trans_splitting(*pmd)) { 700 for (; addr != end; addr += PAGE_SIZE) {
710 spin_unlock(&walk->mm->page_table_lock); 701 unsigned long offset;
711 wait_split_huge_page(vma->anon_vma, pmd); 702
712 } else { 703 offset = (addr & ~PAGEMAP_WALK_MASK) >>
713 for (; addr != end; addr += PAGE_SIZE) { 704 PAGE_SHIFT;
714 unsigned long offset; 705 pfn = thp_pmd_to_pagemap_entry(*pmd, offset);
715 706 err = add_to_pagemap(addr, pfn, pm);
716 offset = (addr & ~PAGEMAP_WALK_MASK) >> 707 if (err)
717 PAGE_SHIFT; 708 break;
718 pfn = thp_pmd_to_pagemap_entry(*pmd, offset);
719 err = add_to_pagemap(addr, pfn, pm);
720 if (err)
721 break;
722 }
723 spin_unlock(&walk->mm->page_table_lock);
724 return err;
725 } 709 }
726 } else {
727 spin_unlock(&walk->mm->page_table_lock); 710 spin_unlock(&walk->mm->page_table_lock);
711 return err;
728 } 712 }
729 713
730 for (; addr != end; addr += PAGE_SIZE) { 714 for (; addr != end; addr += PAGE_SIZE) {
@@ -992,24 +976,17 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
992 pte_t *pte; 976 pte_t *pte;
993 977
994 md = walk->private; 978 md = walk->private;
995 spin_lock(&walk->mm->page_table_lock); 979
996 if (pmd_trans_huge(*pmd)) { 980 if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
997 if (pmd_trans_splitting(*pmd)) { 981 pte_t huge_pte = *(pte_t *)pmd;
998 spin_unlock(&walk->mm->page_table_lock); 982 struct page *page;
999 wait_split_huge_page(md->vma->anon_vma, pmd); 983
1000 } else { 984 page = can_gather_numa_stats(huge_pte, md->vma, addr);
1001 pte_t huge_pte = *(pte_t *)pmd; 985 if (page)
1002 struct page *page; 986 gather_stats(page, md, pte_dirty(huge_pte),
1003 987 HPAGE_PMD_SIZE/PAGE_SIZE);
1004 page = can_gather_numa_stats(huge_pte, md->vma, addr);
1005 if (page)
1006 gather_stats(page, md, pte_dirty(huge_pte),
1007 HPAGE_PMD_SIZE/PAGE_SIZE);
1008 spin_unlock(&walk->mm->page_table_lock);
1009 return 0;
1010 }
1011 } else {
1012 spin_unlock(&walk->mm->page_table_lock); 988 spin_unlock(&walk->mm->page_table_lock);
989 return 0;
1013 } 990 }
1014 991
1015 if (pmd_trans_unstable(pmd)) 992 if (pmd_trans_unstable(pmd))
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1b921299abc4..f56cacb4fec3 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -113,6 +113,18 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
113 unsigned long start, 113 unsigned long start,
114 unsigned long end, 114 unsigned long end,
115 long adjust_next); 115 long adjust_next);
116extern int __pmd_trans_huge_lock(pmd_t *pmd,
117 struct vm_area_struct *vma);
118/* mmap_sem must be held on entry */
119static inline int pmd_trans_huge_lock(pmd_t *pmd,
120 struct vm_area_struct *vma)
121{
122 VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
123 if (pmd_trans_huge(*pmd))
124 return __pmd_trans_huge_lock(pmd, vma);
125 else
126 return 0;
127}
116static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, 128static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
117 unsigned long start, 129 unsigned long start,
118 unsigned long end, 130 unsigned long end,
@@ -176,6 +188,11 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
176 long adjust_next) 188 long adjust_next)
177{ 189{
178} 190}
191static inline int pmd_trans_huge_lock(pmd_t *pmd,
192 struct vm_area_struct *vma)
193{
194 return 0;
195}
179#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 196#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
180 197
181#endif /* _LINUX_HUGE_MM_H */ 198#endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8f7fc394f636..f0e5306eeb55 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1031,32 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1031{ 1031{
1032 int ret = 0; 1032 int ret = 0;
1033 1033
1034 spin_lock(&tlb->mm->page_table_lock); 1034 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1035 if (likely(pmd_trans_huge(*pmd))) { 1035 struct page *page;
1036 if (unlikely(pmd_trans_splitting(*pmd))) { 1036 pgtable_t pgtable;
1037 spin_unlock(&tlb->mm->page_table_lock); 1037 pgtable = get_pmd_huge_pte(tlb->mm);
1038 wait_split_huge_page(vma->anon_vma, 1038 page = pmd_page(*pmd);
1039 pmd); 1039 pmd_clear(pmd);
1040 } else { 1040 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1041 struct page *page; 1041 page_remove_rmap(page);
1042 pgtable_t pgtable; 1042 VM_BUG_ON(page_mapcount(page) < 0);
1043 pgtable = get_pmd_huge_pte(tlb->mm); 1043 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1044 page = pmd_page(*pmd); 1044 VM_BUG_ON(!PageHead(page));
1045 pmd_clear(pmd); 1045 tlb->mm->nr_ptes--;
1046 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1047 page_remove_rmap(page);
1048 VM_BUG_ON(page_mapcount(page) < 0);
1049 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1050 VM_BUG_ON(!PageHead(page));
1051 tlb->mm->nr_ptes--;
1052 spin_unlock(&tlb->mm->page_table_lock);
1053 tlb_remove_page(tlb, page);
1054 pte_free(tlb->mm, pgtable);
1055 ret = 1;
1056 }
1057 } else
1058 spin_unlock(&tlb->mm->page_table_lock); 1046 spin_unlock(&tlb->mm->page_table_lock);
1059 1047 tlb_remove_page(tlb, page);
1048 pte_free(tlb->mm, pgtable);
1049 ret = 1;
1050 }
1060 return ret; 1051 return ret;
1061} 1052}
1062 1053
@@ -1066,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1066{ 1057{
1067 int ret = 0; 1058 int ret = 0;
1068 1059
1069 spin_lock(&vma->vm_mm->page_table_lock); 1060 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1070 if (likely(pmd_trans_huge(*pmd))) { 1061 /*
1071 ret = !pmd_trans_splitting(*pmd); 1062 * All logical pages in the range are present
1072 spin_unlock(&vma->vm_mm->page_table_lock); 1063 * if backed by a huge page.
1073 if (unlikely(!ret)) 1064 */
1074 wait_split_huge_page(vma->anon_vma, pmd);
1075 else {
1076 /*
1077 * All logical pages in the range are present
1078 * if backed by a huge page.
1079 */
1080 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1081 }
1082 } else
1083 spin_unlock(&vma->vm_mm->page_table_lock); 1065 spin_unlock(&vma->vm_mm->page_table_lock);
1066 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1067 ret = 1;
1068 }
1084 1069
1085 return ret; 1070 return ret;
1086} 1071}
@@ -1110,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1110 goto out; 1095 goto out;
1111 } 1096 }
1112 1097
1113 spin_lock(&mm->page_table_lock); 1098 ret = __pmd_trans_huge_lock(old_pmd, vma);
1114 if (likely(pmd_trans_huge(*old_pmd))) { 1099 if (ret == 1) {
1115 if (pmd_trans_splitting(*old_pmd)) { 1100 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1116 spin_unlock(&mm->page_table_lock); 1101 VM_BUG_ON(!pmd_none(*new_pmd));
1117 wait_split_huge_page(vma->anon_vma, old_pmd); 1102 set_pmd_at(mm, new_addr, new_pmd, pmd);
1118 ret = -1;
1119 } else {
1120 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1121 VM_BUG_ON(!pmd_none(*new_pmd));
1122 set_pmd_at(mm, new_addr, new_pmd, pmd);
1123 spin_unlock(&mm->page_table_lock);
1124 ret = 1;
1125 }
1126 } else {
1127 spin_unlock(&mm->page_table_lock); 1103 spin_unlock(&mm->page_table_lock);
1128 } 1104 }
1129out: 1105out:
@@ -1136,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1136 struct mm_struct *mm = vma->vm_mm; 1112 struct mm_struct *mm = vma->vm_mm;
1137 int ret = 0; 1113 int ret = 0;
1138 1114
1139 spin_lock(&mm->page_table_lock); 1115 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1116 pmd_t entry;
1117 entry = pmdp_get_and_clear(mm, addr, pmd);
1118 entry = pmd_modify(entry, newprot);
1119 set_pmd_at(mm, addr, pmd, entry);
1120 spin_unlock(&vma->vm_mm->page_table_lock);
1121 ret = 1;
1122 }
1123
1124 return ret;
1125}
1126
1127/*
1128 * Returns 1 if a given pmd maps a stable (not under splitting) thp.
1129 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
1130 *
1131 * Note that if it returns 1, this routine returns without unlocking page
1132 * table locks. So callers must unlock them.
1133 */
1134int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1135{
1136 spin_lock(&vma->vm_mm->page_table_lock);
1140 if (likely(pmd_trans_huge(*pmd))) { 1137 if (likely(pmd_trans_huge(*pmd))) {
1141 if (unlikely(pmd_trans_splitting(*pmd))) { 1138 if (unlikely(pmd_trans_splitting(*pmd))) {
1142 spin_unlock(&mm->page_table_lock); 1139 spin_unlock(&vma->vm_mm->page_table_lock);
1143 wait_split_huge_page(vma->anon_vma, pmd); 1140 wait_split_huge_page(vma->anon_vma, pmd);
1141 return -1;
1144 } else { 1142 } else {
1145 pmd_t entry; 1143 /* Thp mapped by 'pmd' is stable, so we can
1146 1144 * handle it as it is. */
1147 entry = pmdp_get_and_clear(mm, addr, pmd); 1145 return 1;
1148 entry = pmd_modify(entry, newprot);
1149 set_pmd_at(mm, addr, pmd, entry);
1150 spin_unlock(&vma->vm_mm->page_table_lock);
1151 ret = 1;
1152 } 1146 }
1153 } else 1147 }
1154 spin_unlock(&vma->vm_mm->page_table_lock); 1148 spin_unlock(&vma->vm_mm->page_table_lock);
1155 1149 return 0;
1156 return ret;
1157} 1150}
1158 1151
1159pmd_t *page_check_address_pmd(struct page *page, 1152pmd_t *page_check_address_pmd(struct page *page,