aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2011-10-31 20:08:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-10-31 20:30:48 -0400
commit37a1c49a91ad55f917a399ef2174b5ebda4283f9 (patch)
treed272ab0f51016181493c6792f0cf229a87da9ae3
parent7b6efc2bc4f19952b25ebf9b236e5ac43cd386c2 (diff)
thp: mremap support and TLB optimization
This adds THP support to mremap (decreases the number of split_huge_page() calls). Here are also some benchmarks with a proggy like this: === #define _GNU_SOURCE #include <sys/mman.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <sys/time.h> #define SIZE (5UL*1024*1024*1024) int main() { static struct timeval oldstamp, newstamp; long diffsec; char *p, *p2, *p3, *p4; if (posix_memalign((void **)&p, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p2, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p3, 2*1024*1024, 4096)) perror("memalign"), exit(1); memset(p, 0xff, SIZE); memset(p2, 0xff, SIZE); memset(p3, 0x77, 4096); gettimeofday(&oldstamp, NULL); p4 = mremap(p, SIZE, SIZE, MREMAP_FIXED|MREMAP_MAYMOVE, p3); gettimeofday(&newstamp, NULL); diffsec = newstamp.tv_sec - oldstamp.tv_sec; diffsec = newstamp.tv_usec - oldstamp.tv_usec + 1000000 * diffsec; printf("usec %ld\n", diffsec); if (p == MAP_FAILED || p4 != p3) //if (p == MAP_FAILED) perror("mremap"), exit(1); if (memcmp(p4, p2, SIZE)) printf("mremap bug\n"), exit(1); printf("ok\n"); return 0; } === THP on Performance counter stats for './largepage13' (3 runs): 69195836 dTLB-loads ( +- 3.546% ) (scaled from 50.30%) 60708 dTLB-load-misses ( +- 11.776% ) (scaled from 52.62%) 676266476 dTLB-stores ( +- 5.654% ) (scaled from 69.54%) 29856 dTLB-store-misses ( +- 4.081% ) (scaled from 89.22%) 1055848782 iTLB-loads ( +- 4.526% ) (scaled from 80.18%) 8689 iTLB-load-misses ( +- 2.987% ) (scaled from 58.20%) 7.314454164 seconds time elapsed ( +- 0.023% ) THP off Performance counter stats for './largepage13' (3 runs): 1967379311 dTLB-loads ( +- 0.506% ) (scaled from 60.59%) 9238687 dTLB-load-misses ( +- 22.547% ) (scaled from 61.87%) 2014239444 dTLB-stores ( +- 0.692% ) (scaled from 60.40%) 3312335 dTLB-store-misses ( +- 7.304% ) (scaled from 67.60%) 6764372065 iTLB-loads ( +- 0.925% ) (scaled from 79.00%) 8202 iTLB-load-misses ( +- 0.475% ) (scaled from 70.55%) 9.693655243 seconds time elapsed ( +- 0.069% ) grep thp /proc/vmstat thp_fault_alloc 35849 thp_fault_fallback 0 thp_collapse_alloc 3 thp_collapse_alloc_failed 0 thp_split 0 thp_split 0 confirms no thp split despite plenty of hugepages allocated. The measurement of only the mremap time (so excluding the 3 long memset and final long 10GB memory accessing memcmp): THP on usec 14824 usec 14862 usec 14859 THP off usec 256416 usec 255981 usec 255847 With an older kernel without the mremap optimizations (the below patch optimizes the non THP version too). THP on usec 392107 usec 390237 usec 404124 THP off usec 444294 usec 445237 usec 445820 I guess with a threaded program that sends more IPI on large SMP it'd create an even larger difference. All debug options are off except DEBUG_VM to avoid skewing the results. The only problem for native 2M mremap like it happens above both the source and destination address must be 2M aligned or the hugepmd can't be moved without a split but that is an hardware limitation. [akpm@linux-foundation.org: coding-style nitpicking] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Acked-by: Johannes Weiner <jweiner@redhat.com> Acked-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/huge_mm.h5
-rw-r--r--mm/huge_memory.c45
-rw-r--r--mm/mremap.c22
3 files changed, 68 insertions, 4 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 48c32ebf65a7..a9ace9c32507 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -22,6 +22,11 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
22extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 22extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
23 unsigned long addr, unsigned long end, 23 unsigned long addr, unsigned long end,
24 unsigned char *vec); 24 unsigned char *vec);
25extern int move_huge_pmd(struct vm_area_struct *vma,
26 struct vm_area_struct *new_vma,
27 unsigned long old_addr,
28 unsigned long new_addr, unsigned long old_end,
29 pmd_t *old_pmd, pmd_t *new_pmd);
25extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 30extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
26 unsigned long addr, pgprot_t newprot); 31 unsigned long addr, pgprot_t newprot);
27 32
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2d1587be269..6b072bdccf81 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1052,6 +1052,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1052 return ret; 1052 return ret;
1053} 1053}
1054 1054
1055int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1056 unsigned long old_addr,
1057 unsigned long new_addr, unsigned long old_end,
1058 pmd_t *old_pmd, pmd_t *new_pmd)
1059{
1060 int ret = 0;
1061 pmd_t pmd;
1062
1063 struct mm_struct *mm = vma->vm_mm;
1064
1065 if ((old_addr & ~HPAGE_PMD_MASK) ||
1066 (new_addr & ~HPAGE_PMD_MASK) ||
1067 old_end - old_addr < HPAGE_PMD_SIZE ||
1068 (new_vma->vm_flags & VM_NOHUGEPAGE))
1069 goto out;
1070
1071 /*
1072 * The destination pmd shouldn't be established, free_pgtables()
1073 * should have release it.
1074 */
1075 if (WARN_ON(!pmd_none(*new_pmd))) {
1076 VM_BUG_ON(pmd_trans_huge(*new_pmd));
1077 goto out;
1078 }
1079
1080 spin_lock(&mm->page_table_lock);
1081 if (likely(pmd_trans_huge(*old_pmd))) {
1082 if (pmd_trans_splitting(*old_pmd)) {
1083 spin_unlock(&mm->page_table_lock);
1084 wait_split_huge_page(vma->anon_vma, old_pmd);
1085 ret = -1;
1086 } else {
1087 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1088 VM_BUG_ON(!pmd_none(*new_pmd));
1089 set_pmd_at(mm, new_addr, new_pmd, pmd);
1090 spin_unlock(&mm->page_table_lock);
1091 ret = 1;
1092 }
1093 } else {
1094 spin_unlock(&mm->page_table_lock);
1095 }
1096out:
1097 return ret;
1098}
1099
1055int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1100int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1056 unsigned long addr, pgprot_t newprot) 1101 unsigned long addr, pgprot_t newprot)
1057{ 1102{
diff --git a/mm/mremap.c b/mm/mremap.c
index a184f3732e1e..d6959cb4df58 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
41 return NULL; 41 return NULL;
42 42
43 pmd = pmd_offset(pud, addr); 43 pmd = pmd_offset(pud, addr);
44 split_huge_page_pmd(mm, pmd); 44 if (pmd_none(*pmd))
45 if (pmd_none_or_clear_bad(pmd))
46 return NULL; 45 return NULL;
47 46
48 return pmd; 47 return pmd;
@@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
65 return NULL; 64 return NULL;
66 65
67 VM_BUG_ON(pmd_trans_huge(*pmd)); 66 VM_BUG_ON(pmd_trans_huge(*pmd));
68 if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
69 return NULL;
70 67
71 return pmd; 68 return pmd;
72} 69}
@@ -149,6 +146,23 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
149 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); 146 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
150 if (!new_pmd) 147 if (!new_pmd)
151 break; 148 break;
149 if (pmd_trans_huge(*old_pmd)) {
150 int err = 0;
151 if (extent == HPAGE_PMD_SIZE)
152 err = move_huge_pmd(vma, new_vma, old_addr,
153 new_addr, old_end,
154 old_pmd, new_pmd);
155 if (err > 0) {
156 need_flush = true;
157 continue;
158 } else if (!err) {
159 split_huge_page_pmd(vma->vm_mm, old_pmd);
160 }
161 VM_BUG_ON(pmd_trans_huge(*old_pmd));
162 }
163 if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
164 new_pmd, new_addr))
165 break;
152 next = (new_addr + PMD_SIZE) & PMD_MASK; 166 next = (new_addr + PMD_SIZE) & PMD_MASK;
153 if (extent > next - new_addr) 167 if (extent > next - new_addr)
154 extent = next - new_addr; 168 extent = next - new_addr;