aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChen, Kenneth W <kenneth.w.chen@intel.com>2006-12-06 23:32:03 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-07 11:39:21 -0500
commit39dde65c9940c97fcd178a3d2b1c57ed8b7b68aa (patch)
tree750818d68ac7381f80fec31491e1d1c78df4b9f6
parente1dbeda60a7ea9e82a908d93c07308d104d50d79 (diff)
[PATCH] shared page table for hugetlb page
Following up with the work on shared page table done by Dave McCracken. This set of patch target shared page table for hugetlb memory only. The shared page table is particular useful in the situation of large number of independent processes sharing large shared memory segments. In the normal page case, the amount of memory saved from process' page table is quite significant. For hugetlb, the saving on page table memory is not the primary objective (as hugetlb itself already cuts down page table overhead significantly), instead, the purpose of using shared page table on hugetlb is to allow faster TLB refill and smaller cache pollution upon TLB miss. With PT sharing, pte entries are shared among hundreds of processes, the cache consumption used by all the page table is smaller and in return, application gets much higher cache hit ratio. One other effect is that cache hit ratio with hardware page walker hitting on pte in cache will be higher and this helps to reduce tlb miss latency. These two effects contribute to higher application performance. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Hugh Dickins <hugh@veritas.com> Cc: Dave McCracken <dmccr@us.ibm.com> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: Adam Litke <agl@us.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: "David S. Miller" <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/mm/hugetlbpage.c112
-rw-r--r--arch/ia64/mm/hugetlbpage.c5
-rw-r--r--arch/powerpc/mm/hugetlbpage.c5
-rw-r--r--arch/sh/mm/hugetlbpage.c5
-rw-r--r--arch/sh64/mm/hugetlbpage.c5
-rw-r--r--arch/sparc64/mm/hugetlbpage.c5
-rw-r--r--include/linux/hugetlb.h1
-rw-r--r--mm/hugetlb.c7
8 files changed, 144 insertions, 1 deletions
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 1719a8141f81..34728e4afe48 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -17,6 +17,113 @@
17#include <asm/tlb.h> 17#include <asm/tlb.h>
18#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
19 19
20static unsigned long page_table_shareable(struct vm_area_struct *svma,
21 struct vm_area_struct *vma,
22 unsigned long addr, pgoff_t idx)
23{
24 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
25 svma->vm_start;
26 unsigned long sbase = saddr & PUD_MASK;
27 unsigned long s_end = sbase + PUD_SIZE;
28
29 /*
30 * match the virtual addresses, permission and the alignment of the
31 * page table page.
32 */
33 if (pmd_index(addr) != pmd_index(saddr) ||
34 vma->vm_flags != svma->vm_flags ||
35 sbase < svma->vm_start || svma->vm_end < s_end)
36 return 0;
37
38 return saddr;
39}
40
41static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
42{
43 unsigned long base = addr & PUD_MASK;
44 unsigned long end = base + PUD_SIZE;
45
46 /*
47 * check on proper vm_flags and page table alignment
48 */
49 if (vma->vm_flags & VM_MAYSHARE &&
50 vma->vm_start <= base && end <= vma->vm_end)
51 return 1;
52 return 0;
53}
54
55/*
56 * search for a shareable pmd page for hugetlb.
57 */
58static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
59{
60 struct vm_area_struct *vma = find_vma(mm, addr);
61 struct address_space *mapping = vma->vm_file->f_mapping;
62 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
63 vma->vm_pgoff;
64 struct prio_tree_iter iter;
65 struct vm_area_struct *svma;
66 unsigned long saddr;
67 pte_t *spte = NULL;
68
69 if (!vma_shareable(vma, addr))
70 return;
71
72 spin_lock(&mapping->i_mmap_lock);
73 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
74 if (svma == vma)
75 continue;
76
77 saddr = page_table_shareable(svma, vma, addr, idx);
78 if (saddr) {
79 spte = huge_pte_offset(svma->vm_mm, saddr);
80 if (spte) {
81 get_page(virt_to_page(spte));
82 break;
83 }
84 }
85 }
86
87 if (!spte)
88 goto out;
89
90 spin_lock(&mm->page_table_lock);
91 if (pud_none(*pud))
92 pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
93 else
94 put_page(virt_to_page(spte));
95 spin_unlock(&mm->page_table_lock);
96out:
97 spin_unlock(&mapping->i_mmap_lock);
98}
99
100/*
101 * unmap huge page backed by shared pte.
102 *
103 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
104 * indicated by page_count > 1, unmap is achieved by clearing pud and
105 * decrementing the ref count. If count == 1, the pte page is not shared.
106 *
107 * called with vma->vm_mm->page_table_lock held.
108 *
109 * returns: 1 successfully unmapped a shared pte page
110 * 0 the underlying pte page is not shared, or it is the last user
111 */
112int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
113{
114 pgd_t *pgd = pgd_offset(mm, *addr);
115 pud_t *pud = pud_offset(pgd, *addr);
116
117 BUG_ON(page_count(virt_to_page(ptep)) == 0);
118 if (page_count(virt_to_page(ptep)) == 1)
119 return 0;
120
121 pud_clear(pud);
122 put_page(virt_to_page(ptep));
123 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
124 return 1;
125}
126
20pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
21{ 128{
22 pgd_t *pgd; 129 pgd_t *pgd;
@@ -25,8 +132,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
25 132
26 pgd = pgd_offset(mm, addr); 133 pgd = pgd_offset(mm, addr);
27 pud = pud_alloc(mm, pgd, addr); 134 pud = pud_alloc(mm, pgd, addr);
28 if (pud) 135 if (pud) {
136 if (pud_none(*pud))
137 huge_pmd_share(mm, addr, pud);
29 pte = (pte_t *) pmd_alloc(mm, pud, addr); 138 pte = (pte_t *) pmd_alloc(mm, pud, addr);
139 }
30 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
31 141
32 return pte; 142 return pte;
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index f3a9585e98a8..0c7e94edc20e 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -64,6 +64,11 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
64 return pte; 64 return pte;
65} 65}
66 66
67int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
68{
69 return 0;
70}
71
67#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } 72#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
68 73
69/* 74/*
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 506d89768d45..424a8f57e155 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -146,6 +146,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
146 return hugepte_offset(hpdp, addr); 146 return hugepte_offset(hpdp, addr);
147} 147}
148 148
149int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
150{
151 return 0;
152}
153
149static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp) 154static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
150{ 155{
151 pte_t *hugepte = hugepd_page(*hpdp); 156 pte_t *hugepte = hugepd_page(*hpdp);
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 329059d6b54a..cf2c2ee35a37 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -63,6 +63,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
63 return pte; 63 return pte;
64} 64}
65 65
66int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
67{
68 return 0;
69}
70
66struct page *follow_huge_addr(struct mm_struct *mm, 71struct page *follow_huge_addr(struct mm_struct *mm,
67 unsigned long address, int write) 72 unsigned long address, int write)
68{ 73{
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c
index 187cf01750b8..4b455f611146 100644
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -53,6 +53,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
53 return pte; 53 return pte;
54} 54}
55 55
56int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
57{
58 return 0;
59}
60
56void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 61void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
57 pte_t *ptep, pte_t entry) 62 pte_t *ptep, pte_t entry)
58{ 63{
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 53b9b1f528e5..33fd0b265e70 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -235,6 +235,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
235 return pte; 235 return pte;
236} 236}
237 237
238int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
239{
240 return 0;
241}
242
238void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 243void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
239 pte_t *ptep, pte_t entry) 244 pte_t *ptep, pte_t entry)
240{ 245{
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ace64e57e17f..a60995afe334 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -35,6 +35,7 @@ extern int sysctl_hugetlb_shm_group;
35 35
36pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr); 36pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
37pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr); 37pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
38int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
38struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, 39struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
39 int write); 40 int write);
40struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 41struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f7355bf2f285..9244971b6791 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -386,6 +386,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
386 if (!ptep) 386 if (!ptep)
387 continue; 387 continue;
388 388
389 if (huge_pmd_unshare(mm, &address, ptep))
390 continue;
391
389 pte = huge_ptep_get_and_clear(mm, address, ptep); 392 pte = huge_ptep_get_and_clear(mm, address, ptep);
390 if (pte_none(pte)) 393 if (pte_none(pte))
391 continue; 394 continue;
@@ -658,11 +661,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
658 BUG_ON(address >= end); 661 BUG_ON(address >= end);
659 flush_cache_range(vma, address, end); 662 flush_cache_range(vma, address, end);
660 663
664 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
661 spin_lock(&mm->page_table_lock); 665 spin_lock(&mm->page_table_lock);
662 for (; address < end; address += HPAGE_SIZE) { 666 for (; address < end; address += HPAGE_SIZE) {
663 ptep = huge_pte_offset(mm, address); 667 ptep = huge_pte_offset(mm, address);
664 if (!ptep) 668 if (!ptep)
665 continue; 669 continue;
670 if (huge_pmd_unshare(mm, &address, ptep))
671 continue;
666 if (!pte_none(*ptep)) { 672 if (!pte_none(*ptep)) {
667 pte = huge_ptep_get_and_clear(mm, address, ptep); 673 pte = huge_ptep_get_and_clear(mm, address, ptep);
668 pte = pte_mkhuge(pte_modify(pte, newprot)); 674 pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -671,6 +677,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
671 } 677 }
672 } 678 }
673 spin_unlock(&mm->page_table_lock); 679 spin_unlock(&mm->page_table_lock);
680 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
674 681
675 flush_tlb_range(vma, start, end); 682 flush_tlb_range(vma, start, end);
676} 683}