aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>2013-04-28 05:37:30 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2013-04-30 01:59:56 -0400
commite2b3d202d1dba8f3546ed28224ce485bc50010be (patch)
tree7062737220b109cb1312ae9a6220761dd23e3ed5 /arch/powerpc/mm
parentcf9427b85e90bb1ff90e2397ff419691d983c68b (diff)
powerpc: Switch 16GB and 16MB explicit hugepages to a different page table format
We will be switching PMD_SHIFT to 24 bits to facilitate THP impmenetation. With PMD_SHIFT set to 24, we now have 16MB huge pages allocated at PGD level. That means with 32 bit process we cannot allocate normal pages at all, because we cover the entire address space with one pgd entry. Fix this by switching to a new page table format for hugepages. With the new page table format for 16GB and 16MB hugepages we won't allocate hugepage directory. Instead we encode the PTE information directly at the directory level. This forces 16MB hugepage at PMD level. This will also make the page take walk much simpler later when we add the THP support. With the new table format we have 4 cases for pgds and pmds: (1) invalid (all zeroes) (2) pointer to next table, as normal; bottom 6 bits == 0 (3) leaf pte for huge page, bottom two bits != 00 (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Acked-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/gup.c18
-rw-r--r--arch/powerpc/mm/hugetlbpage.c176
2 files changed, 164 insertions, 30 deletions
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index d7efdbf640c7..4b921affa495 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -68,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
68 next = pmd_addr_end(addr, end); 68 next = pmd_addr_end(addr, end);
69 if (pmd_none(pmd)) 69 if (pmd_none(pmd))
70 return 0; 70 return 0;
71 if (is_hugepd(pmdp)) { 71 if (pmd_huge(pmd)) {
72 if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
73 write, pages, nr))
74 return 0;
75 } else if (is_hugepd(pmdp)) {
72 if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT, 76 if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
73 addr, next, write, pages, nr)) 77 addr, next, write, pages, nr))
74 return 0; 78 return 0;
@@ -92,7 +96,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
92 next = pud_addr_end(addr, end); 96 next = pud_addr_end(addr, end);
93 if (pud_none(pud)) 97 if (pud_none(pud))
94 return 0; 98 return 0;
95 if (is_hugepd(pudp)) { 99 if (pud_huge(pud)) {
100 if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next,
101 write, pages, nr))
102 return 0;
103 } else if (is_hugepd(pudp)) {
96 if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT, 104 if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
97 addr, next, write, pages, nr)) 105 addr, next, write, pages, nr))
98 return 0; 106 return 0;
@@ -153,7 +161,11 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
153 next = pgd_addr_end(addr, end); 161 next = pgd_addr_end(addr, end);
154 if (pgd_none(pgd)) 162 if (pgd_none(pgd))
155 goto slow; 163 goto slow;
156 if (is_hugepd(pgdp)) { 164 if (pgd_huge(pgd)) {
165 if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next,
166 write, pages, &nr))
167 goto slow;
168 } else if (is_hugepd(pgdp)) {
157 if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, 169 if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
158 addr, next, write, pages, &nr)) 170 addr, next, write, pages, &nr))
159 goto slow; 171 goto slow;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index b4e2f24a9b8f..237c8e5f2640 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -50,11 +50,69 @@ static unsigned nr_gpages;
50 50
51#define hugepd_none(hpd) ((hpd).pd == 0) 51#define hugepd_none(hpd) ((hpd).pd == 0)
52 52
53#ifdef CONFIG_PPC_BOOK3S_64
54/*
55 * At this point we do the placement change only for BOOK3S 64. This would
56 * possibly work on other subarchs.
57 */
58
59/*
60 * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
61 * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
62 */
63int pmd_huge(pmd_t pmd)
64{
65 /*
66 * leaf pte for huge page, bottom two bits != 00
67 */
68 return ((pmd_val(pmd) & 0x3) != 0x0);
69}
70
71int pud_huge(pud_t pud)
72{
73 /*
74 * leaf pte for huge page, bottom two bits != 00
75 */
76 return ((pud_val(pud) & 0x3) != 0x0);
77}
78
79int pgd_huge(pgd_t pgd)
80{
81 /*
82 * leaf pte for huge page, bottom two bits != 00
83 */
84 return ((pgd_val(pgd) & 0x3) != 0x0);
85}
86#else
87int pmd_huge(pmd_t pmd)
88{
89 return 0;
90}
91
92int pud_huge(pud_t pud)
93{
94 return 0;
95}
96
97int pgd_huge(pgd_t pgd)
98{
99 return 0;
100}
101#endif
102
103/*
104 * We have 4 cases for pgds and pmds:
105 * (1) invalid (all zeroes)
106 * (2) pointer to next table, as normal; bottom 6 bits == 0
107 * (3) leaf pte for huge page, bottom two bits != 00
108 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
109 */
53pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) 110pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
54{ 111{
55 pgd_t *pg; 112 pgd_t *pg;
56 pud_t *pu; 113 pud_t *pu;
57 pmd_t *pm; 114 pmd_t *pm;
115 pte_t *ret_pte;
58 hugepd_t *hpdp = NULL; 116 hugepd_t *hpdp = NULL;
59 unsigned pdshift = PGDIR_SHIFT; 117 unsigned pdshift = PGDIR_SHIFT;
60 118
@@ -62,30 +120,43 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
62 *shift = 0; 120 *shift = 0;
63 121
64 pg = pgdir + pgd_index(ea); 122 pg = pgdir + pgd_index(ea);
65 if (is_hugepd(pg)) { 123
124 if (pgd_huge(*pg)) {
125 ret_pte = (pte_t *) pg;
126 goto out;
127 } else if (is_hugepd(pg))
66 hpdp = (hugepd_t *)pg; 128 hpdp = (hugepd_t *)pg;
67 } else if (!pgd_none(*pg)) { 129 else if (!pgd_none(*pg)) {
68 pdshift = PUD_SHIFT; 130 pdshift = PUD_SHIFT;
69 pu = pud_offset(pg, ea); 131 pu = pud_offset(pg, ea);
70 if (is_hugepd(pu)) 132
133 if (pud_huge(*pu)) {
134 ret_pte = (pte_t *) pu;
135 goto out;
136 } else if (is_hugepd(pu))
71 hpdp = (hugepd_t *)pu; 137 hpdp = (hugepd_t *)pu;
72 else if (!pud_none(*pu)) { 138 else if (!pud_none(*pu)) {
73 pdshift = PMD_SHIFT; 139 pdshift = PMD_SHIFT;
74 pm = pmd_offset(pu, ea); 140 pm = pmd_offset(pu, ea);
75 if (is_hugepd(pm)) 141
142 if (pmd_huge(*pm)) {
143 ret_pte = (pte_t *) pm;
144 goto out;
145 } else if (is_hugepd(pm))
76 hpdp = (hugepd_t *)pm; 146 hpdp = (hugepd_t *)pm;
77 else if (!pmd_none(*pm)) { 147 else if (!pmd_none(*pm))
78 return pte_offset_kernel(pm, ea); 148 return pte_offset_kernel(pm, ea);
79 }
80 } 149 }
81 } 150 }
82
83 if (!hpdp) 151 if (!hpdp)
84 return NULL; 152 return NULL;
85 153
154 ret_pte = hugepte_offset(hpdp, ea, pdshift);
155 pdshift = hugepd_shift(*hpdp);
156out:
86 if (shift) 157 if (shift)
87 *shift = hugepd_shift(*hpdp); 158 *shift = pdshift;
88 return hugepte_offset(hpdp, ea, pdshift); 159 return ret_pte;
89} 160}
90EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); 161EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
91 162
@@ -165,6 +236,61 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
165#define HUGEPD_PUD_SHIFT PMD_SHIFT 236#define HUGEPD_PUD_SHIFT PMD_SHIFT
166#endif 237#endif
167 238
239#ifdef CONFIG_PPC_BOOK3S_64
240/*
241 * At this point we do the placement change only for BOOK3S 64. This would
242 * possibly work on other subarchs.
243 */
244pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
245{
246 pgd_t *pg;
247 pud_t *pu;
248 pmd_t *pm;
249 hugepd_t *hpdp = NULL;
250 unsigned pshift = __ffs(sz);
251 unsigned pdshift = PGDIR_SHIFT;
252
253 addr &= ~(sz-1);
254 pg = pgd_offset(mm, addr);
255
256 if (pshift == PGDIR_SHIFT)
257 /* 16GB huge page */
258 return (pte_t *) pg;
259 else if (pshift > PUD_SHIFT)
260 /*
261 * We need to use hugepd table
262 */
263 hpdp = (hugepd_t *)pg;
264 else {
265 pdshift = PUD_SHIFT;
266 pu = pud_alloc(mm, pg, addr);
267 if (pshift == PUD_SHIFT)
268 return (pte_t *)pu;
269 else if (pshift > PMD_SHIFT)
270 hpdp = (hugepd_t *)pu;
271 else {
272 pdshift = PMD_SHIFT;
273 pm = pmd_alloc(mm, pu, addr);
274 if (pshift == PMD_SHIFT)
275 /* 16MB hugepage */
276 return (pte_t *)pm;
277 else
278 hpdp = (hugepd_t *)pm;
279 }
280 }
281 if (!hpdp)
282 return NULL;
283
284 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
285
286 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
287 return NULL;
288
289 return hugepte_offset(hpdp, addr, pdshift);
290}
291
292#else
293
168pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) 294pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
169{ 295{
170 pgd_t *pg; 296 pgd_t *pg;
@@ -202,6 +328,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
202 328
203 return hugepte_offset(hpdp, addr, pdshift); 329 return hugepte_offset(hpdp, addr, pdshift);
204} 330}
331#endif
205 332
206#ifdef CONFIG_PPC_FSL_BOOK3E 333#ifdef CONFIG_PPC_FSL_BOOK3E
207/* Build list of addresses of gigantic pages. This function is used in early 334/* Build list of addresses of gigantic pages. This function is used in early
@@ -465,7 +592,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
465 do { 592 do {
466 pmd = pmd_offset(pud, addr); 593 pmd = pmd_offset(pud, addr);
467 next = pmd_addr_end(addr, end); 594 next = pmd_addr_end(addr, end);
468 if (pmd_none(*pmd)) 595 if (pmd_none_or_clear_bad(pmd))
469 continue; 596 continue;
470#ifdef CONFIG_PPC_FSL_BOOK3E 597#ifdef CONFIG_PPC_FSL_BOOK3E
471 /* 598 /*
@@ -618,16 +745,6 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
618 return page; 745 return page;
619} 746}
620 747
621int pmd_huge(pmd_t pmd)
622{
623 return 0;
624}
625
626int pud_huge(pud_t pud)
627{
628 return 0;
629}
630
631struct page * 748struct page *
632follow_huge_pmd(struct mm_struct *mm, unsigned long address, 749follow_huge_pmd(struct mm_struct *mm, unsigned long address,
633 pmd_t *pmd, int write) 750 pmd_t *pmd, int write)
@@ -636,8 +753,8 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
636 return NULL; 753 return NULL;
637} 754}
638 755
639static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 756int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
640 unsigned long end, int write, struct page **pages, int *nr) 757 unsigned long end, int write, struct page **pages, int *nr)
641{ 758{
642 unsigned long mask; 759 unsigned long mask;
643 unsigned long pte_end; 760 unsigned long pte_end;
@@ -873,11 +990,16 @@ static int __init hugetlbpage_init(void)
873 pdshift = PUD_SHIFT; 990 pdshift = PUD_SHIFT;
874 else 991 else
875 pdshift = PGDIR_SHIFT; 992 pdshift = PGDIR_SHIFT;
876 993 /*
877 pgtable_cache_add(pdshift - shift, NULL); 994 * if we have pdshift and shift value same, we don't
878 if (!PGT_CACHE(pdshift - shift)) 995 * use pgt cache for hugepd.
879 panic("hugetlbpage_init(): could not create " 996 */
880 "pgtable cache for %d bit pagesize\n", shift); 997 if (pdshift != shift) {
998 pgtable_cache_add(pdshift - shift, NULL);
999 if (!PGT_CACHE(pdshift - shift))
1000 panic("hugetlbpage_init(): could not create "
1001 "pgtable cache for %d bit pagesize\n", shift);
1002 }
881 } 1003 }
882 1004
883 /* Set default large page size. Currently, we pick 16M or 1M 1005 /* Set default large page size. Currently, we pick 16M or 1M