diff options
author | Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | 2013-04-28 05:37:30 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2013-04-30 01:59:56 -0400 |
commit | e2b3d202d1dba8f3546ed28224ce485bc50010be (patch) | |
tree | 7062737220b109cb1312ae9a6220761dd23e3ed5 /arch/powerpc/mm | |
parent | cf9427b85e90bb1ff90e2397ff419691d983c68b (diff) |
powerpc: Switch 16GB and 16MB explicit hugepages to a different page table format
We will be switching PMD_SHIFT to 24 bits to facilitate THP impmenetation.
With PMD_SHIFT set to 24, we now have 16MB huge pages allocated at PGD level.
That means with 32 bit process we cannot allocate normal pages at
all, because we cover the entire address space with one pgd entry. Fix this
by switching to a new page table format for hugepages. With the new page table
format for 16GB and 16MB hugepages we won't allocate hugepage directory. Instead
we encode the PTE information directly at the directory level. This forces 16MB
hugepage at PMD level. This will also make the page take walk much simpler later
when we add the THP support.
With the new table format we have 4 cases for pgds and pmds:
(1) invalid (all zeroes)
(2) pointer to next table, as normal; bottom 6 bits == 0
(3) leaf pte for huge page, bottom two bits != 00
(4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r-- | arch/powerpc/mm/gup.c | 18 | ||||
-rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 176 |
2 files changed, 164 insertions, 30 deletions
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index d7efdbf640c7..4b921affa495 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c | |||
@@ -68,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
68 | next = pmd_addr_end(addr, end); | 68 | next = pmd_addr_end(addr, end); |
69 | if (pmd_none(pmd)) | 69 | if (pmd_none(pmd)) |
70 | return 0; | 70 | return 0; |
71 | if (is_hugepd(pmdp)) { | 71 | if (pmd_huge(pmd)) { |
72 | if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next, | ||
73 | write, pages, nr)) | ||
74 | return 0; | ||
75 | } else if (is_hugepd(pmdp)) { | ||
72 | if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT, | 76 | if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT, |
73 | addr, next, write, pages, nr)) | 77 | addr, next, write, pages, nr)) |
74 | return 0; | 78 | return 0; |
@@ -92,7 +96,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | |||
92 | next = pud_addr_end(addr, end); | 96 | next = pud_addr_end(addr, end); |
93 | if (pud_none(pud)) | 97 | if (pud_none(pud)) |
94 | return 0; | 98 | return 0; |
95 | if (is_hugepd(pudp)) { | 99 | if (pud_huge(pud)) { |
100 | if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next, | ||
101 | write, pages, nr)) | ||
102 | return 0; | ||
103 | } else if (is_hugepd(pudp)) { | ||
96 | if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT, | 104 | if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT, |
97 | addr, next, write, pages, nr)) | 105 | addr, next, write, pages, nr)) |
98 | return 0; | 106 | return 0; |
@@ -153,7 +161,11 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
153 | next = pgd_addr_end(addr, end); | 161 | next = pgd_addr_end(addr, end); |
154 | if (pgd_none(pgd)) | 162 | if (pgd_none(pgd)) |
155 | goto slow; | 163 | goto slow; |
156 | if (is_hugepd(pgdp)) { | 164 | if (pgd_huge(pgd)) { |
165 | if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next, | ||
166 | write, pages, &nr)) | ||
167 | goto slow; | ||
168 | } else if (is_hugepd(pgdp)) { | ||
157 | if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, | 169 | if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, |
158 | addr, next, write, pages, &nr)) | 170 | addr, next, write, pages, &nr)) |
159 | goto slow; | 171 | goto slow; |
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b4e2f24a9b8f..237c8e5f2640 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -50,11 +50,69 @@ static unsigned nr_gpages; | |||
50 | 50 | ||
51 | #define hugepd_none(hpd) ((hpd).pd == 0) | 51 | #define hugepd_none(hpd) ((hpd).pd == 0) |
52 | 52 | ||
53 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
54 | /* | ||
55 | * At this point we do the placement change only for BOOK3S 64. This would | ||
56 | * possibly work on other subarchs. | ||
57 | */ | ||
58 | |||
59 | /* | ||
60 | * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have | ||
61 | * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; | ||
62 | */ | ||
63 | int pmd_huge(pmd_t pmd) | ||
64 | { | ||
65 | /* | ||
66 | * leaf pte for huge page, bottom two bits != 00 | ||
67 | */ | ||
68 | return ((pmd_val(pmd) & 0x3) != 0x0); | ||
69 | } | ||
70 | |||
71 | int pud_huge(pud_t pud) | ||
72 | { | ||
73 | /* | ||
74 | * leaf pte for huge page, bottom two bits != 00 | ||
75 | */ | ||
76 | return ((pud_val(pud) & 0x3) != 0x0); | ||
77 | } | ||
78 | |||
79 | int pgd_huge(pgd_t pgd) | ||
80 | { | ||
81 | /* | ||
82 | * leaf pte for huge page, bottom two bits != 00 | ||
83 | */ | ||
84 | return ((pgd_val(pgd) & 0x3) != 0x0); | ||
85 | } | ||
86 | #else | ||
87 | int pmd_huge(pmd_t pmd) | ||
88 | { | ||
89 | return 0; | ||
90 | } | ||
91 | |||
92 | int pud_huge(pud_t pud) | ||
93 | { | ||
94 | return 0; | ||
95 | } | ||
96 | |||
97 | int pgd_huge(pgd_t pgd) | ||
98 | { | ||
99 | return 0; | ||
100 | } | ||
101 | #endif | ||
102 | |||
103 | /* | ||
104 | * We have 4 cases for pgds and pmds: | ||
105 | * (1) invalid (all zeroes) | ||
106 | * (2) pointer to next table, as normal; bottom 6 bits == 0 | ||
107 | * (3) leaf pte for huge page, bottom two bits != 00 | ||
108 | * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table | ||
109 | */ | ||
53 | pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) | 110 | pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) |
54 | { | 111 | { |
55 | pgd_t *pg; | 112 | pgd_t *pg; |
56 | pud_t *pu; | 113 | pud_t *pu; |
57 | pmd_t *pm; | 114 | pmd_t *pm; |
115 | pte_t *ret_pte; | ||
58 | hugepd_t *hpdp = NULL; | 116 | hugepd_t *hpdp = NULL; |
59 | unsigned pdshift = PGDIR_SHIFT; | 117 | unsigned pdshift = PGDIR_SHIFT; |
60 | 118 | ||
@@ -62,30 +120,43 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift | |||
62 | *shift = 0; | 120 | *shift = 0; |
63 | 121 | ||
64 | pg = pgdir + pgd_index(ea); | 122 | pg = pgdir + pgd_index(ea); |
65 | if (is_hugepd(pg)) { | 123 | |
124 | if (pgd_huge(*pg)) { | ||
125 | ret_pte = (pte_t *) pg; | ||
126 | goto out; | ||
127 | } else if (is_hugepd(pg)) | ||
66 | hpdp = (hugepd_t *)pg; | 128 | hpdp = (hugepd_t *)pg; |
67 | } else if (!pgd_none(*pg)) { | 129 | else if (!pgd_none(*pg)) { |
68 | pdshift = PUD_SHIFT; | 130 | pdshift = PUD_SHIFT; |
69 | pu = pud_offset(pg, ea); | 131 | pu = pud_offset(pg, ea); |
70 | if (is_hugepd(pu)) | 132 | |
133 | if (pud_huge(*pu)) { | ||
134 | ret_pte = (pte_t *) pu; | ||
135 | goto out; | ||
136 | } else if (is_hugepd(pu)) | ||
71 | hpdp = (hugepd_t *)pu; | 137 | hpdp = (hugepd_t *)pu; |
72 | else if (!pud_none(*pu)) { | 138 | else if (!pud_none(*pu)) { |
73 | pdshift = PMD_SHIFT; | 139 | pdshift = PMD_SHIFT; |
74 | pm = pmd_offset(pu, ea); | 140 | pm = pmd_offset(pu, ea); |
75 | if (is_hugepd(pm)) | 141 | |
142 | if (pmd_huge(*pm)) { | ||
143 | ret_pte = (pte_t *) pm; | ||
144 | goto out; | ||
145 | } else if (is_hugepd(pm)) | ||
76 | hpdp = (hugepd_t *)pm; | 146 | hpdp = (hugepd_t *)pm; |
77 | else if (!pmd_none(*pm)) { | 147 | else if (!pmd_none(*pm)) |
78 | return pte_offset_kernel(pm, ea); | 148 | return pte_offset_kernel(pm, ea); |
79 | } | ||
80 | } | 149 | } |
81 | } | 150 | } |
82 | |||
83 | if (!hpdp) | 151 | if (!hpdp) |
84 | return NULL; | 152 | return NULL; |
85 | 153 | ||
154 | ret_pte = hugepte_offset(hpdp, ea, pdshift); | ||
155 | pdshift = hugepd_shift(*hpdp); | ||
156 | out: | ||
86 | if (shift) | 157 | if (shift) |
87 | *shift = hugepd_shift(*hpdp); | 158 | *shift = pdshift; |
88 | return hugepte_offset(hpdp, ea, pdshift); | 159 | return ret_pte; |
89 | } | 160 | } |
90 | EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); | 161 | EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); |
91 | 162 | ||
@@ -165,6 +236,61 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, | |||
165 | #define HUGEPD_PUD_SHIFT PMD_SHIFT | 236 | #define HUGEPD_PUD_SHIFT PMD_SHIFT |
166 | #endif | 237 | #endif |
167 | 238 | ||
239 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
240 | /* | ||
241 | * At this point we do the placement change only for BOOK3S 64. This would | ||
242 | * possibly work on other subarchs. | ||
243 | */ | ||
244 | pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) | ||
245 | { | ||
246 | pgd_t *pg; | ||
247 | pud_t *pu; | ||
248 | pmd_t *pm; | ||
249 | hugepd_t *hpdp = NULL; | ||
250 | unsigned pshift = __ffs(sz); | ||
251 | unsigned pdshift = PGDIR_SHIFT; | ||
252 | |||
253 | addr &= ~(sz-1); | ||
254 | pg = pgd_offset(mm, addr); | ||
255 | |||
256 | if (pshift == PGDIR_SHIFT) | ||
257 | /* 16GB huge page */ | ||
258 | return (pte_t *) pg; | ||
259 | else if (pshift > PUD_SHIFT) | ||
260 | /* | ||
261 | * We need to use hugepd table | ||
262 | */ | ||
263 | hpdp = (hugepd_t *)pg; | ||
264 | else { | ||
265 | pdshift = PUD_SHIFT; | ||
266 | pu = pud_alloc(mm, pg, addr); | ||
267 | if (pshift == PUD_SHIFT) | ||
268 | return (pte_t *)pu; | ||
269 | else if (pshift > PMD_SHIFT) | ||
270 | hpdp = (hugepd_t *)pu; | ||
271 | else { | ||
272 | pdshift = PMD_SHIFT; | ||
273 | pm = pmd_alloc(mm, pu, addr); | ||
274 | if (pshift == PMD_SHIFT) | ||
275 | /* 16MB hugepage */ | ||
276 | return (pte_t *)pm; | ||
277 | else | ||
278 | hpdp = (hugepd_t *)pm; | ||
279 | } | ||
280 | } | ||
281 | if (!hpdp) | ||
282 | return NULL; | ||
283 | |||
284 | BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); | ||
285 | |||
286 | if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) | ||
287 | return NULL; | ||
288 | |||
289 | return hugepte_offset(hpdp, addr, pdshift); | ||
290 | } | ||
291 | |||
292 | #else | ||
293 | |||
168 | pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) | 294 | pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) |
169 | { | 295 | { |
170 | pgd_t *pg; | 296 | pgd_t *pg; |
@@ -202,6 +328,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz | |||
202 | 328 | ||
203 | return hugepte_offset(hpdp, addr, pdshift); | 329 | return hugepte_offset(hpdp, addr, pdshift); |
204 | } | 330 | } |
331 | #endif | ||
205 | 332 | ||
206 | #ifdef CONFIG_PPC_FSL_BOOK3E | 333 | #ifdef CONFIG_PPC_FSL_BOOK3E |
207 | /* Build list of addresses of gigantic pages. This function is used in early | 334 | /* Build list of addresses of gigantic pages. This function is used in early |
@@ -465,7 +592,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
465 | do { | 592 | do { |
466 | pmd = pmd_offset(pud, addr); | 593 | pmd = pmd_offset(pud, addr); |
467 | next = pmd_addr_end(addr, end); | 594 | next = pmd_addr_end(addr, end); |
468 | if (pmd_none(*pmd)) | 595 | if (pmd_none_or_clear_bad(pmd)) |
469 | continue; | 596 | continue; |
470 | #ifdef CONFIG_PPC_FSL_BOOK3E | 597 | #ifdef CONFIG_PPC_FSL_BOOK3E |
471 | /* | 598 | /* |
@@ -618,16 +745,6 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |||
618 | return page; | 745 | return page; |
619 | } | 746 | } |
620 | 747 | ||
621 | int pmd_huge(pmd_t pmd) | ||
622 | { | ||
623 | return 0; | ||
624 | } | ||
625 | |||
626 | int pud_huge(pud_t pud) | ||
627 | { | ||
628 | return 0; | ||
629 | } | ||
630 | |||
631 | struct page * | 748 | struct page * |
632 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 749 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
633 | pmd_t *pmd, int write) | 750 | pmd_t *pmd, int write) |
@@ -636,8 +753,8 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |||
636 | return NULL; | 753 | return NULL; |
637 | } | 754 | } |
638 | 755 | ||
639 | static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, | 756 | int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, |
640 | unsigned long end, int write, struct page **pages, int *nr) | 757 | unsigned long end, int write, struct page **pages, int *nr) |
641 | { | 758 | { |
642 | unsigned long mask; | 759 | unsigned long mask; |
643 | unsigned long pte_end; | 760 | unsigned long pte_end; |
@@ -873,11 +990,16 @@ static int __init hugetlbpage_init(void) | |||
873 | pdshift = PUD_SHIFT; | 990 | pdshift = PUD_SHIFT; |
874 | else | 991 | else |
875 | pdshift = PGDIR_SHIFT; | 992 | pdshift = PGDIR_SHIFT; |
876 | 993 | /* | |
877 | pgtable_cache_add(pdshift - shift, NULL); | 994 | * if we have pdshift and shift value same, we don't |
878 | if (!PGT_CACHE(pdshift - shift)) | 995 | * use pgt cache for hugepd. |
879 | panic("hugetlbpage_init(): could not create " | 996 | */ |
880 | "pgtable cache for %d bit pagesize\n", shift); | 997 | if (pdshift != shift) { |
998 | pgtable_cache_add(pdshift - shift, NULL); | ||
999 | if (!PGT_CACHE(pdshift - shift)) | ||
1000 | panic("hugetlbpage_init(): could not create " | ||
1001 | "pgtable cache for %d bit pagesize\n", shift); | ||
1002 | } | ||
881 | } | 1003 | } |
882 | 1004 | ||
883 | /* Set default large page size. Currently, we pick 16M or 1M | 1005 | /* Set default large page size. Currently, we pick 16M or 1M |