diff options
author | David Gibson <david@gibson.dropbear.id.au> | 2006-04-28 01:02:51 -0400 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2006-04-28 01:02:51 -0400 |
commit | f10a04c034c7285a1b15dfa4a83d3e56578e34e8 (patch) | |
tree | ee6bd0c670b6606017cbd88b56a1247ff241e00e /arch/powerpc/mm/hugetlbpage.c | |
parent | 37e53db8aa233c65142d63b496277bf5be9c0ade (diff) |
[PATCH] powerpc: Fix pagetable bloat for hugepages
At present, ARCH=powerpc kernels can waste considerable space in
pagetables when making large hugepage mappings. Hugepage PTEs go in
PMD pages, but each PMD page maps 256M and so contains only 16
hugepage PTEs (128 bytes of data), but takes up a 1024 byte
allocation. With CONFIG_PPC_64K_PAGES enabled (64k base page size),
the situation is worse. Now hugepage PTEs are at the PTE page level
(also mapping 256M), so we store 16 hugepage PTEs in a 64k allocation.
The PowerPC MMU already means that any 256M region is either all
hugepage, or all normal pages. Thus, with some care, we can use a
different allocation for the hugepage PTE tables and only allocate the
128 bytes necessary.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc/mm/hugetlbpage.c')
-rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 295 |
1 files changed, 259 insertions, 36 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7370f9f33e29..266b8b2ceac9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -30,13 +30,66 @@ | |||
30 | #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) | 30 | #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) |
31 | #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) | 31 | #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) |
32 | 32 | ||
33 | #ifdef CONFIG_PPC_64K_PAGES | ||
34 | #define HUGEPTE_INDEX_SIZE (PMD_SHIFT-HPAGE_SHIFT) | ||
35 | #else | ||
36 | #define HUGEPTE_INDEX_SIZE (PUD_SHIFT-HPAGE_SHIFT) | ||
37 | #endif | ||
38 | #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE) | ||
39 | #define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << HUGEPTE_INDEX_SIZE) | ||
40 | |||
41 | #define HUGEPD_SHIFT (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE) | ||
42 | #define HUGEPD_SIZE (1UL << HUGEPD_SHIFT) | ||
43 | #define HUGEPD_MASK (~(HUGEPD_SIZE-1)) | ||
44 | |||
45 | #define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM]) | ||
46 | |||
47 | /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() | ||
48 | * will choke on pointers to hugepte tables, which is handy for | ||
49 | * catching screwups early. */ | ||
50 | #define HUGEPD_OK 0x1 | ||
51 | |||
52 | typedef struct { unsigned long pd; } hugepd_t; | ||
53 | |||
54 | #define hugepd_none(hpd) ((hpd).pd == 0) | ||
55 | |||
56 | static inline pte_t *hugepd_page(hugepd_t hpd) | ||
57 | { | ||
58 | BUG_ON(!(hpd.pd & HUGEPD_OK)); | ||
59 | return (pte_t *)(hpd.pd & ~HUGEPD_OK); | ||
60 | } | ||
61 | |||
62 | static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr) | ||
63 | { | ||
64 | unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1)); | ||
65 | pte_t *dir = hugepd_page(*hpdp); | ||
66 | |||
67 | return dir + idx; | ||
68 | } | ||
69 | |||
70 | static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, | ||
71 | unsigned long address) | ||
72 | { | ||
73 | pte_t *new = kmem_cache_alloc(huge_pgtable_cache, | ||
74 | GFP_KERNEL|__GFP_REPEAT); | ||
75 | |||
76 | if (! new) | ||
77 | return -ENOMEM; | ||
78 | |||
79 | spin_lock(&mm->page_table_lock); | ||
80 | if (!hugepd_none(*hpdp)) | ||
81 | kmem_cache_free(huge_pgtable_cache, new); | ||
82 | else | ||
83 | hpdp->pd = (unsigned long)new | HUGEPD_OK; | ||
84 | spin_unlock(&mm->page_table_lock); | ||
85 | return 0; | ||
86 | } | ||
87 | |||
33 | /* Modelled after find_linux_pte() */ | 88 | /* Modelled after find_linux_pte() */ |
34 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | 89 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) |
35 | { | 90 | { |
36 | pgd_t *pg; | 91 | pgd_t *pg; |
37 | pud_t *pu; | 92 | pud_t *pu; |
38 | pmd_t *pm; | ||
39 | pte_t *pt; | ||
40 | 93 | ||
41 | BUG_ON(! in_hugepage_area(mm->context, addr)); | 94 | BUG_ON(! in_hugepage_area(mm->context, addr)); |
42 | 95 | ||
@@ -46,26 +99,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
46 | if (!pgd_none(*pg)) { | 99 | if (!pgd_none(*pg)) { |
47 | pu = pud_offset(pg, addr); | 100 | pu = pud_offset(pg, addr); |
48 | if (!pud_none(*pu)) { | 101 | if (!pud_none(*pu)) { |
49 | pm = pmd_offset(pu, addr); | ||
50 | #ifdef CONFIG_PPC_64K_PAGES | 102 | #ifdef CONFIG_PPC_64K_PAGES |
51 | /* Currently, we use the normal PTE offset within full | 103 | pmd_t *pm; |
52 | * size PTE pages, thus our huge PTEs are scattered in | 104 | pm = pmd_offset(pu, addr); |
53 | * the PTE page and we do waste some. We may change | 105 | if (!pmd_none(*pm)) |
54 | * that in the future, but the current mecanism keeps | 106 | return hugepte_offset((hugepd_t *)pm, addr); |
55 | * things much simpler | 107 | #else |
56 | */ | 108 | return hugepte_offset((hugepd_t *)pu, addr); |
57 | if (!pmd_none(*pm)) { | 109 | #endif |
58 | /* Note: pte_offset_* are all equivalent on | ||
59 | * ppc64 as we don't have HIGHMEM | ||
60 | */ | ||
61 | pt = pte_offset_kernel(pm, addr); | ||
62 | return pt; | ||
63 | } | ||
64 | #else /* CONFIG_PPC_64K_PAGES */ | ||
65 | /* On 4k pages, we put huge PTEs in the PMD page */ | ||
66 | pt = (pte_t *)pm; | ||
67 | return pt; | ||
68 | #endif /* CONFIG_PPC_64K_PAGES */ | ||
69 | } | 110 | } |
70 | } | 111 | } |
71 | 112 | ||
@@ -76,8 +117,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) | |||
76 | { | 117 | { |
77 | pgd_t *pg; | 118 | pgd_t *pg; |
78 | pud_t *pu; | 119 | pud_t *pu; |
79 | pmd_t *pm; | 120 | hugepd_t *hpdp = NULL; |
80 | pte_t *pt; | ||
81 | 121 | ||
82 | BUG_ON(! in_hugepage_area(mm->context, addr)); | 122 | BUG_ON(! in_hugepage_area(mm->context, addr)); |
83 | 123 | ||
@@ -87,23 +127,182 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) | |||
87 | pu = pud_alloc(mm, pg, addr); | 127 | pu = pud_alloc(mm, pg, addr); |
88 | 128 | ||
89 | if (pu) { | 129 | if (pu) { |
130 | #ifdef CONFIG_PPC_64K_PAGES | ||
131 | pmd_t *pm; | ||
90 | pm = pmd_alloc(mm, pu, addr); | 132 | pm = pmd_alloc(mm, pu, addr); |
91 | if (pm) { | 133 | if (pm) |
134 | hpdp = (hugepd_t *)pm; | ||
135 | #else | ||
136 | hpdp = (hugepd_t *)pu; | ||
137 | #endif | ||
138 | } | ||
139 | |||
140 | if (! hpdp) | ||
141 | return NULL; | ||
142 | |||
143 | if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr)) | ||
144 | return NULL; | ||
145 | |||
146 | return hugepte_offset(hpdp, addr); | ||
147 | } | ||
148 | |||
149 | static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp) | ||
150 | { | ||
151 | pte_t *hugepte = hugepd_page(*hpdp); | ||
152 | |||
153 | hpdp->pd = 0; | ||
154 | tlb->need_flush = 1; | ||
155 | pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM, | ||
156 | HUGEPTE_TABLE_SIZE-1)); | ||
157 | } | ||
158 | |||
92 | #ifdef CONFIG_PPC_64K_PAGES | 159 | #ifdef CONFIG_PPC_64K_PAGES |
93 | /* See comment in huge_pte_offset. Note that if we ever | 160 | static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, |
94 | * want to put the page size in the PMD, we would have | 161 | unsigned long addr, unsigned long end, |
95 | * to open code our own pte_alloc* function in order | 162 | unsigned long floor, unsigned long ceiling) |
96 | * to populate and set the size atomically | 163 | { |
97 | */ | 164 | pmd_t *pmd; |
98 | pt = pte_alloc_map(mm, pm, addr); | 165 | unsigned long next; |
99 | #else /* CONFIG_PPC_64K_PAGES */ | 166 | unsigned long start; |
100 | pt = (pte_t *)pm; | 167 | |
101 | #endif /* CONFIG_PPC_64K_PAGES */ | 168 | start = addr; |
102 | return pt; | 169 | pmd = pmd_offset(pud, addr); |
103 | } | 170 | do { |
171 | next = pmd_addr_end(addr, end); | ||
172 | if (pmd_none(*pmd)) | ||
173 | continue; | ||
174 | free_hugepte_range(tlb, (hugepd_t *)pmd); | ||
175 | } while (pmd++, addr = next, addr != end); | ||
176 | |||
177 | start &= PUD_MASK; | ||
178 | if (start < floor) | ||
179 | return; | ||
180 | if (ceiling) { | ||
181 | ceiling &= PUD_MASK; | ||
182 | if (!ceiling) | ||
183 | return; | ||
104 | } | 184 | } |
185 | if (end - 1 > ceiling - 1) | ||
186 | return; | ||
105 | 187 | ||
106 | return NULL; | 188 | pmd = pmd_offset(pud, start); |
189 | pud_clear(pud); | ||
190 | pmd_free_tlb(tlb, pmd); | ||
191 | } | ||
192 | #endif | ||
193 | |||
194 | static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | ||
195 | unsigned long addr, unsigned long end, | ||
196 | unsigned long floor, unsigned long ceiling) | ||
197 | { | ||
198 | pud_t *pud; | ||
199 | unsigned long next; | ||
200 | unsigned long start; | ||
201 | |||
202 | start = addr; | ||
203 | pud = pud_offset(pgd, addr); | ||
204 | do { | ||
205 | next = pud_addr_end(addr, end); | ||
206 | #ifdef CONFIG_PPC_64K_PAGES | ||
207 | if (pud_none_or_clear_bad(pud)) | ||
208 | continue; | ||
209 | hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling); | ||
210 | #else | ||
211 | if (pud_none(*pud)) | ||
212 | continue; | ||
213 | free_hugepte_range(tlb, (hugepd_t *)pud); | ||
214 | #endif | ||
215 | } while (pud++, addr = next, addr != end); | ||
216 | |||
217 | start &= PGDIR_MASK; | ||
218 | if (start < floor) | ||
219 | return; | ||
220 | if (ceiling) { | ||
221 | ceiling &= PGDIR_MASK; | ||
222 | if (!ceiling) | ||
223 | return; | ||
224 | } | ||
225 | if (end - 1 > ceiling - 1) | ||
226 | return; | ||
227 | |||
228 | pud = pud_offset(pgd, start); | ||
229 | pgd_clear(pgd); | ||
230 | pud_free_tlb(tlb, pud); | ||
231 | } | ||
232 | |||
233 | /* | ||
234 | * This function frees user-level page tables of a process. | ||
235 | * | ||
236 | * Must be called with pagetable lock held. | ||
237 | */ | ||
238 | void hugetlb_free_pgd_range(struct mmu_gather **tlb, | ||
239 | unsigned long addr, unsigned long end, | ||
240 | unsigned long floor, unsigned long ceiling) | ||
241 | { | ||
242 | pgd_t *pgd; | ||
243 | unsigned long next; | ||
244 | unsigned long start; | ||
245 | |||
246 | /* | ||
247 | * Comments below take from the normal free_pgd_range(). They | ||
248 | * apply here too. The tests against HUGEPD_MASK below are | ||
249 | * essential, because we *don't* test for this at the bottom | ||
250 | * level. Without them we'll attempt to free a hugepte table | ||
251 | * when we unmap just part of it, even if there are other | ||
252 | * active mappings using it. | ||
253 | * | ||
254 | * The next few lines have given us lots of grief... | ||
255 | * | ||
256 | * Why are we testing HUGEPD* at this top level? Because | ||
257 | * often there will be no work to do at all, and we'd prefer | ||
258 | * not to go all the way down to the bottom just to discover | ||
259 | * that. | ||
260 | * | ||
261 | * Why all these "- 1"s? Because 0 represents both the bottom | ||
262 | * of the address space and the top of it (using -1 for the | ||
263 | * top wouldn't help much: the masks would do the wrong thing). | ||
264 | * The rule is that addr 0 and floor 0 refer to the bottom of | ||
265 | * the address space, but end 0 and ceiling 0 refer to the top | ||
266 | * Comparisons need to use "end - 1" and "ceiling - 1" (though | ||
267 | * that end 0 case should be mythical). | ||
268 | * | ||
269 | * Wherever addr is brought up or ceiling brought down, we | ||
270 | * must be careful to reject "the opposite 0" before it | ||
271 | * confuses the subsequent tests. But what about where end is | ||
272 | * brought down by HUGEPD_SIZE below? no, end can't go down to | ||
273 | * 0 there. | ||
274 | * | ||
275 | * Whereas we round start (addr) and ceiling down, by different | ||
276 | * masks at different levels, in order to test whether a table | ||
277 | * now has no other vmas using it, so can be freed, we don't | ||
278 | * bother to round floor or end up - the tests don't need that. | ||
279 | */ | ||
280 | |||
281 | addr &= HUGEPD_MASK; | ||
282 | if (addr < floor) { | ||
283 | addr += HUGEPD_SIZE; | ||
284 | if (!addr) | ||
285 | return; | ||
286 | } | ||
287 | if (ceiling) { | ||
288 | ceiling &= HUGEPD_MASK; | ||
289 | if (!ceiling) | ||
290 | return; | ||
291 | } | ||
292 | if (end - 1 > ceiling - 1) | ||
293 | end -= HUGEPD_SIZE; | ||
294 | if (addr > end - 1) | ||
295 | return; | ||
296 | |||
297 | start = addr; | ||
298 | pgd = pgd_offset((*tlb)->mm, addr); | ||
299 | do { | ||
300 | BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr)); | ||
301 | next = pgd_addr_end(addr, end); | ||
302 | if (pgd_none_or_clear_bad(pgd)) | ||
303 | continue; | ||
304 | hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling); | ||
305 | } while (pgd++, addr = next, addr != end); | ||
107 | } | 306 | } |
108 | 307 | ||
109 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | 308 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
@@ -841,3 +1040,27 @@ repeat: | |||
841 | out: | 1040 | out: |
842 | return err; | 1041 | return err; |
843 | } | 1042 | } |
1043 | |||
1044 | static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) | ||
1045 | { | ||
1046 | memset(addr, 0, kmem_cache_size(cache)); | ||
1047 | } | ||
1048 | |||
1049 | static int __init hugetlbpage_init(void) | ||
1050 | { | ||
1051 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) | ||
1052 | return -ENODEV; | ||
1053 | |||
1054 | huge_pgtable_cache = kmem_cache_create("hugepte_cache", | ||
1055 | HUGEPTE_TABLE_SIZE, | ||
1056 | HUGEPTE_TABLE_SIZE, | ||
1057 | SLAB_HWCACHE_ALIGN | | ||
1058 | SLAB_MUST_HWCACHE_ALIGN, | ||
1059 | zero_ctor, NULL); | ||
1060 | if (! huge_pgtable_cache) | ||
1061 | panic("hugetlbpage_init(): could not create hugepte cache\n"); | ||
1062 | |||
1063 | return 0; | ||
1064 | } | ||
1065 | |||
1066 | module_init(hugetlbpage_init); | ||