diff options
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r-- | arch/powerpc/mm/Makefile | 5 | ||||
-rw-r--r-- | arch/powerpc/mm/fault.c | 8 | ||||
-rw-r--r-- | arch/powerpc/mm/fsl_booke_mmu.c | 132 | ||||
-rw-r--r-- | arch/powerpc/mm/gup.c | 149 | ||||
-rw-r--r-- | arch/powerpc/mm/hash_utils_64.c | 46 | ||||
-rw-r--r-- | arch/powerpc/mm/hugetlbpage-hash64.c | 139 | ||||
-rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 792 | ||||
-rw-r--r-- | arch/powerpc/mm/init_64.c | 76 | ||||
-rw-r--r-- | arch/powerpc/mm/mem.c | 17 | ||||
-rw-r--r-- | arch/powerpc/mm/mmu_context_hash64.c | 26 | ||||
-rw-r--r-- | arch/powerpc/mm/mmu_decl.h | 11 | ||||
-rw-r--r-- | arch/powerpc/mm/pgtable.c | 25 | ||||
-rw-r--r-- | arch/powerpc/mm/subpage-prot.c | 15 | ||||
-rw-r--r-- | arch/powerpc/mm/tlb_hash64.c | 8 |
14 files changed, 681 insertions, 768 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 6fb8fc8d2fea..ce68708bbad5 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile | |||
@@ -28,7 +28,10 @@ obj-$(CONFIG_44x) += 44x_mmu.o | |||
28 | obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o | 28 | obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o |
29 | obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o | 29 | obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o |
30 | obj-$(CONFIG_PPC_MM_SLICES) += slice.o | 30 | obj-$(CONFIG_PPC_MM_SLICES) += slice.o |
31 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | 31 | ifeq ($(CONFIG_HUGETLB_PAGE),y) |
32 | obj-y += hugetlbpage.o | ||
33 | obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o | ||
34 | endif | ||
32 | obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o | 35 | obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o |
33 | obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o | 36 | obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o |
34 | obj-$(CONFIG_HIGHMEM) += highmem.o | 37 | obj-$(CONFIG_HIGHMEM) += highmem.o |
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index e7dae82c1285..26fb6b990b0a 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c | |||
@@ -40,7 +40,7 @@ | |||
40 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
41 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
42 | #include <asm/siginfo.h> | 42 | #include <asm/siginfo.h> |
43 | 43 | #include <mm/mmu_decl.h> | |
44 | 44 | ||
45 | #ifdef CONFIG_KPROBES | 45 | #ifdef CONFIG_KPROBES |
46 | static inline int notify_page_fault(struct pt_regs *regs) | 46 | static inline int notify_page_fault(struct pt_regs *regs) |
@@ -246,6 +246,12 @@ good_area: | |||
246 | goto bad_area; | 246 | goto bad_area; |
247 | #endif /* CONFIG_6xx */ | 247 | #endif /* CONFIG_6xx */ |
248 | #if defined(CONFIG_8xx) | 248 | #if defined(CONFIG_8xx) |
249 | /* 8xx sometimes need to load a invalid/non-present TLBs. | ||
250 | * These must be invalidated separately as linux mm don't. | ||
251 | */ | ||
252 | if (error_code & 0x40000000) /* no translation? */ | ||
253 | _tlbil_va(address, 0, 0, 0); | ||
254 | |||
249 | /* The MPC8xx seems to always set 0x80000000, which is | 255 | /* The MPC8xx seems to always set 0x80000000, which is |
250 | * "undefined". Of those that can be set, this is the only | 256 | * "undefined". Of those that can be set, this is the only |
251 | * one which seems bad. | 257 | * one which seems bad. |
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index dc93e95b256e..fcfcb6e976c7 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c | |||
@@ -54,26 +54,35 @@ | |||
54 | 54 | ||
55 | #include "mmu_decl.h" | 55 | #include "mmu_decl.h" |
56 | 56 | ||
57 | extern void loadcam_entry(unsigned int index); | ||
58 | unsigned int tlbcam_index; | 57 | unsigned int tlbcam_index; |
59 | static unsigned long cam[CONFIG_LOWMEM_CAM_NUM]; | ||
60 | 58 | ||
61 | #define NUM_TLBCAMS (16) | 59 | #define NUM_TLBCAMS (64) |
62 | 60 | ||
63 | #if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) | 61 | #if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) |
64 | #error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" | 62 | #error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" |
65 | #endif | 63 | #endif |
66 | 64 | ||
67 | struct tlbcam TLBCAM[NUM_TLBCAMS]; | 65 | struct tlbcam { |
66 | u32 MAS0; | ||
67 | u32 MAS1; | ||
68 | unsigned long MAS2; | ||
69 | u32 MAS3; | ||
70 | u32 MAS7; | ||
71 | } TLBCAM[NUM_TLBCAMS]; | ||
68 | 72 | ||
69 | struct tlbcamrange { | 73 | struct tlbcamrange { |
70 | unsigned long start; | 74 | unsigned long start; |
71 | unsigned long limit; | 75 | unsigned long limit; |
72 | phys_addr_t phys; | 76 | phys_addr_t phys; |
73 | } tlbcam_addrs[NUM_TLBCAMS]; | 77 | } tlbcam_addrs[NUM_TLBCAMS]; |
74 | 78 | ||
75 | extern unsigned int tlbcam_index; | 79 | extern unsigned int tlbcam_index; |
76 | 80 | ||
81 | unsigned long tlbcam_sz(int idx) | ||
82 | { | ||
83 | return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; | ||
84 | } | ||
85 | |||
77 | /* | 86 | /* |
78 | * Return PA for this VA if it is mapped by a CAM, or 0 | 87 | * Return PA for this VA if it is mapped by a CAM, or 0 |
79 | */ | 88 | */ |
@@ -94,23 +103,36 @@ unsigned long p_mapped_by_tlbcam(phys_addr_t pa) | |||
94 | int b; | 103 | int b; |
95 | for (b = 0; b < tlbcam_index; ++b) | 104 | for (b = 0; b < tlbcam_index; ++b) |
96 | if (pa >= tlbcam_addrs[b].phys | 105 | if (pa >= tlbcam_addrs[b].phys |
97 | && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) | 106 | && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) |
98 | +tlbcam_addrs[b].phys) | 107 | +tlbcam_addrs[b].phys) |
99 | return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); | 108 | return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); |
100 | return 0; | 109 | return 0; |
101 | } | 110 | } |
102 | 111 | ||
112 | void loadcam_entry(int idx) | ||
113 | { | ||
114 | mtspr(SPRN_MAS0, TLBCAM[idx].MAS0); | ||
115 | mtspr(SPRN_MAS1, TLBCAM[idx].MAS1); | ||
116 | mtspr(SPRN_MAS2, TLBCAM[idx].MAS2); | ||
117 | mtspr(SPRN_MAS3, TLBCAM[idx].MAS3); | ||
118 | |||
119 | if (cur_cpu_spec->cpu_features & MMU_FTR_BIG_PHYS) | ||
120 | mtspr(SPRN_MAS7, TLBCAM[idx].MAS7); | ||
121 | |||
122 | asm volatile("isync;tlbwe;isync" : : : "memory"); | ||
123 | } | ||
124 | |||
103 | /* | 125 | /* |
104 | * Set up one of the I/D BAT (block address translation) register pairs. | 126 | * Set up one of the I/D BAT (block address translation) register pairs. |
105 | * The parameters are not checked; in particular size must be a power | 127 | * The parameters are not checked; in particular size must be a power |
106 | * of 4 between 4k and 256M. | 128 | * of 4 between 4k and 256M. |
107 | */ | 129 | */ |
108 | void settlbcam(int index, unsigned long virt, phys_addr_t phys, | 130 | static void settlbcam(int index, unsigned long virt, phys_addr_t phys, |
109 | unsigned int size, int flags, unsigned int pid) | 131 | unsigned long size, unsigned long flags, unsigned int pid) |
110 | { | 132 | { |
111 | unsigned int tsize, lz; | 133 | unsigned int tsize, lz; |
112 | 134 | ||
113 | asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size)); | 135 | asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (size)); |
114 | tsize = 21 - lz; | 136 | tsize = 21 - lz; |
115 | 137 | ||
116 | #ifdef CONFIG_SMP | 138 | #ifdef CONFIG_SMP |
@@ -128,8 +150,10 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys, | |||
128 | TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; | 150 | TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; |
129 | TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; | 151 | TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; |
130 | 152 | ||
131 | TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR; | 153 | TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR; |
132 | TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); | 154 | TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); |
155 | if (cur_cpu_spec->cpu_features & MMU_FTR_BIG_PHYS) | ||
156 | TLBCAM[index].MAS7 = (u64)phys >> 32; | ||
133 | 157 | ||
134 | #ifndef CONFIG_KGDB /* want user access for breakpoints */ | 158 | #ifndef CONFIG_KGDB /* want user access for breakpoints */ |
135 | if (flags & _PAGE_USER) { | 159 | if (flags & _PAGE_USER) { |
@@ -148,27 +172,44 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys, | |||
148 | loadcam_entry(index); | 172 | loadcam_entry(index); |
149 | } | 173 | } |
150 | 174 | ||
151 | void invalidate_tlbcam_entry(int index) | 175 | unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx) |
152 | { | ||
153 | TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index); | ||
154 | TLBCAM[index].MAS1 = ~MAS1_VALID; | ||
155 | |||
156 | loadcam_entry(index); | ||
157 | } | ||
158 | |||
159 | unsigned long __init mmu_mapin_ram(void) | ||
160 | { | 176 | { |
177 | int i; | ||
161 | unsigned long virt = PAGE_OFFSET; | 178 | unsigned long virt = PAGE_OFFSET; |
162 | phys_addr_t phys = memstart_addr; | 179 | phys_addr_t phys = memstart_addr; |
180 | unsigned long amount_mapped = 0; | ||
181 | unsigned long max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xf; | ||
182 | |||
183 | /* Convert (4^max) kB to (2^max) bytes */ | ||
184 | max_cam = max_cam * 2 + 10; | ||
163 | 185 | ||
164 | while (tlbcam_index < ARRAY_SIZE(cam) && cam[tlbcam_index]) { | 186 | /* Calculate CAM values */ |
165 | settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0); | 187 | for (i = 0; ram && i < max_cam_idx; i++) { |
166 | virt += cam[tlbcam_index]; | 188 | unsigned int camsize = __ilog2(ram) & ~1U; |
167 | phys += cam[tlbcam_index]; | 189 | unsigned int align = __ffs(virt | phys) & ~1U; |
168 | tlbcam_index++; | 190 | unsigned long cam_sz; |
191 | |||
192 | if (camsize > align) | ||
193 | camsize = align; | ||
194 | if (camsize > max_cam) | ||
195 | camsize = max_cam; | ||
196 | |||
197 | cam_sz = 1UL << camsize; | ||
198 | settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0); | ||
199 | |||
200 | ram -= cam_sz; | ||
201 | amount_mapped += cam_sz; | ||
202 | virt += cam_sz; | ||
203 | phys += cam_sz; | ||
169 | } | 204 | } |
205 | tlbcam_index = i; | ||
206 | |||
207 | return amount_mapped; | ||
208 | } | ||
170 | 209 | ||
171 | return virt - PAGE_OFFSET; | 210 | unsigned long __init mmu_mapin_ram(void) |
211 | { | ||
212 | return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; | ||
172 | } | 213 | } |
173 | 214 | ||
174 | /* | 215 | /* |
@@ -179,46 +220,21 @@ void __init MMU_init_hw(void) | |||
179 | flush_instruction_cache(); | 220 | flush_instruction_cache(); |
180 | } | 221 | } |
181 | 222 | ||
182 | void __init | 223 | void __init adjust_total_lowmem(void) |
183 | adjust_total_lowmem(void) | ||
184 | { | 224 | { |
185 | phys_addr_t ram; | 225 | unsigned long ram; |
186 | unsigned int max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xff; | ||
187 | char buf[ARRAY_SIZE(cam) * 5 + 1], *p = buf; | ||
188 | int i; | 226 | int i; |
189 | unsigned long virt = PAGE_OFFSET & 0xffffffffUL; | ||
190 | unsigned long phys = memstart_addr & 0xffffffffUL; | ||
191 | |||
192 | /* Convert (4^max) kB to (2^max) bytes */ | ||
193 | max_cam = max_cam * 2 + 10; | ||
194 | 227 | ||
195 | /* adjust lowmem size to __max_low_memory */ | 228 | /* adjust lowmem size to __max_low_memory */ |
196 | ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); | 229 | ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); |
197 | 230 | ||
198 | /* Calculate CAM values */ | 231 | __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM); |
199 | __max_low_memory = 0; | ||
200 | for (i = 0; ram && i < ARRAY_SIZE(cam); i++) { | ||
201 | unsigned int camsize = __ilog2(ram) & ~1U; | ||
202 | unsigned int align = __ffs(virt | phys) & ~1U; | ||
203 | 232 | ||
204 | if (camsize > align) | 233 | pr_info("Memory CAM mapping: "); |
205 | camsize = align; | 234 | for (i = 0; i < tlbcam_index - 1; i++) |
206 | if (camsize > max_cam) | 235 | pr_cont("%lu/", tlbcam_sz(i) >> 20); |
207 | camsize = max_cam; | 236 | pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20, |
208 | |||
209 | cam[i] = 1UL << camsize; | ||
210 | ram -= cam[i]; | ||
211 | __max_low_memory += cam[i]; | ||
212 | virt += cam[i]; | ||
213 | phys += cam[i]; | ||
214 | |||
215 | p += sprintf(p, "%lu/", cam[i] >> 20); | ||
216 | } | ||
217 | for (; i < ARRAY_SIZE(cam); i++) | ||
218 | p += sprintf(p, "0/"); | ||
219 | p[-1] = '\0'; | ||
220 | |||
221 | pr_info("Memory CAM mapping: %s Mb, residual: %dMb\n", buf, | ||
222 | (unsigned int)((total_lowmem - __max_low_memory) >> 20)); | 237 | (unsigned int)((total_lowmem - __max_low_memory) >> 20)); |
238 | |||
223 | __initial_memory_limit_addr = memstart_addr + __max_low_memory; | 239 | __initial_memory_limit_addr = memstart_addr + __max_low_memory; |
224 | } | 240 | } |
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index bc122a120bf0..d7efdbf640c7 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c | |||
@@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | |||
55 | return 1; | 55 | return 1; |
56 | } | 56 | } |
57 | 57 | ||
58 | #ifdef CONFIG_HUGETLB_PAGE | ||
59 | static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate, | ||
60 | unsigned long *addr, unsigned long end, | ||
61 | int write, struct page **pages, int *nr) | ||
62 | { | ||
63 | unsigned long mask; | ||
64 | unsigned long pte_end; | ||
65 | struct page *head, *page; | ||
66 | pte_t pte; | ||
67 | int refs; | ||
68 | |||
69 | pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate); | ||
70 | if (pte_end < end) | ||
71 | end = pte_end; | ||
72 | |||
73 | pte = *ptep; | ||
74 | mask = _PAGE_PRESENT|_PAGE_USER; | ||
75 | if (write) | ||
76 | mask |= _PAGE_RW; | ||
77 | if ((pte_val(pte) & mask) != mask) | ||
78 | return 0; | ||
79 | /* hugepages are never "special" */ | ||
80 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
81 | |||
82 | refs = 0; | ||
83 | head = pte_page(pte); | ||
84 | page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT); | ||
85 | do { | ||
86 | VM_BUG_ON(compound_head(page) != head); | ||
87 | pages[*nr] = page; | ||
88 | (*nr)++; | ||
89 | page++; | ||
90 | refs++; | ||
91 | } while (*addr += PAGE_SIZE, *addr != end); | ||
92 | |||
93 | if (!page_cache_add_speculative(head, refs)) { | ||
94 | *nr -= refs; | ||
95 | return 0; | ||
96 | } | ||
97 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | ||
98 | /* Could be optimized better */ | ||
99 | while (*nr) { | ||
100 | put_page(page); | ||
101 | (*nr)--; | ||
102 | } | ||
103 | } | ||
104 | |||
105 | return 1; | ||
106 | } | ||
107 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
108 | |||
109 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | 58 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, |
110 | int write, struct page **pages, int *nr) | 59 | int write, struct page **pages, int *nr) |
111 | { | 60 | { |
@@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
119 | next = pmd_addr_end(addr, end); | 68 | next = pmd_addr_end(addr, end); |
120 | if (pmd_none(pmd)) | 69 | if (pmd_none(pmd)) |
121 | return 0; | 70 | return 0; |
122 | if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | 71 | if (is_hugepd(pmdp)) { |
72 | if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT, | ||
73 | addr, next, write, pages, nr)) | ||
74 | return 0; | ||
75 | } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | ||
123 | return 0; | 76 | return 0; |
124 | } while (pmdp++, addr = next, addr != end); | 77 | } while (pmdp++, addr = next, addr != end); |
125 | 78 | ||
@@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | |||
139 | next = pud_addr_end(addr, end); | 92 | next = pud_addr_end(addr, end); |
140 | if (pud_none(pud)) | 93 | if (pud_none(pud)) |
141 | return 0; | 94 | return 0; |
142 | if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | 95 | if (is_hugepd(pudp)) { |
96 | if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT, | ||
97 | addr, next, write, pages, nr)) | ||
98 | return 0; | ||
99 | } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | ||
143 | return 0; | 100 | return 0; |
144 | } while (pudp++, addr = next, addr != end); | 101 | } while (pudp++, addr = next, addr != end); |
145 | 102 | ||
@@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
154 | unsigned long next; | 111 | unsigned long next; |
155 | pgd_t *pgdp; | 112 | pgd_t *pgdp; |
156 | int nr = 0; | 113 | int nr = 0; |
157 | #ifdef CONFIG_PPC64 | ||
158 | unsigned int shift; | ||
159 | int psize; | ||
160 | #endif | ||
161 | 114 | ||
162 | pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); | 115 | pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); |
163 | 116 | ||
@@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
172 | 125 | ||
173 | pr_devel(" aligned: %lx .. %lx\n", start, end); | 126 | pr_devel(" aligned: %lx .. %lx\n", start, end); |
174 | 127 | ||
175 | #ifdef CONFIG_HUGETLB_PAGE | ||
176 | /* We bail out on slice boundary crossing when hugetlb is | ||
177 | * enabled in order to not have to deal with two different | ||
178 | * page table formats | ||
179 | */ | ||
180 | if (addr < SLICE_LOW_TOP) { | ||
181 | if (end > SLICE_LOW_TOP) | ||
182 | goto slow_irqon; | ||
183 | |||
184 | if (unlikely(GET_LOW_SLICE_INDEX(addr) != | ||
185 | GET_LOW_SLICE_INDEX(end - 1))) | ||
186 | goto slow_irqon; | ||
187 | } else { | ||
188 | if (unlikely(GET_HIGH_SLICE_INDEX(addr) != | ||
189 | GET_HIGH_SLICE_INDEX(end - 1))) | ||
190 | goto slow_irqon; | ||
191 | } | ||
192 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
193 | |||
194 | /* | 128 | /* |
195 | * XXX: batch / limit 'nr', to avoid large irq off latency | 129 | * XXX: batch / limit 'nr', to avoid large irq off latency |
196 | * needs some instrumenting to determine the common sizes used by | 130 | * needs some instrumenting to determine the common sizes used by |
@@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
210 | */ | 144 | */ |
211 | local_irq_disable(); | 145 | local_irq_disable(); |
212 | 146 | ||
213 | #ifdef CONFIG_PPC64 | 147 | pgdp = pgd_offset(mm, addr); |
214 | /* Those bits are related to hugetlbfs implementation and only exist | 148 | do { |
215 | * on 64-bit for now | 149 | pgd_t pgd = *pgdp; |
216 | */ | 150 | |
217 | psize = get_slice_psize(mm, addr); | 151 | pr_devel(" %016lx: normal pgd %p\n", addr, |
218 | shift = mmu_psize_defs[psize].shift; | 152 | (void *)pgd_val(pgd)); |
219 | #endif /* CONFIG_PPC64 */ | 153 | next = pgd_addr_end(addr, end); |
220 | 154 | if (pgd_none(pgd)) | |
221 | #ifdef CONFIG_HUGETLB_PAGE | 155 | goto slow; |
222 | if (unlikely(mmu_huge_psizes[psize])) { | 156 | if (is_hugepd(pgdp)) { |
223 | pte_t *ptep; | 157 | if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, |
224 | unsigned long a = addr; | 158 | addr, next, write, pages, &nr)) |
225 | unsigned long sz = ((1UL) << shift); | ||
226 | struct hstate *hstate = size_to_hstate(sz); | ||
227 | |||
228 | BUG_ON(!hstate); | ||
229 | /* | ||
230 | * XXX: could be optimized to avoid hstate | ||
231 | * lookup entirely (just use shift) | ||
232 | */ | ||
233 | |||
234 | do { | ||
235 | VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift); | ||
236 | ptep = huge_pte_offset(mm, a); | ||
237 | pr_devel(" %016lx: huge ptep %p\n", a, ptep); | ||
238 | if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages, | ||
239 | &nr)) | ||
240 | goto slow; | ||
241 | } while (a != end); | ||
242 | } else | ||
243 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
244 | { | ||
245 | pgdp = pgd_offset(mm, addr); | ||
246 | do { | ||
247 | pgd_t pgd = *pgdp; | ||
248 | |||
249 | #ifdef CONFIG_PPC64 | ||
250 | VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift); | ||
251 | #endif | ||
252 | pr_devel(" %016lx: normal pgd %p\n", addr, | ||
253 | (void *)pgd_val(pgd)); | ||
254 | next = pgd_addr_end(addr, end); | ||
255 | if (pgd_none(pgd)) | ||
256 | goto slow; | ||
257 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | ||
258 | goto slow; | 159 | goto slow; |
259 | } while (pgdp++, addr = next, addr != end); | 160 | } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) |
260 | } | 161 | goto slow; |
162 | } while (pgdp++, addr = next, addr != end); | ||
163 | |||
261 | local_irq_enable(); | 164 | local_irq_enable(); |
262 | 165 | ||
263 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | 166 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); |
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 1ade7eb6ae00..50f867d657df 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c | |||
@@ -92,6 +92,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; | |||
92 | struct hash_pte *htab_address; | 92 | struct hash_pte *htab_address; |
93 | unsigned long htab_size_bytes; | 93 | unsigned long htab_size_bytes; |
94 | unsigned long htab_hash_mask; | 94 | unsigned long htab_hash_mask; |
95 | EXPORT_SYMBOL_GPL(htab_hash_mask); | ||
95 | int mmu_linear_psize = MMU_PAGE_4K; | 96 | int mmu_linear_psize = MMU_PAGE_4K; |
96 | int mmu_virtual_psize = MMU_PAGE_4K; | 97 | int mmu_virtual_psize = MMU_PAGE_4K; |
97 | int mmu_vmalloc_psize = MMU_PAGE_4K; | 98 | int mmu_vmalloc_psize = MMU_PAGE_4K; |
@@ -102,6 +103,7 @@ int mmu_io_psize = MMU_PAGE_4K; | |||
102 | int mmu_kernel_ssize = MMU_SEGSIZE_256M; | 103 | int mmu_kernel_ssize = MMU_SEGSIZE_256M; |
103 | int mmu_highuser_ssize = MMU_SEGSIZE_256M; | 104 | int mmu_highuser_ssize = MMU_SEGSIZE_256M; |
104 | u16 mmu_slb_size = 64; | 105 | u16 mmu_slb_size = 64; |
106 | EXPORT_SYMBOL_GPL(mmu_slb_size); | ||
105 | #ifdef CONFIG_HUGETLB_PAGE | 107 | #ifdef CONFIG_HUGETLB_PAGE |
106 | unsigned int HPAGE_SHIFT; | 108 | unsigned int HPAGE_SHIFT; |
107 | #endif | 109 | #endif |
@@ -481,16 +483,6 @@ static void __init htab_init_page_sizes(void) | |||
481 | #ifdef CONFIG_HUGETLB_PAGE | 483 | #ifdef CONFIG_HUGETLB_PAGE |
482 | /* Reserve 16G huge page memory sections for huge pages */ | 484 | /* Reserve 16G huge page memory sections for huge pages */ |
483 | of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); | 485 | of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); |
484 | |||
485 | /* Set default large page size. Currently, we pick 16M or 1M depending | ||
486 | * on what is available | ||
487 | */ | ||
488 | if (mmu_psize_defs[MMU_PAGE_16M].shift) | ||
489 | HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; | ||
490 | /* With 4k/4level pagetables, we can't (for now) cope with a | ||
491 | * huge page size < PMD_SIZE */ | ||
492 | else if (mmu_psize_defs[MMU_PAGE_1M].shift) | ||
493 | HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; | ||
494 | #endif /* CONFIG_HUGETLB_PAGE */ | 486 | #endif /* CONFIG_HUGETLB_PAGE */ |
495 | } | 487 | } |
496 | 488 | ||
@@ -785,7 +777,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) | |||
785 | /* page is dirty */ | 777 | /* page is dirty */ |
786 | if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { | 778 | if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { |
787 | if (trap == 0x400) { | 779 | if (trap == 0x400) { |
788 | __flush_dcache_icache(page_address(page)); | 780 | flush_dcache_icache_page(page); |
789 | set_bit(PG_arch_1, &page->flags); | 781 | set_bit(PG_arch_1, &page->flags); |
790 | } else | 782 | } else |
791 | pp |= HPTE_R_N; | 783 | pp |= HPTE_R_N; |
@@ -843,9 +835,9 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) | |||
843 | * Result is 0: full permissions, _PAGE_RW: read-only, | 835 | * Result is 0: full permissions, _PAGE_RW: read-only, |
844 | * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. | 836 | * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. |
845 | */ | 837 | */ |
846 | static int subpage_protection(pgd_t *pgdir, unsigned long ea) | 838 | static int subpage_protection(struct mm_struct *mm, unsigned long ea) |
847 | { | 839 | { |
848 | struct subpage_prot_table *spt = pgd_subpage_prot(pgdir); | 840 | struct subpage_prot_table *spt = &mm->context.spt; |
849 | u32 spp = 0; | 841 | u32 spp = 0; |
850 | u32 **sbpm, *sbpp; | 842 | u32 **sbpm, *sbpp; |
851 | 843 | ||
@@ -873,7 +865,7 @@ static int subpage_protection(pgd_t *pgdir, unsigned long ea) | |||
873 | } | 865 | } |
874 | 866 | ||
875 | #else /* CONFIG_PPC_SUBPAGE_PROT */ | 867 | #else /* CONFIG_PPC_SUBPAGE_PROT */ |
876 | static inline int subpage_protection(pgd_t *pgdir, unsigned long ea) | 868 | static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) |
877 | { | 869 | { |
878 | return 0; | 870 | return 0; |
879 | } | 871 | } |
@@ -891,6 +883,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) | |||
891 | unsigned long vsid; | 883 | unsigned long vsid; |
892 | struct mm_struct *mm; | 884 | struct mm_struct *mm; |
893 | pte_t *ptep; | 885 | pte_t *ptep; |
886 | unsigned hugeshift; | ||
894 | const struct cpumask *tmp; | 887 | const struct cpumask *tmp; |
895 | int rc, user_region = 0, local = 0; | 888 | int rc, user_region = 0, local = 0; |
896 | int psize, ssize; | 889 | int psize, ssize; |
@@ -943,30 +936,31 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) | |||
943 | if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) | 936 | if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) |
944 | local = 1; | 937 | local = 1; |
945 | 938 | ||
946 | #ifdef CONFIG_HUGETLB_PAGE | ||
947 | /* Handle hugepage regions */ | ||
948 | if (HPAGE_SHIFT && mmu_huge_psizes[psize]) { | ||
949 | DBG_LOW(" -> huge page !\n"); | ||
950 | return hash_huge_page(mm, access, ea, vsid, local, trap); | ||
951 | } | ||
952 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
953 | |||
954 | #ifndef CONFIG_PPC_64K_PAGES | 939 | #ifndef CONFIG_PPC_64K_PAGES |
955 | /* If we use 4K pages and our psize is not 4K, then we are hitting | 940 | /* If we use 4K pages and our psize is not 4K, then we might |
956 | * a special driver mapping, we need to align the address before | 941 | * be hitting a special driver mapping, and need to align the |
957 | * we fetch the PTE | 942 | * address before we fetch the PTE. |
943 | * | ||
944 | * It could also be a hugepage mapping, in which case this is | ||
945 | * not necessary, but it's not harmful, either. | ||
958 | */ | 946 | */ |
959 | if (psize != MMU_PAGE_4K) | 947 | if (psize != MMU_PAGE_4K) |
960 | ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); | 948 | ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); |
961 | #endif /* CONFIG_PPC_64K_PAGES */ | 949 | #endif /* CONFIG_PPC_64K_PAGES */ |
962 | 950 | ||
963 | /* Get PTE and page size from page tables */ | 951 | /* Get PTE and page size from page tables */ |
964 | ptep = find_linux_pte(pgdir, ea); | 952 | ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); |
965 | if (ptep == NULL || !pte_present(*ptep)) { | 953 | if (ptep == NULL || !pte_present(*ptep)) { |
966 | DBG_LOW(" no PTE !\n"); | 954 | DBG_LOW(" no PTE !\n"); |
967 | return 1; | 955 | return 1; |
968 | } | 956 | } |
969 | 957 | ||
958 | #ifdef CONFIG_HUGETLB_PAGE | ||
959 | if (hugeshift) | ||
960 | return __hash_page_huge(ea, access, vsid, ptep, trap, local, | ||
961 | ssize, hugeshift, psize); | ||
962 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
963 | |||
970 | #ifndef CONFIG_PPC_64K_PAGES | 964 | #ifndef CONFIG_PPC_64K_PAGES |
971 | DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); | 965 | DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); |
972 | #else | 966 | #else |
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c new file mode 100644 index 000000000000..199539882f92 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-hash64.c | |||
@@ -0,0 +1,139 @@ | |||
1 | /* | ||
2 | * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) | ||
3 | * | ||
4 | * Copyright (C) 2003 David Gibson, IBM Corporation. | ||
5 | * | ||
6 | * Based on the IA-32 version: | ||
7 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | ||
8 | */ | ||
9 | |||
10 | #include <linux/mm.h> | ||
11 | #include <linux/hugetlb.h> | ||
12 | #include <asm/pgtable.h> | ||
13 | #include <asm/pgalloc.h> | ||
14 | #include <asm/cacheflush.h> | ||
15 | #include <asm/machdep.h> | ||
16 | |||
17 | int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, | ||
18 | pte_t *ptep, unsigned long trap, int local, int ssize, | ||
19 | unsigned int shift, unsigned int mmu_psize) | ||
20 | { | ||
21 | unsigned long old_pte, new_pte; | ||
22 | unsigned long va, rflags, pa, sz; | ||
23 | long slot; | ||
24 | int err = 1; | ||
25 | |||
26 | BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); | ||
27 | |||
28 | /* Search the Linux page table for a match with va */ | ||
29 | va = hpt_va(ea, vsid, ssize); | ||
30 | |||
31 | /* | ||
32 | * Check the user's access rights to the page. If access should be | ||
33 | * prevented then send the problem up to do_page_fault. | ||
34 | */ | ||
35 | if (unlikely(access & ~pte_val(*ptep))) | ||
36 | goto out; | ||
37 | /* | ||
38 | * At this point, we have a pte (old_pte) which can be used to build | ||
39 | * or update an HPTE. There are 2 cases: | ||
40 | * | ||
41 | * 1. There is a valid (present) pte with no associated HPTE (this is | ||
42 | * the most common case) | ||
43 | * 2. There is a valid (present) pte with an associated HPTE. The | ||
44 | * current values of the pp bits in the HPTE prevent access | ||
45 | * because we are doing software DIRTY bit management and the | ||
46 | * page is currently not DIRTY. | ||
47 | */ | ||
48 | |||
49 | |||
50 | do { | ||
51 | old_pte = pte_val(*ptep); | ||
52 | if (old_pte & _PAGE_BUSY) | ||
53 | goto out; | ||
54 | new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; | ||
55 | } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, | ||
56 | old_pte, new_pte)); | ||
57 | |||
58 | rflags = 0x2 | (!(new_pte & _PAGE_RW)); | ||
59 | /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ | ||
60 | rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); | ||
61 | sz = ((1UL) << shift); | ||
62 | if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) | ||
63 | /* No CPU has hugepages but lacks no execute, so we | ||
64 | * don't need to worry about that case */ | ||
65 | rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); | ||
66 | |||
67 | /* Check if pte already has an hpte (case 2) */ | ||
68 | if (unlikely(old_pte & _PAGE_HASHPTE)) { | ||
69 | /* There MIGHT be an HPTE for this pte */ | ||
70 | unsigned long hash, slot; | ||
71 | |||
72 | hash = hpt_hash(va, shift, ssize); | ||
73 | if (old_pte & _PAGE_F_SECOND) | ||
74 | hash = ~hash; | ||
75 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | ||
76 | slot += (old_pte & _PAGE_F_GIX) >> 12; | ||
77 | |||
78 | if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, | ||
79 | ssize, local) == -1) | ||
80 | old_pte &= ~_PAGE_HPTEFLAGS; | ||
81 | } | ||
82 | |||
83 | if (likely(!(old_pte & _PAGE_HASHPTE))) { | ||
84 | unsigned long hash = hpt_hash(va, shift, ssize); | ||
85 | unsigned long hpte_group; | ||
86 | |||
87 | pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; | ||
88 | |||
89 | repeat: | ||
90 | hpte_group = ((hash & htab_hash_mask) * | ||
91 | HPTES_PER_GROUP) & ~0x7UL; | ||
92 | |||
93 | /* clear HPTE slot informations in new PTE */ | ||
94 | #ifdef CONFIG_PPC_64K_PAGES | ||
95 | new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; | ||
96 | #else | ||
97 | new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; | ||
98 | #endif | ||
99 | /* Add in WIMG bits */ | ||
100 | rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | | ||
101 | _PAGE_COHERENT | _PAGE_GUARDED)); | ||
102 | |||
103 | /* Insert into the hash table, primary slot */ | ||
104 | slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, | ||
105 | mmu_psize, ssize); | ||
106 | |||
107 | /* Primary is full, try the secondary */ | ||
108 | if (unlikely(slot == -1)) { | ||
109 | hpte_group = ((~hash & htab_hash_mask) * | ||
110 | HPTES_PER_GROUP) & ~0x7UL; | ||
111 | slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, | ||
112 | HPTE_V_SECONDARY, | ||
113 | mmu_psize, ssize); | ||
114 | if (slot == -1) { | ||
115 | if (mftb() & 0x1) | ||
116 | hpte_group = ((hash & htab_hash_mask) * | ||
117 | HPTES_PER_GROUP)&~0x7UL; | ||
118 | |||
119 | ppc_md.hpte_remove(hpte_group); | ||
120 | goto repeat; | ||
121 | } | ||
122 | } | ||
123 | |||
124 | if (unlikely(slot == -2)) | ||
125 | panic("hash_huge_page: pte_insert failed\n"); | ||
126 | |||
127 | new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * No need to use ldarx/stdcx here | ||
132 | */ | ||
133 | *ptep = __pte(new_pte & ~_PAGE_BUSY); | ||
134 | |||
135 | err = 0; | ||
136 | |||
137 | out: | ||
138 | return err; | ||
139 | } | ||
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 90df6ffe3a43..123f7070238a 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -7,29 +7,17 @@ | |||
7 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | 7 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/fs.h> | ||
12 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
11 | #include <linux/io.h> | ||
13 | #include <linux/hugetlb.h> | 12 | #include <linux/hugetlb.h> |
14 | #include <linux/pagemap.h> | 13 | #include <asm/pgtable.h> |
15 | #include <linux/slab.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <linux/sysctl.h> | ||
18 | #include <asm/mman.h> | ||
19 | #include <asm/pgalloc.h> | 14 | #include <asm/pgalloc.h> |
20 | #include <asm/tlb.h> | 15 | #include <asm/tlb.h> |
21 | #include <asm/tlbflush.h> | ||
22 | #include <asm/mmu_context.h> | ||
23 | #include <asm/machdep.h> | ||
24 | #include <asm/cputable.h> | ||
25 | #include <asm/spu.h> | ||
26 | 16 | ||
27 | #define PAGE_SHIFT_64K 16 | 17 | #define PAGE_SHIFT_64K 16 |
28 | #define PAGE_SHIFT_16M 24 | 18 | #define PAGE_SHIFT_16M 24 |
29 | #define PAGE_SHIFT_16G 34 | 19 | #define PAGE_SHIFT_16G 34 |
30 | 20 | ||
31 | #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) | ||
32 | #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) | ||
33 | #define MAX_NUMBER_GPAGES 1024 | 21 | #define MAX_NUMBER_GPAGES 1024 |
34 | 22 | ||
35 | /* Tracks the 16G pages after the device tree is scanned and before the | 23 | /* Tracks the 16G pages after the device tree is scanned and before the |
@@ -37,53 +25,17 @@ | |||
37 | static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; | 25 | static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; |
38 | static unsigned nr_gpages; | 26 | static unsigned nr_gpages; |
39 | 27 | ||
40 | /* Array of valid huge page sizes - non-zero value(hugepte_shift) is | ||
41 | * stored for the huge page sizes that are valid. | ||
42 | */ | ||
43 | unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ | ||
44 | |||
45 | #define hugepte_shift mmu_huge_psizes | ||
46 | #define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize]) | ||
47 | #define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize]) | ||
48 | |||
49 | #define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ | ||
50 | + hugepte_shift[psize]) | ||
51 | #define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) | ||
52 | #define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) | ||
53 | |||
54 | /* Subtract one from array size because we don't need a cache for 4K since | ||
55 | * is not a huge page size */ | ||
56 | #define HUGE_PGTABLE_INDEX(psize) (HUGEPTE_CACHE_NUM + psize - 1) | ||
57 | #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) | ||
58 | |||
59 | static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { | ||
60 | [MMU_PAGE_64K] = "hugepte_cache_64K", | ||
61 | [MMU_PAGE_1M] = "hugepte_cache_1M", | ||
62 | [MMU_PAGE_16M] = "hugepte_cache_16M", | ||
63 | [MMU_PAGE_16G] = "hugepte_cache_16G", | ||
64 | }; | ||
65 | |||
66 | /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() | 28 | /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() |
67 | * will choke on pointers to hugepte tables, which is handy for | 29 | * will choke on pointers to hugepte tables, which is handy for |
68 | * catching screwups early. */ | 30 | * catching screwups early. */ |
69 | #define HUGEPD_OK 0x1 | ||
70 | |||
71 | typedef struct { unsigned long pd; } hugepd_t; | ||
72 | |||
73 | #define hugepd_none(hpd) ((hpd).pd == 0) | ||
74 | 31 | ||
75 | static inline int shift_to_mmu_psize(unsigned int shift) | 32 | static inline int shift_to_mmu_psize(unsigned int shift) |
76 | { | 33 | { |
77 | switch (shift) { | 34 | int psize; |
78 | #ifndef CONFIG_PPC_64K_PAGES | 35 | |
79 | case PAGE_SHIFT_64K: | 36 | for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) |
80 | return MMU_PAGE_64K; | 37 | if (mmu_psize_defs[psize].shift == shift) |
81 | #endif | 38 | return psize; |
82 | case PAGE_SHIFT_16M: | ||
83 | return MMU_PAGE_16M; | ||
84 | case PAGE_SHIFT_16G: | ||
85 | return MMU_PAGE_16G; | ||
86 | } | ||
87 | return -1; | 39 | return -1; |
88 | } | 40 | } |
89 | 41 | ||
@@ -94,71 +46,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) | |||
94 | BUG(); | 46 | BUG(); |
95 | } | 47 | } |
96 | 48 | ||
49 | #define hugepd_none(hpd) ((hpd).pd == 0) | ||
50 | |||
97 | static inline pte_t *hugepd_page(hugepd_t hpd) | 51 | static inline pte_t *hugepd_page(hugepd_t hpd) |
98 | { | 52 | { |
99 | BUG_ON(!(hpd.pd & HUGEPD_OK)); | 53 | BUG_ON(!hugepd_ok(hpd)); |
100 | return (pte_t *)(hpd.pd & ~HUGEPD_OK); | 54 | return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000); |
55 | } | ||
56 | |||
57 | static inline unsigned int hugepd_shift(hugepd_t hpd) | ||
58 | { | ||
59 | return hpd.pd & HUGEPD_SHIFT_MASK; | ||
101 | } | 60 | } |
102 | 61 | ||
103 | static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, | 62 | static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift) |
104 | struct hstate *hstate) | ||
105 | { | 63 | { |
106 | unsigned int shift = huge_page_shift(hstate); | 64 | unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); |
107 | int psize = shift_to_mmu_psize(shift); | ||
108 | unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1)); | ||
109 | pte_t *dir = hugepd_page(*hpdp); | 65 | pte_t *dir = hugepd_page(*hpdp); |
110 | 66 | ||
111 | return dir + idx; | 67 | return dir + idx; |
112 | } | 68 | } |
113 | 69 | ||
70 | pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) | ||
71 | { | ||
72 | pgd_t *pg; | ||
73 | pud_t *pu; | ||
74 | pmd_t *pm; | ||
75 | hugepd_t *hpdp = NULL; | ||
76 | unsigned pdshift = PGDIR_SHIFT; | ||
77 | |||
78 | if (shift) | ||
79 | *shift = 0; | ||
80 | |||
81 | pg = pgdir + pgd_index(ea); | ||
82 | if (is_hugepd(pg)) { | ||
83 | hpdp = (hugepd_t *)pg; | ||
84 | } else if (!pgd_none(*pg)) { | ||
85 | pdshift = PUD_SHIFT; | ||
86 | pu = pud_offset(pg, ea); | ||
87 | if (is_hugepd(pu)) | ||
88 | hpdp = (hugepd_t *)pu; | ||
89 | else if (!pud_none(*pu)) { | ||
90 | pdshift = PMD_SHIFT; | ||
91 | pm = pmd_offset(pu, ea); | ||
92 | if (is_hugepd(pm)) | ||
93 | hpdp = (hugepd_t *)pm; | ||
94 | else if (!pmd_none(*pm)) { | ||
95 | return pte_offset_map(pm, ea); | ||
96 | } | ||
97 | } | ||
98 | } | ||
99 | |||
100 | if (!hpdp) | ||
101 | return NULL; | ||
102 | |||
103 | if (shift) | ||
104 | *shift = hugepd_shift(*hpdp); | ||
105 | return hugepte_offset(hpdp, ea, pdshift); | ||
106 | } | ||
107 | |||
108 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | ||
109 | { | ||
110 | return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); | ||
111 | } | ||
112 | |||
114 | static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, | 113 | static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, |
115 | unsigned long address, unsigned int psize) | 114 | unsigned long address, unsigned pdshift, unsigned pshift) |
116 | { | 115 | { |
117 | pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], | 116 | pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift), |
118 | GFP_KERNEL|__GFP_REPEAT); | 117 | GFP_KERNEL|__GFP_REPEAT); |
118 | |||
119 | BUG_ON(pshift > HUGEPD_SHIFT_MASK); | ||
120 | BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); | ||
119 | 121 | ||
120 | if (! new) | 122 | if (! new) |
121 | return -ENOMEM; | 123 | return -ENOMEM; |
122 | 124 | ||
123 | spin_lock(&mm->page_table_lock); | 125 | spin_lock(&mm->page_table_lock); |
124 | if (!hugepd_none(*hpdp)) | 126 | if (!hugepd_none(*hpdp)) |
125 | kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new); | 127 | kmem_cache_free(PGT_CACHE(pdshift - pshift), new); |
126 | else | 128 | else |
127 | hpdp->pd = (unsigned long)new | HUGEPD_OK; | 129 | hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift; |
128 | spin_unlock(&mm->page_table_lock); | 130 | spin_unlock(&mm->page_table_lock); |
129 | return 0; | 131 | return 0; |
130 | } | 132 | } |
131 | 133 | ||
132 | 134 | pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) | |
133 | static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate) | ||
134 | { | ||
135 | if (huge_page_shift(hstate) < PUD_SHIFT) | ||
136 | return pud_offset(pgd, addr); | ||
137 | else | ||
138 | return (pud_t *) pgd; | ||
139 | } | ||
140 | static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, | ||
141 | struct hstate *hstate) | ||
142 | { | 135 | { |
143 | if (huge_page_shift(hstate) < PUD_SHIFT) | 136 | pgd_t *pg; |
144 | return pud_alloc(mm, pgd, addr); | 137 | pud_t *pu; |
145 | else | 138 | pmd_t *pm; |
146 | return (pud_t *) pgd; | 139 | hugepd_t *hpdp = NULL; |
147 | } | 140 | unsigned pshift = __ffs(sz); |
148 | static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) | 141 | unsigned pdshift = PGDIR_SHIFT; |
149 | { | 142 | |
150 | if (huge_page_shift(hstate) < PMD_SHIFT) | 143 | addr &= ~(sz-1); |
151 | return pmd_offset(pud, addr); | 144 | |
152 | else | 145 | pg = pgd_offset(mm, addr); |
153 | return (pmd_t *) pud; | 146 | if (pshift >= PUD_SHIFT) { |
154 | } | 147 | hpdp = (hugepd_t *)pg; |
155 | static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, | 148 | } else { |
156 | struct hstate *hstate) | 149 | pdshift = PUD_SHIFT; |
157 | { | 150 | pu = pud_alloc(mm, pg, addr); |
158 | if (huge_page_shift(hstate) < PMD_SHIFT) | 151 | if (pshift >= PMD_SHIFT) { |
159 | return pmd_alloc(mm, pud, addr); | 152 | hpdp = (hugepd_t *)pu; |
160 | else | 153 | } else { |
161 | return (pmd_t *) pud; | 154 | pdshift = PMD_SHIFT; |
155 | pm = pmd_alloc(mm, pu, addr); | ||
156 | hpdp = (hugepd_t *)pm; | ||
157 | } | ||
158 | } | ||
159 | |||
160 | if (!hpdp) | ||
161 | return NULL; | ||
162 | |||
163 | BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); | ||
164 | |||
165 | if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) | ||
166 | return NULL; | ||
167 | |||
168 | return hugepte_offset(hpdp, addr, pdshift); | ||
162 | } | 169 | } |
163 | 170 | ||
164 | /* Build list of addresses of gigantic pages. This function is used in early | 171 | /* Build list of addresses of gigantic pages. This function is used in early |
@@ -192,94 +199,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate) | |||
192 | return 1; | 199 | return 1; |
193 | } | 200 | } |
194 | 201 | ||
195 | |||
196 | /* Modelled after find_linux_pte() */ | ||
197 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | ||
198 | { | ||
199 | pgd_t *pg; | ||
200 | pud_t *pu; | ||
201 | pmd_t *pm; | ||
202 | |||
203 | unsigned int psize; | ||
204 | unsigned int shift; | ||
205 | unsigned long sz; | ||
206 | struct hstate *hstate; | ||
207 | psize = get_slice_psize(mm, addr); | ||
208 | shift = mmu_psize_to_shift(psize); | ||
209 | sz = ((1UL) << shift); | ||
210 | hstate = size_to_hstate(sz); | ||
211 | |||
212 | addr &= hstate->mask; | ||
213 | |||
214 | pg = pgd_offset(mm, addr); | ||
215 | if (!pgd_none(*pg)) { | ||
216 | pu = hpud_offset(pg, addr, hstate); | ||
217 | if (!pud_none(*pu)) { | ||
218 | pm = hpmd_offset(pu, addr, hstate); | ||
219 | if (!pmd_none(*pm)) | ||
220 | return hugepte_offset((hugepd_t *)pm, addr, | ||
221 | hstate); | ||
222 | } | ||
223 | } | ||
224 | |||
225 | return NULL; | ||
226 | } | ||
227 | |||
228 | pte_t *huge_pte_alloc(struct mm_struct *mm, | ||
229 | unsigned long addr, unsigned long sz) | ||
230 | { | ||
231 | pgd_t *pg; | ||
232 | pud_t *pu; | ||
233 | pmd_t *pm; | ||
234 | hugepd_t *hpdp = NULL; | ||
235 | struct hstate *hstate; | ||
236 | unsigned int psize; | ||
237 | hstate = size_to_hstate(sz); | ||
238 | |||
239 | psize = get_slice_psize(mm, addr); | ||
240 | BUG_ON(!mmu_huge_psizes[psize]); | ||
241 | |||
242 | addr &= hstate->mask; | ||
243 | |||
244 | pg = pgd_offset(mm, addr); | ||
245 | pu = hpud_alloc(mm, pg, addr, hstate); | ||
246 | |||
247 | if (pu) { | ||
248 | pm = hpmd_alloc(mm, pu, addr, hstate); | ||
249 | if (pm) | ||
250 | hpdp = (hugepd_t *)pm; | ||
251 | } | ||
252 | |||
253 | if (! hpdp) | ||
254 | return NULL; | ||
255 | |||
256 | if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize)) | ||
257 | return NULL; | ||
258 | |||
259 | return hugepte_offset(hpdp, addr, hstate); | ||
260 | } | ||
261 | |||
262 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | 202 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) |
263 | { | 203 | { |
264 | return 0; | 204 | return 0; |
265 | } | 205 | } |
266 | 206 | ||
267 | static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, | 207 | static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, |
268 | unsigned int psize) | 208 | unsigned long start, unsigned long end, |
209 | unsigned long floor, unsigned long ceiling) | ||
269 | { | 210 | { |
270 | pte_t *hugepte = hugepd_page(*hpdp); | 211 | pte_t *hugepte = hugepd_page(*hpdp); |
212 | unsigned shift = hugepd_shift(*hpdp); | ||
213 | unsigned long pdmask = ~((1UL << pdshift) - 1); | ||
214 | |||
215 | start &= pdmask; | ||
216 | if (start < floor) | ||
217 | return; | ||
218 | if (ceiling) { | ||
219 | ceiling &= pdmask; | ||
220 | if (! ceiling) | ||
221 | return; | ||
222 | } | ||
223 | if (end - 1 > ceiling - 1) | ||
224 | return; | ||
271 | 225 | ||
272 | hpdp->pd = 0; | 226 | hpdp->pd = 0; |
273 | tlb->need_flush = 1; | 227 | tlb->need_flush = 1; |
274 | pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, | 228 | pgtable_free_tlb(tlb, hugepte, pdshift - shift); |
275 | HUGEPTE_CACHE_NUM+psize-1, | ||
276 | PGF_CACHENUM_MASK)); | ||
277 | } | 229 | } |
278 | 230 | ||
279 | static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 231 | static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, |
280 | unsigned long addr, unsigned long end, | 232 | unsigned long addr, unsigned long end, |
281 | unsigned long floor, unsigned long ceiling, | 233 | unsigned long floor, unsigned long ceiling) |
282 | unsigned int psize) | ||
283 | { | 234 | { |
284 | pmd_t *pmd; | 235 | pmd_t *pmd; |
285 | unsigned long next; | 236 | unsigned long next; |
@@ -291,7 +242,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
291 | next = pmd_addr_end(addr, end); | 242 | next = pmd_addr_end(addr, end); |
292 | if (pmd_none(*pmd)) | 243 | if (pmd_none(*pmd)) |
293 | continue; | 244 | continue; |
294 | free_hugepte_range(tlb, (hugepd_t *)pmd, psize); | 245 | free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, |
246 | addr, next, floor, ceiling); | ||
295 | } while (pmd++, addr = next, addr != end); | 247 | } while (pmd++, addr = next, addr != end); |
296 | 248 | ||
297 | start &= PUD_MASK; | 249 | start &= PUD_MASK; |
@@ -317,23 +269,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
317 | pud_t *pud; | 269 | pud_t *pud; |
318 | unsigned long next; | 270 | unsigned long next; |
319 | unsigned long start; | 271 | unsigned long start; |
320 | unsigned int shift; | ||
321 | unsigned int psize = get_slice_psize(tlb->mm, addr); | ||
322 | shift = mmu_psize_to_shift(psize); | ||
323 | 272 | ||
324 | start = addr; | 273 | start = addr; |
325 | pud = pud_offset(pgd, addr); | 274 | pud = pud_offset(pgd, addr); |
326 | do { | 275 | do { |
327 | next = pud_addr_end(addr, end); | 276 | next = pud_addr_end(addr, end); |
328 | if (shift < PMD_SHIFT) { | 277 | if (!is_hugepd(pud)) { |
329 | if (pud_none_or_clear_bad(pud)) | 278 | if (pud_none_or_clear_bad(pud)) |
330 | continue; | 279 | continue; |
331 | hugetlb_free_pmd_range(tlb, pud, addr, next, floor, | 280 | hugetlb_free_pmd_range(tlb, pud, addr, next, floor, |
332 | ceiling, psize); | 281 | ceiling); |
333 | } else { | 282 | } else { |
334 | if (pud_none(*pud)) | 283 | free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, |
335 | continue; | 284 | addr, next, floor, ceiling); |
336 | free_hugepte_range(tlb, (hugepd_t *)pud, psize); | ||
337 | } | 285 | } |
338 | } while (pud++, addr = next, addr != end); | 286 | } while (pud++, addr = next, addr != end); |
339 | 287 | ||
@@ -364,121 +312,56 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, | |||
364 | { | 312 | { |
365 | pgd_t *pgd; | 313 | pgd_t *pgd; |
366 | unsigned long next; | 314 | unsigned long next; |
367 | unsigned long start; | ||
368 | 315 | ||
369 | /* | 316 | /* |
370 | * Comments below take from the normal free_pgd_range(). They | 317 | * Because there are a number of different possible pagetable |
371 | * apply here too. The tests against HUGEPD_MASK below are | 318 | * layouts for hugepage ranges, we limit knowledge of how |
372 | * essential, because we *don't* test for this at the bottom | 319 | * things should be laid out to the allocation path |
373 | * level. Without them we'll attempt to free a hugepte table | 320 | * (huge_pte_alloc(), above). Everything else works out the |
374 | * when we unmap just part of it, even if there are other | 321 | * structure as it goes from information in the hugepd |
375 | * active mappings using it. | 322 | * pointers. That means that we can't here use the |
376 | * | 323 | * optimization used in the normal page free_pgd_range(), of |
377 | * The next few lines have given us lots of grief... | 324 | * checking whether we're actually covering a large enough |
378 | * | 325 | * range to have to do anything at the top level of the walk |
379 | * Why are we testing HUGEPD* at this top level? Because | 326 | * instead of at the bottom. |
380 | * often there will be no work to do at all, and we'd prefer | ||
381 | * not to go all the way down to the bottom just to discover | ||
382 | * that. | ||
383 | * | ||
384 | * Why all these "- 1"s? Because 0 represents both the bottom | ||
385 | * of the address space and the top of it (using -1 for the | ||
386 | * top wouldn't help much: the masks would do the wrong thing). | ||
387 | * The rule is that addr 0 and floor 0 refer to the bottom of | ||
388 | * the address space, but end 0 and ceiling 0 refer to the top | ||
389 | * Comparisons need to use "end - 1" and "ceiling - 1" (though | ||
390 | * that end 0 case should be mythical). | ||
391 | * | 327 | * |
392 | * Wherever addr is brought up or ceiling brought down, we | 328 | * To make sense of this, you should probably go read the big |
393 | * must be careful to reject "the opposite 0" before it | 329 | * block comment at the top of the normal free_pgd_range(), |
394 | * confuses the subsequent tests. But what about where end is | 330 | * too. |
395 | * brought down by HUGEPD_SIZE below? no, end can't go down to | ||
396 | * 0 there. | ||
397 | * | ||
398 | * Whereas we round start (addr) and ceiling down, by different | ||
399 | * masks at different levels, in order to test whether a table | ||
400 | * now has no other vmas using it, so can be freed, we don't | ||
401 | * bother to round floor or end up - the tests don't need that. | ||
402 | */ | 331 | */ |
403 | unsigned int psize = get_slice_psize(tlb->mm, addr); | ||
404 | |||
405 | addr &= HUGEPD_MASK(psize); | ||
406 | if (addr < floor) { | ||
407 | addr += HUGEPD_SIZE(psize); | ||
408 | if (!addr) | ||
409 | return; | ||
410 | } | ||
411 | if (ceiling) { | ||
412 | ceiling &= HUGEPD_MASK(psize); | ||
413 | if (!ceiling) | ||
414 | return; | ||
415 | } | ||
416 | if (end - 1 > ceiling - 1) | ||
417 | end -= HUGEPD_SIZE(psize); | ||
418 | if (addr > end - 1) | ||
419 | return; | ||
420 | 332 | ||
421 | start = addr; | ||
422 | pgd = pgd_offset(tlb->mm, addr); | 333 | pgd = pgd_offset(tlb->mm, addr); |
423 | do { | 334 | do { |
424 | psize = get_slice_psize(tlb->mm, addr); | ||
425 | BUG_ON(!mmu_huge_psizes[psize]); | ||
426 | next = pgd_addr_end(addr, end); | 335 | next = pgd_addr_end(addr, end); |
427 | if (mmu_psize_to_shift(psize) < PUD_SHIFT) { | 336 | if (!is_hugepd(pgd)) { |
428 | if (pgd_none_or_clear_bad(pgd)) | 337 | if (pgd_none_or_clear_bad(pgd)) |
429 | continue; | 338 | continue; |
430 | hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); | 339 | hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); |
431 | } else { | 340 | } else { |
432 | if (pgd_none(*pgd)) | 341 | free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, |
433 | continue; | 342 | addr, next, floor, ceiling); |
434 | free_hugepte_range(tlb, (hugepd_t *)pgd, psize); | ||
435 | } | 343 | } |
436 | } while (pgd++, addr = next, addr != end); | 344 | } while (pgd++, addr = next, addr != end); |
437 | } | 345 | } |
438 | 346 | ||
439 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | ||
440 | pte_t *ptep, pte_t pte) | ||
441 | { | ||
442 | if (pte_present(*ptep)) { | ||
443 | /* We open-code pte_clear because we need to pass the right | ||
444 | * argument to hpte_need_flush (huge / !huge). Might not be | ||
445 | * necessary anymore if we make hpte_need_flush() get the | ||
446 | * page size from the slices | ||
447 | */ | ||
448 | unsigned int psize = get_slice_psize(mm, addr); | ||
449 | unsigned int shift = mmu_psize_to_shift(psize); | ||
450 | unsigned long sz = ((1UL) << shift); | ||
451 | struct hstate *hstate = size_to_hstate(sz); | ||
452 | pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1); | ||
453 | } | ||
454 | *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); | ||
455 | } | ||
456 | |||
457 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, | ||
458 | pte_t *ptep) | ||
459 | { | ||
460 | unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); | ||
461 | return __pte(old); | ||
462 | } | ||
463 | |||
464 | struct page * | 347 | struct page * |
465 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | 348 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) |
466 | { | 349 | { |
467 | pte_t *ptep; | 350 | pte_t *ptep; |
468 | struct page *page; | 351 | struct page *page; |
469 | unsigned int mmu_psize = get_slice_psize(mm, address); | 352 | unsigned shift; |
353 | unsigned long mask; | ||
354 | |||
355 | ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); | ||
470 | 356 | ||
471 | /* Verify it is a huge page else bail. */ | 357 | /* Verify it is a huge page else bail. */ |
472 | if (!mmu_huge_psizes[mmu_psize]) | 358 | if (!ptep || !shift) |
473 | return ERR_PTR(-EINVAL); | 359 | return ERR_PTR(-EINVAL); |
474 | 360 | ||
475 | ptep = huge_pte_offset(mm, address); | 361 | mask = (1UL << shift) - 1; |
476 | page = pte_page(*ptep); | 362 | page = pte_page(*ptep); |
477 | if (page) { | 363 | if (page) |
478 | unsigned int shift = mmu_psize_to_shift(mmu_psize); | 364 | page += (address & mask) / PAGE_SIZE; |
479 | unsigned long sz = ((1UL) << shift); | ||
480 | page += (address % sz) / PAGE_SIZE; | ||
481 | } | ||
482 | 365 | ||
483 | return page; | 366 | return page; |
484 | } | 367 | } |
@@ -501,6 +384,82 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |||
501 | return NULL; | 384 | return NULL; |
502 | } | 385 | } |
503 | 386 | ||
387 | static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, | ||
388 | unsigned long end, int write, struct page **pages, int *nr) | ||
389 | { | ||
390 | unsigned long mask; | ||
391 | unsigned long pte_end; | ||
392 | struct page *head, *page; | ||
393 | pte_t pte; | ||
394 | int refs; | ||
395 | |||
396 | pte_end = (addr + sz) & ~(sz-1); | ||
397 | if (pte_end < end) | ||
398 | end = pte_end; | ||
399 | |||
400 | pte = *ptep; | ||
401 | mask = _PAGE_PRESENT | _PAGE_USER; | ||
402 | if (write) | ||
403 | mask |= _PAGE_RW; | ||
404 | |||
405 | if ((pte_val(pte) & mask) != mask) | ||
406 | return 0; | ||
407 | |||
408 | /* hugepages are never "special" */ | ||
409 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
410 | |||
411 | refs = 0; | ||
412 | head = pte_page(pte); | ||
413 | |||
414 | page = head + ((addr & (sz-1)) >> PAGE_SHIFT); | ||
415 | do { | ||
416 | VM_BUG_ON(compound_head(page) != head); | ||
417 | pages[*nr] = page; | ||
418 | (*nr)++; | ||
419 | page++; | ||
420 | refs++; | ||
421 | } while (addr += PAGE_SIZE, addr != end); | ||
422 | |||
423 | if (!page_cache_add_speculative(head, refs)) { | ||
424 | *nr -= refs; | ||
425 | return 0; | ||
426 | } | ||
427 | |||
428 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | ||
429 | /* Could be optimized better */ | ||
430 | while (*nr) { | ||
431 | put_page(page); | ||
432 | (*nr)--; | ||
433 | } | ||
434 | } | ||
435 | |||
436 | return 1; | ||
437 | } | ||
438 | |||
439 | static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, | ||
440 | unsigned long sz) | ||
441 | { | ||
442 | unsigned long __boundary = (addr + sz) & ~(sz-1); | ||
443 | return (__boundary - 1 < end - 1) ? __boundary : end; | ||
444 | } | ||
445 | |||
446 | int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, | ||
447 | unsigned long addr, unsigned long end, | ||
448 | int write, struct page **pages, int *nr) | ||
449 | { | ||
450 | pte_t *ptep; | ||
451 | unsigned long sz = 1UL << hugepd_shift(*hugepd); | ||
452 | unsigned long next; | ||
453 | |||
454 | ptep = hugepte_offset(hugepd, addr, pdshift); | ||
455 | do { | ||
456 | next = hugepte_addr_end(addr, end, sz); | ||
457 | if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) | ||
458 | return 0; | ||
459 | } while (ptep++, addr = next, addr != end); | ||
460 | |||
461 | return 1; | ||
462 | } | ||
504 | 463 | ||
505 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | 464 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, |
506 | unsigned long len, unsigned long pgoff, | 465 | unsigned long len, unsigned long pgoff, |
@@ -509,8 +468,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |||
509 | struct hstate *hstate = hstate_file(file); | 468 | struct hstate *hstate = hstate_file(file); |
510 | int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); | 469 | int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); |
511 | 470 | ||
512 | if (!mmu_huge_psizes[mmu_psize]) | ||
513 | return -EINVAL; | ||
514 | return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); | 471 | return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); |
515 | } | 472 | } |
516 | 473 | ||
@@ -521,229 +478,46 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) | |||
521 | return 1UL << mmu_psize_to_shift(psize); | 478 | return 1UL << mmu_psize_to_shift(psize); |
522 | } | 479 | } |
523 | 480 | ||
524 | /* | 481 | static int __init add_huge_page_size(unsigned long long size) |
525 | * Called by asm hashtable.S for doing lazy icache flush | ||
526 | */ | ||
527 | static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, | ||
528 | pte_t pte, int trap, unsigned long sz) | ||
529 | { | 482 | { |
530 | struct page *page; | 483 | int shift = __ffs(size); |
531 | int i; | 484 | int mmu_psize; |
532 | |||
533 | if (!pfn_valid(pte_pfn(pte))) | ||
534 | return rflags; | ||
535 | |||
536 | page = pte_page(pte); | ||
537 | |||
538 | /* page is dirty */ | ||
539 | if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { | ||
540 | if (trap == 0x400) { | ||
541 | for (i = 0; i < (sz / PAGE_SIZE); i++) | ||
542 | __flush_dcache_icache(page_address(page+i)); | ||
543 | set_bit(PG_arch_1, &page->flags); | ||
544 | } else { | ||
545 | rflags |= HPTE_R_N; | ||
546 | } | ||
547 | } | ||
548 | return rflags; | ||
549 | } | ||
550 | 485 | ||
551 | int hash_huge_page(struct mm_struct *mm, unsigned long access, | 486 | /* Check that it is a page size supported by the hardware and |
552 | unsigned long ea, unsigned long vsid, int local, | 487 | * that it fits within pagetable and slice limits. */ |
553 | unsigned long trap) | 488 | if (!is_power_of_2(size) |
554 | { | 489 | || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) |
555 | pte_t *ptep; | 490 | return -EINVAL; |
556 | unsigned long old_pte, new_pte; | ||
557 | unsigned long va, rflags, pa, sz; | ||
558 | long slot; | ||
559 | int err = 1; | ||
560 | int ssize = user_segment_size(ea); | ||
561 | unsigned int mmu_psize; | ||
562 | int shift; | ||
563 | mmu_psize = get_slice_psize(mm, ea); | ||
564 | |||
565 | if (!mmu_huge_psizes[mmu_psize]) | ||
566 | goto out; | ||
567 | ptep = huge_pte_offset(mm, ea); | ||
568 | |||
569 | /* Search the Linux page table for a match with va */ | ||
570 | va = hpt_va(ea, vsid, ssize); | ||
571 | 491 | ||
572 | /* | 492 | if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) |
573 | * If no pte found or not present, send the problem up to | 493 | return -EINVAL; |
574 | * do_page_fault | ||
575 | */ | ||
576 | if (unlikely(!ptep || pte_none(*ptep))) | ||
577 | goto out; | ||
578 | 494 | ||
579 | /* | 495 | #ifdef CONFIG_SPU_FS_64K_LS |
580 | * Check the user's access rights to the page. If access should be | 496 | /* Disable support for 64K huge pages when 64K SPU local store |
581 | * prevented then send the problem up to do_page_fault. | 497 | * support is enabled as the current implementation conflicts. |
582 | */ | 498 | */ |
583 | if (unlikely(access & ~pte_val(*ptep))) | 499 | if (shift == PAGE_SHIFT_64K) |
584 | goto out; | 500 | return -EINVAL; |
585 | /* | 501 | #endif /* CONFIG_SPU_FS_64K_LS */ |
586 | * At this point, we have a pte (old_pte) which can be used to build | ||
587 | * or update an HPTE. There are 2 cases: | ||
588 | * | ||
589 | * 1. There is a valid (present) pte with no associated HPTE (this is | ||
590 | * the most common case) | ||
591 | * 2. There is a valid (present) pte with an associated HPTE. The | ||
592 | * current values of the pp bits in the HPTE prevent access | ||
593 | * because we are doing software DIRTY bit management and the | ||
594 | * page is currently not DIRTY. | ||
595 | */ | ||
596 | |||
597 | |||
598 | do { | ||
599 | old_pte = pte_val(*ptep); | ||
600 | if (old_pte & _PAGE_BUSY) | ||
601 | goto out; | ||
602 | new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; | ||
603 | } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, | ||
604 | old_pte, new_pte)); | ||
605 | |||
606 | rflags = 0x2 | (!(new_pte & _PAGE_RW)); | ||
607 | /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ | ||
608 | rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); | ||
609 | shift = mmu_psize_to_shift(mmu_psize); | ||
610 | sz = ((1UL) << shift); | ||
611 | if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) | ||
612 | /* No CPU has hugepages but lacks no execute, so we | ||
613 | * don't need to worry about that case */ | ||
614 | rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), | ||
615 | trap, sz); | ||
616 | |||
617 | /* Check if pte already has an hpte (case 2) */ | ||
618 | if (unlikely(old_pte & _PAGE_HASHPTE)) { | ||
619 | /* There MIGHT be an HPTE for this pte */ | ||
620 | unsigned long hash, slot; | ||
621 | |||
622 | hash = hpt_hash(va, shift, ssize); | ||
623 | if (old_pte & _PAGE_F_SECOND) | ||
624 | hash = ~hash; | ||
625 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | ||
626 | slot += (old_pte & _PAGE_F_GIX) >> 12; | ||
627 | |||
628 | if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, | ||
629 | ssize, local) == -1) | ||
630 | old_pte &= ~_PAGE_HPTEFLAGS; | ||
631 | } | ||
632 | |||
633 | if (likely(!(old_pte & _PAGE_HASHPTE))) { | ||
634 | unsigned long hash = hpt_hash(va, shift, ssize); | ||
635 | unsigned long hpte_group; | ||
636 | |||
637 | pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; | ||
638 | |||
639 | repeat: | ||
640 | hpte_group = ((hash & htab_hash_mask) * | ||
641 | HPTES_PER_GROUP) & ~0x7UL; | ||
642 | |||
643 | /* clear HPTE slot informations in new PTE */ | ||
644 | #ifdef CONFIG_PPC_64K_PAGES | ||
645 | new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; | ||
646 | #else | ||
647 | new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; | ||
648 | #endif | ||
649 | /* Add in WIMG bits */ | ||
650 | rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | | ||
651 | _PAGE_COHERENT | _PAGE_GUARDED)); | ||
652 | |||
653 | /* Insert into the hash table, primary slot */ | ||
654 | slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, | ||
655 | mmu_psize, ssize); | ||
656 | |||
657 | /* Primary is full, try the secondary */ | ||
658 | if (unlikely(slot == -1)) { | ||
659 | hpte_group = ((~hash & htab_hash_mask) * | ||
660 | HPTES_PER_GROUP) & ~0x7UL; | ||
661 | slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, | ||
662 | HPTE_V_SECONDARY, | ||
663 | mmu_psize, ssize); | ||
664 | if (slot == -1) { | ||
665 | if (mftb() & 0x1) | ||
666 | hpte_group = ((hash & htab_hash_mask) * | ||
667 | HPTES_PER_GROUP)&~0x7UL; | ||
668 | |||
669 | ppc_md.hpte_remove(hpte_group); | ||
670 | goto repeat; | ||
671 | } | ||
672 | } | ||
673 | |||
674 | if (unlikely(slot == -2)) | ||
675 | panic("hash_huge_page: pte_insert failed\n"); | ||
676 | |||
677 | new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); | ||
678 | } | ||
679 | 502 | ||
680 | /* | 503 | BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); |
681 | * No need to use ldarx/stdcx here | ||
682 | */ | ||
683 | *ptep = __pte(new_pte & ~_PAGE_BUSY); | ||
684 | 504 | ||
685 | err = 0; | 505 | /* Return if huge page size has already been setup */ |
506 | if (size_to_hstate(size)) | ||
507 | return 0; | ||
686 | 508 | ||
687 | out: | 509 | hugetlb_add_hstate(shift - PAGE_SHIFT); |
688 | return err; | ||
689 | } | ||
690 | 510 | ||
691 | static void __init set_huge_psize(int psize) | 511 | return 0; |
692 | { | ||
693 | /* Check that it is a page size supported by the hardware and | ||
694 | * that it fits within pagetable limits. */ | ||
695 | if (mmu_psize_defs[psize].shift && | ||
696 | mmu_psize_defs[psize].shift < SID_SHIFT_1T && | ||
697 | (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT || | ||
698 | mmu_psize_defs[psize].shift == PAGE_SHIFT_64K || | ||
699 | mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) { | ||
700 | /* Return if huge page size has already been setup or is the | ||
701 | * same as the base page size. */ | ||
702 | if (mmu_huge_psizes[psize] || | ||
703 | mmu_psize_defs[psize].shift == PAGE_SHIFT) | ||
704 | return; | ||
705 | if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL)) | ||
706 | return; | ||
707 | hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); | ||
708 | |||
709 | switch (mmu_psize_defs[psize].shift) { | ||
710 | case PAGE_SHIFT_64K: | ||
711 | /* We only allow 64k hpages with 4k base page, | ||
712 | * which was checked above, and always put them | ||
713 | * at the PMD */ | ||
714 | hugepte_shift[psize] = PMD_SHIFT; | ||
715 | break; | ||
716 | case PAGE_SHIFT_16M: | ||
717 | /* 16M pages can be at two different levels | ||
718 | * of pagestables based on base page size */ | ||
719 | if (PAGE_SHIFT == PAGE_SHIFT_64K) | ||
720 | hugepte_shift[psize] = PMD_SHIFT; | ||
721 | else /* 4k base page */ | ||
722 | hugepte_shift[psize] = PUD_SHIFT; | ||
723 | break; | ||
724 | case PAGE_SHIFT_16G: | ||
725 | /* 16G pages are always at PGD level */ | ||
726 | hugepte_shift[psize] = PGDIR_SHIFT; | ||
727 | break; | ||
728 | } | ||
729 | hugepte_shift[psize] -= mmu_psize_defs[psize].shift; | ||
730 | } else | ||
731 | hugepte_shift[psize] = 0; | ||
732 | } | 512 | } |
733 | 513 | ||
734 | static int __init hugepage_setup_sz(char *str) | 514 | static int __init hugepage_setup_sz(char *str) |
735 | { | 515 | { |
736 | unsigned long long size; | 516 | unsigned long long size; |
737 | int mmu_psize; | ||
738 | int shift; | ||
739 | 517 | ||
740 | size = memparse(str, &str); | 518 | size = memparse(str, &str); |
741 | 519 | ||
742 | shift = __ffs(size); | 520 | if (add_huge_page_size(size) != 0) |
743 | mmu_psize = shift_to_mmu_psize(shift); | ||
744 | if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift) | ||
745 | set_huge_psize(mmu_psize); | ||
746 | else | ||
747 | printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); | 521 | printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); |
748 | 522 | ||
749 | return 1; | 523 | return 1; |
@@ -752,41 +526,55 @@ __setup("hugepagesz=", hugepage_setup_sz); | |||
752 | 526 | ||
753 | static int __init hugetlbpage_init(void) | 527 | static int __init hugetlbpage_init(void) |
754 | { | 528 | { |
755 | unsigned int psize; | 529 | int psize; |
756 | 530 | ||
757 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) | 531 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) |
758 | return -ENODEV; | 532 | return -ENODEV; |
759 | 533 | ||
760 | /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE | 534 | for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { |
761 | * and adjust PTE_NONCACHE_NUM if the number of supported huge page | 535 | unsigned shift; |
762 | * sizes changes. | 536 | unsigned pdshift; |
763 | */ | ||
764 | set_huge_psize(MMU_PAGE_16M); | ||
765 | set_huge_psize(MMU_PAGE_16G); | ||
766 | 537 | ||
767 | /* Temporarily disable support for 64K huge pages when 64K SPU local | 538 | if (!mmu_psize_defs[psize].shift) |
768 | * store support is enabled as the current implementation conflicts. | 539 | continue; |
769 | */ | ||
770 | #ifndef CONFIG_SPU_FS_64K_LS | ||
771 | set_huge_psize(MMU_PAGE_64K); | ||
772 | #endif | ||
773 | 540 | ||
774 | for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { | 541 | shift = mmu_psize_to_shift(psize); |
775 | if (mmu_huge_psizes[psize]) { | 542 | |
776 | pgtable_cache[HUGE_PGTABLE_INDEX(psize)] = | 543 | if (add_huge_page_size(1ULL << shift) < 0) |
777 | kmem_cache_create( | 544 | continue; |
778 | HUGEPTE_CACHE_NAME(psize), | 545 | |
779 | HUGEPTE_TABLE_SIZE(psize), | 546 | if (shift < PMD_SHIFT) |
780 | HUGEPTE_TABLE_SIZE(psize), | 547 | pdshift = PMD_SHIFT; |
781 | 0, | 548 | else if (shift < PUD_SHIFT) |
782 | NULL); | 549 | pdshift = PUD_SHIFT; |
783 | if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)]) | 550 | else |
784 | panic("hugetlbpage_init(): could not create %s"\ | 551 | pdshift = PGDIR_SHIFT; |
785 | "\n", HUGEPTE_CACHE_NAME(psize)); | 552 | |
786 | } | 553 | pgtable_cache_add(pdshift - shift, NULL); |
554 | if (!PGT_CACHE(pdshift - shift)) | ||
555 | panic("hugetlbpage_init(): could not create " | ||
556 | "pgtable cache for %d bit pagesize\n", shift); | ||
787 | } | 557 | } |
788 | 558 | ||
559 | /* Set default large page size. Currently, we pick 16M or 1M | ||
560 | * depending on what is available | ||
561 | */ | ||
562 | if (mmu_psize_defs[MMU_PAGE_16M].shift) | ||
563 | HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; | ||
564 | else if (mmu_psize_defs[MMU_PAGE_1M].shift) | ||
565 | HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; | ||
566 | |||
789 | return 0; | 567 | return 0; |
790 | } | 568 | } |
791 | 569 | ||
792 | module_init(hugetlbpage_init); | 570 | module_init(hugetlbpage_init); |
571 | |||
572 | void flush_dcache_icache_hugepage(struct page *page) | ||
573 | { | ||
574 | int i; | ||
575 | |||
576 | BUG_ON(!PageCompound(page)); | ||
577 | |||
578 | for (i = 0; i < (1UL << compound_order(page)); i++) | ||
579 | __flush_dcache_icache(page_address(page+i)); | ||
580 | } | ||
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 335c578b9cc3..776f28d02b6b 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/module.h> | 41 | #include <linux/module.h> |
42 | #include <linux/poison.h> | 42 | #include <linux/poison.h> |
43 | #include <linux/lmb.h> | 43 | #include <linux/lmb.h> |
44 | #include <linux/hugetlb.h> | ||
44 | 45 | ||
45 | #include <asm/pgalloc.h> | 46 | #include <asm/pgalloc.h> |
46 | #include <asm/page.h> | 47 | #include <asm/page.h> |
@@ -119,30 +120,63 @@ static void pmd_ctor(void *addr) | |||
119 | memset(addr, 0, PMD_TABLE_SIZE); | 120 | memset(addr, 0, PMD_TABLE_SIZE); |
120 | } | 121 | } |
121 | 122 | ||
122 | static const unsigned int pgtable_cache_size[2] = { | 123 | struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; |
123 | PGD_TABLE_SIZE, PMD_TABLE_SIZE | 124 | |
124 | }; | 125 | /* |
125 | static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { | 126 | * Create a kmem_cache() for pagetables. This is not used for PTE |
126 | #ifdef CONFIG_PPC_64K_PAGES | 127 | * pages - they're linked to struct page, come from the normal free |
127 | "pgd_cache", "pmd_cache", | 128 | * pages pool and have a different entry size (see real_pte_t) to |
128 | #else | 129 | * everything else. Caches created by this function are used for all |
129 | "pgd_cache", "pud_pmd_cache", | 130 | * the higher level pagetables, and for hugepage pagetables. |
130 | #endif /* CONFIG_PPC_64K_PAGES */ | 131 | */ |
131 | }; | 132 | void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) |
132 | 133 | { | |
133 | #ifdef CONFIG_HUGETLB_PAGE | 134 | char *name; |
134 | /* Hugepages need an extra cache per hugepagesize, initialized in | 135 | unsigned long table_size = sizeof(void *) << shift; |
135 | * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT | 136 | unsigned long align = table_size; |
136 | * is not compile time constant. */ | 137 | |
137 | struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT]; | 138 | /* When batching pgtable pointers for RCU freeing, we store |
138 | #else | 139 | * the index size in the low bits. Table alignment must be |
139 | struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; | 140 | * big enough to fit it. |
140 | #endif | 141 | * |
142 | * Likewise, hugeapge pagetable pointers contain a (different) | ||
143 | * shift value in the low bits. All tables must be aligned so | ||
144 | * as to leave enough 0 bits in the address to contain it. */ | ||
145 | unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, | ||
146 | HUGEPD_SHIFT_MASK + 1); | ||
147 | struct kmem_cache *new; | ||
148 | |||
149 | /* It would be nice if this was a BUILD_BUG_ON(), but at the | ||
150 | * moment, gcc doesn't seem to recognize is_power_of_2 as a | ||
151 | * constant expression, so so much for that. */ | ||
152 | BUG_ON(!is_power_of_2(minalign)); | ||
153 | BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE)); | ||
154 | |||
155 | if (PGT_CACHE(shift)) | ||
156 | return; /* Already have a cache of this size */ | ||
157 | |||
158 | align = max_t(unsigned long, align, minalign); | ||
159 | name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); | ||
160 | new = kmem_cache_create(name, table_size, align, 0, ctor); | ||
161 | PGT_CACHE(shift) = new; | ||
162 | |||
163 | pr_debug("Allocated pgtable cache for order %d\n", shift); | ||
164 | } | ||
165 | |||
141 | 166 | ||
142 | void pgtable_cache_init(void) | 167 | void pgtable_cache_init(void) |
143 | { | 168 | { |
144 | pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); | 169 | pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); |
145 | pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); | 170 | pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); |
171 | if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) | ||
172 | panic("Couldn't allocate pgtable caches"); | ||
173 | |||
174 | /* In all current configs, when the PUD index exists it's the | ||
175 | * same size as either the pgd or pmd index. Verify that the | ||
176 | * initialization above has also created a PUD cache. This | ||
177 | * will need re-examiniation if we add new possibilities for | ||
178 | * the pagetable layout. */ | ||
179 | BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE)); | ||
146 | } | 180 | } |
147 | 181 | ||
148 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 182 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 59736317bf0e..b9b152558f9c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/pagemap.h> | 32 | #include <linux/pagemap.h> |
33 | #include <linux/suspend.h> | 33 | #include <linux/suspend.h> |
34 | #include <linux/lmb.h> | 34 | #include <linux/lmb.h> |
35 | #include <linux/hugetlb.h> | ||
35 | 36 | ||
36 | #include <asm/pgalloc.h> | 37 | #include <asm/pgalloc.h> |
37 | #include <asm/prom.h> | 38 | #include <asm/prom.h> |
@@ -417,18 +418,26 @@ EXPORT_SYMBOL(flush_dcache_page); | |||
417 | 418 | ||
418 | void flush_dcache_icache_page(struct page *page) | 419 | void flush_dcache_icache_page(struct page *page) |
419 | { | 420 | { |
421 | #ifdef CONFIG_HUGETLB_PAGE | ||
422 | if (PageCompound(page)) { | ||
423 | flush_dcache_icache_hugepage(page); | ||
424 | return; | ||
425 | } | ||
426 | #endif | ||
420 | #ifdef CONFIG_BOOKE | 427 | #ifdef CONFIG_BOOKE |
421 | void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); | 428 | { |
422 | __flush_dcache_icache(start); | 429 | void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); |
423 | kunmap_atomic(start, KM_PPC_SYNC_ICACHE); | 430 | __flush_dcache_icache(start); |
431 | kunmap_atomic(start, KM_PPC_SYNC_ICACHE); | ||
432 | } | ||
424 | #elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) | 433 | #elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) |
425 | /* On 8xx there is no need to kmap since highmem is not supported */ | 434 | /* On 8xx there is no need to kmap since highmem is not supported */ |
426 | __flush_dcache_icache(page_address(page)); | 435 | __flush_dcache_icache(page_address(page)); |
427 | #else | 436 | #else |
428 | __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); | 437 | __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); |
429 | #endif | 438 | #endif |
430 | |||
431 | } | 439 | } |
440 | |||
432 | void clear_user_page(void *page, unsigned long vaddr, struct page *pg) | 441 | void clear_user_page(void *page, unsigned long vaddr, struct page *pg) |
433 | { | 442 | { |
434 | clear_page(page); | 443 | clear_page(page); |
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index dbeb86ac90cd..b910d37aea1a 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
19 | #include <linux/spinlock.h> | 19 | #include <linux/spinlock.h> |
20 | #include <linux/idr.h> | 20 | #include <linux/idr.h> |
21 | #include <linux/module.h> | ||
21 | 22 | ||
22 | #include <asm/mmu_context.h> | 23 | #include <asm/mmu_context.h> |
23 | 24 | ||
@@ -32,7 +33,7 @@ static DEFINE_IDR(mmu_context_idr); | |||
32 | #define NO_CONTEXT 0 | 33 | #define NO_CONTEXT 0 |
33 | #define MAX_CONTEXT ((1UL << 19) - 1) | 34 | #define MAX_CONTEXT ((1UL << 19) - 1) |
34 | 35 | ||
35 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | 36 | int __init_new_context(void) |
36 | { | 37 | { |
37 | int index; | 38 | int index; |
38 | int err; | 39 | int err; |
@@ -57,22 +58,41 @@ again: | |||
57 | return -ENOMEM; | 58 | return -ENOMEM; |
58 | } | 59 | } |
59 | 60 | ||
61 | return index; | ||
62 | } | ||
63 | EXPORT_SYMBOL_GPL(__init_new_context); | ||
64 | |||
65 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | ||
66 | { | ||
67 | int index; | ||
68 | |||
69 | index = __init_new_context(); | ||
70 | if (index < 0) | ||
71 | return index; | ||
72 | |||
60 | /* The old code would re-promote on fork, we don't do that | 73 | /* The old code would re-promote on fork, we don't do that |
61 | * when using slices as it could cause problem promoting slices | 74 | * when using slices as it could cause problem promoting slices |
62 | * that have been forced down to 4K | 75 | * that have been forced down to 4K |
63 | */ | 76 | */ |
64 | if (slice_mm_new_context(mm)) | 77 | if (slice_mm_new_context(mm)) |
65 | slice_set_user_psize(mm, mmu_virtual_psize); | 78 | slice_set_user_psize(mm, mmu_virtual_psize); |
79 | subpage_prot_init_new_context(mm); | ||
66 | mm->context.id = index; | 80 | mm->context.id = index; |
67 | 81 | ||
68 | return 0; | 82 | return 0; |
69 | } | 83 | } |
70 | 84 | ||
71 | void destroy_context(struct mm_struct *mm) | 85 | void __destroy_context(int context_id) |
72 | { | 86 | { |
73 | spin_lock(&mmu_context_lock); | 87 | spin_lock(&mmu_context_lock); |
74 | idr_remove(&mmu_context_idr, mm->context.id); | 88 | idr_remove(&mmu_context_idr, context_id); |
75 | spin_unlock(&mmu_context_lock); | 89 | spin_unlock(&mmu_context_lock); |
90 | } | ||
91 | EXPORT_SYMBOL_GPL(__destroy_context); | ||
76 | 92 | ||
93 | void destroy_context(struct mm_struct *mm) | ||
94 | { | ||
95 | __destroy_context(mm->context.id); | ||
96 | subpage_prot_free(mm); | ||
77 | mm->context.id = NO_CONTEXT; | 97 | mm->context.id = NO_CONTEXT; |
78 | } | 98 | } |
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index d2e5321d5ea6..e27a990af42d 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h | |||
@@ -98,21 +98,10 @@ extern void _tlbia(void); | |||
98 | 98 | ||
99 | #ifdef CONFIG_PPC32 | 99 | #ifdef CONFIG_PPC32 |
100 | 100 | ||
101 | struct tlbcam { | ||
102 | u32 MAS0; | ||
103 | u32 MAS1; | ||
104 | u32 MAS2; | ||
105 | u32 MAS3; | ||
106 | u32 MAS7; | ||
107 | }; | ||
108 | |||
109 | extern void mapin_ram(void); | 101 | extern void mapin_ram(void); |
110 | extern int map_page(unsigned long va, phys_addr_t pa, int flags); | 102 | extern int map_page(unsigned long va, phys_addr_t pa, int flags); |
111 | extern void setbat(int index, unsigned long virt, phys_addr_t phys, | 103 | extern void setbat(int index, unsigned long virt, phys_addr_t phys, |
112 | unsigned int size, int flags); | 104 | unsigned int size, int flags); |
113 | extern void settlbcam(int index, unsigned long virt, phys_addr_t phys, | ||
114 | unsigned int size, int flags, unsigned int pid); | ||
115 | extern void invalidate_tlbcam_entry(int index); | ||
116 | 105 | ||
117 | extern int __map_without_bats; | 106 | extern int __map_without_bats; |
118 | extern unsigned long ioremap_base; | 107 | extern unsigned long ioremap_base; |
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 53040931de32..99df697c601a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c | |||
@@ -49,12 +49,12 @@ struct pte_freelist_batch | |||
49 | { | 49 | { |
50 | struct rcu_head rcu; | 50 | struct rcu_head rcu; |
51 | unsigned int index; | 51 | unsigned int index; |
52 | pgtable_free_t tables[0]; | 52 | unsigned long tables[0]; |
53 | }; | 53 | }; |
54 | 54 | ||
55 | #define PTE_FREELIST_SIZE \ | 55 | #define PTE_FREELIST_SIZE \ |
56 | ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ | 56 | ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ |
57 | / sizeof(pgtable_free_t)) | 57 | / sizeof(unsigned long)) |
58 | 58 | ||
59 | static void pte_free_smp_sync(void *arg) | 59 | static void pte_free_smp_sync(void *arg) |
60 | { | 60 | { |
@@ -64,13 +64,13 @@ static void pte_free_smp_sync(void *arg) | |||
64 | /* This is only called when we are critically out of memory | 64 | /* This is only called when we are critically out of memory |
65 | * (and fail to get a page in pte_free_tlb). | 65 | * (and fail to get a page in pte_free_tlb). |
66 | */ | 66 | */ |
67 | static void pgtable_free_now(pgtable_free_t pgf) | 67 | static void pgtable_free_now(void *table, unsigned shift) |
68 | { | 68 | { |
69 | pte_freelist_forced_free++; | 69 | pte_freelist_forced_free++; |
70 | 70 | ||
71 | smp_call_function(pte_free_smp_sync, NULL, 1); | 71 | smp_call_function(pte_free_smp_sync, NULL, 1); |
72 | 72 | ||
73 | pgtable_free(pgf); | 73 | pgtable_free(table, shift); |
74 | } | 74 | } |
75 | 75 | ||
76 | static void pte_free_rcu_callback(struct rcu_head *head) | 76 | static void pte_free_rcu_callback(struct rcu_head *head) |
@@ -79,8 +79,12 @@ static void pte_free_rcu_callback(struct rcu_head *head) | |||
79 | container_of(head, struct pte_freelist_batch, rcu); | 79 | container_of(head, struct pte_freelist_batch, rcu); |
80 | unsigned int i; | 80 | unsigned int i; |
81 | 81 | ||
82 | for (i = 0; i < batch->index; i++) | 82 | for (i = 0; i < batch->index; i++) { |
83 | pgtable_free(batch->tables[i]); | 83 | void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE); |
84 | unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE; | ||
85 | |||
86 | pgtable_free(table, shift); | ||
87 | } | ||
84 | 88 | ||
85 | free_page((unsigned long)batch); | 89 | free_page((unsigned long)batch); |
86 | } | 90 | } |
@@ -91,25 +95,28 @@ static void pte_free_submit(struct pte_freelist_batch *batch) | |||
91 | call_rcu(&batch->rcu, pte_free_rcu_callback); | 95 | call_rcu(&batch->rcu, pte_free_rcu_callback); |
92 | } | 96 | } |
93 | 97 | ||
94 | void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) | 98 | void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift) |
95 | { | 99 | { |
96 | /* This is safe since tlb_gather_mmu has disabled preemption */ | 100 | /* This is safe since tlb_gather_mmu has disabled preemption */ |
97 | struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); | 101 | struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); |
102 | unsigned long pgf; | ||
98 | 103 | ||
99 | if (atomic_read(&tlb->mm->mm_users) < 2 || | 104 | if (atomic_read(&tlb->mm->mm_users) < 2 || |
100 | cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ | 105 | cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ |
101 | pgtable_free(pgf); | 106 | pgtable_free(table, shift); |
102 | return; | 107 | return; |
103 | } | 108 | } |
104 | 109 | ||
105 | if (*batchp == NULL) { | 110 | if (*batchp == NULL) { |
106 | *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); | 111 | *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); |
107 | if (*batchp == NULL) { | 112 | if (*batchp == NULL) { |
108 | pgtable_free_now(pgf); | 113 | pgtable_free_now(table, shift); |
109 | return; | 114 | return; |
110 | } | 115 | } |
111 | (*batchp)->index = 0; | 116 | (*batchp)->index = 0; |
112 | } | 117 | } |
118 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | ||
119 | pgf = (unsigned long)table | shift; | ||
113 | (*batchp)->tables[(*batchp)->index++] = pgf; | 120 | (*batchp)->tables[(*batchp)->index++] = pgf; |
114 | if ((*batchp)->index == PTE_FREELIST_SIZE) { | 121 | if ((*batchp)->index == PTE_FREELIST_SIZE) { |
115 | pte_free_submit(*batchp); | 122 | pte_free_submit(*batchp); |
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 4cafc0c33d0a..a040b81e93bd 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c | |||
@@ -24,9 +24,9 @@ | |||
24 | * Also makes sure that the subpage_prot_table structure is | 24 | * Also makes sure that the subpage_prot_table structure is |
25 | * reinitialized for the next user. | 25 | * reinitialized for the next user. |
26 | */ | 26 | */ |
27 | void subpage_prot_free(pgd_t *pgd) | 27 | void subpage_prot_free(struct mm_struct *mm) |
28 | { | 28 | { |
29 | struct subpage_prot_table *spt = pgd_subpage_prot(pgd); | 29 | struct subpage_prot_table *spt = &mm->context.spt; |
30 | unsigned long i, j, addr; | 30 | unsigned long i, j, addr; |
31 | u32 **p; | 31 | u32 **p; |
32 | 32 | ||
@@ -51,6 +51,13 @@ void subpage_prot_free(pgd_t *pgd) | |||
51 | spt->maxaddr = 0; | 51 | spt->maxaddr = 0; |
52 | } | 52 | } |
53 | 53 | ||
54 | void subpage_prot_init_new_context(struct mm_struct *mm) | ||
55 | { | ||
56 | struct subpage_prot_table *spt = &mm->context.spt; | ||
57 | |||
58 | memset(spt, 0, sizeof(*spt)); | ||
59 | } | ||
60 | |||
54 | static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, | 61 | static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, |
55 | int npages) | 62 | int npages) |
56 | { | 63 | { |
@@ -87,7 +94,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, | |||
87 | static void subpage_prot_clear(unsigned long addr, unsigned long len) | 94 | static void subpage_prot_clear(unsigned long addr, unsigned long len) |
88 | { | 95 | { |
89 | struct mm_struct *mm = current->mm; | 96 | struct mm_struct *mm = current->mm; |
90 | struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd); | 97 | struct subpage_prot_table *spt = &mm->context.spt; |
91 | u32 **spm, *spp; | 98 | u32 **spm, *spp; |
92 | int i, nw; | 99 | int i, nw; |
93 | unsigned long next, limit; | 100 | unsigned long next, limit; |
@@ -136,7 +143,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) | |||
136 | long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) | 143 | long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) |
137 | { | 144 | { |
138 | struct mm_struct *mm = current->mm; | 145 | struct mm_struct *mm = current->mm; |
139 | struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd); | 146 | struct subpage_prot_table *spt = &mm->context.spt; |
140 | u32 **spm, *spp; | 147 | u32 **spm, *spp; |
141 | int i, nw; | 148 | int i, nw; |
142 | unsigned long next, limit; | 149 | unsigned long next, limit; |
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 2b2f35f6985e..282d9306361f 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c | |||
@@ -53,11 +53,6 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, | |||
53 | 53 | ||
54 | i = batch->index; | 54 | i = batch->index; |
55 | 55 | ||
56 | /* We mask the address for the base page size. Huge pages will | ||
57 | * have applied their own masking already | ||
58 | */ | ||
59 | addr &= PAGE_MASK; | ||
60 | |||
61 | /* Get page size (maybe move back to caller). | 56 | /* Get page size (maybe move back to caller). |
62 | * | 57 | * |
63 | * NOTE: when using special 64K mappings in 4K environment like | 58 | * NOTE: when using special 64K mappings in 4K environment like |
@@ -75,6 +70,9 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, | |||
75 | } else | 70 | } else |
76 | psize = pte_pagesize_index(mm, addr, pte); | 71 | psize = pte_pagesize_index(mm, addr, pte); |
77 | 72 | ||
73 | /* Mask the address for the correct page size */ | ||
74 | addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1); | ||
75 | |||
78 | /* Build full vaddr */ | 76 | /* Build full vaddr */ |
79 | if (!is_kernel_addr(addr)) { | 77 | if (!is_kernel_addr(addr)) { |
80 | ssize = user_segment_size(addr); | 78 | ssize = user_segment_size(addr); |