aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/kernel/head_32.S34
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/mm/init_32.c84
-rw-r--r--arch/x86/mm/init_64.c110
-rw-r--r--arch/x86/mm/ioremap.c19
-rw-r--r--arch/x86/mm/pageattr-test.c9
-rw-r--r--arch/x86/mm/pageattr.c459
-rw-r--r--arch/x86/mm/pat.c132
8 files changed, 555 insertions, 296 deletions
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index a7010c3a377a..e835b4eea70b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
172 * 172 *
173 * Note that the stack is not yet set up! 173 * Note that the stack is not yet set up!
174 */ 174 */
175#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
176#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
177#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
178
179default_entry: 175default_entry:
180#ifdef CONFIG_X86_PAE 176#ifdef CONFIG_X86_PAE
181 177
@@ -196,9 +192,9 @@ default_entry:
196 movl $pa(pg0), %edi 192 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start) 193 movl %edi, pa(init_pg_tables_start)
198 movl $pa(swapper_pg_pmd), %edx 194 movl $pa(swapper_pg_pmd), %edx
199 movl $PTE_ATTR, %eax 195 movl $PTE_IDENT_ATTR, %eax
20010: 19610:
201 leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 197 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
202 movl %ecx,(%edx) /* Store PMD entry */ 198 movl %ecx,(%edx) /* Store PMD entry */
203 /* Upper half already zero */ 199 /* Upper half already zero */
204 addl $8,%edx 200 addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
215 * End condition: we must map up to and including INIT_MAP_BEYOND_END 211 * End condition: we must map up to and including INIT_MAP_BEYOND_END
216 * bytes beyond the end of our own page tables. 212 * bytes beyond the end of our own page tables.
217 */ 213 */
218 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
219 cmpl %ebp,%eax 215 cmpl %ebp,%eax
220 jb 10b 216 jb 10b
2211: 2171:
@@ -224,7 +220,7 @@ default_entry:
224 movl %eax, pa(max_pfn_mapped) 220 movl %eax, pa(max_pfn_mapped)
225 221
226 /* Do early initialization of the fixmap area */ 222 /* Do early initialization of the fixmap area */
227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 223 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
228 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 224 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
229#else /* Not PAE */ 225#else /* Not PAE */
230 226
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
233 movl $pa(pg0), %edi 229 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start) 230 movl %edi, pa(init_pg_tables_start)
235 movl $pa(swapper_pg_dir), %edx 231 movl $pa(swapper_pg_dir), %edx
236 movl $PTE_ATTR, %eax 232 movl $PTE_IDENT_ATTR, %eax
23710: 23310:
238 leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 234 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
239 movl %ecx,(%edx) /* Store identity PDE entry */ 235 movl %ecx,(%edx) /* Store identity PDE entry */
240 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
241 addl $4,%edx 237 addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 * bytes beyond the end of our own page tables; the +0x007 is 245 * bytes beyond the end of our own page tables; the +0x007 is
250 * the attribute bits 246 * the attribute bits
251 */ 247 */
252 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
253 cmpl %ebp,%eax 249 cmpl %ebp,%eax
254 jb 10b 250 jb 10b
255 movl %edi,pa(init_pg_tables_end) 251 movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 253 movl %eax, pa(max_pfn_mapped)
258 254
259 /* Do early initialization of the fixmap area */ 255 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 256 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 257 movl %eax,pa(swapper_pg_dir+0xffc)
262#endif 258#endif
263 jmp 3f 259 jmp 3f
@@ -634,19 +630,19 @@ ENTRY(empty_zero_page)
634 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
635 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
636ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
637 .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 633 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
638# if KPMDS == 3 634# if KPMDS == 3
639 .long pa(swapper_pg_pmd+PGD_ATTR),0 635 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
640 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 636 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
641 .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 637 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
642# elif KPMDS == 2 638# elif KPMDS == 2
643 .long 0,0 639 .long 0,0
644 .long pa(swapper_pg_pmd+PGD_ATTR),0 640 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
645 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 641 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
646# elif KPMDS == 1 642# elif KPMDS == 1
647 .long 0,0 643 .long 0,0
648 .long 0,0 644 .long 0,0
649 .long pa(swapper_pg_pmd+PGD_ATTR),0 645 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
650# else 646# else
651# error "Kernel PMDs should be 1, 2 or 3" 647# error "Kernel PMDs should be 1, 2 or 3"
652# endif 648# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index db3280afe886..26cfdc1d7c7f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
110 movq %rdi, %rax 110 movq %rdi, %rax
111 shrq $PMD_SHIFT, %rax 111 shrq $PMD_SHIFT, %rax
112 andq $(PTRS_PER_PMD - 1), %rax 112 andq $(PTRS_PER_PMD - 1), %rax
113 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx 113 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
114 leaq level2_spare_pgt(%rip), %rbx 114 leaq level2_spare_pgt(%rip), %rbx
115 movq %rdx, 0(%rbx, %rax, 8) 115 movq %rdx, 0(%rbx, %rax, 8)
116ident_complete: 116ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
374 /* Since I easily can, map the first 1G. 374 /* Since I easily can, map the first 1G.
375 * Don't set NX because code runs from these pages. 375 * Don't set NX because code runs from these pages.
376 */ 376 */
377 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 377 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
378 378
379NEXT_PAGE(level2_kernel_pgt) 379NEXT_PAGE(level2_kernel_pgt)
380 /* 380 /*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 6b9a9358b330..c3789bb19308 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -195,11 +195,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
195 pgd_t *pgd; 195 pgd_t *pgd;
196 pmd_t *pmd; 196 pmd_t *pmd;
197 pte_t *pte; 197 pte_t *pte;
198 unsigned pages_2m = 0, pages_4k = 0; 198 unsigned pages_2m, pages_4k;
199 int mapping_iter;
200
201 /*
202 * First iteration will setup identity mapping using large/small pages
203 * based on use_pse, with other attributes same as set by
204 * the early code in head_32.S
205 *
206 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
207 * as desired for the kernel identity mapping.
208 *
209 * This two pass mechanism conforms to the TLB app note which says:
210 *
211 * "Software should not write to a paging-structure entry in a way
212 * that would change, for any linear address, both the page size
213 * and either the page frame or attributes."
214 */
215 mapping_iter = 1;
199 216
200 if (!cpu_has_pse) 217 if (!cpu_has_pse)
201 use_pse = 0; 218 use_pse = 0;
202 219
220repeat:
221 pages_2m = pages_4k = 0;
203 pfn = start_pfn; 222 pfn = start_pfn;
204 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); 223 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
205 pgd = pgd_base + pgd_idx; 224 pgd = pgd_base + pgd_idx;
@@ -225,6 +244,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
225 if (use_pse) { 244 if (use_pse) {
226 unsigned int addr2; 245 unsigned int addr2;
227 pgprot_t prot = PAGE_KERNEL_LARGE; 246 pgprot_t prot = PAGE_KERNEL_LARGE;
247 /*
248 * first pass will use the same initial
249 * identity mapping attribute + _PAGE_PSE.
250 */
251 pgprot_t init_prot =
252 __pgprot(PTE_IDENT_ATTR |
253 _PAGE_PSE);
228 254
229 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 255 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
230 PAGE_OFFSET + PAGE_SIZE-1; 256 PAGE_OFFSET + PAGE_SIZE-1;
@@ -234,7 +260,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
234 prot = PAGE_KERNEL_LARGE_EXEC; 260 prot = PAGE_KERNEL_LARGE_EXEC;
235 261
236 pages_2m++; 262 pages_2m++;
237 set_pmd(pmd, pfn_pmd(pfn, prot)); 263 if (mapping_iter == 1)
264 set_pmd(pmd, pfn_pmd(pfn, init_prot));
265 else
266 set_pmd(pmd, pfn_pmd(pfn, prot));
238 267
239 pfn += PTRS_PER_PTE; 268 pfn += PTRS_PER_PTE;
240 continue; 269 continue;
@@ -246,17 +275,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
246 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; 275 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
247 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 276 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
248 pgprot_t prot = PAGE_KERNEL; 277 pgprot_t prot = PAGE_KERNEL;
278 /*
279 * first pass will use the same initial
280 * identity mapping attribute.
281 */
282 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
249 283
250 if (is_kernel_text(addr)) 284 if (is_kernel_text(addr))
251 prot = PAGE_KERNEL_EXEC; 285 prot = PAGE_KERNEL_EXEC;
252 286
253 pages_4k++; 287 pages_4k++;
254 set_pte(pte, pfn_pte(pfn, prot)); 288 if (mapping_iter == 1)
289 set_pte(pte, pfn_pte(pfn, init_prot));
290 else
291 set_pte(pte, pfn_pte(pfn, prot));
255 } 292 }
256 } 293 }
257 } 294 }
258 update_page_count(PG_LEVEL_2M, pages_2m); 295 if (mapping_iter == 1) {
259 update_page_count(PG_LEVEL_4K, pages_4k); 296 /*
297 * update direct mapping page count only in the first
298 * iteration.
299 */
300 update_page_count(PG_LEVEL_2M, pages_2m);
301 update_page_count(PG_LEVEL_4K, pages_4k);
302
303 /*
304 * local global flush tlb, which will flush the previous
305 * mappings present in both small and large page TLB's.
306 */
307 __flush_tlb_all();
308
309 /*
310 * Second iteration will set the actual desired PTE attributes.
311 */
312 mapping_iter = 2;
313 goto repeat;
314 }
260} 315}
261 316
262/* 317/*
@@ -719,7 +774,7 @@ void __init setup_bootmem_allocator(void)
719 after_init_bootmem = 1; 774 after_init_bootmem = 1;
720} 775}
721 776
722static void __init find_early_table_space(unsigned long end) 777static void __init find_early_table_space(unsigned long end, int use_pse)
723{ 778{
724 unsigned long puds, pmds, ptes, tables, start; 779 unsigned long puds, pmds, ptes, tables, start;
725 780
@@ -729,7 +784,7 @@ static void __init find_early_table_space(unsigned long end)
729 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 784 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
730 tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); 785 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
731 786
732 if (cpu_has_pse) { 787 if (use_pse) {
733 unsigned long extra; 788 unsigned long extra;
734 789
735 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 790 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -769,12 +824,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
769 pgd_t *pgd_base = swapper_pg_dir; 824 pgd_t *pgd_base = swapper_pg_dir;
770 unsigned long start_pfn, end_pfn; 825 unsigned long start_pfn, end_pfn;
771 unsigned long big_page_start; 826 unsigned long big_page_start;
827#ifdef CONFIG_DEBUG_PAGEALLOC
828 /*
829 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
830 * This will simplify cpa(), which otherwise needs to support splitting
831 * large pages into small in interrupt context, etc.
832 */
833 int use_pse = 0;
834#else
835 int use_pse = cpu_has_pse;
836#endif
772 837
773 /* 838 /*
774 * Find space for the kernel direct mapping tables. 839 * Find space for the kernel direct mapping tables.
775 */ 840 */
776 if (!after_init_bootmem) 841 if (!after_init_bootmem)
777 find_early_table_space(end); 842 find_early_table_space(end, use_pse);
778 843
779#ifdef CONFIG_X86_PAE 844#ifdef CONFIG_X86_PAE
780 set_nx(); 845 set_nx();
@@ -820,7 +885,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
820 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 885 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
821 if (start_pfn < end_pfn) 886 if (start_pfn < end_pfn)
822 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 887 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
823 cpu_has_pse); 888 use_pse);
824 889
825 /* tail is not big page alignment ? */ 890 /* tail is not big page alignment ? */
826 start_pfn = end_pfn; 891 start_pfn = end_pfn;
@@ -983,7 +1048,6 @@ void __init mem_init(void)
983 if (boot_cpu_data.wp_works_ok < 0) 1048 if (boot_cpu_data.wp_works_ok < 0)
984 test_wp_bit(); 1049 test_wp_bit();
985 1050
986 cpa_init();
987 save_pg_dir(); 1051 save_pg_dir();
988 zap_low_mappings(); 1052 zap_low_mappings();
989} 1053}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 770536ebf7e9..fb30486c82f7 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -271,7 +271,8 @@ static __ref void unmap_low_page(void *adr)
271} 271}
272 272
273static unsigned long __meminit 273static unsigned long __meminit
274phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) 274phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
275 pgprot_t prot)
275{ 276{
276 unsigned pages = 0; 277 unsigned pages = 0;
277 unsigned long last_map_addr = end; 278 unsigned long last_map_addr = end;
@@ -289,36 +290,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
289 break; 290 break;
290 } 291 }
291 292
293 /*
294 * We will re-use the existing mapping.
295 * Xen for example has some special requirements, like mapping
296 * pagetable pages as RO. So assume someone who pre-setup
297 * these mappings are more intelligent.
298 */
292 if (pte_val(*pte)) 299 if (pte_val(*pte))
293 continue; 300 continue;
294 301
295 if (0) 302 if (0)
296 printk(" pte=%p addr=%lx pte=%016lx\n", 303 printk(" pte=%p addr=%lx pte=%016lx\n",
297 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 304 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
298 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
299 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
300 pages++; 305 pages++;
306 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
307 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
301 } 308 }
309
302 update_page_count(PG_LEVEL_4K, pages); 310 update_page_count(PG_LEVEL_4K, pages);
303 311
304 return last_map_addr; 312 return last_map_addr;
305} 313}
306 314
307static unsigned long __meminit 315static unsigned long __meminit
308phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) 316phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
317 pgprot_t prot)
309{ 318{
310 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); 319 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
311 320
312 return phys_pte_init(pte, address, end); 321 return phys_pte_init(pte, address, end, prot);
313} 322}
314 323
315static unsigned long __meminit 324static unsigned long __meminit
316phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 325phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
317 unsigned long page_size_mask) 326 unsigned long page_size_mask, pgprot_t prot)
318{ 327{
319 unsigned long pages = 0; 328 unsigned long pages = 0;
320 unsigned long last_map_addr = end; 329 unsigned long last_map_addr = end;
321 unsigned long start = address;
322 330
323 int i = pmd_index(address); 331 int i = pmd_index(address);
324 332
@@ -326,6 +334,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
326 unsigned long pte_phys; 334 unsigned long pte_phys;
327 pmd_t *pmd = pmd_page + pmd_index(address); 335 pmd_t *pmd = pmd_page + pmd_index(address);
328 pte_t *pte; 336 pte_t *pte;
337 pgprot_t new_prot = prot;
329 338
330 if (address >= end) { 339 if (address >= end) {
331 if (!after_bootmem) { 340 if (!after_bootmem) {
@@ -339,27 +348,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
339 if (!pmd_large(*pmd)) { 348 if (!pmd_large(*pmd)) {
340 spin_lock(&init_mm.page_table_lock); 349 spin_lock(&init_mm.page_table_lock);
341 last_map_addr = phys_pte_update(pmd, address, 350 last_map_addr = phys_pte_update(pmd, address,
342 end); 351 end, prot);
343 spin_unlock(&init_mm.page_table_lock); 352 spin_unlock(&init_mm.page_table_lock);
353 continue;
344 } 354 }
345 /* Count entries we're using from level2_ident_pgt */ 355 /*
346 if (start == 0) 356 * If we are ok with PG_LEVEL_2M mapping, then we will
347 pages++; 357 * use the existing mapping,
348 continue; 358 *
359 * Otherwise, we will split the large page mapping but
360 * use the same existing protection bits except for
361 * large page, so that we don't violate Intel's TLB
362 * Application note (317080) which says, while changing
363 * the page sizes, new and old translations should
364 * not differ with respect to page frame and
365 * attributes.
366 */
367 if (page_size_mask & (1 << PG_LEVEL_2M))
368 continue;
369 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
349 } 370 }
350 371
351 if (page_size_mask & (1<<PG_LEVEL_2M)) { 372 if (page_size_mask & (1<<PG_LEVEL_2M)) {
352 pages++; 373 pages++;
353 spin_lock(&init_mm.page_table_lock); 374 spin_lock(&init_mm.page_table_lock);
354 set_pte((pte_t *)pmd, 375 set_pte((pte_t *)pmd,
355 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 376 pfn_pte(address >> PAGE_SHIFT,
377 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
356 spin_unlock(&init_mm.page_table_lock); 378 spin_unlock(&init_mm.page_table_lock);
357 last_map_addr = (address & PMD_MASK) + PMD_SIZE; 379 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
358 continue; 380 continue;
359 } 381 }
360 382
361 pte = alloc_low_page(&pte_phys); 383 pte = alloc_low_page(&pte_phys);
362 last_map_addr = phys_pte_init(pte, address, end); 384 last_map_addr = phys_pte_init(pte, address, end, new_prot);
363 unmap_low_page(pte); 385 unmap_low_page(pte);
364 386
365 spin_lock(&init_mm.page_table_lock); 387 spin_lock(&init_mm.page_table_lock);
@@ -372,12 +394,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
372 394
373static unsigned long __meminit 395static unsigned long __meminit
374phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, 396phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
375 unsigned long page_size_mask) 397 unsigned long page_size_mask, pgprot_t prot)
376{ 398{
377 pmd_t *pmd = pmd_offset(pud, 0); 399 pmd_t *pmd = pmd_offset(pud, 0);
378 unsigned long last_map_addr; 400 unsigned long last_map_addr;
379 401
380 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); 402 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
381 __flush_tlb_all(); 403 __flush_tlb_all();
382 return last_map_addr; 404 return last_map_addr;
383} 405}
@@ -394,6 +416,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
394 unsigned long pmd_phys; 416 unsigned long pmd_phys;
395 pud_t *pud = pud_page + pud_index(addr); 417 pud_t *pud = pud_page + pud_index(addr);
396 pmd_t *pmd; 418 pmd_t *pmd;
419 pgprot_t prot = PAGE_KERNEL;
397 420
398 if (addr >= end) 421 if (addr >= end)
399 break; 422 break;
@@ -405,10 +428,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
405 } 428 }
406 429
407 if (pud_val(*pud)) { 430 if (pud_val(*pud)) {
408 if (!pud_large(*pud)) 431 if (!pud_large(*pud)) {
409 last_map_addr = phys_pmd_update(pud, addr, end, 432 last_map_addr = phys_pmd_update(pud, addr, end,
410 page_size_mask); 433 page_size_mask, prot);
411 continue; 434 continue;
435 }
436 /*
437 * If we are ok with PG_LEVEL_1G mapping, then we will
438 * use the existing mapping.
439 *
440 * Otherwise, we will split the gbpage mapping but use
441 * the same existing protection bits except for large
442 * page, so that we don't violate Intel's TLB
443 * Application note (317080) which says, while changing
444 * the page sizes, new and old translations should
445 * not differ with respect to page frame and
446 * attributes.
447 */
448 if (page_size_mask & (1 << PG_LEVEL_1G))
449 continue;
450 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
412 } 451 }
413 452
414 if (page_size_mask & (1<<PG_LEVEL_1G)) { 453 if (page_size_mask & (1<<PG_LEVEL_1G)) {
@@ -422,7 +461,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
422 } 461 }
423 462
424 pmd = alloc_low_page(&pmd_phys); 463 pmd = alloc_low_page(&pmd_phys);
425 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); 464 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
465 prot);
426 unmap_low_page(pmd); 466 unmap_low_page(pmd);
427 467
428 spin_lock(&init_mm.page_table_lock); 468 spin_lock(&init_mm.page_table_lock);
@@ -430,6 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
430 spin_unlock(&init_mm.page_table_lock); 470 spin_unlock(&init_mm.page_table_lock);
431 } 471 }
432 __flush_tlb_all(); 472 __flush_tlb_all();
473
433 update_page_count(PG_LEVEL_1G, pages); 474 update_page_count(PG_LEVEL_1G, pages);
434 475
435 return last_map_addr; 476 return last_map_addr;
@@ -446,13 +487,14 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
446 return phys_pud_init(pud, addr, end, page_size_mask); 487 return phys_pud_init(pud, addr, end, page_size_mask);
447} 488}
448 489
449static void __init find_early_table_space(unsigned long end) 490static void __init find_early_table_space(unsigned long end, int use_pse,
491 int use_gbpages)
450{ 492{
451 unsigned long puds, pmds, ptes, tables, start; 493 unsigned long puds, pmds, ptes, tables, start;
452 494
453 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 495 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
454 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); 496 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
455 if (direct_gbpages) { 497 if (use_gbpages) {
456 unsigned long extra; 498 unsigned long extra;
457 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); 499 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
458 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; 500 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
@@ -460,7 +502,7 @@ static void __init find_early_table_space(unsigned long end)
460 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 502 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
461 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); 503 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
462 504
463 if (cpu_has_pse) { 505 if (use_pse) {
464 unsigned long extra; 506 unsigned long extra;
465 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 507 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
466 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 508 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -528,6 +570,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
528 pgd_populate(&init_mm, pgd, __va(pud_phys)); 570 pgd_populate(&init_mm, pgd, __va(pud_phys));
529 spin_unlock(&init_mm.page_table_lock); 571 spin_unlock(&init_mm.page_table_lock);
530 } 572 }
573 __flush_tlb_all();
531 574
532 return last_map_addr; 575 return last_map_addr;
533} 576}
@@ -571,6 +614,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
571 614
572 struct map_range mr[NR_RANGE_MR]; 615 struct map_range mr[NR_RANGE_MR];
573 int nr_range, i; 616 int nr_range, i;
617 int use_pse, use_gbpages;
574 618
575 printk(KERN_INFO "init_memory_mapping\n"); 619 printk(KERN_INFO "init_memory_mapping\n");
576 620
@@ -584,9 +628,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
584 if (!after_bootmem) 628 if (!after_bootmem)
585 init_gbpages(); 629 init_gbpages();
586 630
587 if (direct_gbpages) 631#ifdef CONFIG_DEBUG_PAGEALLOC
632 /*
633 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
634 * This will simplify cpa(), which otherwise needs to support splitting
635 * large pages into small in interrupt context, etc.
636 */
637 use_pse = use_gbpages = 0;
638#else
639 use_pse = cpu_has_pse;
640 use_gbpages = direct_gbpages;
641#endif
642
643 if (use_gbpages)
588 page_size_mask |= 1 << PG_LEVEL_1G; 644 page_size_mask |= 1 << PG_LEVEL_1G;
589 if (cpu_has_pse) 645 if (use_pse)
590 page_size_mask |= 1 << PG_LEVEL_2M; 646 page_size_mask |= 1 << PG_LEVEL_2M;
591 647
592 memset(mr, 0, sizeof(mr)); 648 memset(mr, 0, sizeof(mr));
@@ -647,7 +703,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
647 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 703 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
648 704
649 if (!after_bootmem) 705 if (!after_bootmem)
650 find_early_table_space(end); 706 find_early_table_space(end, use_pse, use_gbpages);
651 707
652 for (i = 0; i < nr_range; i++) 708 for (i = 0; i < nr_range; i++)
653 last_map_addr = kernel_physical_mapping_init( 709 last_map_addr = kernel_physical_mapping_init(
@@ -806,8 +862,6 @@ void __init mem_init(void)
806 reservedpages << (PAGE_SHIFT-10), 862 reservedpages << (PAGE_SHIFT-10),
807 datasize >> 10, 863 datasize >> 10,
808 initsize >> 10); 864 initsize >> 10);
809
810 cpa_init();
811} 865}
812 866
813void free_init_pages(char *what, unsigned long begin, unsigned long end) 867void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index cac6da54203b..6ab3196d12b4 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -83,6 +83,25 @@ int page_is_ram(unsigned long pagenr)
83 return 0; 83 return 0;
84} 84}
85 85
86int pagerange_is_ram(unsigned long start, unsigned long end)
87{
88 int ram_page = 0, not_rampage = 0;
89 unsigned long page_nr;
90
91 for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
92 ++page_nr) {
93 if (page_is_ram(page_nr))
94 ram_page = 1;
95 else
96 not_rampage = 1;
97
98 if (ram_page == not_rampage)
99 return -1;
100 }
101
102 return ram_page;
103}
104
86/* 105/*
87 * Fix up the linear direct mapping of the kernel to avoid cache attribute 106 * Fix up the linear direct mapping of the kernel to avoid cache attribute
88 * conflicts. 107 * conflicts.
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d4aa503caaa2..e1d106909218 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -32,7 +32,7 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) 35#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
36 36
37static int pte_testbit(pte_t pte) 37static int pte_testbit(pte_t pte)
38{ 38{
@@ -118,6 +118,7 @@ static int pageattr_test(void)
118 unsigned int level; 118 unsigned int level;
119 int i, k; 119 int i, k;
120 int err; 120 int err;
121 unsigned long test_addr;
121 122
122 if (print) 123 if (print)
123 printk(KERN_INFO "CPA self-test:\n"); 124 printk(KERN_INFO "CPA self-test:\n");
@@ -172,7 +173,8 @@ static int pageattr_test(void)
172 continue; 173 continue;
173 } 174 }
174 175
175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); 176 test_addr = addr[i];
177 err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
176 if (err < 0) { 178 if (err < 0) {
177 printk(KERN_ERR "CPA %d failed %d\n", i, err); 179 printk(KERN_ERR "CPA %d failed %d\n", i, err);
178 failed++; 180 failed++;
@@ -204,7 +206,8 @@ static int pageattr_test(void)
204 failed++; 206 failed++;
205 continue; 207 continue;
206 } 208 }
207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); 209 test_addr = addr[i];
210 err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
208 if (err < 0) { 211 if (err < 0) {
209 printk(KERN_ERR "CPA reverting failed: %d\n", err); 212 printk(KERN_ERR "CPA reverting failed: %d\n", err);
210 failed++; 213 failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 898fad617abe..a9ec89c3fbca 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,27 @@
25 * The current flushing context - we pass it instead of 5 arguments: 25 * The current flushing context - we pass it instead of 5 arguments:
26 */ 26 */
27struct cpa_data { 27struct cpa_data {
28 unsigned long vaddr; 28 unsigned long *vaddr;
29 pgprot_t mask_set; 29 pgprot_t mask_set;
30 pgprot_t mask_clr; 30 pgprot_t mask_clr;
31 int numpages; 31 int numpages;
32 int flushtlb; 32 int flags;
33 unsigned long pfn; 33 unsigned long pfn;
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35 int curpage;
35}; 36};
36 37
38/*
39 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
40 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
41 * entries change the page attribute in parallel to some other cpu
42 * splitting a large page entry along with changing the attribute.
43 */
44static DEFINE_SPINLOCK(cpa_lock);
45
46#define CPA_FLUSHTLB 1
47#define CPA_ARRAY 2
48
37#ifdef CONFIG_PROC_FS 49#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM]; 50static unsigned long direct_pages_count[PG_LEVEL_NUM];
39 51
@@ -190,6 +202,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
190 } 202 }
191} 203}
192 204
205static void cpa_flush_array(unsigned long *start, int numpages, int cache)
206{
207 unsigned int i, level;
208 unsigned long *addr;
209
210 BUG_ON(irqs_disabled());
211
212 on_each_cpu(__cpa_flush_range, NULL, 1);
213
214 if (!cache)
215 return;
216
217 /* 4M threshold */
218 if (numpages >= 1024) {
219 if (boot_cpu_data.x86_model >= 4)
220 wbinvd();
221 return;
222 }
223 /*
224 * We only need to flush on one CPU,
225 * clflush is a MESI-coherent instruction that
226 * will cause all other CPUs to flush the same
227 * cachelines:
228 */
229 for (i = 0, addr = start; i < numpages; i++, addr++) {
230 pte_t *pte = lookup_address(*addr, &level);
231
232 /*
233 * Only flush present addresses:
234 */
235 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
236 clflush_cache_range((void *) *addr, PAGE_SIZE);
237 }
238}
239
193/* 240/*
194 * Certain areas of memory on x86 require very specific protection flags, 241 * Certain areas of memory on x86 require very specific protection flags,
195 * for example the BIOS area or kernel text. Callers don't always get this 242 * for example the BIOS area or kernel text. Callers don't always get this
@@ -398,7 +445,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
398 */ 445 */
399 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); 446 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
400 __set_pmd_pte(kpte, address, new_pte); 447 __set_pmd_pte(kpte, address, new_pte);
401 cpa->flushtlb = 1; 448 cpa->flags |= CPA_FLUSHTLB;
402 do_split = 0; 449 do_split = 0;
403 } 450 }
404 451
@@ -408,84 +455,6 @@ out_unlock:
408 return do_split; 455 return do_split;
409} 456}
410 457
411static LIST_HEAD(page_pool);
412static unsigned long pool_size, pool_pages, pool_low;
413static unsigned long pool_used, pool_failed;
414
415static void cpa_fill_pool(struct page **ret)
416{
417 gfp_t gfp = GFP_KERNEL;
418 unsigned long flags;
419 struct page *p;
420
421 /*
422 * Avoid recursion (on debug-pagealloc) and also signal
423 * our priority to get to these pagetables:
424 */
425 if (current->flags & PF_MEMALLOC)
426 return;
427 current->flags |= PF_MEMALLOC;
428
429 /*
430 * Allocate atomically from atomic contexts:
431 */
432 if (in_atomic() || irqs_disabled() || debug_pagealloc)
433 gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
434
435 while (pool_pages < pool_size || (ret && !*ret)) {
436 p = alloc_pages(gfp, 0);
437 if (!p) {
438 pool_failed++;
439 break;
440 }
441 /*
442 * If the call site needs a page right now, provide it:
443 */
444 if (ret && !*ret) {
445 *ret = p;
446 continue;
447 }
448 spin_lock_irqsave(&pgd_lock, flags);
449 list_add(&p->lru, &page_pool);
450 pool_pages++;
451 spin_unlock_irqrestore(&pgd_lock, flags);
452 }
453
454 current->flags &= ~PF_MEMALLOC;
455}
456
457#define SHIFT_MB (20 - PAGE_SHIFT)
458#define ROUND_MB_GB ((1 << 10) - 1)
459#define SHIFT_MB_GB 10
460#define POOL_PAGES_PER_GB 16
461
462void __init cpa_init(void)
463{
464 struct sysinfo si;
465 unsigned long gb;
466
467 si_meminfo(&si);
468 /*
469 * Calculate the number of pool pages:
470 *
471 * Convert totalram (nr of pages) to MiB and round to the next
472 * GiB. Shift MiB to Gib and multiply the result by
473 * POOL_PAGES_PER_GB:
474 */
475 if (debug_pagealloc) {
476 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
477 pool_size = POOL_PAGES_PER_GB * gb;
478 } else {
479 pool_size = 1;
480 }
481 pool_low = pool_size;
482
483 cpa_fill_pool(NULL);
484 printk(KERN_DEBUG
485 "CPA: page pool initialized %lu of %lu pages preallocated\n",
486 pool_pages, pool_size);
487}
488
489static int split_large_page(pte_t *kpte, unsigned long address) 458static int split_large_page(pte_t *kpte, unsigned long address)
490{ 459{
491 unsigned long flags, pfn, pfninc = 1; 460 unsigned long flags, pfn, pfninc = 1;
@@ -494,28 +463,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
494 pgprot_t ref_prot; 463 pgprot_t ref_prot;
495 struct page *base; 464 struct page *base;
496 465
497 /* 466 if (!debug_pagealloc)
498 * Get a page from the pool. The pool list is protected by the 467 spin_unlock(&cpa_lock);
499 * pgd_lock, which we have to take anyway for the split 468 base = alloc_pages(GFP_KERNEL, 0);
500 * operation: 469 if (!debug_pagealloc)
501 */ 470 spin_lock(&cpa_lock);
502 spin_lock_irqsave(&pgd_lock, flags); 471 if (!base)
503 if (list_empty(&page_pool)) { 472 return -ENOMEM;
504 spin_unlock_irqrestore(&pgd_lock, flags);
505 base = NULL;
506 cpa_fill_pool(&base);
507 if (!base)
508 return -ENOMEM;
509 spin_lock_irqsave(&pgd_lock, flags);
510 } else {
511 base = list_first_entry(&page_pool, struct page, lru);
512 list_del(&base->lru);
513 pool_pages--;
514
515 if (pool_pages < pool_low)
516 pool_low = pool_pages;
517 }
518 473
474 spin_lock_irqsave(&pgd_lock, flags);
519 /* 475 /*
520 * Check for races, another CPU might have split this page 476 * Check for races, another CPU might have split this page
521 * up for us already: 477 * up for us already:
@@ -572,11 +528,8 @@ out_unlock:
572 * If we dropped out via the lookup_address check under 528 * If we dropped out via the lookup_address check under
573 * pgd_lock then stick the page back into the pool: 529 * pgd_lock then stick the page back into the pool:
574 */ 530 */
575 if (base) { 531 if (base)
576 list_add(&base->lru, &page_pool); 532 __free_page(base);
577 pool_pages++;
578 } else
579 pool_used++;
580 spin_unlock_irqrestore(&pgd_lock, flags); 533 spin_unlock_irqrestore(&pgd_lock, flags);
581 534
582 return 0; 535 return 0;
@@ -584,11 +537,16 @@ out_unlock:
584 537
585static int __change_page_attr(struct cpa_data *cpa, int primary) 538static int __change_page_attr(struct cpa_data *cpa, int primary)
586{ 539{
587 unsigned long address = cpa->vaddr; 540 unsigned long address;
588 int do_split, err; 541 int do_split, err;
589 unsigned int level; 542 unsigned int level;
590 pte_t *kpte, old_pte; 543 pte_t *kpte, old_pte;
591 544
545 if (cpa->flags & CPA_ARRAY)
546 address = cpa->vaddr[cpa->curpage];
547 else
548 address = *cpa->vaddr;
549
592repeat: 550repeat:
593 kpte = lookup_address(address, &level); 551 kpte = lookup_address(address, &level);
594 if (!kpte) 552 if (!kpte)
@@ -600,7 +558,7 @@ repeat:
600 return 0; 558 return 0;
601 WARN(1, KERN_WARNING "CPA: called for zero pte. " 559 WARN(1, KERN_WARNING "CPA: called for zero pte. "
602 "vaddr = %lx cpa->vaddr = %lx\n", address, 560 "vaddr = %lx cpa->vaddr = %lx\n", address,
603 cpa->vaddr); 561 *cpa->vaddr);
604 return -EINVAL; 562 return -EINVAL;
605 } 563 }
606 564
@@ -626,7 +584,7 @@ repeat:
626 */ 584 */
627 if (pte_val(old_pte) != pte_val(new_pte)) { 585 if (pte_val(old_pte) != pte_val(new_pte)) {
628 set_pte_atomic(kpte, new_pte); 586 set_pte_atomic(kpte, new_pte);
629 cpa->flushtlb = 1; 587 cpa->flags |= CPA_FLUSHTLB;
630 } 588 }
631 cpa->numpages = 1; 589 cpa->numpages = 1;
632 return 0; 590 return 0;
@@ -650,7 +608,25 @@ repeat:
650 */ 608 */
651 err = split_large_page(kpte, address); 609 err = split_large_page(kpte, address);
652 if (!err) { 610 if (!err) {
653 cpa->flushtlb = 1; 611 /*
612 * Do a global flush tlb after splitting the large page
613 * and before we do the actual change page attribute in the PTE.
614 *
615 * With out this, we violate the TLB application note, that says
616 * "The TLBs may contain both ordinary and large-page
617 * translations for a 4-KByte range of linear addresses. This
618 * may occur if software modifies the paging structures so that
619 * the page size used for the address range changes. If the two
620 * translations differ with respect to page frame or attributes
621 * (e.g., permissions), processor behavior is undefined and may
622 * be implementation-specific."
623 *
624 * We do this global tlb flush inside the cpa_lock, so that we
625 * don't allow any other cpu, with stale tlb entries change the
626 * page attribute in parallel, that also falls into the
627 * just split large page entry.
628 */
629 flush_tlb_all();
654 goto repeat; 630 goto repeat;
655 } 631 }
656 632
@@ -663,6 +639,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
663{ 639{
664 struct cpa_data alias_cpa; 640 struct cpa_data alias_cpa;
665 int ret = 0; 641 int ret = 0;
642 unsigned long temp_cpa_vaddr, vaddr;
666 643
667 if (cpa->pfn >= max_pfn_mapped) 644 if (cpa->pfn >= max_pfn_mapped)
668 return 0; 645 return 0;
@@ -675,16 +652,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
675 * No need to redo, when the primary call touched the direct 652 * No need to redo, when the primary call touched the direct
676 * mapping already: 653 * mapping already:
677 */ 654 */
678 if (!(within(cpa->vaddr, PAGE_OFFSET, 655 if (cpa->flags & CPA_ARRAY)
656 vaddr = cpa->vaddr[cpa->curpage];
657 else
658 vaddr = *cpa->vaddr;
659
660 if (!(within(vaddr, PAGE_OFFSET,
679 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) 661 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
680#ifdef CONFIG_X86_64 662#ifdef CONFIG_X86_64
681 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), 663 || within(vaddr, PAGE_OFFSET + (1UL<<32),
682 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) 664 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
683#endif 665#endif
684 )) { 666 )) {
685 667
686 alias_cpa = *cpa; 668 alias_cpa = *cpa;
687 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 669 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
670 alias_cpa.vaddr = &temp_cpa_vaddr;
671 alias_cpa.flags &= ~CPA_ARRAY;
672
688 673
689 ret = __change_page_attr_set_clr(&alias_cpa, 0); 674 ret = __change_page_attr_set_clr(&alias_cpa, 0);
690 } 675 }
@@ -696,7 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
696 * No need to redo, when the primary call touched the high 681 * No need to redo, when the primary call touched the high
697 * mapping already: 682 * mapping already:
698 */ 683 */
699 if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) 684 if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
700 return 0; 685 return 0;
701 686
702 /* 687 /*
@@ -707,8 +692,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
707 return 0; 692 return 0;
708 693
709 alias_cpa = *cpa; 694 alias_cpa = *cpa;
710 alias_cpa.vaddr = 695 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
711 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 696 alias_cpa.vaddr = &temp_cpa_vaddr;
697 alias_cpa.flags &= ~CPA_ARRAY;
712 698
713 /* 699 /*
714 * The high mapping range is imprecise, so ignore the return value. 700 * The high mapping range is imprecise, so ignore the return value.
@@ -728,8 +714,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
728 * preservation check. 714 * preservation check.
729 */ 715 */
730 cpa->numpages = numpages; 716 cpa->numpages = numpages;
717 /* for array changes, we can't use large page */
718 if (cpa->flags & CPA_ARRAY)
719 cpa->numpages = 1;
731 720
721 if (!debug_pagealloc)
722 spin_lock(&cpa_lock);
732 ret = __change_page_attr(cpa, checkalias); 723 ret = __change_page_attr(cpa, checkalias);
724 if (!debug_pagealloc)
725 spin_unlock(&cpa_lock);
733 if (ret) 726 if (ret)
734 return ret; 727 return ret;
735 728
@@ -746,7 +739,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
746 */ 739 */
747 BUG_ON(cpa->numpages > numpages); 740 BUG_ON(cpa->numpages > numpages);
748 numpages -= cpa->numpages; 741 numpages -= cpa->numpages;
749 cpa->vaddr += cpa->numpages * PAGE_SIZE; 742 if (cpa->flags & CPA_ARRAY)
743 cpa->curpage++;
744 else
745 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
746
750 } 747 }
751 return 0; 748 return 0;
752} 749}
@@ -757,9 +754,9 @@ static inline int cache_attr(pgprot_t attr)
757 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); 754 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
758} 755}
759 756
760static int change_page_attr_set_clr(unsigned long addr, int numpages, 757static int change_page_attr_set_clr(unsigned long *addr, int numpages,
761 pgprot_t mask_set, pgprot_t mask_clr, 758 pgprot_t mask_set, pgprot_t mask_clr,
762 int force_split) 759 int force_split, int array)
763{ 760{
764 struct cpa_data cpa; 761 struct cpa_data cpa;
765 int ret, cache, checkalias; 762 int ret, cache, checkalias;
@@ -774,21 +771,38 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
774 return 0; 771 return 0;
775 772
776 /* Ensure we are PAGE_SIZE aligned */ 773 /* Ensure we are PAGE_SIZE aligned */
777 if (addr & ~PAGE_MASK) { 774 if (!array) {
778 addr &= PAGE_MASK; 775 if (*addr & ~PAGE_MASK) {
779 /* 776 *addr &= PAGE_MASK;
780 * People should not be passing in unaligned addresses: 777 /*
781 */ 778 * People should not be passing in unaligned addresses:
782 WARN_ON_ONCE(1); 779 */
780 WARN_ON_ONCE(1);
781 }
782 } else {
783 int i;
784 for (i = 0; i < numpages; i++) {
785 if (addr[i] & ~PAGE_MASK) {
786 addr[i] &= PAGE_MASK;
787 WARN_ON_ONCE(1);
788 }
789 }
783 } 790 }
784 791
792 /* Must avoid aliasing mappings in the highmem code */
793 kmap_flush_unused();
794
785 cpa.vaddr = addr; 795 cpa.vaddr = addr;
786 cpa.numpages = numpages; 796 cpa.numpages = numpages;
787 cpa.mask_set = mask_set; 797 cpa.mask_set = mask_set;
788 cpa.mask_clr = mask_clr; 798 cpa.mask_clr = mask_clr;
789 cpa.flushtlb = 0; 799 cpa.flags = 0;
800 cpa.curpage = 0;
790 cpa.force_split = force_split; 801 cpa.force_split = force_split;
791 802
803 if (array)
804 cpa.flags |= CPA_ARRAY;
805
792 /* No alias checking for _NX bit modifications */ 806 /* No alias checking for _NX bit modifications */
793 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 807 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
794 808
@@ -797,7 +811,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
797 /* 811 /*
798 * Check whether we really changed something: 812 * Check whether we really changed something:
799 */ 813 */
800 if (!cpa.flushtlb) 814 if (!(cpa.flags & CPA_FLUSHTLB))
801 goto out; 815 goto out;
802 816
803 /* 817 /*
@@ -812,27 +826,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
812 * error case we fall back to cpa_flush_all (which uses 826 * error case we fall back to cpa_flush_all (which uses
813 * wbindv): 827 * wbindv):
814 */ 828 */
815 if (!ret && cpu_has_clflush) 829 if (!ret && cpu_has_clflush) {
816 cpa_flush_range(addr, numpages, cache); 830 if (cpa.flags & CPA_ARRAY)
817 else 831 cpa_flush_array(addr, numpages, cache);
832 else
833 cpa_flush_range(*addr, numpages, cache);
834 } else
818 cpa_flush_all(cache); 835 cpa_flush_all(cache);
819 836
820out: 837out:
821 cpa_fill_pool(NULL);
822
823 return ret; 838 return ret;
824} 839}
825 840
826static inline int change_page_attr_set(unsigned long addr, int numpages, 841static inline int change_page_attr_set(unsigned long *addr, int numpages,
827 pgprot_t mask) 842 pgprot_t mask, int array)
828{ 843{
829 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); 844 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
845 array);
830} 846}
831 847
832static inline int change_page_attr_clear(unsigned long addr, int numpages, 848static inline int change_page_attr_clear(unsigned long *addr, int numpages,
833 pgprot_t mask) 849 pgprot_t mask, int array)
834{ 850{
835 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); 851 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
852 array);
836} 853}
837 854
838int _set_memory_uc(unsigned long addr, int numpages) 855int _set_memory_uc(unsigned long addr, int numpages)
@@ -840,8 +857,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
840 /* 857 /*
841 * for now UC MINUS. see comments in ioremap_nocache() 858 * for now UC MINUS. see comments in ioremap_nocache()
842 */ 859 */
843 return change_page_attr_set(addr, numpages, 860 return change_page_attr_set(&addr, numpages,
844 __pgprot(_PAGE_CACHE_UC_MINUS)); 861 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
845} 862}
846 863
847int set_memory_uc(unsigned long addr, int numpages) 864int set_memory_uc(unsigned long addr, int numpages)
@@ -857,10 +874,48 @@ int set_memory_uc(unsigned long addr, int numpages)
857} 874}
858EXPORT_SYMBOL(set_memory_uc); 875EXPORT_SYMBOL(set_memory_uc);
859 876
877int set_memory_array_uc(unsigned long *addr, int addrinarray)
878{
879 unsigned long start;
880 unsigned long end;
881 int i;
882 /*
883 * for now UC MINUS. see comments in ioremap_nocache()
884 */
885 for (i = 0; i < addrinarray; i++) {
886 start = __pa(addr[i]);
887 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
888 if (end != __pa(addr[i + 1]))
889 break;
890 i++;
891 }
892 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
893 goto out;
894 }
895
896 return change_page_attr_set(addr, addrinarray,
897 __pgprot(_PAGE_CACHE_UC_MINUS), 1);
898out:
899 for (i = 0; i < addrinarray; i++) {
900 unsigned long tmp = __pa(addr[i]);
901
902 if (tmp == start)
903 break;
904 for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
905 if (end != __pa(addr[i + 1]))
906 break;
907 i++;
908 }
909 free_memtype(tmp, end);
910 }
911 return -EINVAL;
912}
913EXPORT_SYMBOL(set_memory_array_uc);
914
860int _set_memory_wc(unsigned long addr, int numpages) 915int _set_memory_wc(unsigned long addr, int numpages)
861{ 916{
862 return change_page_attr_set(addr, numpages, 917 return change_page_attr_set(&addr, numpages,
863 __pgprot(_PAGE_CACHE_WC)); 918 __pgprot(_PAGE_CACHE_WC), 0);
864} 919}
865 920
866int set_memory_wc(unsigned long addr, int numpages) 921int set_memory_wc(unsigned long addr, int numpages)
@@ -878,8 +933,8 @@ EXPORT_SYMBOL(set_memory_wc);
878 933
879int _set_memory_wb(unsigned long addr, int numpages) 934int _set_memory_wb(unsigned long addr, int numpages)
880{ 935{
881 return change_page_attr_clear(addr, numpages, 936 return change_page_attr_clear(&addr, numpages,
882 __pgprot(_PAGE_CACHE_MASK)); 937 __pgprot(_PAGE_CACHE_MASK), 0);
883} 938}
884 939
885int set_memory_wb(unsigned long addr, int numpages) 940int set_memory_wb(unsigned long addr, int numpages)
@@ -890,39 +945,59 @@ int set_memory_wb(unsigned long addr, int numpages)
890} 945}
891EXPORT_SYMBOL(set_memory_wb); 946EXPORT_SYMBOL(set_memory_wb);
892 947
948int set_memory_array_wb(unsigned long *addr, int addrinarray)
949{
950 int i;
951
952 for (i = 0; i < addrinarray; i++) {
953 unsigned long start = __pa(addr[i]);
954 unsigned long end;
955
956 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
957 if (end != __pa(addr[i + 1]))
958 break;
959 i++;
960 }
961 free_memtype(start, end);
962 }
963 return change_page_attr_clear(addr, addrinarray,
964 __pgprot(_PAGE_CACHE_MASK), 1);
965}
966EXPORT_SYMBOL(set_memory_array_wb);
967
893int set_memory_x(unsigned long addr, int numpages) 968int set_memory_x(unsigned long addr, int numpages)
894{ 969{
895 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); 970 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
896} 971}
897EXPORT_SYMBOL(set_memory_x); 972EXPORT_SYMBOL(set_memory_x);
898 973
899int set_memory_nx(unsigned long addr, int numpages) 974int set_memory_nx(unsigned long addr, int numpages)
900{ 975{
901 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); 976 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
902} 977}
903EXPORT_SYMBOL(set_memory_nx); 978EXPORT_SYMBOL(set_memory_nx);
904 979
905int set_memory_ro(unsigned long addr, int numpages) 980int set_memory_ro(unsigned long addr, int numpages)
906{ 981{
907 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); 982 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
908} 983}
909EXPORT_SYMBOL_GPL(set_memory_ro); 984EXPORT_SYMBOL_GPL(set_memory_ro);
910 985
911int set_memory_rw(unsigned long addr, int numpages) 986int set_memory_rw(unsigned long addr, int numpages)
912{ 987{
913 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); 988 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
914} 989}
915EXPORT_SYMBOL_GPL(set_memory_rw); 990EXPORT_SYMBOL_GPL(set_memory_rw);
916 991
917int set_memory_np(unsigned long addr, int numpages) 992int set_memory_np(unsigned long addr, int numpages)
918{ 993{
919 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); 994 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
920} 995}
921 996
922int set_memory_4k(unsigned long addr, int numpages) 997int set_memory_4k(unsigned long addr, int numpages)
923{ 998{
924 return change_page_attr_set_clr(addr, numpages, __pgprot(0), 999 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
925 __pgprot(0), 1); 1000 __pgprot(0), 1, 0);
926} 1001}
927 1002
928int set_pages_uc(struct page *page, int numpages) 1003int set_pages_uc(struct page *page, int numpages)
@@ -975,22 +1050,38 @@ int set_pages_rw(struct page *page, int numpages)
975 1050
976static int __set_pages_p(struct page *page, int numpages) 1051static int __set_pages_p(struct page *page, int numpages)
977{ 1052{
978 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1053 unsigned long tempaddr = (unsigned long) page_address(page);
1054 struct cpa_data cpa = { .vaddr = &tempaddr,
979 .numpages = numpages, 1055 .numpages = numpages,
980 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1056 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
981 .mask_clr = __pgprot(0)}; 1057 .mask_clr = __pgprot(0),
1058 .flags = 0};
982 1059
983 return __change_page_attr_set_clr(&cpa, 1); 1060 /*
1061 * No alias checking needed for setting present flag. otherwise,
1062 * we may need to break large pages for 64-bit kernel text
1063 * mappings (this adds to complexity if we want to do this from
1064 * atomic context especially). Let's keep it simple!
1065 */
1066 return __change_page_attr_set_clr(&cpa, 0);
984} 1067}
985 1068
986static int __set_pages_np(struct page *page, int numpages) 1069static int __set_pages_np(struct page *page, int numpages)
987{ 1070{
988 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1071 unsigned long tempaddr = (unsigned long) page_address(page);
1072 struct cpa_data cpa = { .vaddr = &tempaddr,
989 .numpages = numpages, 1073 .numpages = numpages,
990 .mask_set = __pgprot(0), 1074 .mask_set = __pgprot(0),
991 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; 1075 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1076 .flags = 0};
992 1077
993 return __change_page_attr_set_clr(&cpa, 1); 1078 /*
1079 * No alias checking needed for setting not present flag. otherwise,
1080 * we may need to break large pages for 64-bit kernel text
1081 * mappings (this adds to complexity if we want to do this from
1082 * atomic context especially). Let's keep it simple!
1083 */
1084 return __change_page_attr_set_clr(&cpa, 0);
994} 1085}
995 1086
996void kernel_map_pages(struct page *page, int numpages, int enable) 1087void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1010,11 +1101,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1010 1101
1011 /* 1102 /*
1012 * The return value is ignored as the calls cannot fail. 1103 * The return value is ignored as the calls cannot fail.
1013 * Large pages are kept enabled at boot time, and are 1104 * Large pages for identity mappings are not used at boot time
1014 * split up quickly with DEBUG_PAGEALLOC. If a splitup 1105 * and hence no memory allocations during large page split.
1015 * fails here (due to temporary memory shortage) no damage
1016 * is done because we just keep the largepage intact up
1017 * to the next attempt when it will likely be split up:
1018 */ 1106 */
1019 if (enable) 1107 if (enable)
1020 __set_pages_p(page, numpages); 1108 __set_pages_p(page, numpages);
@@ -1026,53 +1114,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1026 * but that can deadlock->flush only current cpu: 1114 * but that can deadlock->flush only current cpu:
1027 */ 1115 */
1028 __flush_tlb_all(); 1116 __flush_tlb_all();
1029
1030 /*
1031 * Try to refill the page pool here. We can do this only after
1032 * the tlb flush.
1033 */
1034 cpa_fill_pool(NULL);
1035} 1117}
1036 1118
1037#ifdef CONFIG_DEBUG_FS
1038static int dpa_show(struct seq_file *m, void *v)
1039{
1040 seq_puts(m, "DEBUG_PAGEALLOC\n");
1041 seq_printf(m, "pool_size : %lu\n", pool_size);
1042 seq_printf(m, "pool_pages : %lu\n", pool_pages);
1043 seq_printf(m, "pool_low : %lu\n", pool_low);
1044 seq_printf(m, "pool_used : %lu\n", pool_used);
1045 seq_printf(m, "pool_failed : %lu\n", pool_failed);
1046
1047 return 0;
1048}
1049
1050static int dpa_open(struct inode *inode, struct file *filp)
1051{
1052 return single_open(filp, dpa_show, NULL);
1053}
1054
1055static const struct file_operations dpa_fops = {
1056 .open = dpa_open,
1057 .read = seq_read,
1058 .llseek = seq_lseek,
1059 .release = single_release,
1060};
1061
1062static int __init debug_pagealloc_proc_init(void)
1063{
1064 struct dentry *de;
1065
1066 de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1067 &dpa_fops);
1068 if (!de)
1069 return -ENOMEM;
1070
1071 return 0;
1072}
1073__initcall(debug_pagealloc_proc_init);
1074#endif
1075
1076#ifdef CONFIG_HIBERNATION 1119#ifdef CONFIG_HIBERNATION
1077 1120
1078bool kernel_page_present(struct page *page) 1121bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2a50e0fa64a5..738fd0f24958 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,24 +7,24 @@
7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
8 */ 8 */
9 9
10#include <linux/mm.h> 10#include <linux/seq_file.h>
11#include <linux/bootmem.h>
12#include <linux/debugfs.h>
11#include <linux/kernel.h> 13#include <linux/kernel.h>
12#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/mm.h>
13#include <linux/fs.h> 16#include <linux/fs.h>
14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
17 17
18#include <asm/msr.h> 18#include <asm/cacheflush.h>
19#include <asm/tlbflush.h>
20#include <asm/processor.h> 19#include <asm/processor.h>
21#include <asm/page.h> 20#include <asm/tlbflush.h>
22#include <asm/pgtable.h> 21#include <asm/pgtable.h>
23#include <asm/pat.h>
24#include <asm/e820.h>
25#include <asm/cacheflush.h>
26#include <asm/fcntl.h> 22#include <asm/fcntl.h>
23#include <asm/e820.h>
27#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/page.h>
26#include <asm/msr.h>
27#include <asm/pat.h>
28#include <asm/io.h> 28#include <asm/io.h>
29 29
30#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
@@ -46,6 +46,7 @@ early_param("nopat", nopat);
46 46
47 47
48static int debug_enable; 48static int debug_enable;
49
49static int __init pat_debug_setup(char *str) 50static int __init pat_debug_setup(char *str)
50{ 51{
51 debug_enable = 1; 52 debug_enable = 1;
@@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags)
145 */ 146 */
146 147
147struct memtype { 148struct memtype {
148 u64 start; 149 u64 start;
149 u64 end; 150 u64 end;
150 unsigned long type; 151 unsigned long type;
151 struct list_head nd; 152 struct list_head nd;
152}; 153};
153 154
154static LIST_HEAD(memtype_list); 155static LIST_HEAD(memtype_list);
155static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 156static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
156 157
157/* 158/*
158 * Does intersection of PAT memory type and MTRR memory type and returns 159 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
180 return req_type; 181 return req_type;
181} 182}
182 183
183static int chk_conflict(struct memtype *new, struct memtype *entry, 184static int
184 unsigned long *type) 185chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
185{ 186{
186 if (new->type != entry->type) { 187 if (new->type != entry->type) {
187 if (type) { 188 if (type) {
@@ -211,6 +212,66 @@ static struct memtype *cached_entry;
211static u64 cached_start; 212static u64 cached_start;
212 213
213/* 214/*
215 * For RAM pages, mark the pages as non WB memory type using
216 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
217 * set_memory_wc() on a RAM page at a time before marking it as WB again.
218 * This is ok, because only one driver will be owning the page and
219 * doing set_memory_*() calls.
220 *
221 * For now, we use PageNonWB to track that the RAM page is being mapped
222 * as non WB. In future, we will have to use one more flag
223 * (or some other mechanism in page_struct) to distinguish between
224 * UC and WC mapping.
225 */
226static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
227 unsigned long *new_type)
228{
229 struct page *page;
230 u64 pfn, end_pfn;
231
232 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
233 page = pfn_to_page(pfn);
234 if (page_mapped(page) || PageNonWB(page))
235 goto out;
236
237 SetPageNonWB(page);
238 }
239 return 0;
240
241out:
242 end_pfn = pfn;
243 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
244 page = pfn_to_page(pfn);
245 ClearPageNonWB(page);
246 }
247
248 return -EINVAL;
249}
250
251static int free_ram_pages_type(u64 start, u64 end)
252{
253 struct page *page;
254 u64 pfn, end_pfn;
255
256 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
257 page = pfn_to_page(pfn);
258 if (page_mapped(page) || !PageNonWB(page))
259 goto out;
260
261 ClearPageNonWB(page);
262 }
263 return 0;
264
265out:
266 end_pfn = pfn;
267 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
268 page = pfn_to_page(pfn);
269 SetPageNonWB(page);
270 }
271 return -EINVAL;
272}
273
274/*
214 * req_type typically has one of the: 275 * req_type typically has one of the:
215 * - _PAGE_CACHE_WB 276 * - _PAGE_CACHE_WB
216 * - _PAGE_CACHE_WC 277 * - _PAGE_CACHE_WC
@@ -226,14 +287,15 @@ static u64 cached_start;
226 * it will return a negative return value. 287 * it will return a negative return value.
227 */ 288 */
228int reserve_memtype(u64 start, u64 end, unsigned long req_type, 289int reserve_memtype(u64 start, u64 end, unsigned long req_type,
229 unsigned long *new_type) 290 unsigned long *new_type)
230{ 291{
231 struct memtype *new, *entry; 292 struct memtype *new, *entry;
232 unsigned long actual_type; 293 unsigned long actual_type;
233 struct list_head *where; 294 struct list_head *where;
295 int is_range_ram;
234 int err = 0; 296 int err = 0;
235 297
236 BUG_ON(start >= end); /* end is exclusive */ 298 BUG_ON(start >= end); /* end is exclusive */
237 299
238 if (!pat_enabled) { 300 if (!pat_enabled) {
239 /* This is identical to page table setting without PAT */ 301 /* This is identical to page table setting without PAT */
@@ -266,17 +328,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
266 actual_type = _PAGE_CACHE_WB; 328 actual_type = _PAGE_CACHE_WB;
267 else 329 else
268 actual_type = _PAGE_CACHE_UC_MINUS; 330 actual_type = _PAGE_CACHE_UC_MINUS;
269 } else 331 } else {
270 actual_type = pat_x_mtrr_type(start, end, 332 actual_type = pat_x_mtrr_type(start, end,
271 req_type & _PAGE_CACHE_MASK); 333 req_type & _PAGE_CACHE_MASK);
334 }
335
336 is_range_ram = pagerange_is_ram(start, end);
337 if (is_range_ram == 1)
338 return reserve_ram_pages_type(start, end, req_type, new_type);
339 else if (is_range_ram < 0)
340 return -EINVAL;
272 341
273 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 342 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
274 if (!new) 343 if (!new)
275 return -ENOMEM; 344 return -ENOMEM;
276 345
277 new->start = start; 346 new->start = start;
278 new->end = end; 347 new->end = end;
279 new->type = actual_type; 348 new->type = actual_type;
280 349
281 if (new_type) 350 if (new_type)
282 *new_type = actual_type; 351 *new_type = actual_type;
@@ -335,6 +404,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
335 start, end, cattr_name(new->type), cattr_name(req_type)); 404 start, end, cattr_name(new->type), cattr_name(req_type));
336 kfree(new); 405 kfree(new);
337 spin_unlock(&memtype_lock); 406 spin_unlock(&memtype_lock);
407
338 return err; 408 return err;
339 } 409 }
340 410
@@ -358,6 +428,7 @@ int free_memtype(u64 start, u64 end)
358{ 428{
359 struct memtype *entry; 429 struct memtype *entry;
360 int err = -EINVAL; 430 int err = -EINVAL;
431 int is_range_ram;
361 432
362 if (!pat_enabled) 433 if (!pat_enabled)
363 return 0; 434 return 0;
@@ -366,6 +437,12 @@ int free_memtype(u64 start, u64 end)
366 if (is_ISA_range(start, end - 1)) 437 if (is_ISA_range(start, end - 1))
367 return 0; 438 return 0;
368 439
440 is_range_ram = pagerange_is_ram(start, end);
441 if (is_range_ram == 1)
442 return free_ram_pages_type(start, end);
443 else if (is_range_ram < 0)
444 return -EINVAL;
445
369 spin_lock(&memtype_lock); 446 spin_lock(&memtype_lock);
370 list_for_each_entry(entry, &memtype_list, nd) { 447 list_for_each_entry(entry, &memtype_list, nd) {
371 if (entry->start == start && entry->end == end) { 448 if (entry->start == start && entry->end == end) {
@@ -386,6 +463,7 @@ int free_memtype(u64 start, u64 end)
386 } 463 }
387 464
388 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); 465 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
466
389 return err; 467 return err;
390} 468}
391 469
@@ -492,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
492 570
493void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) 571void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
494{ 572{
573 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
495 u64 addr = (u64)pfn << PAGE_SHIFT; 574 u64 addr = (u64)pfn << PAGE_SHIFT;
496 unsigned long flags; 575 unsigned long flags;
497 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
498 576
499 reserve_memtype(addr, addr + size, want_flags, &flags); 577 reserve_memtype(addr, addr + size, want_flags, &flags);
500 if (flags != want_flags) { 578 if (flags != want_flags) {
@@ -514,7 +592,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
514 free_memtype(addr, addr + size); 592 free_memtype(addr, addr + size);
515} 593}
516 594
517#if defined(CONFIG_DEBUG_FS) 595#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
518 596
519/* get Nth element of the linked list */ 597/* get Nth element of the linked list */
520static struct memtype *memtype_get_idx(loff_t pos) 598static struct memtype *memtype_get_idx(loff_t pos)
@@ -537,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
537 } 615 }
538 spin_unlock(&memtype_lock); 616 spin_unlock(&memtype_lock);
539 kfree(print_entry); 617 kfree(print_entry);
618
540 return NULL; 619 return NULL;
541} 620}
542 621
@@ -567,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v)
567 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), 646 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
568 print_entry->start, print_entry->end); 647 print_entry->start, print_entry->end);
569 kfree(print_entry); 648 kfree(print_entry);
649
570 return 0; 650 return 0;
571} 651}
572 652
@@ -598,4 +678,4 @@ static int __init pat_memtype_list_init(void)
598 678
599late_initcall(pat_memtype_list_init); 679late_initcall(pat_memtype_list_init);
600 680
601#endif /* CONFIG_DEBUG_FS */ 681#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */