aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenjamin Herrenschmidt <benh@kernel.crashing.org>2007-05-08 02:27:27 -0400
committerPaul Mackerras <paulus@samba.org>2007-05-09 02:35:00 -0400
commitd0f13e3c20b6fb73ccb467bdca97fa7cf5a574cd (patch)
treea2de01a21dbb28449893102742e6b516a519c03e
parent16f1c746755836aa823658000493cdab8ce7b098 (diff)
[POWERPC] Introduce address space "slices"
The basic issue is to be able to do what hugetlbfs does but with different page sizes for some other special filesystems; more specifically, my need is: - Huge pages - SPE local store mappings using 64K pages on a 4K base page size kernel on Cell - Some special 4K segments in 64K-page kernels for mapping a dodgy type of powerpc-specific infiniband hardware that requires 4K MMU mappings for various reasons I won't explain here. The main issues are: - To maintain/keep track of the page size per "segment" (as we can only have one page size per segment on powerpc, which are 256MB divisions of the address space). - To make sure special mappings stay within their allotted "segments" (including MAP_FIXED crap) - To make sure everybody else doesn't mmap/brk/grow_stack into a "segment" that is used for a special mapping Some of the necessary mechanisms to handle that were present in the hugetlbfs code, but mostly in ways not suitable for anything else. The patch relies on some changes to the generic get_unmapped_area() that just got merged. It still hijacks hugetlb callbacks here or there as the generic code hasn't been entirely cleaned up yet but that shouldn't be a problem. So what is a slice ? Well, I re-used the mechanism used formerly by our hugetlbfs implementation which divides the address space in "meta-segments" which I called "slices". The division is done using 256MB slices below 4G, and 1T slices above. Thus the address space is divided currently into 16 "low" slices and 16 "high" slices. (Special case: high slice 0 is the area between 4G and 1T). Doing so simplifies significantly the tracking of segments and avoids having to keep track of all the 256MB segments in the address space. While I used the "concepts" of hugetlbfs, I mostly re-implemented everything in a more generic way and "ported" hugetlbfs to it. Slices can have an associated page size, which is encoded in the mmu context and used by the SLB miss handler to set the segment sizes. The hash code currently doesn't care, it has a specific check for hugepages, though I might add a mechanism to provide per-slice hash mapping functions in the future. The slice code provide a pair of "generic" get_unmapped_area() (bottomup and topdown) functions that should work with any slice size. There is some trickiness here so I would appreciate people to have a look at the implementation of these and let me know if I got something wrong. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
-rw-r--r--arch/powerpc/Kconfig5
-rw-r--r--arch/powerpc/kernel/asm-offsets.c16
-rw-r--r--arch/powerpc/mm/Makefile1
-rw-r--r--arch/powerpc/mm/hash_utils_64.c20
-rw-r--r--arch/powerpc/mm/hugetlbpage.c548
-rw-r--r--arch/powerpc/mm/mmu_context_64.c10
-rw-r--r--arch/powerpc/mm/slb.c11
-rw-r--r--arch/powerpc/mm/slb_low.S52
-rw-r--r--arch/powerpc/mm/slice.c633
-rw-r--r--arch/powerpc/platforms/cell/spu_base.c9
-rw-r--r--include/asm-powerpc/mmu-hash64.h11
-rw-r--r--include/asm-powerpc/paca.h2
-rw-r--r--include/asm-powerpc/page_64.h86
13 files changed, 769 insertions, 635 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 808d2ef80e2f..5226f701634e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -352,6 +352,11 @@ config PPC_STD_MMU_32
352 def_bool y 352 def_bool y
353 depends on PPC_STD_MMU && PPC32 353 depends on PPC_STD_MMU && PPC32
354 354
355config PPC_MM_SLICES
356 bool
357 default y if HUGETLB_PAGE
358 default n
359
355config VIRT_CPU_ACCOUNTING 360config VIRT_CPU_ACCOUNTING
356 bool "Deterministic task and CPU time accounting" 361 bool "Deterministic task and CPU time accounting"
357 depends on PPC64 362 depends on PPC64
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8f48560b7ee2..d6803fb7b28b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -122,12 +122,18 @@ int main(void)
122 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); 122 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
123 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); 123 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
124 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); 124 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
125 DEFINE(PACACONTEXTSLLP, offsetof(struct paca_struct, context.sllp));
126 DEFINE(PACAVMALLOCSLLP, offsetof(struct paca_struct, vmalloc_sllp)); 125 DEFINE(PACAVMALLOCSLLP, offsetof(struct paca_struct, vmalloc_sllp));
127#ifdef CONFIG_HUGETLB_PAGE 126#ifdef CONFIG_PPC_MM_SLICES
128 DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas)); 127 DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct,
129 DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas)); 128 context.low_slices_psize));
130#endif /* CONFIG_HUGETLB_PAGE */ 129 DEFINE(PACAHIGHSLICEPSIZE, offsetof(struct paca_struct,
130 context.high_slices_psize));
131 DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
132 DEFINE(MMUPSIZESLLP, offsetof(struct mmu_psize_def, sllp));
133#else
134 DEFINE(PACACONTEXTSLLP, offsetof(struct paca_struct, context.sllp));
135
136#endif /* CONFIG_PPC_MM_SLICES */
131 DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen)); 137 DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen));
132 DEFINE(PACA_EXMC, offsetof(struct paca_struct, exmc)); 138 DEFINE(PACA_EXMC, offsetof(struct paca_struct, exmc));
133 DEFINE(PACA_EXSLB, offsetof(struct paca_struct, exslb)); 139 DEFINE(PACA_EXSLB, offsetof(struct paca_struct, exslb));
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 38a81967ca07..4f839c6a9768 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -18,4 +18,5 @@ obj-$(CONFIG_40x) += 4xx_mmu.o
18obj-$(CONFIG_44x) += 44x_mmu.o 18obj-$(CONFIG_44x) += 44x_mmu.o
19obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o 19obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o
20obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o 20obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
21obj-$(CONFIG_PPC_MM_SLICES) += slice.o
21obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 22obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 71092c2f65cd..5610ffb14211 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -51,6 +51,7 @@
51#include <asm/cputable.h> 51#include <asm/cputable.h>
52#include <asm/abs_addr.h> 52#include <asm/abs_addr.h>
53#include <asm/sections.h> 53#include <asm/sections.h>
54#include <asm/spu.h>
54 55
55#ifdef DEBUG 56#ifdef DEBUG
56#define DBG(fmt...) udbg_printf(fmt) 57#define DBG(fmt...) udbg_printf(fmt)
@@ -601,8 +602,13 @@ static void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
601{ 602{
602 if (mm->context.user_psize == MMU_PAGE_4K) 603 if (mm->context.user_psize == MMU_PAGE_4K)
603 return; 604 return;
605#ifdef CONFIG_PPC_MM_SLICES
606 slice_set_user_psize(mm, MMU_PAGE_4K);
607#else /* CONFIG_PPC_MM_SLICES */
604 mm->context.user_psize = MMU_PAGE_4K; 608 mm->context.user_psize = MMU_PAGE_4K;
605 mm->context.sllp = SLB_VSID_USER | mmu_psize_defs[MMU_PAGE_4K].sllp; 609 mm->context.sllp = SLB_VSID_USER | mmu_psize_defs[MMU_PAGE_4K].sllp;
610#endif /* CONFIG_PPC_MM_SLICES */
611
606#ifdef CONFIG_SPE_BASE 612#ifdef CONFIG_SPE_BASE
607 spu_flush_all_slbs(mm); 613 spu_flush_all_slbs(mm);
608#endif 614#endif
@@ -670,11 +676,14 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
670 if (user_region && cpus_equal(mm->cpu_vm_mask, tmp)) 676 if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
671 local = 1; 677 local = 1;
672 678
679#ifdef CONFIG_HUGETLB_PAGE
673 /* Handle hugepage regions */ 680 /* Handle hugepage regions */
674 if (unlikely(in_hugepage_area(mm->context, ea))) { 681 if (HPAGE_SHIFT &&
682 unlikely(get_slice_psize(mm, ea) == mmu_huge_psize)) {
675 DBG_LOW(" -> huge page !\n"); 683 DBG_LOW(" -> huge page !\n");
676 return hash_huge_page(mm, access, ea, vsid, local, trap); 684 return hash_huge_page(mm, access, ea, vsid, local, trap);
677 } 685 }
686#endif /* CONFIG_HUGETLB_PAGE */
678 687
679 /* Get PTE and page size from page tables */ 688 /* Get PTE and page size from page tables */
680 ptep = find_linux_pte(pgdir, ea); 689 ptep = find_linux_pte(pgdir, ea);
@@ -770,10 +779,13 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
770 unsigned long flags; 779 unsigned long flags;
771 int local = 0; 780 int local = 0;
772 781
773 /* We don't want huge pages prefaulted for now 782 BUG_ON(REGION_ID(ea) != USER_REGION_ID);
774 */ 783
775 if (unlikely(in_hugepage_area(mm->context, ea))) 784#ifdef CONFIG_PPC_MM_SLICES
785 /* We only prefault standard pages for now */
786 if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize));
776 return; 787 return;
788#endif
777 789
778 DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," 790 DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
779 " trap=%lx\n", mm, mm->pgd, ea, access, trap); 791 " trap=%lx\n", mm, mm->pgd, ea, access, trap);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index fb959264c104..92a1b16fb7e3 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -91,7 +91,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
91 pgd_t *pg; 91 pgd_t *pg;
92 pud_t *pu; 92 pud_t *pu;
93 93
94 BUG_ON(! in_hugepage_area(mm->context, addr)); 94 BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
95 95
96 addr &= HPAGE_MASK; 96 addr &= HPAGE_MASK;
97 97
@@ -119,7 +119,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
119 pud_t *pu; 119 pud_t *pu;
120 hugepd_t *hpdp = NULL; 120 hugepd_t *hpdp = NULL;
121 121
122 BUG_ON(! in_hugepage_area(mm->context, addr)); 122 BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
123 123
124 addr &= HPAGE_MASK; 124 addr &= HPAGE_MASK;
125 125
@@ -302,7 +302,7 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb,
302 start = addr; 302 start = addr;
303 pgd = pgd_offset((*tlb)->mm, addr); 303 pgd = pgd_offset((*tlb)->mm, addr);
304 do { 304 do {
305 BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr)); 305 BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
306 next = pgd_addr_end(addr, end); 306 next = pgd_addr_end(addr, end);
307 if (pgd_none_or_clear_bad(pgd)) 307 if (pgd_none_or_clear_bad(pgd))
308 continue; 308 continue;
@@ -331,203 +331,13 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
331 return __pte(old); 331 return __pte(old);
332} 332}
333 333
334struct slb_flush_info {
335 struct mm_struct *mm;
336 u16 newareas;
337};
338
339static void flush_low_segments(void *parm)
340{
341 struct slb_flush_info *fi = parm;
342 unsigned long i;
343
344 BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
345
346 if (current->active_mm != fi->mm)
347 return;
348
349 /* Only need to do anything if this CPU is working in the same
350 * mm as the one which has changed */
351
352 /* update the paca copy of the context struct */
353 get_paca()->context = current->active_mm->context;
354
355 asm volatile("isync" : : : "memory");
356 for (i = 0; i < NUM_LOW_AREAS; i++) {
357 if (! (fi->newareas & (1U << i)))
358 continue;
359 asm volatile("slbie %0"
360 : : "r" ((i << SID_SHIFT) | SLBIE_C));
361 }
362 asm volatile("isync" : : : "memory");
363}
364
365static void flush_high_segments(void *parm)
366{
367 struct slb_flush_info *fi = parm;
368 unsigned long i, j;
369
370
371 BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
372
373 if (current->active_mm != fi->mm)
374 return;
375
376 /* Only need to do anything if this CPU is working in the same
377 * mm as the one which has changed */
378
379 /* update the paca copy of the context struct */
380 get_paca()->context = current->active_mm->context;
381
382 asm volatile("isync" : : : "memory");
383 for (i = 0; i < NUM_HIGH_AREAS; i++) {
384 if (! (fi->newareas & (1U << i)))
385 continue;
386 for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
387 asm volatile("slbie %0"
388 :: "r" (((i << HTLB_AREA_SHIFT)
389 + (j << SID_SHIFT)) | SLBIE_C));
390 }
391 asm volatile("isync" : : : "memory");
392}
393
394static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
395{
396 unsigned long start = area << SID_SHIFT;
397 unsigned long end = (area+1) << SID_SHIFT;
398 struct vm_area_struct *vma;
399
400 BUG_ON(area >= NUM_LOW_AREAS);
401
402 /* Check no VMAs are in the region */
403 vma = find_vma(mm, start);
404 if (vma && (vma->vm_start < end))
405 return -EBUSY;
406
407 return 0;
408}
409
410static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
411{
412 unsigned long start = area << HTLB_AREA_SHIFT;
413 unsigned long end = (area+1) << HTLB_AREA_SHIFT;
414 struct vm_area_struct *vma;
415
416 BUG_ON(area >= NUM_HIGH_AREAS);
417
418 /* Hack, so that each addresses is controlled by exactly one
419 * of the high or low area bitmaps, the first high area starts
420 * at 4GB, not 0 */
421 if (start == 0)
422 start = 0x100000000UL;
423
424 /* Check no VMAs are in the region */
425 vma = find_vma(mm, start);
426 if (vma && (vma->vm_start < end))
427 return -EBUSY;
428
429 return 0;
430}
431
432static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
433{
434 unsigned long i;
435 struct slb_flush_info fi;
436
437 BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
438 BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
439
440 newareas &= ~(mm->context.low_htlb_areas);
441 if (! newareas)
442 return 0; /* The segments we want are already open */
443
444 for (i = 0; i < NUM_LOW_AREAS; i++)
445 if ((1 << i) & newareas)
446 if (prepare_low_area_for_htlb(mm, i) != 0)
447 return -EBUSY;
448
449 mm->context.low_htlb_areas |= newareas;
450
451 /* the context change must make it to memory before the flush,
452 * so that further SLB misses do the right thing. */
453 mb();
454
455 fi.mm = mm;
456 fi.newareas = newareas;
457 on_each_cpu(flush_low_segments, &fi, 0, 1);
458
459 return 0;
460}
461
462static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
463{
464 struct slb_flush_info fi;
465 unsigned long i;
466
467 BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
468 BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
469 != NUM_HIGH_AREAS);
470
471 newareas &= ~(mm->context.high_htlb_areas);
472 if (! newareas)
473 return 0; /* The areas we want are already open */
474
475 for (i = 0; i < NUM_HIGH_AREAS; i++)
476 if ((1 << i) & newareas)
477 if (prepare_high_area_for_htlb(mm, i) != 0)
478 return -EBUSY;
479
480 mm->context.high_htlb_areas |= newareas;
481
482 /* the context change must make it to memory before the flush,
483 * so that further SLB misses do the right thing. */
484 mb();
485
486 fi.mm = mm;
487 fi.newareas = newareas;
488 on_each_cpu(flush_high_segments, &fi, 0, 1);
489
490 return 0;
491}
492
493int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff)
494{
495 int err = 0;
496
497 if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
498 return -EINVAL;
499 if (len & ~HPAGE_MASK)
500 return -EINVAL;
501 if (addr & ~HPAGE_MASK)
502 return -EINVAL;
503
504 if (addr < 0x100000000UL)
505 err = open_low_hpage_areas(current->mm,
506 LOW_ESID_MASK(addr, len));
507 if ((addr + len) > 0x100000000UL)
508 err = open_high_hpage_areas(current->mm,
509 HTLB_AREA_MASK(addr, len));
510#ifdef CONFIG_SPE_BASE
511 spu_flush_all_slbs(current->mm);
512#endif
513 if (err) {
514 printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
515 " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
516 addr, len,
517 LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
518 return err;
519 }
520
521 return 0;
522}
523
524struct page * 334struct page *
525follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 335follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
526{ 336{
527 pte_t *ptep; 337 pte_t *ptep;
528 struct page *page; 338 struct page *page;
529 339
530 if (! in_hugepage_area(mm->context, address)) 340 if (get_slice_psize(mm, address) != mmu_huge_psize)
531 return ERR_PTR(-EINVAL); 341 return ERR_PTR(-EINVAL);
532 342
533 ptep = huge_pte_offset(mm, address); 343 ptep = huge_pte_offset(mm, address);
@@ -551,359 +361,13 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
551 return NULL; 361 return NULL;
552} 362}
553 363
554/* Because we have an exclusive hugepage region which lies within the
555 * normal user address space, we have to take special measures to make
556 * non-huge mmap()s evade the hugepage reserved regions. */
557unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
558 unsigned long len, unsigned long pgoff,
559 unsigned long flags)
560{
561 struct mm_struct *mm = current->mm;
562 struct vm_area_struct *vma;
563 unsigned long start_addr;
564
565 if (len > TASK_SIZE)
566 return -ENOMEM;
567
568 /* handle fixed mapping: prevent overlap with huge pages */
569 if (flags & MAP_FIXED) {
570 if (is_hugepage_only_range(mm, addr, len))
571 return -EINVAL;
572 return addr;
573 }
574
575 if (addr) {
576 addr = PAGE_ALIGN(addr);
577 vma = find_vma(mm, addr);
578 if (((TASK_SIZE - len) >= addr)
579 && (!vma || (addr+len) <= vma->vm_start)
580 && !is_hugepage_only_range(mm, addr,len))
581 return addr;
582 }
583 if (len > mm->cached_hole_size) {
584 start_addr = addr = mm->free_area_cache;
585 } else {
586 start_addr = addr = TASK_UNMAPPED_BASE;
587 mm->cached_hole_size = 0;
588 }
589
590full_search:
591 vma = find_vma(mm, addr);
592 while (TASK_SIZE - len >= addr) {
593 BUG_ON(vma && (addr >= vma->vm_end));
594
595 if (touches_hugepage_low_range(mm, addr, len)) {
596 addr = ALIGN(addr+1, 1<<SID_SHIFT);
597 vma = find_vma(mm, addr);
598 continue;
599 }
600 if (touches_hugepage_high_range(mm, addr, len)) {
601 addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
602 vma = find_vma(mm, addr);
603 continue;
604 }
605 if (!vma || addr + len <= vma->vm_start) {
606 /*
607 * Remember the place where we stopped the search:
608 */
609 mm->free_area_cache = addr + len;
610 return addr;
611 }
612 if (addr + mm->cached_hole_size < vma->vm_start)
613 mm->cached_hole_size = vma->vm_start - addr;
614 addr = vma->vm_end;
615 vma = vma->vm_next;
616 }
617
618 /* Make sure we didn't miss any holes */
619 if (start_addr != TASK_UNMAPPED_BASE) {
620 start_addr = addr = TASK_UNMAPPED_BASE;
621 mm->cached_hole_size = 0;
622 goto full_search;
623 }
624 return -ENOMEM;
625}
626
627/*
628 * This mmap-allocator allocates new areas top-down from below the
629 * stack's low limit (the base):
630 *
631 * Because we have an exclusive hugepage region which lies within the
632 * normal user address space, we have to take special measures to make
633 * non-huge mmap()s evade the hugepage reserved regions.
634 */
635unsigned long
636arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
637 const unsigned long len, const unsigned long pgoff,
638 const unsigned long flags)
639{
640 struct vm_area_struct *vma, *prev_vma;
641 struct mm_struct *mm = current->mm;
642 unsigned long base = mm->mmap_base, addr = addr0;
643 unsigned long largest_hole = mm->cached_hole_size;
644 int first_time = 1;
645
646 /* requested length too big for entire address space */
647 if (len > TASK_SIZE)
648 return -ENOMEM;
649
650 /* handle fixed mapping: prevent overlap with huge pages */
651 if (flags & MAP_FIXED) {
652 if (is_hugepage_only_range(mm, addr, len))
653 return -EINVAL;
654 return addr;
655 }
656
657 /* dont allow allocations above current base */
658 if (mm->free_area_cache > base)
659 mm->free_area_cache = base;
660
661 /* requesting a specific address */
662 if (addr) {
663 addr = PAGE_ALIGN(addr);
664 vma = find_vma(mm, addr);
665 if (TASK_SIZE - len >= addr &&
666 (!vma || addr + len <= vma->vm_start)
667 && !is_hugepage_only_range(mm, addr,len))
668 return addr;
669 }
670
671 if (len <= largest_hole) {
672 largest_hole = 0;
673 mm->free_area_cache = base;
674 }
675try_again:
676 /* make sure it can fit in the remaining address space */
677 if (mm->free_area_cache < len)
678 goto fail;
679
680 /* either no address requested or cant fit in requested address hole */
681 addr = (mm->free_area_cache - len) & PAGE_MASK;
682 do {
683hugepage_recheck:
684 if (touches_hugepage_low_range(mm, addr, len)) {
685 addr = (addr & ((~0) << SID_SHIFT)) - len;
686 goto hugepage_recheck;
687 } else if (touches_hugepage_high_range(mm, addr, len)) {
688 addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
689 goto hugepage_recheck;
690 }
691
692 /*
693 * Lookup failure means no vma is above this address,
694 * i.e. return with success:
695 */
696 if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
697 return addr;
698
699 /*
700 * new region fits between prev_vma->vm_end and
701 * vma->vm_start, use it:
702 */
703 if (addr+len <= vma->vm_start &&
704 (!prev_vma || (addr >= prev_vma->vm_end))) {
705 /* remember the address as a hint for next time */
706 mm->cached_hole_size = largest_hole;
707 return (mm->free_area_cache = addr);
708 } else {
709 /* pull free_area_cache down to the first hole */
710 if (mm->free_area_cache == vma->vm_end) {
711 mm->free_area_cache = vma->vm_start;
712 mm->cached_hole_size = largest_hole;
713 }
714 }
715
716 /* remember the largest hole we saw so far */
717 if (addr + largest_hole < vma->vm_start)
718 largest_hole = vma->vm_start - addr;
719
720 /* try just below the current vma->vm_start */
721 addr = vma->vm_start-len;
722 } while (len <= vma->vm_start);
723
724fail:
725 /*
726 * if hint left us with no space for the requested
727 * mapping then try again:
728 */
729 if (first_time) {
730 mm->free_area_cache = base;
731 largest_hole = 0;
732 first_time = 0;
733 goto try_again;
734 }
735 /*
736 * A failed mmap() very likely causes application failure,
737 * so fall back to the bottom-up function here. This scenario
738 * can happen with large stack limits and large mmap()
739 * allocations.
740 */
741 mm->free_area_cache = TASK_UNMAPPED_BASE;
742 mm->cached_hole_size = ~0UL;
743 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
744 /*
745 * Restore the topdown base:
746 */
747 mm->free_area_cache = base;
748 mm->cached_hole_size = ~0UL;
749
750 return addr;
751}
752
753static int htlb_check_hinted_area(unsigned long addr, unsigned long len)
754{
755 struct vm_area_struct *vma;
756
757 vma = find_vma(current->mm, addr);
758 if (TASK_SIZE - len >= addr &&
759 (!vma || ((addr + len) <= vma->vm_start)))
760 return 0;
761
762 return -ENOMEM;
763}
764
765static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
766{
767 unsigned long addr = 0;
768 struct vm_area_struct *vma;
769
770 vma = find_vma(current->mm, addr);
771 while (addr + len <= 0x100000000UL) {
772 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
773
774 if (! __within_hugepage_low_range(addr, len, segmask)) {
775 addr = ALIGN(addr+1, 1<<SID_SHIFT);
776 vma = find_vma(current->mm, addr);
777 continue;
778 }
779
780 if (!vma || (addr + len) <= vma->vm_start)
781 return addr;
782 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
783 /* Depending on segmask this might not be a confirmed
784 * hugepage region, so the ALIGN could have skipped
785 * some VMAs */
786 vma = find_vma(current->mm, addr);
787 }
788
789 return -ENOMEM;
790}
791
792static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
793{
794 unsigned long addr = 0x100000000UL;
795 struct vm_area_struct *vma;
796
797 vma = find_vma(current->mm, addr);
798 while (addr + len <= TASK_SIZE_USER64) {
799 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
800
801 if (! __within_hugepage_high_range(addr, len, areamask)) {
802 addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
803 vma = find_vma(current->mm, addr);
804 continue;
805 }
806
807 if (!vma || (addr + len) <= vma->vm_start)
808 return addr;
809 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
810 /* Depending on segmask this might not be a confirmed
811 * hugepage region, so the ALIGN could have skipped
812 * some VMAs */
813 vma = find_vma(current->mm, addr);
814 }
815
816 return -ENOMEM;
817}
818 364
819unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 365unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
820 unsigned long len, unsigned long pgoff, 366 unsigned long len, unsigned long pgoff,
821 unsigned long flags) 367 unsigned long flags)
822{ 368{
823 int lastshift; 369 return slice_get_unmapped_area(addr, len, flags,
824 u16 areamask, curareas; 370 mmu_huge_psize, 1, 0);
825
826 if (HPAGE_SHIFT == 0)
827 return -EINVAL;
828 if (len & ~HPAGE_MASK)
829 return -EINVAL;
830 if (len > TASK_SIZE)
831 return -ENOMEM;
832
833 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
834 return -EINVAL;
835
836 /* Paranoia, caller should have dealt with this */
837 BUG_ON((addr + len) < addr);
838
839 /* Handle MAP_FIXED */
840 if (flags & MAP_FIXED) {
841 if (prepare_hugepage_range(addr, len, pgoff))
842 return -EINVAL;
843 return addr;
844 }
845
846 if (test_thread_flag(TIF_32BIT)) {
847 curareas = current->mm->context.low_htlb_areas;
848
849 /* First see if we can use the hint address */
850 if (addr && (htlb_check_hinted_area(addr, len) == 0)) {
851 areamask = LOW_ESID_MASK(addr, len);
852 if (open_low_hpage_areas(current->mm, areamask) == 0)
853 return addr;
854 }
855
856 /* Next see if we can map in the existing low areas */
857 addr = htlb_get_low_area(len, curareas);
858 if (addr != -ENOMEM)
859 return addr;
860
861 /* Finally go looking for areas to open */
862 lastshift = 0;
863 for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
864 ! lastshift; areamask >>=1) {
865 if (areamask & 1)
866 lastshift = 1;
867
868 addr = htlb_get_low_area(len, curareas | areamask);
869 if ((addr != -ENOMEM)
870 && open_low_hpage_areas(current->mm, areamask) == 0)
871 return addr;
872 }
873 } else {
874 curareas = current->mm->context.high_htlb_areas;
875
876 /* First see if we can use the hint address */
877 /* We discourage 64-bit processes from doing hugepage
878 * mappings below 4GB (must use MAP_FIXED) */
879 if ((addr >= 0x100000000UL)
880 && (htlb_check_hinted_area(addr, len) == 0)) {
881 areamask = HTLB_AREA_MASK(addr, len);
882 if (open_high_hpage_areas(current->mm, areamask) == 0)
883 return addr;
884 }
885
886 /* Next see if we can map in the existing high areas */
887 addr = htlb_get_high_area(len, curareas);
888 if (addr != -ENOMEM)
889 return addr;
890
891 /* Finally go looking for areas to open */
892 lastshift = 0;
893 for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
894 ! lastshift; areamask >>=1) {
895 if (areamask & 1)
896 lastshift = 1;
897
898 addr = htlb_get_high_area(len, curareas | areamask);
899 if ((addr != -ENOMEM)
900 && open_high_hpage_areas(current->mm, areamask) == 0)
901 return addr;
902 }
903 }
904 printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
905 " enough areas\n");
906 return -ENOMEM;
907} 371}
908 372
909/* 373/*
diff --git a/arch/powerpc/mm/mmu_context_64.c b/arch/powerpc/mm/mmu_context_64.c
index 90a06ac02d5e..7a78cdc0515a 100644
--- a/arch/powerpc/mm/mmu_context_64.c
+++ b/arch/powerpc/mm/mmu_context_64.c
@@ -28,6 +28,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
28{ 28{
29 int index; 29 int index;
30 int err; 30 int err;
31 int new_context = (mm->context.id == 0);
31 32
32again: 33again:
33 if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) 34 if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
@@ -50,9 +51,18 @@ again:
50 } 51 }
51 52
52 mm->context.id = index; 53 mm->context.id = index;
54#ifdef CONFIG_PPC_MM_SLICES
55 /* The old code would re-promote on fork, we don't do that
56 * when using slices as it could cause problem promoting slices
57 * that have been forced down to 4K
58 */
59 if (new_context)
60 slice_set_user_psize(mm, mmu_virtual_psize);
61#else
53 mm->context.user_psize = mmu_virtual_psize; 62 mm->context.user_psize = mmu_virtual_psize;
54 mm->context.sllp = SLB_VSID_USER | 63 mm->context.sllp = SLB_VSID_USER |
55 mmu_psize_defs[mmu_virtual_psize].sllp; 64 mmu_psize_defs[mmu_virtual_psize].sllp;
65#endif
56 66
57 return 0; 67 return 0;
58} 68}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 224e960650a0..304375a73574 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -198,12 +198,6 @@ void slb_initialize(void)
198 static int slb_encoding_inited; 198 static int slb_encoding_inited;
199 extern unsigned int *slb_miss_kernel_load_linear; 199 extern unsigned int *slb_miss_kernel_load_linear;
200 extern unsigned int *slb_miss_kernel_load_io; 200 extern unsigned int *slb_miss_kernel_load_io;
201#ifdef CONFIG_HUGETLB_PAGE
202 extern unsigned int *slb_miss_user_load_huge;
203 unsigned long huge_llp;
204
205 huge_llp = mmu_psize_defs[mmu_huge_psize].sllp;
206#endif
207 201
208 /* Prepare our SLB miss handler based on our page size */ 202 /* Prepare our SLB miss handler based on our page size */
209 linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; 203 linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
@@ -220,11 +214,6 @@ void slb_initialize(void)
220 214
221 DBG("SLB: linear LLP = %04x\n", linear_llp); 215 DBG("SLB: linear LLP = %04x\n", linear_llp);
222 DBG("SLB: io LLP = %04x\n", io_llp); 216 DBG("SLB: io LLP = %04x\n", io_llp);
223#ifdef CONFIG_HUGETLB_PAGE
224 patch_slb_encoding(slb_miss_user_load_huge,
225 SLB_VSID_USER | huge_llp);
226 DBG("SLB: huge LLP = %04x\n", huge_llp);
227#endif
228 } 217 }
229 218
230 get_paca()->stab_rr = SLB_NUM_BOLTED; 219 get_paca()->stab_rr = SLB_NUM_BOLTED;
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index b10e4707d7c1..cd1a93d4948c 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -82,31 +82,45 @@ _GLOBAL(slb_miss_kernel_load_io)
82 srdi. r9,r10,USER_ESID_BITS 82 srdi. r9,r10,USER_ESID_BITS
83 bne- 8f /* invalid ea bits set */ 83 bne- 8f /* invalid ea bits set */
84 84
85 /* Figure out if the segment contains huge pages */ 85
86#ifdef CONFIG_HUGETLB_PAGE 86 /* when using slices, we extract the psize off the slice bitmaps
87BEGIN_FTR_SECTION 87 * and then we need to get the sllp encoding off the mmu_psize_defs
88 b 1f 88 * array.
89END_FTR_SECTION_IFCLR(CPU_FTR_16M_PAGE) 89 *
90 * XXX This is a bit inefficient especially for the normal case,
91 * so we should try to implement a fast path for the standard page
92 * size using the old sllp value so we avoid the array. We cannot
93 * really do dynamic patching unfortunately as processes might flip
94 * between 4k and 64k standard page size
95 */
96#ifdef CONFIG_PPC_MM_SLICES
90 cmpldi r10,16 97 cmpldi r10,16
91 98
92 lhz r9,PACALOWHTLBAREAS(r13) 99 /* Get the slice index * 4 in r11 and matching slice size mask in r9 */
93 mr r11,r10 100 ld r9,PACALOWSLICESPSIZE(r13)
101 sldi r11,r10,2
94 blt 5f 102 blt 5f
103 ld r9,PACAHIGHSLICEPSIZE(r13)
104 srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2)
105 andi. r11,r11,0x3c
95 106
96 lhz r9,PACAHIGHHTLBAREAS(r13) 1075: /* Extract the psize and multiply to get an array offset */
97 srdi r11,r10,(HTLB_AREA_SHIFT-SID_SHIFT) 108 srd r9,r9,r11
98 109 andi. r9,r9,0xf
995: srd r9,r9,r11 110 mulli r9,r9,MMUPSIZEDEFSIZE
100 andi. r9,r9,1
101 beq 1f
102_GLOBAL(slb_miss_user_load_huge)
103 li r11,0
104 b 2f
1051:
106#endif /* CONFIG_HUGETLB_PAGE */
107 111
112 /* Now get to the array and obtain the sllp
113 */
114 ld r11,PACATOC(r13)
115 ld r11,mmu_psize_defs@got(r11)
116 add r11,r11,r9
117 ld r11,MMUPSIZESLLP(r11)
118 ori r11,r11,SLB_VSID_USER
119#else
120 /* paca context sllp already contains the SLB_VSID_USER bits */
108 lhz r11,PACACONTEXTSLLP(r13) 121 lhz r11,PACACONTEXTSLLP(r13)
1092: 122#endif /* CONFIG_PPC_MM_SLICES */
123
110 ld r9,PACACONTEXTID(r13) 124 ld r9,PACACONTEXTID(r13)
111 rldimi r10,r9,USER_ESID_BITS,0 125 rldimi r10,r9,USER_ESID_BITS,0
112 b slb_finish_load 126 b slb_finish_load
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
new file mode 100644
index 000000000000..f833dba2a028
--- /dev/null
+++ b/arch/powerpc/mm/slice.c
@@ -0,0 +1,633 @@
1/*
2 * address space "slices" (meta-segments) support
3 *
4 * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation.
5 *
6 * Based on hugetlb implementation
7 *
8 * Copyright (C) 2003 David Gibson, IBM Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25#undef DEBUG
26
27#include <linux/kernel.h>
28#include <linux/mm.h>
29#include <linux/pagemap.h>
30#include <linux/err.h>
31#include <linux/spinlock.h>
32#include <linux/module.h>
33#include <asm/mman.h>
34#include <asm/mmu.h>
35#include <asm/spu.h>
36
37static spinlock_t slice_convert_lock = SPIN_LOCK_UNLOCKED;
38
39
40#ifdef DEBUG
41int _slice_debug = 1;
42
43static void slice_print_mask(const char *label, struct slice_mask mask)
44{
45 char *p, buf[16 + 3 + 16 + 1];
46 int i;
47
48 if (!_slice_debug)
49 return;
50 p = buf;
51 for (i = 0; i < SLICE_NUM_LOW; i++)
52 *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0';
53 *(p++) = ' ';
54 *(p++) = '-';
55 *(p++) = ' ';
56 for (i = 0; i < SLICE_NUM_HIGH; i++)
57 *(p++) = (mask.high_slices & (1 << i)) ? '1' : '0';
58 *(p++) = 0;
59
60 printk(KERN_DEBUG "%s:%s\n", label, buf);
61}
62
63#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0)
64
65#else
66
67static void slice_print_mask(const char *label, struct slice_mask mask) {}
68#define slice_dbg(fmt...)
69
70#endif
71
72static struct slice_mask slice_range_to_mask(unsigned long start,
73 unsigned long len)
74{
75 unsigned long end = start + len - 1;
76 struct slice_mask ret = { 0, 0 };
77
78 if (start < SLICE_LOW_TOP) {
79 unsigned long mend = min(end, SLICE_LOW_TOP);
80 unsigned long mstart = min(start, SLICE_LOW_TOP);
81
82 ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
83 - (1u << GET_LOW_SLICE_INDEX(mstart));
84 }
85
86 if ((start + len) > SLICE_LOW_TOP)
87 ret.high_slices = (1u << (GET_HIGH_SLICE_INDEX(end) + 1))
88 - (1u << GET_HIGH_SLICE_INDEX(start));
89
90 return ret;
91}
92
93static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
94 unsigned long len)
95{
96 struct vm_area_struct *vma;
97
98 if ((mm->task_size - len) < addr)
99 return 0;
100 vma = find_vma(mm, addr);
101 return (!vma || (addr + len) <= vma->vm_start);
102}
103
104static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice)
105{
106 return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT,
107 1ul << SLICE_LOW_SHIFT);
108}
109
110static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
111{
112 unsigned long start = slice << SLICE_HIGH_SHIFT;
113 unsigned long end = start + (1ul << SLICE_HIGH_SHIFT);
114
115 /* Hack, so that each addresses is controlled by exactly one
116 * of the high or low area bitmaps, the first high area starts
117 * at 4GB, not 0 */
118 if (start == 0)
119 start = SLICE_LOW_TOP;
120
121 return !slice_area_is_free(mm, start, end - start);
122}
123
124static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
125{
126 struct slice_mask ret = { 0, 0 };
127 unsigned long i;
128
129 for (i = 0; i < SLICE_NUM_LOW; i++)
130 if (!slice_low_has_vma(mm, i))
131 ret.low_slices |= 1u << i;
132
133 if (mm->task_size <= SLICE_LOW_TOP)
134 return ret;
135
136 for (i = 0; i < SLICE_NUM_HIGH; i++)
137 if (!slice_high_has_vma(mm, i))
138 ret.high_slices |= 1u << i;
139
140 return ret;
141}
142
143static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
144{
145 struct slice_mask ret = { 0, 0 };
146 unsigned long i;
147 u64 psizes;
148
149 psizes = mm->context.low_slices_psize;
150 for (i = 0; i < SLICE_NUM_LOW; i++)
151 if (((psizes >> (i * 4)) & 0xf) == psize)
152 ret.low_slices |= 1u << i;
153
154 psizes = mm->context.high_slices_psize;
155 for (i = 0; i < SLICE_NUM_HIGH; i++)
156 if (((psizes >> (i * 4)) & 0xf) == psize)
157 ret.high_slices |= 1u << i;
158
159 return ret;
160}
161
162static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
163{
164 return (mask.low_slices & available.low_slices) == mask.low_slices &&
165 (mask.high_slices & available.high_slices) == mask.high_slices;
166}
167
168static void slice_flush_segments(void *parm)
169{
170 struct mm_struct *mm = parm;
171 unsigned long flags;
172
173 if (mm != current->active_mm)
174 return;
175
176 /* update the paca copy of the context struct */
177 get_paca()->context = current->active_mm->context;
178
179 local_irq_save(flags);
180 slb_flush_and_rebolt();
181 local_irq_restore(flags);
182}
183
184static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
185{
186 /* Write the new slice psize bits */
187 u64 lpsizes, hpsizes;
188 unsigned long i, flags;
189
190 slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
191 slice_print_mask(" mask", mask);
192
193 /* We need to use a spinlock here to protect against
194 * concurrent 64k -> 4k demotion ...
195 */
196 spin_lock_irqsave(&slice_convert_lock, flags);
197
198 lpsizes = mm->context.low_slices_psize;
199 for (i = 0; i < SLICE_NUM_LOW; i++)
200 if (mask.low_slices & (1u << i))
201 lpsizes = (lpsizes & ~(0xful << (i * 4))) |
202 (((unsigned long)psize) << (i * 4));
203
204 hpsizes = mm->context.high_slices_psize;
205 for (i = 0; i < SLICE_NUM_HIGH; i++)
206 if (mask.high_slices & (1u << i))
207 hpsizes = (hpsizes & ~(0xful << (i * 4))) |
208 (((unsigned long)psize) << (i * 4));
209
210 mm->context.low_slices_psize = lpsizes;
211 mm->context.high_slices_psize = hpsizes;
212
213 slice_dbg(" lsps=%lx, hsps=%lx\n",
214 mm->context.low_slices_psize,
215 mm->context.high_slices_psize);
216
217 spin_unlock_irqrestore(&slice_convert_lock, flags);
218 mb();
219
220 /* XXX this is sub-optimal but will do for now */
221 on_each_cpu(slice_flush_segments, mm, 0, 1);
222#ifdef CONFIG_SPU_BASE
223 spu_flush_all_slbs(mm);
224#endif
225}
226
227static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
228 unsigned long len,
229 struct slice_mask available,
230 int psize, int use_cache)
231{
232 struct vm_area_struct *vma;
233 unsigned long start_addr, addr;
234 struct slice_mask mask;
235 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
236
237 if (use_cache) {
238 if (len <= mm->cached_hole_size) {
239 start_addr = addr = TASK_UNMAPPED_BASE;
240 mm->cached_hole_size = 0;
241 } else
242 start_addr = addr = mm->free_area_cache;
243 } else
244 start_addr = addr = TASK_UNMAPPED_BASE;
245
246full_search:
247 for (;;) {
248 addr = _ALIGN_UP(addr, 1ul << pshift);
249 if ((TASK_SIZE - len) < addr)
250 break;
251 vma = find_vma(mm, addr);
252 BUG_ON(vma && (addr >= vma->vm_end));
253
254 mask = slice_range_to_mask(addr, len);
255 if (!slice_check_fit(mask, available)) {
256 if (addr < SLICE_LOW_TOP)
257 addr = _ALIGN_UP(addr + 1, 1ul << SLICE_LOW_SHIFT);
258 else
259 addr = _ALIGN_UP(addr + 1, 1ul << SLICE_HIGH_SHIFT);
260 continue;
261 }
262 if (!vma || addr + len <= vma->vm_start) {
263 /*
264 * Remember the place where we stopped the search:
265 */
266 if (use_cache)
267 mm->free_area_cache = addr + len;
268 return addr;
269 }
270 if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
271 mm->cached_hole_size = vma->vm_start - addr;
272 addr = vma->vm_end;
273 }
274
275 /* Make sure we didn't miss any holes */
276 if (use_cache && start_addr != TASK_UNMAPPED_BASE) {
277 start_addr = addr = TASK_UNMAPPED_BASE;
278 mm->cached_hole_size = 0;
279 goto full_search;
280 }
281 return -ENOMEM;
282}
283
284static unsigned long slice_find_area_topdown(struct mm_struct *mm,
285 unsigned long len,
286 struct slice_mask available,
287 int psize, int use_cache)
288{
289 struct vm_area_struct *vma;
290 unsigned long addr;
291 struct slice_mask mask;
292 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
293
294 /* check if free_area_cache is useful for us */
295 if (use_cache) {
296 if (len <= mm->cached_hole_size) {
297 mm->cached_hole_size = 0;
298 mm->free_area_cache = mm->mmap_base;
299 }
300
301 /* either no address requested or can't fit in requested
302 * address hole
303 */
304 addr = mm->free_area_cache;
305
306 /* make sure it can fit in the remaining address space */
307 if (addr > len) {
308 addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
309 mask = slice_range_to_mask(addr, len);
310 if (slice_check_fit(mask, available) &&
311 slice_area_is_free(mm, addr, len))
312 /* remember the address as a hint for
313 * next time
314 */
315 return (mm->free_area_cache = addr);
316 }
317 }
318
319 addr = mm->mmap_base;
320 while (addr > len) {
321 /* Go down by chunk size */
322 addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
323
324 /* Check for hit with different page size */
325 mask = slice_range_to_mask(addr, len);
326 if (!slice_check_fit(mask, available)) {
327 if (addr < SLICE_LOW_TOP)
328 addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
329 else if (addr < (1ul << SLICE_HIGH_SHIFT))
330 addr = SLICE_LOW_TOP;
331 else
332 addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
333 continue;
334 }
335
336 /*
337 * Lookup failure means no vma is above this address,
338 * else if new region fits below vma->vm_start,
339 * return with success:
340 */
341 vma = find_vma(mm, addr);
342 if (!vma || (addr + len) <= vma->vm_start) {
343 /* remember the address as a hint for next time */
344 if (use_cache)
345 mm->free_area_cache = addr;
346 return addr;
347 }
348
349 /* remember the largest hole we saw so far */
350 if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
351 mm->cached_hole_size = vma->vm_start - addr;
352
353 /* try just below the current vma->vm_start */
354 addr = vma->vm_start;
355 }
356
357 /*
358 * A failed mmap() very likely causes application failure,
359 * so fall back to the bottom-up function here. This scenario
360 * can happen with large stack limits and large mmap()
361 * allocations.
362 */
363 addr = slice_find_area_bottomup(mm, len, available, psize, 0);
364
365 /*
366 * Restore the topdown base:
367 */
368 if (use_cache) {
369 mm->free_area_cache = mm->mmap_base;
370 mm->cached_hole_size = ~0UL;
371 }
372
373 return addr;
374}
375
376
377static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
378 struct slice_mask mask, int psize,
379 int topdown, int use_cache)
380{
381 if (topdown)
382 return slice_find_area_topdown(mm, len, mask, psize, use_cache);
383 else
384 return slice_find_area_bottomup(mm, len, mask, psize, use_cache);
385}
386
387unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
388 unsigned long flags, unsigned int psize,
389 int topdown, int use_cache)
390{
391 struct slice_mask mask;
392 struct slice_mask good_mask;
393 struct slice_mask potential_mask = {0,0} /* silence stupid warning */;
394 int pmask_set = 0;
395 int fixed = (flags & MAP_FIXED);
396 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
397 struct mm_struct *mm = current->mm;
398
399 /* Sanity checks */
400 BUG_ON(mm->task_size == 0);
401
402 slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
403 slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n",
404 addr, len, flags, topdown, use_cache);
405
406 if (len > mm->task_size)
407 return -ENOMEM;
408 if (fixed && (addr & ((1ul << pshift) - 1)))
409 return -EINVAL;
410 if (fixed && addr > (mm->task_size - len))
411 return -EINVAL;
412
413 /* If hint, make sure it matches our alignment restrictions */
414 if (!fixed && addr) {
415 addr = _ALIGN_UP(addr, 1ul << pshift);
416 slice_dbg(" aligned addr=%lx\n", addr);
417 }
418
419 /* First makeup a "good" mask of slices that have the right size
420 * already
421 */
422 good_mask = slice_mask_for_size(mm, psize);
423 slice_print_mask(" good_mask", good_mask);
424
425 /* First check hint if it's valid or if we have MAP_FIXED */
426 if ((addr != 0 || fixed) && (mm->task_size - len) >= addr) {
427
428 /* Don't bother with hint if it overlaps a VMA */
429 if (!fixed && !slice_area_is_free(mm, addr, len))
430 goto search;
431
432 /* Build a mask for the requested range */
433 mask = slice_range_to_mask(addr, len);
434 slice_print_mask(" mask", mask);
435
436 /* Check if we fit in the good mask. If we do, we just return,
437 * nothing else to do
438 */
439 if (slice_check_fit(mask, good_mask)) {
440 slice_dbg(" fits good !\n");
441 return addr;
442 }
443
444 /* We don't fit in the good mask, check what other slices are
445 * empty and thus can be converted
446 */
447 potential_mask = slice_mask_for_free(mm);
448 potential_mask.low_slices |= good_mask.low_slices;
449 potential_mask.high_slices |= good_mask.high_slices;
450 pmask_set = 1;
451 slice_print_mask(" potential", potential_mask);
452 if (slice_check_fit(mask, potential_mask)) {
453 slice_dbg(" fits potential !\n");
454 goto convert;
455 }
456 }
457
458 /* If we have MAP_FIXED and failed the above step, then error out */
459 if (fixed)
460 return -EBUSY;
461
462 search:
463 slice_dbg(" search...\n");
464
465 /* Now let's see if we can find something in the existing slices
466 * for that size
467 */
468 addr = slice_find_area(mm, len, good_mask, psize, topdown, use_cache);
469 if (addr != -ENOMEM) {
470 /* Found within the good mask, we don't have to setup,
471 * we thus return directly
472 */
473 slice_dbg(" found area at 0x%lx\n", addr);
474 return addr;
475 }
476
477 /* Won't fit, check what can be converted */
478 if (!pmask_set) {
479 potential_mask = slice_mask_for_free(mm);
480 potential_mask.low_slices |= good_mask.low_slices;
481 potential_mask.high_slices |= good_mask.high_slices;
482 pmask_set = 1;
483 slice_print_mask(" potential", potential_mask);
484 }
485
486 /* Now let's see if we can find something in the existing slices
487 * for that size
488 */
489 addr = slice_find_area(mm, len, potential_mask, psize, topdown,
490 use_cache);
491 if (addr == -ENOMEM)
492 return -ENOMEM;
493
494 mask = slice_range_to_mask(addr, len);
495 slice_dbg(" found potential area at 0x%lx\n", addr);
496 slice_print_mask(" mask", mask);
497
498 convert:
499 slice_convert(mm, mask, psize);
500 return addr;
501
502}
503EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
504
505unsigned long arch_get_unmapped_area(struct file *filp,
506 unsigned long addr,
507 unsigned long len,
508 unsigned long pgoff,
509 unsigned long flags)
510{
511 return slice_get_unmapped_area(addr, len, flags,
512 current->mm->context.user_psize,
513 0, 1);
514}
515
516unsigned long arch_get_unmapped_area_topdown(struct file *filp,
517 const unsigned long addr0,
518 const unsigned long len,
519 const unsigned long pgoff,
520 const unsigned long flags)
521{
522 return slice_get_unmapped_area(addr0, len, flags,
523 current->mm->context.user_psize,
524 1, 1);
525}
526
527unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
528{
529 u64 psizes;
530 int index;
531
532 if (addr < SLICE_LOW_TOP) {
533 psizes = mm->context.low_slices_psize;
534 index = GET_LOW_SLICE_INDEX(addr);
535 } else {
536 psizes = mm->context.high_slices_psize;
537 index = GET_HIGH_SLICE_INDEX(addr);
538 }
539
540 return (psizes >> (index * 4)) & 0xf;
541}
542EXPORT_SYMBOL_GPL(get_slice_psize);
543
544/*
545 * This is called by hash_page when it needs to do a lazy conversion of
546 * an address space from real 64K pages to combo 4K pages (typically
547 * when hitting a non cacheable mapping on a processor or hypervisor
548 * that won't allow them for 64K pages).
549 *
550 * This is also called in init_new_context() to change back the user
551 * psize from whatever the parent context had it set to
552 *
553 * This function will only change the content of the {low,high)_slice_psize
554 * masks, it will not flush SLBs as this shall be handled lazily by the
555 * caller.
556 */
557void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
558{
559 unsigned long flags, lpsizes, hpsizes;
560 unsigned int old_psize;
561 int i;
562
563 slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize);
564
565 spin_lock_irqsave(&slice_convert_lock, flags);
566
567 old_psize = mm->context.user_psize;
568 slice_dbg(" old_psize=%d\n", old_psize);
569 if (old_psize == psize)
570 goto bail;
571
572 mm->context.user_psize = psize;
573 wmb();
574
575 lpsizes = mm->context.low_slices_psize;
576 for (i = 0; i < SLICE_NUM_LOW; i++)
577 if (((lpsizes >> (i * 4)) & 0xf) == old_psize)
578 lpsizes = (lpsizes & ~(0xful << (i * 4))) |
579 (((unsigned long)psize) << (i * 4));
580
581 hpsizes = mm->context.high_slices_psize;
582 for (i = 0; i < SLICE_NUM_HIGH; i++)
583 if (((hpsizes >> (i * 4)) & 0xf) == old_psize)
584 hpsizes = (hpsizes & ~(0xful << (i * 4))) |
585 (((unsigned long)psize) << (i * 4));
586
587 mm->context.low_slices_psize = lpsizes;
588 mm->context.high_slices_psize = hpsizes;
589
590 slice_dbg(" lsps=%lx, hsps=%lx\n",
591 mm->context.low_slices_psize,
592 mm->context.high_slices_psize);
593
594 bail:
595 spin_unlock_irqrestore(&slice_convert_lock, flags);
596}
597
598/*
599 * is_hugepage_only_range() is used by generic code to verify wether
600 * a normal mmap mapping (non hugetlbfs) is valid on a given area.
601 *
602 * until the generic code provides a more generic hook and/or starts
603 * calling arch get_unmapped_area for MAP_FIXED (which our implementation
604 * here knows how to deal with), we hijack it to keep standard mappings
605 * away from us.
606 *
607 * because of that generic code limitation, MAP_FIXED mapping cannot
608 * "convert" back a slice with no VMAs to the standard page size, only
609 * get_unmapped_area() can. It would be possible to fix it here but I
610 * prefer working on fixing the generic code instead.
611 *
612 * WARNING: This will not work if hugetlbfs isn't enabled since the
613 * generic code will redefine that function as 0 in that. This is ok
614 * for now as we only use slices with hugetlbfs enabled. This should
615 * be fixed as the generic code gets fixed.
616 */
617int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
618 unsigned long len)
619{
620 struct slice_mask mask, available;
621
622 mask = slice_range_to_mask(addr, len);
623 available = slice_mask_for_size(mm, mm->context.user_psize);
624
625#if 0 /* too verbose */
626 slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n",
627 mm, addr, len);
628 slice_print_mask(" mask", mask);
629 slice_print_mask(" available", available);
630#endif
631 return !slice_check_fit(mask, available);
632}
633
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index fec51525252e..a7f5a7653c62 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -144,12 +144,11 @@ static int __spu_trap_data_seg(struct spu *spu, unsigned long ea)
144 144
145 switch(REGION_ID(ea)) { 145 switch(REGION_ID(ea)) {
146 case USER_REGION_ID: 146 case USER_REGION_ID:
147#ifdef CONFIG_HUGETLB_PAGE 147#ifdef CONFIG_PPC_MM_SLICES
148 if (in_hugepage_area(mm->context, ea)) 148 psize = get_slice_psize(mm, ea);
149 psize = mmu_huge_psize; 149#else
150 else 150 psize = mm->context.user_psize;
151#endif 151#endif
152 psize = mm->context.user_psize;
153 vsid = (get_vsid(mm->context.id, ea) << SLB_VSID_SHIFT) | 152 vsid = (get_vsid(mm->context.id, ea) << SLB_VSID_SHIFT) |
154 SLB_VSID_USER; 153 SLB_VSID_USER;
155 break; 154 break;
diff --git a/include/asm-powerpc/mmu-hash64.h b/include/asm-powerpc/mmu-hash64.h
index 6739457d8bc0..e2ca55bcfe0b 100644
--- a/include/asm-powerpc/mmu-hash64.h
+++ b/include/asm-powerpc/mmu-hash64.h
@@ -350,10 +350,13 @@ typedef unsigned long mm_context_id_t;
350 350
351typedef struct { 351typedef struct {
352 mm_context_id_t id; 352 mm_context_id_t id;
353 u16 user_psize; /* page size index */ 353 u16 user_psize; /* page size index */
354 u16 sllp; /* SLB entry page size encoding */ 354
355#ifdef CONFIG_HUGETLB_PAGE 355#ifdef CONFIG_PPC_MM_SLICES
356 u16 low_htlb_areas, high_htlb_areas; 356 u64 low_slices_psize; /* SLB page size encodings */
357 u64 high_slices_psize; /* 4 bits per slice for now */
358#else
359 u16 sllp; /* SLB page size encoding */
357#endif 360#endif
358 unsigned long vdso_base; 361 unsigned long vdso_base;
359} mm_context_t; 362} mm_context_t;
diff --git a/include/asm-powerpc/paca.h b/include/asm-powerpc/paca.h
index cf95274f735e..c6a5b1735666 100644
--- a/include/asm-powerpc/paca.h
+++ b/include/asm-powerpc/paca.h
@@ -83,8 +83,8 @@ struct paca_struct {
83 83
84 mm_context_t context; 84 mm_context_t context;
85 u16 vmalloc_sllp; 85 u16 vmalloc_sllp;
86 u16 slb_cache[SLB_CACHE_ENTRIES];
87 u16 slb_cache_ptr; 86 u16 slb_cache_ptr;
87 u16 slb_cache[SLB_CACHE_ENTRIES];
88 88
89 /* 89 /*
90 * then miscellaneous read-write fields 90 * then miscellaneous read-write fields
diff --git a/include/asm-powerpc/page_64.h b/include/asm-powerpc/page_64.h
index eab779c21995..3448a3d4bc64 100644
--- a/include/asm-powerpc/page_64.h
+++ b/include/asm-powerpc/page_64.h
@@ -88,57 +88,55 @@ extern unsigned int HPAGE_SHIFT;
88 88
89#endif /* __ASSEMBLY__ */ 89#endif /* __ASSEMBLY__ */
90 90
91#ifdef CONFIG_HUGETLB_PAGE 91#ifdef CONFIG_PPC_MM_SLICES
92 92
93#define HTLB_AREA_SHIFT 40 93#define SLICE_LOW_SHIFT 28
94#define HTLB_AREA_SIZE (1UL << HTLB_AREA_SHIFT) 94#define SLICE_HIGH_SHIFT 40
95#define GET_HTLB_AREA(x) ((x) >> HTLB_AREA_SHIFT)
96 95
97#define LOW_ESID_MASK(addr, len) \ 96#define SLICE_LOW_TOP (0x100000000ul)
98 (((1U << (GET_ESID(min((addr)+(len)-1, 0x100000000UL))+1)) \ 97#define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT)
99 - (1U << GET_ESID(min((addr), 0x100000000UL)))) & 0xffff) 98#define SLICE_NUM_HIGH (PGTABLE_RANGE >> SLICE_HIGH_SHIFT)
100#define HTLB_AREA_MASK(addr, len) (((1U << (GET_HTLB_AREA(addr+len-1)+1)) \
101 - (1U << GET_HTLB_AREA(addr))) & 0xffff)
102 99
103#define ARCH_HAS_HUGEPAGE_ONLY_RANGE 100#define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT)
104#define ARCH_HAS_HUGETLB_FREE_PGD_RANGE 101#define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT)
105#define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
106#define ARCH_HAS_SETCLEAR_HUGE_PTE
107 102
108#define touches_hugepage_low_range(mm, addr, len) \ 103#ifndef __ASSEMBLY__
109 (((addr) < 0x100000000UL) \ 104
110 && (LOW_ESID_MASK((addr), (len)) & (mm)->context.low_htlb_areas)) 105struct slice_mask {
111#define touches_hugepage_high_range(mm, addr, len) \ 106 u16 low_slices;
112 ((((addr) + (len)) > 0x100000000UL) \ 107 u16 high_slices;
113 && (HTLB_AREA_MASK((addr), (len)) & (mm)->context.high_htlb_areas)) 108};
114 109
115#define __within_hugepage_low_range(addr, len, segmask) \ 110struct mm_struct;
116 ( (((addr)+(len)) <= 0x100000000UL) \
117 && ((LOW_ESID_MASK((addr), (len)) | (segmask)) == (segmask)))
118#define within_hugepage_low_range(addr, len) \
119 __within_hugepage_low_range((addr), (len), \
120 current->mm->context.low_htlb_areas)
121#define __within_hugepage_high_range(addr, len, zonemask) \
122 ( ((addr) >= 0x100000000UL) \
123 && ((HTLB_AREA_MASK((addr), (len)) | (zonemask)) == (zonemask)))
124#define within_hugepage_high_range(addr, len) \
125 __within_hugepage_high_range((addr), (len), \
126 current->mm->context.high_htlb_areas)
127
128#define is_hugepage_only_range(mm, addr, len) \
129 (touches_hugepage_high_range((mm), (addr), (len)) || \
130 touches_hugepage_low_range((mm), (addr), (len)))
131#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
132 111
133#define in_hugepage_area(context, addr) \ 112extern unsigned long slice_get_unmapped_area(unsigned long addr,
134 (cpu_has_feature(CPU_FTR_16M_PAGE) && \ 113 unsigned long len,
135 ( ( (addr) >= 0x100000000UL) \ 114 unsigned long flags,
136 ? ((1 << GET_HTLB_AREA(addr)) & (context).high_htlb_areas) \ 115 unsigned int psize,
137 : ((1 << GET_ESID(addr)) & (context).low_htlb_areas) ) ) 116 int topdown,
117 int use_cache);
138 118
139#else /* !CONFIG_HUGETLB_PAGE */ 119extern unsigned int get_slice_psize(struct mm_struct *mm,
120 unsigned long addr);
140 121
141#define in_hugepage_area(mm, addr) 0 122extern void slice_init_context(struct mm_struct *mm, unsigned int psize);
123extern void slice_set_user_psize(struct mm_struct *mm, unsigned int psize);
124
125#define ARCH_HAS_HUGEPAGE_ONLY_RANGE
126extern int is_hugepage_only_range(struct mm_struct *m,
127 unsigned long addr,
128 unsigned long len);
129
130#endif /* __ASSEMBLY__ */
131#else
132#define slice_init()
133#endif /* CONFIG_PPC_MM_SLICES */
134
135#ifdef CONFIG_HUGETLB_PAGE
136
137#define ARCH_HAS_HUGETLB_FREE_PGD_RANGE
138#define ARCH_HAS_SETCLEAR_HUGE_PTE
139#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
142 140
143#endif /* !CONFIG_HUGETLB_PAGE */ 141#endif /* !CONFIG_HUGETLB_PAGE */
144 142