5 files changed, 203 insertions, 118 deletions
diff --git a/arch/sparc64/kernel/tsb.S b/arch/sparc64/kernel/tsb.S
index d738910153f6..1b154c863628 100644
--- a/arch/sparc64/kernel/tsb.S
+++ b/arch/sparc64/kernel/tsb.S
@@ -34,8 +34,9 @@ tsb_miss_itlb:
         ldxa           [%g4] ASI_IMMU, %g4
        /* At this point we have:
-         * %g4 --       missing virtual address
         * %g1 --       TSB entry address
+         * %g3 --       FAULT_CODE_{D,I}TLB
+         * %g4 --       missing virtual address
         * %g6 --       TAG TARGET (vaddr >> 22)
         */
 tsb_miss_page_table_walk:
@@ -45,6 +46,12 @@ tsb_miss_page_table_walk:
 tsb_miss_page_table_walk_sun4v_fastpath:
        USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)
+        /* At this point we have:
+         * %g1 --       TSB entry address
+         * %g3 --       FAULT_CODE_{D,I}TLB
+         * %g5 --       physical address of PTE in Linux page tables
+         * %g6 --       TAG TARGET (vaddr >> 22)
+         */
 tsb_reload:
        TSB_LOCK_TAG(%g1, %g2, %g7)
@@ -199,6 +206,7 @@ __tsb_insert:
        wrpr    %o5, %pstate
        retl
         nop
+        .size   __tsb_insert, .-__tsb_insert
        /* Flush the given TSB entry if it has the matching
         * tag.
@@ -208,6 +216,7 @@ __tsb_insert:
         */
        .align  32
        .globl  tsb_flush
+        .type   tsb_flush,#function
 tsb_flush:
        sethi   %hi(TSB_TAG_LOCK_HIGH), %g2
 1:      TSB_LOAD_TAG(%o0, %g1)
@@ -225,6 +234,7 @@ tsb_flush:
         nop
 2:      retl
         TSB_MEMBAR
+        .size   tsb_flush, .-tsb_flush
        /* Reload MMU related context switch state at
         * schedule() time.
@@ -241,6 +251,7 @@ tsb_flush:
         */
        .align  32
        .globl  __tsb_context_switch
+        .type   __tsb_context_switch,#function
 __tsb_context_switch:
        rdpr    %pstate, %o5
        wrpr    %o5, PSTATE_IE, %pstate
@@ -302,3 +313,61 @@ __tsb_context_switch:
        retl
         nop
+        .size   __tsb_context_switch, .-__tsb_context_switch
+#define TSB_PASS_BITS   ((1 << TSB_TAG_LOCK_BIT) | \
+                         (1 << TSB_TAG_INVALID_BIT))
+        .align  32
+        .globl  copy_tsb
+        .type   copy_tsb,#function
+copy_tsb:               /* %o0=old_tsb_base, %o1=old_tsb_size
+                         * %o2=new_tsb_base, %o3=new_tsb_size
+                         */
+        sethi           %uhi(TSB_PASS_BITS), %g7
+        srlx            %o3, 4, %o3
+        add             %o0, %o1, %g1   /* end of old tsb */
+        sllx            %g7, 32, %g7
+        sub             %o3, 1, %o3     /* %o3 == new tsb hash mask */
+661:    prefetcha       [%o0] ASI_N, #one_read
+        .section        .tsb_phys_patch, "ax"
+        .word           661b
+        prefetcha       [%o0] ASI_PHYS_USE_EC, #one_read
+        .previous
+90:     andcc           %o0, (64 - 1), %g0
+        bne             1f
+         add            %o0, 64, %o5
+661:    prefetcha       [%o5] ASI_N, #one_read
+        .section        .tsb_phys_patch, "ax"
+        .word           661b
+        prefetcha       [%o5] ASI_PHYS_USE_EC, #one_read
+        .previous
+1:      TSB_LOAD_QUAD(%o0, %g2)         /* %g2/%g3 == TSB entry */
+        andcc           %g2, %g7, %g0   /* LOCK or INVALID set? */
+        bne,pn          %xcc, 80f       /* Skip it */
+         sllx           %g2, 22, %o4    /* TAG --> VADDR */
+        /* This can definitely be computed faster... */
+        srlx            %o0, 4, %o5     /* Build index */
+        and             %o5, 511, %o5   /* Mask index */
+        sllx            %o5, PAGE_SHIFT, %o5 /* Put into vaddr position */
+        or              %o4, %o5, %o4   /* Full VADDR. */
+        srlx            %o4, PAGE_SHIFT, %o4 /* Shift down to create index */
+        and             %o4, %o3, %o4   /* Mask with new_tsb_nents-1 */
+        sllx            %o4, 4, %o4     /* Shift back up into tsb ent offset */
+        TSB_STORE(%o2 + %o4, %g2)       /* Store TAG */
+        add             %o4, 0x8, %o4   /* Advance to TTE */
+        TSB_STORE(%o2 + %o4, %g3)       /* Store TTE */
+80:     add             %o0, 16, %o0
+        cmp             %o0, %g1
+        bne,pt          %xcc, 90b
+         nop
+        retl
+         TSB_MEMBAR
+        .size           copy_tsb, .-copy_tsb
diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c
index b97bd054aad3..63b6cc0cd5d5 100644
--- a/arch/sparc64/mm/fault.c
+++ b/arch/sparc64/mm/fault.c
@@ -29,6 +29,7 @@
 #include <asm/lsu.h>
 #include <asm/sections.h>
 #include <asm/kdebug.h>
+#include <asm/mmu_context.h>
 /*
 * To debug kernel to catch accesses to certain virtual/physical addresses.
@@ -258,7 +259,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
        struct vm_area_struct *vma;
        unsigned int insn = 0;
        int si_code, fault_code;
-        unsigned long address;
+        unsigned long address, mm_rss;
        fault_code = get_thread_fault_code();
@@ -407,6 +408,11 @@ good_area:
        }
        up_read(&mm->mmap_sem);
+        mm_rss = get_mm_rss(mm);
+        if (unlikely(mm_rss >= mm->context.tsb_rss_limit))
+                tsb_grow(mm, mm_rss);
        return;
        /*
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index b40f6477dea0..d703b67bc7b9 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -279,7 +279,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
 {
        struct mm_struct *mm;
        struct tsb *tsb;
-        unsigned long tag;
+        unsigned long tag, flags;
        if (tlb_type != hypervisor) {
                unsigned long pfn = pte_pfn(pte);
@@ -308,10 +308,15 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
        }
        mm = vma->vm_mm;
+        spin_lock_irqsave(&mm->context.lock, flags);
        tsb = &mm->context.tsb[(address >> PAGE_SHIFT) &
                               (mm->context.tsb_nentries - 1UL)];
        tag = (address >> 22UL);
        tsb_insert(tsb, tag, pte_val(pte));
+        spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 void flush_dcache_page(struct page *page)
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
index f36799b7152c..7fbe1e0cd105 100644
--- a/arch/sparc64/mm/tsb.c
+++ b/arch/sparc64/mm/tsb.c
@@ -48,11 +48,15 @@ void flush_tsb_kernel_range(unsigned long start, unsigned long end)
 void flush_tsb_user(struct mmu_gather *mp)
 {
        struct mm_struct *mm = mp->mm;
-        struct tsb *tsb = mm->context.tsb;
+        unsigned long nentries, base, flags;
-        unsigned long nentries = mm->context.tsb_nentries;
+        struct tsb *tsb;
-        unsigned long base;
        int i;
+        spin_lock_irqsave(&mm->context.lock, flags);
+        tsb = mm->context.tsb;
+        nentries = mm->context.tsb_nentries;
        if (tlb_type == cheetah_plus || tlb_type == hypervisor)
                base = __pa(tsb);
        else
@@ -70,6 +74,8 @@ void flush_tsb_user(struct mmu_gather *mp)
                tsb_flush(ent, tag);
        }
+        spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
@@ -201,86 +207,9 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
        }
 }
-/* The page tables are locked against modifications while this
- * runs.
- *
- * XXX do some prefetching...
- */
-static void copy_tsb(struct tsb *old_tsb, unsigned long old_size,
-                     struct tsb *new_tsb, unsigned long new_size)
-{
-        unsigned long old_nentries = old_size / sizeof(struct tsb);
-        unsigned long new_nentries = new_size / sizeof(struct tsb);
-        unsigned long i;
-        for (i = 0; i < old_nentries; i++) {
-                register unsigned long tag asm("o4");
-                register unsigned long pte asm("o5");
-                unsigned long v, hash;
-                if (tlb_type == hypervisor) {
-                        __asm__ __volatile__(
-                                "ldda [%2] %3, %0"
-                                : "=r" (tag), "=r" (pte)
-                                : "r" (__pa(&old_tsb[i])),
-                                  "i" (ASI_QUAD_LDD_PHYS_4V));
-                } else if (tlb_type == cheetah_plus) {
-                        __asm__ __volatile__(
-                                "ldda [%2] %3, %0"
-                                : "=r" (tag), "=r" (pte)
-                                : "r" (__pa(&old_tsb[i])),
-                                  "i" (ASI_QUAD_LDD_PHYS));
-                } else {
-                        __asm__ __volatile__(
-                                "ldda [%2] %3, %0"
-                                : "=r" (tag), "=r" (pte)
-                                : "r" (&old_tsb[i]),
-                                  "i" (ASI_NUCLEUS_QUAD_LDD));
-                }
-                if (tag & ((1UL << TSB_TAG_LOCK_BIT) |
-                           (1UL << TSB_TAG_INVALID_BIT)))
-                        continue;
-                /* We only put base page size PTEs into the TSB,
-                 * but that might change in the future.  This code
-                 * would need to be changed if we start putting larger
-                 * page size PTEs into there.
-                 */
-                WARN_ON((pte & _PAGE_ALL_SZ_BITS) != _PAGE_SZBITS);
-                /* The tag holds bits 22 to 63 of the virtual address
-                 * and the context.  Clear out the context, and shift
-                 * up to make a virtual address.
-                 */
-                v = (tag & ((1UL << 42UL) - 1UL)) << 22UL;
-                /* The implied bits of the tag (bits 13 to 21) are
-                 * determined by the TSB entry index, so fill that in.
-                 */
-                v |= (i & (512UL - 1UL)) << 13UL;
-                hash = tsb_hash(v, new_nentries);
-                if (tlb_type == cheetah_plus ||
-                    tlb_type == hypervisor) {
-                        __asm__ __volatile__(
-                                "stxa   %0, [%1] %2\n\t"
-                                "stxa   %3, [%4] %2"
-                                : /* no outputs */
-                                : "r" (tag),
-                                  "r" (__pa(&new_tsb[hash].tag)),
-                                  "i" (ASI_PHYS_USE_EC),
-                                  "r" (pte),
-                                  "r" (__pa(&new_tsb[hash].pte)));
-                } else {
-                        new_tsb[hash].tag = tag;
-                        new_tsb[hash].pte = pte;
-                }
-        }
-}
 /* When the RSS of an address space exceeds mm->context.tsb_rss_limit,
- * update_mmu_cache() invokes this routine to try and grow the TSB.
+ * do_sparc64_fault() invokes this routine to try and grow the TSB.
+ *
 * When we reach the maximum TSB size supported, we stick ~0UL into
 * mm->context.tsb_rss_limit so the grow checks in update_mmu_cache()
 * will not trigger any longer.
@@ -293,12 +222,12 @@ static void copy_tsb(struct tsb *old_tsb, unsigned long old_size,
 * the number of entries that the current TSB can hold at once.  Currently,
 * we trigger when the RSS hits 3/4 of the TSB capacity.
 */
-void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
+void tsb_grow(struct mm_struct *mm, unsigned long rss)
 {
        unsigned long max_tsb_size = 1 * 1024 * 1024;
-        unsigned long size, old_size;
+        unsigned long size, old_size, flags;
        struct page *page;
-        struct tsb *old_tsb;
+        struct tsb *old_tsb, *new_tsb;
        if (max_tsb_size > (PAGE_SIZE << MAX_ORDER))
                max_tsb_size = (PAGE_SIZE << MAX_ORDER);
@@ -311,12 +240,51 @@ void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
                        break;
        }
-        page = alloc_pages(gfp_flags, get_order(size));
+        page = alloc_pages(GFP_KERNEL, get_order(size));
        if (unlikely(!page))
                return;
        /* Mark all tags as invalid.  */
-        memset(page_address(page), 0x40, size);
+        new_tsb = page_address(page);
+        memset(new_tsb, 0x40, size);
+        /* Ok, we are about to commit the changes.  If we are
+         * growing an existing TSB the locking is very tricky,
+         * so WATCH OUT!
+         *
+         * We have to hold mm->context.lock while committing to the
+         * new TSB, this synchronizes us with processors in
+         * flush_tsb_user() and switch_mm() for this address space.
+         *
+         * But even with that lock held, processors run asynchronously
+         * accessing the old TSB via TLB miss handling.  This is OK
+         * because those actions are just propagating state from the
+         * Linux page tables into the TSB, page table mappings are not
+         * being changed.  If a real fault occurs, the processor will
+         * synchronize with us when it hits flush_tsb_user(), this is
+         * also true for the case where vmscan is modifying the page
+         * tables.  The only thing we need to be careful with is to
+         * skip any locked TSB entries during copy_tsb().
+         *
+         * When we finish committing to the new TSB, we have to drop
+         * the lock and ask all other cpus running this address space
+         * to run tsb_context_switch() to see the new TSB table.
+         */
+        spin_lock_irqsave(&mm->context.lock, flags);
+        old_tsb = mm->context.tsb;
+        old_size = mm->context.tsb_nentries * sizeof(struct tsb);
+        /* Handle multiple threads trying to grow the TSB at the same time.
+         * One will get in here first, and bump the size and the RSS limit.
+         * The others will get in here next and hit this check.
+         */
+        if (unlikely(old_tsb && (rss < mm->context.tsb_rss_limit))) {
+                spin_unlock_irqrestore(&mm->context.lock, flags);
+                free_pages((unsigned long) new_tsb, get_order(size));
+                return;
+        }
        if (size == max_tsb_size)
                mm->context.tsb_rss_limit = ~0UL;
@@ -324,30 +292,37 @@ void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
                mm->context.tsb_rss_limit =
                        ((size / sizeof(struct tsb)) * 3) / 4;
-        old_tsb = mm->context.tsb;
+        if (old_tsb) {
-        old_size = mm->context.tsb_nentries * sizeof(struct tsb);
+                extern void copy_tsb(unsigned long old_tsb_base,
+                                     unsigned long old_tsb_size,
-        if (old_tsb)
+                                     unsigned long new_tsb_base,
-                copy_tsb(old_tsb, old_size, page_address(page), size);
+                                     unsigned long new_tsb_size);
+                unsigned long old_tsb_base = (unsigned long) old_tsb;
+                unsigned long new_tsb_base = (unsigned long) new_tsb;
+                if (tlb_type == cheetah_plus || tlb_type == hypervisor) {
+                        old_tsb_base = __pa(old_tsb_base);
+                        new_tsb_base = __pa(new_tsb_base);
+                }
+                copy_tsb(old_tsb_base, old_size, new_tsb_base, size);
+        }
-        mm->context.tsb = page_address(page);
+        mm->context.tsb = new_tsb;
        setup_tsb_params(mm, size);
+        spin_unlock_irqrestore(&mm->context.lock, flags);
        /* If old_tsb is NULL, we're being invoked for the first time
         * from init_new_context().
         */
        if (old_tsb) {
-                /* Now force all other processors to reload the new
+                /* Reload it on the local cpu.  */
-                 * TSB state.
-                 */
-                smp_tsb_sync(mm);
-                /* Finally reload it on the local cpu.  No further
-                 * references will remain to the old TSB and we can
-                 * thus free it up.
-                 */
                tsb_context_switch(mm);
+                /* Now force other processors to do the same.  */
+                smp_tsb_sync(mm);
+                /* Now it is safe to free the old tsb.  */
                free_pages((unsigned long) old_tsb, get_order(old_size));
        }
 }
@@ -363,7 +338,11 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
         * will be confused and think there is an older TSB to free up.
         */
        mm->context.tsb = NULL;
-        tsb_grow(mm, 0, GFP_KERNEL);
+        /* If this is fork, inherit the parent's TSB size.  We would
+         * grow it to that size on the first page fault anyways.
+         */
+        tsb_grow(mm, get_mm_rss(mm));
        if (unlikely(!mm->context.tsb))
                return -ENOMEM;
diff --git a/include/asm-sparc64/mmu_context.h b/include/asm-sparc64/mmu_context.h
index ca36ea96f64b..e7974321d052 100644
--- a/include/asm-sparc64/mmu_context.h
+++ b/include/asm-sparc64/mmu_context.h
@@ -42,7 +42,7 @@ static inline void tsb_context_switch(struct mm_struct *mm)
                             __pa(&mm->context.tsb_descr));
 }
-extern void tsb_grow(struct mm_struct *mm, unsigned long mm_rss, gfp_t gfp_flags);
+extern void tsb_grow(struct mm_struct *mm, unsigned long mm_rss);
 #ifdef CONFIG_SMP
 extern void smp_tsb_sync(struct mm_struct *mm);
 #else
@@ -74,18 +74,43 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
        ctx_valid = CTX_VALID(mm->context);
        if (!ctx_valid)
                get_new_mmu_context(mm);
-        spin_unlock_irqrestore(&mm->context.lock, flags);
-        if (!ctx_valid || (old_mm != mm)) {
+        /* We have to be extremely careful here or else we will miss
-                load_secondary_context(mm);
+         * a TSB grow if we switch back and forth between a kernel
-                tsb_context_switch(mm);
+         * thread and an address space which has it's TSB size increased
-        }
+         * on another processor.
+         *
+         * It is possible to play some games in order to optimize the
+         * switch, but the safest thing to do is to unconditionally
+         * perform the secondary context load and the TSB context switch.
+         *
+         * For reference the bad case is, for address space "A":
+         *
+         *              CPU 0                   CPU 1
+         *      run address space A
+         *      set cpu0's bits in cpu_vm_mask
+         *      switch to kernel thread, borrow
+         *      address space A via entry_lazy_tlb
+         *                                      run address space A
+         *                                      set cpu1's bit in cpu_vm_mask
+         *                                      flush_tlb_pending()
+         *                                      reset cpu_vm_mask to just cpu1
+         *                                      TSB grow
+         *      run address space A
+         *      context was valid, so skip
+         *      TSB context switch
+         *
+         * At that point cpu0 continues to use a stale TSB, the one from
+         * before the TSB grow performed on cpu1.  cpu1 did not cross-call
+         * cpu0 to update it's TSB because at that point the cpu_vm_mask
+         * only had cpu1 set in it.
+         */
+        load_secondary_context(mm);
+        tsb_context_switch(mm);
-        /* Even if (mm == old_mm) we _must_ check
+        /* Any time a processor runs a context on an address space
-         * the cpu_vm_mask.  If we do not we could
+         * for the first time, we must flush that context out of the
-         * corrupt the TLB state because of how
+         * local TLB.
-         * smp_flush_tlb_{page,range,mm} on sparc64
-         * and lazy tlb switches work. -DaveM
         */
        cpu = smp_processor_id();
        if (!ctx_valid || !cpu_isset(cpu, mm->cpu_vm_mask)) {
@@ -93,6 +118,7 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
                __flush_tlb_mm(CTX_HWBITS(mm->context),
                               SECONDARY_CONTEXT);
        }
+        spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 #define deactivate_mm(tsk,mm)   do { } while (0)
@@ -109,11 +135,11 @@ static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm
        cpu = smp_processor_id();
        if (!cpu_isset(cpu, mm->cpu_vm_mask))
                cpu_set(cpu, mm->cpu_vm_mask);
-        spin_unlock_irqrestore(&mm->context.lock, flags);
        load_secondary_context(mm);
        __flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT);
        tsb_context_switch(mm);
+        spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 #endif /* !(__ASSEMBLY__) */

diff --git a/arch/sparc64/kernel/tsb.S b/arch/sparc64/kernel/tsb.S index d738910153f6..1b154c863628 100644 --- a/arch/sparc64/kernel/tsb.S +++ b/arch/sparc64/kernel/tsb.S
@@ -34,8 +34,9 @@ tsb_miss_itlb:
34	ldxa [%g4] ASI_IMMU, %g4	34	ldxa [%g4] ASI_IMMU, %g4
35		35
36	/* At this point we have:	36	/* At this point we have:
37	* %g4 -- missing virtual address
38	* %g1 -- TSB entry address	37	* %g1 -- TSB entry address
		38	* %g3 -- FAULT_CODE_{D,I}TLB
		39	* %g4 -- missing virtual address
39	* %g6 -- TAG TARGET (vaddr >> 22)	40	* %g6 -- TAG TARGET (vaddr >> 22)
40	*/	41	*/
41	tsb_miss_page_table_walk:	42	tsb_miss_page_table_walk:
@@ -45,6 +46,12 @@ tsb_miss_page_table_walk:
45	tsb_miss_page_table_walk_sun4v_fastpath:	46	tsb_miss_page_table_walk_sun4v_fastpath:
46	USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)	47	USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)
47		48
		49	/* At this point we have:
		50	* %g1 -- TSB entry address
		51	* %g3 -- FAULT_CODE_{D,I}TLB
		52	* %g5 -- physical address of PTE in Linux page tables
		53	* %g6 -- TAG TARGET (vaddr >> 22)
		54	*/
48	tsb_reload:	55	tsb_reload:
49	TSB_LOCK_TAG(%g1, %g2, %g7)	56	TSB_LOCK_TAG(%g1, %g2, %g7)
50		57
@@ -199,6 +206,7 @@ __tsb_insert:
199	wrpr %o5, %pstate	206	wrpr %o5, %pstate
200	retl	207	retl
201	nop	208	nop
		209	.size __tsb_insert, .-__tsb_insert
202		210
203	/* Flush the given TSB entry if it has the matching	211	/* Flush the given TSB entry if it has the matching
204	* tag.	212	* tag.
@@ -208,6 +216,7 @@ __tsb_insert:
208	*/	216	*/
209	.align 32	217	.align 32
210	.globl tsb_flush	218	.globl tsb_flush
		219	.type tsb_flush,#function
211	tsb_flush:	220	tsb_flush:
212	sethi %hi(TSB_TAG_LOCK_HIGH), %g2	221	sethi %hi(TSB_TAG_LOCK_HIGH), %g2
213	1: TSB_LOAD_TAG(%o0, %g1)	222	1: TSB_LOAD_TAG(%o0, %g1)
@@ -225,6 +234,7 @@ tsb_flush:
225	nop	234	nop
226	2: retl	235	2: retl
227	TSB_MEMBAR	236	TSB_MEMBAR
		237	.size tsb_flush, .-tsb_flush
228		238
229	/* Reload MMU related context switch state at	239	/* Reload MMU related context switch state at
230	* schedule() time.	240	* schedule() time.
@@ -241,6 +251,7 @@ tsb_flush:
241	*/	251	*/
242	.align 32	252	.align 32
243	.globl __tsb_context_switch	253	.globl __tsb_context_switch
		254	.type __tsb_context_switch,#function
244	__tsb_context_switch:	255	__tsb_context_switch:
245	rdpr %pstate, %o5	256	rdpr %pstate, %o5
246	wrpr %o5, PSTATE_IE, %pstate	257	wrpr %o5, PSTATE_IE, %pstate
@@ -302,3 +313,61 @@ __tsb_context_switch:
302		313
303	retl	314	retl
304	nop	315	nop
		316	.size __tsb_context_switch, .-__tsb_context_switch
		317
		318	#define TSB_PASS_BITS ((1 << TSB_TAG_LOCK_BIT) \| \
		319	(1 << TSB_TAG_INVALID_BIT))
		320
		321	.align 32
		322	.globl copy_tsb
		323	.type copy_tsb,#function
		324	copy_tsb: /* %o0=old_tsb_base, %o1=old_tsb_size
		325	* %o2=new_tsb_base, %o3=new_tsb_size
		326	*/
		327	sethi %uhi(TSB_PASS_BITS), %g7
		328	srlx %o3, 4, %o3
		329	add %o0, %o1, %g1 /* end of old tsb */
		330	sllx %g7, 32, %g7
		331	sub %o3, 1, %o3 /* %o3 == new tsb hash mask */
		332
		333	661: prefetcha [%o0] ASI_N, #one_read
		334	.section .tsb_phys_patch, "ax"
		335	.word 661b
		336	prefetcha [%o0] ASI_PHYS_USE_EC, #one_read
		337	.previous
		338
		339	90: andcc %o0, (64 - 1), %g0
		340	bne 1f
		341	add %o0, 64, %o5
		342
		343	661: prefetcha [%o5] ASI_N, #one_read
		344	.section .tsb_phys_patch, "ax"
		345	.word 661b
		346	prefetcha [%o5] ASI_PHYS_USE_EC, #one_read
		347	.previous
		348
		349	1: TSB_LOAD_QUAD(%o0, %g2) /* %g2/%g3 == TSB entry */
		350	andcc %g2, %g7, %g0 /* LOCK or INVALID set? */
		351	bne,pn %xcc, 80f /* Skip it */
		352	sllx %g2, 22, %o4 /* TAG --> VADDR */
		353
		354	/* This can definitely be computed faster... */
		355	srlx %o0, 4, %o5 /* Build index */
		356	and %o5, 511, %o5 /* Mask index */
		357	sllx %o5, PAGE_SHIFT, %o5 /* Put into vaddr position */
		358	or %o4, %o5, %o4 /* Full VADDR. */
		359	srlx %o4, PAGE_SHIFT, %o4 /* Shift down to create index */
		360	and %o4, %o3, %o4 /* Mask with new_tsb_nents-1 */
		361	sllx %o4, 4, %o4 /* Shift back up into tsb ent offset */
		362	TSB_STORE(%o2 + %o4, %g2) /* Store TAG */
		363	add %o4, 0x8, %o4 /* Advance to TTE */
		364	TSB_STORE(%o2 + %o4, %g3) /* Store TTE */
		365
		366	80: add %o0, 16, %o0
		367	cmp %o0, %g1
		368	bne,pt %xcc, 90b
		369	nop
		370
		371	retl
		372	TSB_MEMBAR
		373	.size copy_tsb, .-copy_tsb


diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index b97bd054aad3..63b6cc0cd5d5 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c
@@ -29,6 +29,7 @@
29	#include <asm/lsu.h>	29	#include <asm/lsu.h>
30	#include <asm/sections.h>	30	#include <asm/sections.h>
31	#include <asm/kdebug.h>	31	#include <asm/kdebug.h>
		32	#include <asm/mmu_context.h>
32		33
33	/*	34	/*
34	* To debug kernel to catch accesses to certain virtual/physical addresses.	35	* To debug kernel to catch accesses to certain virtual/physical addresses.
@@ -258,7 +259,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
258	struct vm_area_struct *vma;	259	struct vm_area_struct *vma;
259	unsigned int insn = 0;	260	unsigned int insn = 0;
260	int si_code, fault_code;	261	int si_code, fault_code;
261	unsigned long address;	262	unsigned long address, mm_rss;
262		263
263	fault_code = get_thread_fault_code();	264	fault_code = get_thread_fault_code();
264		265
@@ -407,6 +408,11 @@ good_area:
407	}	408	}
408		409
409	up_read(&mm->mmap_sem);	410	up_read(&mm->mmap_sem);
		411
		412	mm_rss = get_mm_rss(mm);
		413	if (unlikely(mm_rss >= mm->context.tsb_rss_limit))
		414	tsb_grow(mm, mm_rss);
		415
410	return;	416	return;
411		417
412	/*	418	/*


diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index b40f6477dea0..d703b67bc7b9 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c
@@ -279,7 +279,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
279	{	279	{
280	struct mm_struct *mm;	280	struct mm_struct *mm;
281	struct tsb *tsb;	281	struct tsb *tsb;
282	unsigned long tag;	282	unsigned long tag, flags;
283		283
284	if (tlb_type != hypervisor) {	284	if (tlb_type != hypervisor) {
285	unsigned long pfn = pte_pfn(pte);	285	unsigned long pfn = pte_pfn(pte);
@@ -308,10 +308,15 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
308	}	308	}
309		309
310	mm = vma->vm_mm;	310	mm = vma->vm_mm;
		311
		312	spin_lock_irqsave(&mm->context.lock, flags);
		313
311	tsb = &mm->context.tsb[(address >> PAGE_SHIFT) &	314	tsb = &mm->context.tsb[(address >> PAGE_SHIFT) &
312	(mm->context.tsb_nentries - 1UL)];	315	(mm->context.tsb_nentries - 1UL)];
313	tag = (address >> 22UL);	316	tag = (address >> 22UL);
314	tsb_insert(tsb, tag, pte_val(pte));	317	tsb_insert(tsb, tag, pte_val(pte));
		318
		319	spin_unlock_irqrestore(&mm->context.lock, flags);
315	}	320	}
316		321
317	void flush_dcache_page(struct page *page)	322	void flush_dcache_page(struct page *page)


diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c index f36799b7152c..7fbe1e0cd105 100644 --- a/arch/sparc64/mm/tsb.c +++ b/arch/sparc64/mm/tsb.c
@@ -48,11 +48,15 @@ void flush_tsb_kernel_range(unsigned long start, unsigned long end)
48	void flush_tsb_user(struct mmu_gather *mp)	48	void flush_tsb_user(struct mmu_gather *mp)
49	{	49	{
50	struct mm_struct *mm = mp->mm;	50	struct mm_struct *mm = mp->mm;
51	struct tsb *tsb = mm->context.tsb;	51	unsigned long nentries, base, flags;
52	unsigned long nentries = mm->context.tsb_nentries;	52	struct tsb *tsb;
53	unsigned long base;
54	int i;	53	int i;
55		54
		55	spin_lock_irqsave(&mm->context.lock, flags);
		56
		57	tsb = mm->context.tsb;
		58	nentries = mm->context.tsb_nentries;
		59
56	if (tlb_type == cheetah_plus \|\| tlb_type == hypervisor)	60	if (tlb_type == cheetah_plus \|\| tlb_type == hypervisor)
57	base = __pa(tsb);	61	base = __pa(tsb);
58	else	62	else
@@ -70,6 +74,8 @@ void flush_tsb_user(struct mmu_gather *mp)
70		74
71	tsb_flush(ent, tag);	75	tsb_flush(ent, tag);
72	}	76	}
		77
		78	spin_unlock_irqrestore(&mm->context.lock, flags);
73	}	79	}
74		80
75	static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)	81	static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
@@ -201,86 +207,9 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
201	}	207	}
202	}	208	}
203		209
204	/* The page tables are locked against modifications while this
205	* runs.
206	*
207	* XXX do some prefetching...
208	*/
209	static void copy_tsb(struct tsb *old_tsb, unsigned long old_size,
210	struct tsb *new_tsb, unsigned long new_size)
211	{
212	unsigned long old_nentries = old_size / sizeof(struct tsb);
213	unsigned long new_nentries = new_size / sizeof(struct tsb);
214	unsigned long i;
215
216	for (i = 0; i < old_nentries; i++) {
217	register unsigned long tag asm("o4");
218	register unsigned long pte asm("o5");
219	unsigned long v, hash;
220
221	if (tlb_type == hypervisor) {
222	__asm__ __volatile__(
223	"ldda [%2] %3, %0"
224	: "=r" (tag), "=r" (pte)
225	: "r" (__pa(&old_tsb[i])),
226	"i" (ASI_QUAD_LDD_PHYS_4V));
227	} else if (tlb_type == cheetah_plus) {
228	__asm__ __volatile__(
229	"ldda [%2] %3, %0"
230	: "=r" (tag), "=r" (pte)
231	: "r" (__pa(&old_tsb[i])),
232	"i" (ASI_QUAD_LDD_PHYS));
233	} else {
234	__asm__ __volatile__(
235	"ldda [%2] %3, %0"
236	: "=r" (tag), "=r" (pte)
237	: "r" (&old_tsb[i]),
238	"i" (ASI_NUCLEUS_QUAD_LDD));
239	}
240
241	if (tag & ((1UL << TSB_TAG_LOCK_BIT) \|
242	(1UL << TSB_TAG_INVALID_BIT)))
243	continue;
244
245	/* We only put base page size PTEs into the TSB,
246	* but that might change in the future. This code
247	* would need to be changed if we start putting larger
248	* page size PTEs into there.
249	*/
250	WARN_ON((pte & _PAGE_ALL_SZ_BITS) != _PAGE_SZBITS);
251
252	/* The tag holds bits 22 to 63 of the virtual address
253	* and the context. Clear out the context, and shift
254	* up to make a virtual address.
255	*/
256	v = (tag & ((1UL << 42UL) - 1UL)) << 22UL;
257
258	/* The implied bits of the tag (bits 13 to 21) are
259	* determined by the TSB entry index, so fill that in.
260	*/
261	v \|= (i & (512UL - 1UL)) << 13UL;
262
263	hash = tsb_hash(v, new_nentries);
264	if (tlb_type == cheetah_plus \|\|
265	tlb_type == hypervisor) {
266	__asm__ __volatile__(
267	"stxa %0, [%1] %2\n\t"
268	"stxa %3, [%4] %2"
269	: /* no outputs */
270	: "r" (tag),
271	"r" (__pa(&new_tsb[hash].tag)),
272	"i" (ASI_PHYS_USE_EC),
273	"r" (pte),
274	"r" (__pa(&new_tsb[hash].pte)));
275	} else {
276	new_tsb[hash].tag = tag;
277	new_tsb[hash].pte = pte;
278	}
279	}
280	}
281
282	/* When the RSS of an address space exceeds mm->context.tsb_rss_limit,	210	/* When the RSS of an address space exceeds mm->context.tsb_rss_limit,
283	* update_mmu_cache() invokes this routine to try and grow the TSB.	211	* do_sparc64_fault() invokes this routine to try and grow the TSB.
		212	*
284	* When we reach the maximum TSB size supported, we stick ~0UL into	213	* When we reach the maximum TSB size supported, we stick ~0UL into
285	* mm->context.tsb_rss_limit so the grow checks in update_mmu_cache()	214	* mm->context.tsb_rss_limit so the grow checks in update_mmu_cache()
286	* will not trigger any longer.	215	* will not trigger any longer.
@@ -293,12 +222,12 @@ static void copy_tsb(struct tsb *old_tsb, unsigned long old_size,
293	* the number of entries that the current TSB can hold at once. Currently,	222	* the number of entries that the current TSB can hold at once. Currently,
294	* we trigger when the RSS hits 3/4 of the TSB capacity.	223	* we trigger when the RSS hits 3/4 of the TSB capacity.
295	*/	224	*/
296	void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)	225	void tsb_grow(struct mm_struct *mm, unsigned long rss)
297	{	226	{
298	unsigned long max_tsb_size = 1 * 1024 * 1024;	227	unsigned long max_tsb_size = 1 * 1024 * 1024;
299	unsigned long size, old_size;	228	unsigned long size, old_size, flags;
300	struct page *page;	229	struct page *page;
301	struct tsb *old_tsb;	230	struct tsb old_tsb, new_tsb;
302		231
303	if (max_tsb_size > (PAGE_SIZE << MAX_ORDER))	232	if (max_tsb_size > (PAGE_SIZE << MAX_ORDER))
304	max_tsb_size = (PAGE_SIZE << MAX_ORDER);	233	max_tsb_size = (PAGE_SIZE << MAX_ORDER);
@@ -311,12 +240,51 @@ void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
311	break;	240	break;
312	}	241	}
313		242
314	page = alloc_pages(gfp_flags, get_order(size));	243	page = alloc_pages(GFP_KERNEL, get_order(size));
315	if (unlikely(!page))	244	if (unlikely(!page))
316	return;	245	return;
317		246
318	/* Mark all tags as invalid. */	247	/* Mark all tags as invalid. */
319	memset(page_address(page), 0x40, size);	248	new_tsb = page_address(page);
		249	memset(new_tsb, 0x40, size);
		250
		251	/* Ok, we are about to commit the changes. If we are
		252	* growing an existing TSB the locking is very tricky,
		253	* so WATCH OUT!
		254	*
		255	* We have to hold mm->context.lock while committing to the
		256	* new TSB, this synchronizes us with processors in
		257	* flush_tsb_user() and switch_mm() for this address space.
		258	*
		259	* But even with that lock held, processors run asynchronously
		260	* accessing the old TSB via TLB miss handling. This is OK
		261	* because those actions are just propagating state from the
		262	* Linux page tables into the TSB, page table mappings are not
		263	* being changed. If a real fault occurs, the processor will
		264	* synchronize with us when it hits flush_tsb_user(), this is
		265	* also true for the case where vmscan is modifying the page
		266	* tables. The only thing we need to be careful with is to
		267	* skip any locked TSB entries during copy_tsb().
		268	*
		269	* When we finish committing to the new TSB, we have to drop
		270	* the lock and ask all other cpus running this address space
		271	* to run tsb_context_switch() to see the new TSB table.
		272	*/
		273	spin_lock_irqsave(&mm->context.lock, flags);
		274
		275	old_tsb = mm->context.tsb;
		276	old_size = mm->context.tsb_nentries * sizeof(struct tsb);
		277
		278	/* Handle multiple threads trying to grow the TSB at the same time.
		279	* One will get in here first, and bump the size and the RSS limit.
		280	* The others will get in here next and hit this check.
		281	*/
		282	if (unlikely(old_tsb && (rss < mm->context.tsb_rss_limit))) {
		283	spin_unlock_irqrestore(&mm->context.lock, flags);
		284
		285	free_pages((unsigned long) new_tsb, get_order(size));
		286	return;
		287	}
320		288
321	if (size == max_tsb_size)	289	if (size == max_tsb_size)
322	mm->context.tsb_rss_limit = ~0UL;	290	mm->context.tsb_rss_limit = ~0UL;
@@ -324,30 +292,37 @@ void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
324	mm->context.tsb_rss_limit =	292	mm->context.tsb_rss_limit =
325	((size / sizeof(struct tsb)) * 3) / 4;	293	((size / sizeof(struct tsb)) * 3) / 4;
326		294
327	old_tsb = mm->context.tsb;	295	if (old_tsb) {
328	old_size = mm->context.tsb_nentries * sizeof(struct tsb);	296	extern void copy_tsb(unsigned long old_tsb_base,
329		297	unsigned long old_tsb_size,
330	if (old_tsb)	298	unsigned long new_tsb_base,
331	copy_tsb(old_tsb, old_size, page_address(page), size);	299	unsigned long new_tsb_size);
		300	unsigned long old_tsb_base = (unsigned long) old_tsb;
		301	unsigned long new_tsb_base = (unsigned long) new_tsb;
		302
		303	if (tlb_type == cheetah_plus \|\| tlb_type == hypervisor) {
		304	old_tsb_base = __pa(old_tsb_base);
		305	new_tsb_base = __pa(new_tsb_base);
		306	}
		307	copy_tsb(old_tsb_base, old_size, new_tsb_base, size);
		308	}
332		309
333	mm->context.tsb = page_address(page);	310	mm->context.tsb = new_tsb;
334	setup_tsb_params(mm, size);	311	setup_tsb_params(mm, size);
335		312
		313	spin_unlock_irqrestore(&mm->context.lock, flags);
		314
336	/* If old_tsb is NULL, we're being invoked for the first time	315	/* If old_tsb is NULL, we're being invoked for the first time
337	* from init_new_context().	316	* from init_new_context().
338	*/	317	*/
339	if (old_tsb) {	318	if (old_tsb) {
340	/* Now force all other processors to reload the new	319	/* Reload it on the local cpu. */
341	* TSB state.
342	*/
343	smp_tsb_sync(mm);
344
345	/* Finally reload it on the local cpu. No further
346	* references will remain to the old TSB and we can
347	* thus free it up.
348	*/
349	tsb_context_switch(mm);	320	tsb_context_switch(mm);
350		321
		322	/* Now force other processors to do the same. */
		323	smp_tsb_sync(mm);
		324
		325	/* Now it is safe to free the old tsb. */
351	free_pages((unsigned long) old_tsb, get_order(old_size));	326	free_pages((unsigned long) old_tsb, get_order(old_size));
352	}	327	}
353	}	328	}
@@ -363,7 +338,11 @@ int init_new_context(struct task_struct tsk, struct mm_struct mm)
363	* will be confused and think there is an older TSB to free up.	338	* will be confused and think there is an older TSB to free up.
364	*/	339	*/
365	mm->context.tsb = NULL;	340	mm->context.tsb = NULL;
366	tsb_grow(mm, 0, GFP_KERNEL);	341
		342	/* If this is fork, inherit the parent's TSB size. We would
		343	* grow it to that size on the first page fault anyways.
		344	*/
		345	tsb_grow(mm, get_mm_rss(mm));
367		346
368	if (unlikely(!mm->context.tsb))	347	if (unlikely(!mm->context.tsb))
369	return -ENOMEM;	348	return -ENOMEM;


diff --git a/include/asm-sparc64/mmu_context.h b/include/asm-sparc64/mmu_context.h index ca36ea96f64b..e7974321d052 100644 --- a/include/asm-sparc64/mmu_context.h +++ b/include/asm-sparc64/mmu_context.h
@@ -42,7 +42,7 @@ static inline void tsb_context_switch(struct mm_struct *mm)
42	__pa(&mm->context.tsb_descr));	42	__pa(&mm->context.tsb_descr));
43	}	43	}
44		44
45	extern void tsb_grow(struct mm_struct *mm, unsigned long mm_rss, gfp_t gfp_flags);	45	extern void tsb_grow(struct mm_struct *mm, unsigned long mm_rss);
46	#ifdef CONFIG_SMP	46	#ifdef CONFIG_SMP
47	extern void smp_tsb_sync(struct mm_struct *mm);	47	extern void smp_tsb_sync(struct mm_struct *mm);
48	#else	48	#else
@@ -74,18 +74,43 @@ static inline void switch_mm(struct mm_struct old_mm, struct mm_struct mm, str
74	ctx_valid = CTX_VALID(mm->context);	74	ctx_valid = CTX_VALID(mm->context);
75	if (!ctx_valid)	75	if (!ctx_valid)
76	get_new_mmu_context(mm);	76	get_new_mmu_context(mm);
77	spin_unlock_irqrestore(&mm->context.lock, flags);
78		77
79	if (!ctx_valid \|\| (old_mm != mm)) {	78	/* We have to be extremely careful here or else we will miss
80	load_secondary_context(mm);	79	* a TSB grow if we switch back and forth between a kernel
81	tsb_context_switch(mm);	80	* thread and an address space which has it's TSB size increased
82	}	81	* on another processor.
		82	*
		83	* It is possible to play some games in order to optimize the
		84	* switch, but the safest thing to do is to unconditionally
		85	* perform the secondary context load and the TSB context switch.
		86	*
		87	* For reference the bad case is, for address space "A":
		88	*
		89	* CPU 0 CPU 1
		90	* run address space A
		91	* set cpu0's bits in cpu_vm_mask
		92	* switch to kernel thread, borrow
		93	* address space A via entry_lazy_tlb
		94	* run address space A
		95	* set cpu1's bit in cpu_vm_mask
		96	* flush_tlb_pending()
		97	* reset cpu_vm_mask to just cpu1
		98	* TSB grow
		99	* run address space A
		100	* context was valid, so skip
		101	* TSB context switch
		102	*
		103	* At that point cpu0 continues to use a stale TSB, the one from
		104	* before the TSB grow performed on cpu1. cpu1 did not cross-call
		105	* cpu0 to update it's TSB because at that point the cpu_vm_mask
		106	* only had cpu1 set in it.
		107	*/
		108	load_secondary_context(mm);
		109	tsb_context_switch(mm);
83		110
84	/* Even if (mm == old_mm) we _must_ check	111	/* Any time a processor runs a context on an address space
85	* the cpu_vm_mask. If we do not we could	112	* for the first time, we must flush that context out of the
86	* corrupt the TLB state because of how	113	* local TLB.
87	* smp_flush_tlb_{page,range,mm} on sparc64
88	* and lazy tlb switches work. -DaveM
89	*/	114	*/
90	cpu = smp_processor_id();	115	cpu = smp_processor_id();
91	if (!ctx_valid \|\| !cpu_isset(cpu, mm->cpu_vm_mask)) {	116	if (!ctx_valid \|\| !cpu_isset(cpu, mm->cpu_vm_mask)) {
@@ -93,6 +118,7 @@ static inline void switch_mm(struct mm_struct old_mm, struct mm_struct mm, str
93	__flush_tlb_mm(CTX_HWBITS(mm->context),	118	__flush_tlb_mm(CTX_HWBITS(mm->context),
94	SECONDARY_CONTEXT);	119	SECONDARY_CONTEXT);
95	}	120	}
		121	spin_unlock_irqrestore(&mm->context.lock, flags);
96	}	122	}
97		123
98	#define deactivate_mm(tsk,mm) do { } while (0)	124	#define deactivate_mm(tsk,mm) do { } while (0)
@@ -109,11 +135,11 @@ static inline void activate_mm(struct mm_struct active_mm, struct mm_struct mm
109	cpu = smp_processor_id();	135	cpu = smp_processor_id();
110	if (!cpu_isset(cpu, mm->cpu_vm_mask))	136	if (!cpu_isset(cpu, mm->cpu_vm_mask))
111	cpu_set(cpu, mm->cpu_vm_mask);	137	cpu_set(cpu, mm->cpu_vm_mask);
112	spin_unlock_irqrestore(&mm->context.lock, flags);
113		138
114	load_secondary_context(mm);	139	load_secondary_context(mm);
115	__flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT);	140	__flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT);
116	tsb_context_switch(mm);	141	tsb_context_switch(mm);
		142	spin_unlock_irqrestore(&mm->context.lock, flags);
117	}	143	}
118		144
119	#endif /* !(__ASSEMBLY__) */	145	#endif /* !(__ASSEMBLY__) */