[SPARC64]: Fix and re-enable dynamic TSB sizing.

This is good for up to %50 performance improvement of some test cases. The problem has been the race conditions, and hopefully I've plugged them all up here. 1) There was a serious race in switch_mm() wrt. lazy TLB switching to and from kernel threads. We could erroneously skip a tsb_context_switch() and thus use a stale TSB across a TSB grow event. There is a big comment now in that function describing exactly how it can happen. 2) All code paths that do something with the TSB need to be guarded with the mm->context.lock spinlock. This makes page table flushing paths properly synchronize with both TSB growing and TLB context changes. 3) TSB growing events are moved to the end of successful fault processing. Previously it was in update_mmu_cache() but that is deadlock prone. At the end of do_sparc64_fault() we hold no spinlocks that could deadlock the TSB grow sequence. We also have dropped the address space semaphore. While we're here, add prefetching to the copy_tsb() routine and put it in assembler into the tsb.S file. This piece of code is quite time critical. There are some small negative side effects to this code which can be improved upon. In particular we grab the mm->context.lock even for the tsb insert done by update_mmu_cache() now and that's a bit excessive. We can get rid of that locking, and the same lock taking in flush_tsb_user(), by disabling PSTATE_IE around the whole operation including the capturing of the tsb pointer and tsb_nentries value. That would work because anyone growing the TSB won't free up the old TSB until all cpus respond to the TSB change cross call. I'm not quite so confident in that optimization to put it in right now, but eventually we might be able to and the description is here for reference. This code seems very solid now. It passes several parallel GCC bootstrap builds, and our favorite "nut cruncher" stress test which is a full "make -j8192" build of a "make allmodconfig" kernel. That puts about 256 processes on each cpu's run queue, makes lots of process cpu migrations occur, causes lots of page table and TLB flushing activity, incurs many context version number changes, and it swaps the machine real far out to disk even though there is 16GB of ram on this test system. :-) Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2006-03-16 05:02:32 -0500
committer: David S. Miller <davem@sunset.davemloft.net> 2006-03-20 04:16:33 -0500
commit: 7a1ac5264108fc3ed22d17a3cdd76212ed1666d1 (patch)
tree: 75378a1b470afa54900f1f15a5b41966d301520d /arch/sparc64/mm
parent: a858f1ca726edc5eb7ed39722f7966d005f1c9ca (diff)
3 files changed, 95 insertions, 105 deletions
diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c
index b97bd054aad3..63b6cc0cd5d5 100644
--- a/arch/sparc64/mm/fault.c
+++ b/arch/sparc64/mm/fault.c
@@ -29,6 +29,7 @@
 #include <asm/lsu.h>
 #include <asm/sections.h>
 #include <asm/kdebug.h>
+#include <asm/mmu_context.h>
 /*
 * To debug kernel to catch accesses to certain virtual/physical addresses.
@@ -258,7 +259,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
        struct vm_area_struct *vma;
        unsigned int insn = 0;
        int si_code, fault_code;
-        unsigned long address;
+        unsigned long address, mm_rss;
        fault_code = get_thread_fault_code();
@@ -407,6 +408,11 @@ good_area:
        }
        up_read(&mm->mmap_sem);
+        mm_rss = get_mm_rss(mm);
+        if (unlikely(mm_rss >= mm->context.tsb_rss_limit))
+                tsb_grow(mm, mm_rss);
        return;
        /*
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index b40f6477dea0..d703b67bc7b9 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -279,7 +279,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
 {
        struct mm_struct *mm;
        struct tsb *tsb;
-        unsigned long tag;
+        unsigned long tag, flags;
        if (tlb_type != hypervisor) {
                unsigned long pfn = pte_pfn(pte);
@@ -308,10 +308,15 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
        }
        mm = vma->vm_mm;
+        spin_lock_irqsave(&mm->context.lock, flags);
        tsb = &mm->context.tsb[(address >> PAGE_SHIFT) &
                               (mm->context.tsb_nentries - 1UL)];
        tag = (address >> 22UL);
        tsb_insert(tsb, tag, pte_val(pte));
+        spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 void flush_dcache_page(struct page *page)
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
index f36799b7152c..7fbe1e0cd105 100644
--- a/arch/sparc64/mm/tsb.c
+++ b/arch/sparc64/mm/tsb.c
@@ -48,11 +48,15 @@ void flush_tsb_kernel_range(unsigned long start, unsigned long end)
 void flush_tsb_user(struct mmu_gather *mp)
 {
        struct mm_struct *mm = mp->mm;
-        struct tsb *tsb = mm->context.tsb;
+        unsigned long nentries, base, flags;
-        unsigned long nentries = mm->context.tsb_nentries;
+        struct tsb *tsb;
-        unsigned long base;
        int i;
+        spin_lock_irqsave(&mm->context.lock, flags);
+        tsb = mm->context.tsb;
+        nentries = mm->context.tsb_nentries;
        if (tlb_type == cheetah_plus || tlb_type == hypervisor)
                base = __pa(tsb);
        else
@@ -70,6 +74,8 @@ void flush_tsb_user(struct mmu_gather *mp)
                tsb_flush(ent, tag);
        }
+        spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
@@ -201,86 +207,9 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
        }
 }
-/* The page tables are locked against modifications while this
- * runs.
- *
- * XXX do some prefetching...
- */
-static void copy_tsb(struct tsb *old_tsb, unsigned long old_size,
-                     struct tsb *new_tsb, unsigned long new_size)
-{
-        unsigned long old_nentries = old_size / sizeof(struct tsb);
-        unsigned long new_nentries = new_size / sizeof(struct tsb);
-        unsigned long i;
-        for (i = 0; i < old_nentries; i++) {
-                register unsigned long tag asm("o4");
-                register unsigned long pte asm("o5");
-                unsigned long v, hash;
-                if (tlb_type == hypervisor) {
-                        __asm__ __volatile__(
-                                "ldda [%2] %3, %0"
-                                : "=r" (tag), "=r" (pte)
-                                : "r" (__pa(&old_tsb[i])),
-                                  "i" (ASI_QUAD_LDD_PHYS_4V));
-                } else if (tlb_type == cheetah_plus) {
-                        __asm__ __volatile__(
-                                "ldda [%2] %3, %0"
-                                : "=r" (tag), "=r" (pte)
-                                : "r" (__pa(&old_tsb[i])),
-                                  "i" (ASI_QUAD_LDD_PHYS));
-                } else {
-                        __asm__ __volatile__(
-                                "ldda [%2] %3, %0"
-                                : "=r" (tag), "=r" (pte)
-                                : "r" (&old_tsb[i]),
-                                  "i" (ASI_NUCLEUS_QUAD_LDD));
-                }
-                if (tag & ((1UL << TSB_TAG_LOCK_BIT) |
-                           (1UL << TSB_TAG_INVALID_BIT)))
-                        continue;
-                /* We only put base page size PTEs into the TSB,
-                 * but that might change in the future.  This code
-                 * would need to be changed if we start putting larger
-                 * page size PTEs into there.
-                 */
-                WARN_ON((pte & _PAGE_ALL_SZ_BITS) != _PAGE_SZBITS);
-                /* The tag holds bits 22 to 63 of the virtual address
-                 * and the context.  Clear out the context, and shift
-                 * up to make a virtual address.
-                 */
-                v = (tag & ((1UL << 42UL) - 1UL)) << 22UL;
-                /* The implied bits of the tag (bits 13 to 21) are
-                 * determined by the TSB entry index, so fill that in.
-                 */
-                v |= (i & (512UL - 1UL)) << 13UL;
-                hash = tsb_hash(v, new_nentries);
-                if (tlb_type == cheetah_plus ||
-                    tlb_type == hypervisor) {
-                        __asm__ __volatile__(
-                                "stxa   %0, [%1] %2\n\t"
-                                "stxa   %3, [%4] %2"
-                                : /* no outputs */
-                                : "r" (tag),
-                                  "r" (__pa(&new_tsb[hash].tag)),
-                                  "i" (ASI_PHYS_USE_EC),
-                                  "r" (pte),
-                                  "r" (__pa(&new_tsb[hash].pte)));
-                } else {
-                        new_tsb[hash].tag = tag;
-                        new_tsb[hash].pte = pte;
-                }
-        }
-}
 /* When the RSS of an address space exceeds mm->context.tsb_rss_limit,
- * update_mmu_cache() invokes this routine to try and grow the TSB.
+ * do_sparc64_fault() invokes this routine to try and grow the TSB.
+ *
 * When we reach the maximum TSB size supported, we stick ~0UL into
 * mm->context.tsb_rss_limit so the grow checks in update_mmu_cache()
 * will not trigger any longer.
@@ -293,12 +222,12 @@ static void copy_tsb(struct tsb *old_tsb, unsigned long old_size,
 * the number of entries that the current TSB can hold at once.  Currently,
 * we trigger when the RSS hits 3/4 of the TSB capacity.
 */
-void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
+void tsb_grow(struct mm_struct *mm, unsigned long rss)
 {
        unsigned long max_tsb_size = 1 * 1024 * 1024;
-        unsigned long size, old_size;
+        unsigned long size, old_size, flags;
        struct page *page;
-        struct tsb *old_tsb;
+        struct tsb *old_tsb, *new_tsb;
        if (max_tsb_size > (PAGE_SIZE << MAX_ORDER))
                max_tsb_size = (PAGE_SIZE << MAX_ORDER);
@@ -311,12 +240,51 @@ void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
                        break;
        }
-        page = alloc_pages(gfp_flags, get_order(size));
+        page = alloc_pages(GFP_KERNEL, get_order(size));
        if (unlikely(!page))
                return;
        /* Mark all tags as invalid.  */
-        memset(page_address(page), 0x40, size);
+        new_tsb = page_address(page);
+        memset(new_tsb, 0x40, size);
+        /* Ok, we are about to commit the changes.  If we are
+         * growing an existing TSB the locking is very tricky,
+         * so WATCH OUT!
+         *
+         * We have to hold mm->context.lock while committing to the
+         * new TSB, this synchronizes us with processors in
+         * flush_tsb_user() and switch_mm() for this address space.
+         *
+         * But even with that lock held, processors run asynchronously
+         * accessing the old TSB via TLB miss handling.  This is OK
+         * because those actions are just propagating state from the
+         * Linux page tables into the TSB, page table mappings are not
+         * being changed.  If a real fault occurs, the processor will
+         * synchronize with us when it hits flush_tsb_user(), this is
+         * also true for the case where vmscan is modifying the page
+         * tables.  The only thing we need to be careful with is to
+         * skip any locked TSB entries during copy_tsb().
+         *
+         * When we finish committing to the new TSB, we have to drop
+         * the lock and ask all other cpus running this address space
+         * to run tsb_context_switch() to see the new TSB table.
+         */
+        spin_lock_irqsave(&mm->context.lock, flags);
+        old_tsb = mm->context.tsb;
+        old_size = mm->context.tsb_nentries * sizeof(struct tsb);
+        /* Handle multiple threads trying to grow the TSB at the same time.
+         * One will get in here first, and bump the size and the RSS limit.
+         * The others will get in here next and hit this check.
+         */
+        if (unlikely(old_tsb && (rss < mm->context.tsb_rss_limit))) {
+                spin_unlock_irqrestore(&mm->context.lock, flags);
+                free_pages((unsigned long) new_tsb, get_order(size));
+                return;
+        }
        if (size == max_tsb_size)
                mm->context.tsb_rss_limit = ~0UL;
@@ -324,30 +292,37 @@ void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
                mm->context.tsb_rss_limit =
                        ((size / sizeof(struct tsb)) * 3) / 4;
-        old_tsb = mm->context.tsb;
+        if (old_tsb) {
-        old_size = mm->context.tsb_nentries * sizeof(struct tsb);
+                extern void copy_tsb(unsigned long old_tsb_base,
+                                     unsigned long old_tsb_size,
-        if (old_tsb)
+                                     unsigned long new_tsb_base,
-                copy_tsb(old_tsb, old_size, page_address(page), size);
+                                     unsigned long new_tsb_size);
+                unsigned long old_tsb_base = (unsigned long) old_tsb;
+                unsigned long new_tsb_base = (unsigned long) new_tsb;
+                if (tlb_type == cheetah_plus || tlb_type == hypervisor) {
+                        old_tsb_base = __pa(old_tsb_base);
+                        new_tsb_base = __pa(new_tsb_base);
+                }
+                copy_tsb(old_tsb_base, old_size, new_tsb_base, size);
+        }
-        mm->context.tsb = page_address(page);
+        mm->context.tsb = new_tsb;
        setup_tsb_params(mm, size);
+        spin_unlock_irqrestore(&mm->context.lock, flags);
        /* If old_tsb is NULL, we're being invoked for the first time
         * from init_new_context().
         */
        if (old_tsb) {
-                /* Now force all other processors to reload the new
+                /* Reload it on the local cpu.  */
-                 * TSB state.
-                 */
-                smp_tsb_sync(mm);
-                /* Finally reload it on the local cpu.  No further
-                 * references will remain to the old TSB and we can
-                 * thus free it up.
-                 */
                tsb_context_switch(mm);
+                /* Now force other processors to do the same.  */
+                smp_tsb_sync(mm);
+                /* Now it is safe to free the old tsb.  */
                free_pages((unsigned long) old_tsb, get_order(old_size));
        }
 }
@@ -363,7 +338,11 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
         * will be confused and think there is an older TSB to free up.
         */
        mm->context.tsb = NULL;
-        tsb_grow(mm, 0, GFP_KERNEL);
+        /* If this is fork, inherit the parent's TSB size.  We would
+         * grow it to that size on the first page fault anyways.
+         */
+        tsb_grow(mm, get_mm_rss(mm));
        if (unlikely(!mm->context.tsb))
                return -ENOMEM;
author	David S. Miller <davem@davemloft.net>	2006-03-16 05:02:32 -0500
committer	David S. Miller <davem@sunset.davemloft.net>	2006-03-20 04:16:33 -0500
commit	7a1ac5264108fc3ed22d17a3cdd76212ed1666d1 (patch)
tree	75378a1b470afa54900f1f15a5b41966d301520d /arch/sparc64/mm
parent	a858f1ca726edc5eb7ed39722f7966d005f1c9ca (diff)

diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index b97bd054aad3..63b6cc0cd5d5 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c
@@ -29,6 +29,7 @@
29	#include <asm/lsu.h>	29	#include <asm/lsu.h>
30	#include <asm/sections.h>	30	#include <asm/sections.h>
31	#include <asm/kdebug.h>	31	#include <asm/kdebug.h>
		32	#include <asm/mmu_context.h>
32		33
33	/*	34	/*
34	* To debug kernel to catch accesses to certain virtual/physical addresses.	35	* To debug kernel to catch accesses to certain virtual/physical addresses.
@@ -258,7 +259,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
258	struct vm_area_struct *vma;	259	struct vm_area_struct *vma;
259	unsigned int insn = 0;	260	unsigned int insn = 0;
260	int si_code, fault_code;	261	int si_code, fault_code;
261	unsigned long address;	262	unsigned long address, mm_rss;
262		263
263	fault_code = get_thread_fault_code();	264	fault_code = get_thread_fault_code();
264		265
@@ -407,6 +408,11 @@ good_area:
407	}	408	}
408		409
409	up_read(&mm->mmap_sem);	410	up_read(&mm->mmap_sem);
		411
		412	mm_rss = get_mm_rss(mm);
		413	if (unlikely(mm_rss >= mm->context.tsb_rss_limit))
		414	tsb_grow(mm, mm_rss);
		415
410	return;	416	return;
411		417
412	/*	418	/*


diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index b40f6477dea0..d703b67bc7b9 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c
@@ -279,7 +279,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
279	{	279	{
280	struct mm_struct *mm;	280	struct mm_struct *mm;
281	struct tsb *tsb;	281	struct tsb *tsb;
282	unsigned long tag;	282	unsigned long tag, flags;
283		283
284	if (tlb_type != hypervisor) {	284	if (tlb_type != hypervisor) {
285	unsigned long pfn = pte_pfn(pte);	285	unsigned long pfn = pte_pfn(pte);
@@ -308,10 +308,15 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
308	}	308	}
309		309
310	mm = vma->vm_mm;	310	mm = vma->vm_mm;
		311
		312	spin_lock_irqsave(&mm->context.lock, flags);
		313
311	tsb = &mm->context.tsb[(address >> PAGE_SHIFT) &	314	tsb = &mm->context.tsb[(address >> PAGE_SHIFT) &
312	(mm->context.tsb_nentries - 1UL)];	315	(mm->context.tsb_nentries - 1UL)];
313	tag = (address >> 22UL);	316	tag = (address >> 22UL);
314	tsb_insert(tsb, tag, pte_val(pte));	317	tsb_insert(tsb, tag, pte_val(pte));
		318
		319	spin_unlock_irqrestore(&mm->context.lock, flags);
315	}	320	}
316		321
317	void flush_dcache_page(struct page *page)	322	void flush_dcache_page(struct page *page)


diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c index f36799b7152c..7fbe1e0cd105 100644 --- a/arch/sparc64/mm/tsb.c +++ b/arch/sparc64/mm/tsb.c
@@ -48,11 +48,15 @@ void flush_tsb_kernel_range(unsigned long start, unsigned long end)
48	void flush_tsb_user(struct mmu_gather *mp)	48	void flush_tsb_user(struct mmu_gather *mp)
49	{	49	{
50	struct mm_struct *mm = mp->mm;	50	struct mm_struct *mm = mp->mm;
51	struct tsb *tsb = mm->context.tsb;	51	unsigned long nentries, base, flags;
52	unsigned long nentries = mm->context.tsb_nentries;	52	struct tsb *tsb;
53	unsigned long base;
54	int i;	53	int i;
55		54
		55	spin_lock_irqsave(&mm->context.lock, flags);
		56
		57	tsb = mm->context.tsb;
		58	nentries = mm->context.tsb_nentries;
		59
56	if (tlb_type == cheetah_plus \|\| tlb_type == hypervisor)	60	if (tlb_type == cheetah_plus \|\| tlb_type == hypervisor)
57	base = __pa(tsb);	61	base = __pa(tsb);
58	else	62	else
@@ -70,6 +74,8 @@ void flush_tsb_user(struct mmu_gather *mp)
70		74
71	tsb_flush(ent, tag);	75	tsb_flush(ent, tag);
72	}	76	}
		77
		78	spin_unlock_irqrestore(&mm->context.lock, flags);
73	}	79	}
74		80
75	static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)	81	static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
@@ -201,86 +207,9 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
201	}	207	}
202	}	208	}
203		209
204	/* The page tables are locked against modifications while this
205	* runs.
206	*
207	* XXX do some prefetching...
208	*/
209	static void copy_tsb(struct tsb *old_tsb, unsigned long old_size,
210	struct tsb *new_tsb, unsigned long new_size)
211	{
212	unsigned long old_nentries = old_size / sizeof(struct tsb);
213	unsigned long new_nentries = new_size / sizeof(struct tsb);
214	unsigned long i;
215
216	for (i = 0; i < old_nentries; i++) {
217	register unsigned long tag asm("o4");
218	register unsigned long pte asm("o5");
219	unsigned long v, hash;
220
221	if (tlb_type == hypervisor) {
222	__asm__ __volatile__(
223	"ldda [%2] %3, %0"
224	: "=r" (tag), "=r" (pte)
225	: "r" (__pa(&old_tsb[i])),
226	"i" (ASI_QUAD_LDD_PHYS_4V));
227	} else if (tlb_type == cheetah_plus) {
228	__asm__ __volatile__(
229	"ldda [%2] %3, %0"
230	: "=r" (tag), "=r" (pte)
231	: "r" (__pa(&old_tsb[i])),
232	"i" (ASI_QUAD_LDD_PHYS));
233	} else {
234	__asm__ __volatile__(
235	"ldda [%2] %3, %0"
236	: "=r" (tag), "=r" (pte)
237	: "r" (&old_tsb[i]),
238	"i" (ASI_NUCLEUS_QUAD_LDD));
239	}
240
241	if (tag & ((1UL << TSB_TAG_LOCK_BIT) \|
242	(1UL << TSB_TAG_INVALID_BIT)))
243	continue;
244
245	/* We only put base page size PTEs into the TSB,
246	* but that might change in the future. This code
247	* would need to be changed if we start putting larger
248	* page size PTEs into there.
249	*/
250	WARN_ON((pte & _PAGE_ALL_SZ_BITS) != _PAGE_SZBITS);
251
252	/* The tag holds bits 22 to 63 of the virtual address
253	* and the context. Clear out the context, and shift
254	* up to make a virtual address.
255	*/
256	v = (tag & ((1UL << 42UL) - 1UL)) << 22UL;
257
258	/* The implied bits of the tag (bits 13 to 21) are
259	* determined by the TSB entry index, so fill that in.
260	*/
261	v \|= (i & (512UL - 1UL)) << 13UL;
262
263	hash = tsb_hash(v, new_nentries);
264	if (tlb_type == cheetah_plus \|\|
265	tlb_type == hypervisor) {
266	__asm__ __volatile__(
267	"stxa %0, [%1] %2\n\t"
268	"stxa %3, [%4] %2"
269	: /* no outputs */
270	: "r" (tag),
271	"r" (__pa(&new_tsb[hash].tag)),
272	"i" (ASI_PHYS_USE_EC),
273	"r" (pte),
274	"r" (__pa(&new_tsb[hash].pte)));
275	} else {
276	new_tsb[hash].tag = tag;
277	new_tsb[hash].pte = pte;
278	}
279	}
280	}
281
282	/* When the RSS of an address space exceeds mm->context.tsb_rss_limit,	210	/* When the RSS of an address space exceeds mm->context.tsb_rss_limit,
283	* update_mmu_cache() invokes this routine to try and grow the TSB.	211	* do_sparc64_fault() invokes this routine to try and grow the TSB.
		212	*
284	* When we reach the maximum TSB size supported, we stick ~0UL into	213	* When we reach the maximum TSB size supported, we stick ~0UL into
285	* mm->context.tsb_rss_limit so the grow checks in update_mmu_cache()	214	* mm->context.tsb_rss_limit so the grow checks in update_mmu_cache()
286	* will not trigger any longer.	215	* will not trigger any longer.
@@ -293,12 +222,12 @@ static void copy_tsb(struct tsb *old_tsb, unsigned long old_size,
293	* the number of entries that the current TSB can hold at once. Currently,	222	* the number of entries that the current TSB can hold at once. Currently,
294	* we trigger when the RSS hits 3/4 of the TSB capacity.	223	* we trigger when the RSS hits 3/4 of the TSB capacity.
295	*/	224	*/
296	void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)	225	void tsb_grow(struct mm_struct *mm, unsigned long rss)
297	{	226	{
298	unsigned long max_tsb_size = 1 * 1024 * 1024;	227	unsigned long max_tsb_size = 1 * 1024 * 1024;
299	unsigned long size, old_size;	228	unsigned long size, old_size, flags;
300	struct page *page;	229	struct page *page;
301	struct tsb *old_tsb;	230	struct tsb old_tsb, new_tsb;
302		231
303	if (max_tsb_size > (PAGE_SIZE << MAX_ORDER))	232	if (max_tsb_size > (PAGE_SIZE << MAX_ORDER))
304	max_tsb_size = (PAGE_SIZE << MAX_ORDER);	233	max_tsb_size = (PAGE_SIZE << MAX_ORDER);
@@ -311,12 +240,51 @@ void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
311	break;	240	break;
312	}	241	}
313		242
314	page = alloc_pages(gfp_flags, get_order(size));	243	page = alloc_pages(GFP_KERNEL, get_order(size));
315	if (unlikely(!page))	244	if (unlikely(!page))
316	return;	245	return;
317		246
318	/* Mark all tags as invalid. */	247	/* Mark all tags as invalid. */
319	memset(page_address(page), 0x40, size);	248	new_tsb = page_address(page);
		249	memset(new_tsb, 0x40, size);
		250
		251	/* Ok, we are about to commit the changes. If we are
		252	* growing an existing TSB the locking is very tricky,
		253	* so WATCH OUT!
		254	*
		255	* We have to hold mm->context.lock while committing to the
		256	* new TSB, this synchronizes us with processors in
		257	* flush_tsb_user() and switch_mm() for this address space.
		258	*
		259	* But even with that lock held, processors run asynchronously
		260	* accessing the old TSB via TLB miss handling. This is OK
		261	* because those actions are just propagating state from the
		262	* Linux page tables into the TSB, page table mappings are not
		263	* being changed. If a real fault occurs, the processor will
		264	* synchronize with us when it hits flush_tsb_user(), this is
		265	* also true for the case where vmscan is modifying the page
		266	* tables. The only thing we need to be careful with is to
		267	* skip any locked TSB entries during copy_tsb().
		268	*
		269	* When we finish committing to the new TSB, we have to drop
		270	* the lock and ask all other cpus running this address space
		271	* to run tsb_context_switch() to see the new TSB table.
		272	*/
		273	spin_lock_irqsave(&mm->context.lock, flags);
		274
		275	old_tsb = mm->context.tsb;
		276	old_size = mm->context.tsb_nentries * sizeof(struct tsb);
		277
		278	/* Handle multiple threads trying to grow the TSB at the same time.
		279	* One will get in here first, and bump the size and the RSS limit.
		280	* The others will get in here next and hit this check.
		281	*/
		282	if (unlikely(old_tsb && (rss < mm->context.tsb_rss_limit))) {
		283	spin_unlock_irqrestore(&mm->context.lock, flags);
		284
		285	free_pages((unsigned long) new_tsb, get_order(size));
		286	return;
		287	}
320		288
321	if (size == max_tsb_size)	289	if (size == max_tsb_size)
322	mm->context.tsb_rss_limit = ~0UL;	290	mm->context.tsb_rss_limit = ~0UL;
@@ -324,30 +292,37 @@ void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
324	mm->context.tsb_rss_limit =	292	mm->context.tsb_rss_limit =
325	((size / sizeof(struct tsb)) * 3) / 4;	293	((size / sizeof(struct tsb)) * 3) / 4;
326		294
327	old_tsb = mm->context.tsb;	295	if (old_tsb) {
328	old_size = mm->context.tsb_nentries * sizeof(struct tsb);	296	extern void copy_tsb(unsigned long old_tsb_base,
329		297	unsigned long old_tsb_size,
330	if (old_tsb)	298	unsigned long new_tsb_base,
331	copy_tsb(old_tsb, old_size, page_address(page), size);	299	unsigned long new_tsb_size);
		300	unsigned long old_tsb_base = (unsigned long) old_tsb;
		301	unsigned long new_tsb_base = (unsigned long) new_tsb;
		302
		303	if (tlb_type == cheetah_plus \|\| tlb_type == hypervisor) {
		304	old_tsb_base = __pa(old_tsb_base);
		305	new_tsb_base = __pa(new_tsb_base);
		306	}
		307	copy_tsb(old_tsb_base, old_size, new_tsb_base, size);
		308	}
332		309
333	mm->context.tsb = page_address(page);	310	mm->context.tsb = new_tsb;
334	setup_tsb_params(mm, size);	311	setup_tsb_params(mm, size);
335		312
		313	spin_unlock_irqrestore(&mm->context.lock, flags);
		314
336	/* If old_tsb is NULL, we're being invoked for the first time	315	/* If old_tsb is NULL, we're being invoked for the first time
337	* from init_new_context().	316	* from init_new_context().
338	*/	317	*/
339	if (old_tsb) {	318	if (old_tsb) {
340	/* Now force all other processors to reload the new	319	/* Reload it on the local cpu. */
341	* TSB state.
342	*/
343	smp_tsb_sync(mm);
344
345	/* Finally reload it on the local cpu. No further
346	* references will remain to the old TSB and we can
347	* thus free it up.
348	*/
349	tsb_context_switch(mm);	320	tsb_context_switch(mm);
350		321
		322	/* Now force other processors to do the same. */
		323	smp_tsb_sync(mm);
		324
		325	/* Now it is safe to free the old tsb. */
351	free_pages((unsigned long) old_tsb, get_order(old_size));	326	free_pages((unsigned long) old_tsb, get_order(old_size));
352	}	327	}
353	}	328	}
@@ -363,7 +338,11 @@ int init_new_context(struct task_struct tsk, struct mm_struct mm)
363	* will be confused and think there is an older TSB to free up.	338	* will be confused and think there is an older TSB to free up.
364	*/	339	*/
365	mm->context.tsb = NULL;	340	mm->context.tsb = NULL;
366	tsb_grow(mm, 0, GFP_KERNEL);	341
		342	/* If this is fork, inherit the parent's TSB size. We would
		343	* grow it to that size on the first page fault anyways.
		344	*/
		345	tsb_grow(mm, get_mm_rss(mm));
367		346
368	if (unlikely(!mm->context.tsb))	347	if (unlikely(!mm->context.tsb))
369	return -ENOMEM;	348	return -ENOMEM;