[POWERPC] Make tlb flush batch use lazy MMU mode

The current tlb flush code on powerpc 64 bits has a subtle race since we lost the page table lock due to the possible faulting in of new PTEs after a previous one has been removed but before the corresponding hash entry has been evicted, which can leads to all sort of fatal problems. This patch reworks the batch code completely. It doesn't use the mmu_gather stuff anymore. Instead, we use the lazy mmu hooks that were added by the paravirt code. They have the nice property that the enter/leave lazy mmu mode pair is always fully contained by the PTE lock for a given range of PTEs. Thus we can guarantee that all batches are flushed on a given CPU before it drops that lock. We also generalize batching for any PTE update that require a flush. Batching is now enabled on a CPU by arch_enter_lazy_mmu_mode() and disabled by arch_leave_lazy_mmu_mode(). The code epects that this is always contained within a PTE lock section so no preemption can happen and no PTE insertion in that range from another CPU. When batching is enabled on a CPU, every PTE updates that need a hash flush will use the batch for that flush. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
author: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2007-04-10 03:09:37 -0400
committer: Paul Mackerras <paulus@samba.org> 2007-04-12 14:09:38 -0400
commit: a741e67969577163a4cfc78d7fd2753219087ef1 (patch)
tree: bac4162aaf15367e896429afa60465e201c9204c /arch
parent: e4ee3891db35aa9a069bb403c2a66a8fbfa274d6 (diff)
4 files changed, 49 insertions, 43 deletions
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 949092dccf44..e509aae2feb3 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -305,9 +305,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
                set_dabr(new->thread.dabr);
                __get_cpu_var(current_dabr) = new->thread.dabr;
        }
+#endif /* CONFIG_PPC64 */
-        flush_tlb_pending();
-#endif
        new_thread = &new->thread;
        old_thread = &current->thread;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 924d692bc8f9..d8e503b2e1af 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -428,10 +428,6 @@ void generic_mach_cpu_die(void)
        smp_wmb();
        while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
                cpu_relax();
-#ifdef CONFIG_PPC64
-        flush_tlb_pending();
-#endif
        cpu_set(cpu, cpu_online_map);
        local_irq_enable();
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f6ffaaa7a5bf..8508f973d9cc 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -316,12 +316,11 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 {
        if (pte_present(*ptep)) {
                /* We open-code pte_clear because we need to pass the right
-                 * argument to hpte_update (huge / !huge)
+                 * argument to hpte_need_flush (huge / !huge). Might not be
+                 * necessary anymore if we make hpte_need_flush() get the
+                 * page size from the slices
                 */
-                unsigned long old = pte_update(ptep, ~0UL);
+                pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
-                if (old & _PAGE_HASHPTE)
-                        hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
-                flush_tlb_pending();
        }
        *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 }
@@ -329,12 +328,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep)
 {
-        unsigned long old = pte_update(ptep, ~0UL);
+        unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
-        if (old & _PAGE_HASHPTE)
-                hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
-        *ptep = __pte(0);
        return __pte(old);
 }
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index b58baa65c4a7..fd8d08c325eb 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -120,17 +120,20 @@ void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 }
 /*
- * Update the MMU hash table to correspond with a change to
+ * A linux PTE was changed and the corresponding hash table entry
- * a Linux PTE.  If wrprot is true, it is permissible to
+ * neesd to be flushed. This function will either perform the flush
- * change the existing HPTE to read-only rather than removing it
+ * immediately or will batch it up if the current CPU has an active
- * (if we remove it we should clear the _PTE_HPTEFLAGS bits).
+ * batch on it.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
 */
-void hpte_update(struct mm_struct *mm, unsigned long addr,
+void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
-                 pte_t *ptep, unsigned long pte, int huge)
+                     pte_t *ptep, unsigned long pte, int huge)
 {
        struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
-        unsigned long vsid;
+        unsigned long vsid, vaddr;
        unsigned int psize;
+        real_pte_t rpte;
        int i;
        i = batch->index;
@@ -151,6 +154,26 @@ void hpte_update(struct mm_struct *mm, unsigned long addr,
        } else
                psize = pte_pagesize_index(pte);
+        /* Build full vaddr */
+        if (!is_kernel_addr(addr)) {
+                vsid = get_vsid(mm->context.id, addr);
+                WARN_ON(vsid == 0);
+        } else
+                vsid = get_kernel_vsid(addr);
+        vaddr = (vsid << 28 ) | (addr & 0x0fffffff);
+        rpte = __real_pte(__pte(pte), ptep);
+        /*
+         * Check if we have an active batch on this CPU. If not, just
+         * flush now and return. For now, we don global invalidates
+         * in that case, might be worth testing the mm cpu mask though
+         * and decide to use local invalidates instead...
+         */
+        if (!batch->active) {
+                flush_hash_page(vaddr, rpte, psize, 0);
+                return;
+        }
        /*
         * This can happen when we are in the middle of a TLB batch and
         * we encounter memory pressure (eg copy_page_range when it tries
@@ -162,47 +185,42 @@ void hpte_update(struct mm_struct *mm, unsigned long addr,
         * batch
         */
        if (i != 0 && (mm != batch->mm || batch->psize != psize)) {
-                flush_tlb_pending();
+                __flush_tlb_pending(batch);
                i = 0;
        }
        if (i == 0) {
                batch->mm = mm;
                batch->psize = psize;
        }
-        if (!is_kernel_addr(addr)) {
+        batch->pte[i] = rpte;
-                vsid = get_vsid(mm->context.id, addr);
+        batch->vaddr[i] = vaddr;
-                WARN_ON(vsid == 0);
-        } else
-                vsid = get_kernel_vsid(addr);
-        batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff);
-        batch->pte[i] = __real_pte(__pte(pte), ptep);
        batch->index = ++i;
        if (i >= PPC64_TLB_BATCH_NR)
-                flush_tlb_pending();
+                __flush_tlb_pending(batch);
 }
+/*
+ * This function is called when terminating an mmu batch or when a batch
+ * is full. It will perform the flush of all the entries currently stored
+ * in a batch.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
 void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 {
-        int i;
-        int cpu;
        cpumask_t tmp;
-        int local = 0;
+        int i, local = 0;
-        BUG_ON(in_interrupt());
-        cpu = get_cpu();
        i = batch->index;
-        tmp = cpumask_of_cpu(cpu);
+        tmp = cpumask_of_cpu(smp_processor_id());
        if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
                local = 1;
        if (i == 1)
                flush_hash_page(batch->vaddr[0], batch->pte[0],
                                batch->psize, local);
        else
                flush_hash_range(i, local);
        batch->index = 0;
-        put_cpu();
 }
 void pte_free_finish(void)
author	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2007-04-10 03:09:37 -0400
committer	Paul Mackerras <paulus@samba.org>	2007-04-12 14:09:38 -0400
commit	a741e67969577163a4cfc78d7fd2753219087ef1 (patch)
tree	bac4162aaf15367e896429afa60465e201c9204c /arch
parent	e4ee3891db35aa9a069bb403c2a66a8fbfa274d6 (diff)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 949092dccf44..e509aae2feb3 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c
@@ -305,9 +305,7 @@ struct task_struct __switch_to(struct task_struct prev,
305	set_dabr(new->thread.dabr);	305	set_dabr(new->thread.dabr);
306	__get_cpu_var(current_dabr) = new->thread.dabr;	306	__get_cpu_var(current_dabr) = new->thread.dabr;
307	}	307	}
308		308	#endif /* CONFIG_PPC64 */
309	flush_tlb_pending();
310	#endif
311		309
312	new_thread = &new->thread;	310	new_thread = &new->thread;
313	old_thread = &current->thread;	311	old_thread = &current->thread;


diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 924d692bc8f9..d8e503b2e1af 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c
@@ -428,10 +428,6 @@ void generic_mach_cpu_die(void)
428	smp_wmb();	428	smp_wmb();
429	while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)	429	while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
430	cpu_relax();	430	cpu_relax();
431
432	#ifdef CONFIG_PPC64
433	flush_tlb_pending();
434	#endif
435	cpu_set(cpu, cpu_online_map);	431	cpu_set(cpu, cpu_online_map);
436	local_irq_enable();	432	local_irq_enable();
437	}	433	}


diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f6ffaaa7a5bf..8508f973d9cc 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c
@@ -316,12 +316,11 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
316	{	316	{
317	if (pte_present(*ptep)) {	317	if (pte_present(*ptep)) {
318	/* We open-code pte_clear because we need to pass the right	318	/* We open-code pte_clear because we need to pass the right
319	* argument to hpte_update (huge / !huge)	319	* argument to hpte_need_flush (huge / !huge). Might not be
		320	* necessary anymore if we make hpte_need_flush() get the
		321	* page size from the slices
320	*/	322	*/
321	unsigned long old = pte_update(ptep, ~0UL);	323	pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
322	if (old & _PAGE_HASHPTE)
323	hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
324	flush_tlb_pending();
325	}	324	}
326	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);	325	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
327	}	326	}
@@ -329,12 +328,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
329	pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,	328	pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
330	pte_t *ptep)	329	pte_t *ptep)
331	{	330	{
332	unsigned long old = pte_update(ptep, ~0UL);	331	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
333
334	if (old & _PAGE_HASHPTE)
335	hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
336	*ptep = __pte(0);
337
338	return __pte(old);	332	return __pte(old);
339	}	333	}
340		334


diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c index b58baa65c4a7..fd8d08c325eb 100644 --- a/arch/powerpc/mm/tlb_64.c +++ b/arch/powerpc/mm/tlb_64.c
@@ -120,17 +120,20 @@ void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
120	}	120	}
121		121
122	/*	122	/*
123	* Update the MMU hash table to correspond with a change to	123	* A linux PTE was changed and the corresponding hash table entry
124	* a Linux PTE. If wrprot is true, it is permissible to	124	* neesd to be flushed. This function will either perform the flush
125	* change the existing HPTE to read-only rather than removing it	125	* immediately or will batch it up if the current CPU has an active
126	* (if we remove it we should clear the _PTE_HPTEFLAGS bits).	126	* batch on it.
		127	*
		128	* Must be called from within some kind of spinlock/non-preempt region...
127	*/	129	*/
128	void hpte_update(struct mm_struct *mm, unsigned long addr,	130	void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
129	pte_t *ptep, unsigned long pte, int huge)	131	pte_t *ptep, unsigned long pte, int huge)
130	{	132	{
131	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);	133	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
132	unsigned long vsid;	134	unsigned long vsid, vaddr;
133	unsigned int psize;	135	unsigned int psize;
		136	real_pte_t rpte;
134	int i;	137	int i;
135		138
136	i = batch->index;	139	i = batch->index;
@@ -151,6 +154,26 @@ void hpte_update(struct mm_struct *mm, unsigned long addr,
151	} else	154	} else
152	psize = pte_pagesize_index(pte);	155	psize = pte_pagesize_index(pte);
153		156
		157	/* Build full vaddr */
		158	if (!is_kernel_addr(addr)) {
		159	vsid = get_vsid(mm->context.id, addr);
		160	WARN_ON(vsid == 0);
		161	} else
		162	vsid = get_kernel_vsid(addr);
		163	vaddr = (vsid << 28 ) \| (addr & 0x0fffffff);
		164	rpte = __real_pte(__pte(pte), ptep);
		165
		166	/*
		167	* Check if we have an active batch on this CPU. If not, just
		168	* flush now and return. For now, we don global invalidates
		169	* in that case, might be worth testing the mm cpu mask though
		170	* and decide to use local invalidates instead...
		171	*/
		172	if (!batch->active) {
		173	flush_hash_page(vaddr, rpte, psize, 0);
		174	return;
		175	}
		176
154	/*	177	/*
155	* This can happen when we are in the middle of a TLB batch and	178	* This can happen when we are in the middle of a TLB batch and
156	* we encounter memory pressure (eg copy_page_range when it tries	179	* we encounter memory pressure (eg copy_page_range when it tries
@@ -162,47 +185,42 @@ void hpte_update(struct mm_struct *mm, unsigned long addr,
162	* batch	185	* batch
163	*/	186	*/
164	if (i != 0 && (mm != batch->mm \|\| batch->psize != psize)) {	187	if (i != 0 && (mm != batch->mm \|\| batch->psize != psize)) {
165	flush_tlb_pending();	188	__flush_tlb_pending(batch);
166	i = 0;	189	i = 0;
167	}	190	}
168	if (i == 0) {	191	if (i == 0) {
169	batch->mm = mm;	192	batch->mm = mm;
170	batch->psize = psize;	193	batch->psize = psize;
171	}	194	}
172	if (!is_kernel_addr(addr)) {	195	batch->pte[i] = rpte;
173	vsid = get_vsid(mm->context.id, addr);	196	batch->vaddr[i] = vaddr;
174	WARN_ON(vsid == 0);
175	} else
176	vsid = get_kernel_vsid(addr);
177	batch->vaddr[i] = (vsid << 28 ) \| (addr & 0x0fffffff);
178	batch->pte[i] = __real_pte(__pte(pte), ptep);
179	batch->index = ++i;	197	batch->index = ++i;
180	if (i >= PPC64_TLB_BATCH_NR)	198	if (i >= PPC64_TLB_BATCH_NR)
181	flush_tlb_pending();	199	__flush_tlb_pending(batch);
182	}	200	}
183		201
		202	/*
		203	* This function is called when terminating an mmu batch or when a batch
		204	* is full. It will perform the flush of all the entries currently stored
		205	* in a batch.
		206	*
		207	* Must be called from within some kind of spinlock/non-preempt region...
		208	*/
184	void __flush_tlb_pending(struct ppc64_tlb_batch *batch)	209	void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
185	{	210	{
186	int i;
187	int cpu;
188	cpumask_t tmp;	211	cpumask_t tmp;
189	int local = 0;	212	int i, local = 0;
190		213
191	BUG_ON(in_interrupt());
192
193	cpu = get_cpu();
194	i = batch->index;	214	i = batch->index;
195	tmp = cpumask_of_cpu(cpu);	215	tmp = cpumask_of_cpu(smp_processor_id());
196	if (cpus_equal(batch->mm->cpu_vm_mask, tmp))	216	if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
197	local = 1;	217	local = 1;
198
199	if (i == 1)	218	if (i == 1)
200	flush_hash_page(batch->vaddr[0], batch->pte[0],	219	flush_hash_page(batch->vaddr[0], batch->pte[0],
201	batch->psize, local);	220	batch->psize, local);
202	else	221	else
203	flush_hash_range(i, local);	222	flush_hash_range(i, local);
204	batch->index = 0;	223	batch->index = 0;
205	put_cpu();
206	}	224	}
207		225
208	void pte_free_finish(void)	226	void pte_free_finish(void)