1 files changed, 141 insertions, 29 deletions
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 2c850d9864e3..1a2b36f8866d 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -142,6 +142,76 @@ pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
 }
 #endif
+/**
+ * shatter_huge_page() - ensure a given address is mapped by a small page.
+ *
+ * This function converts a huge PTE mapping kernel LOWMEM into a bunch
+ * of small PTEs with the same caching.  No cache flush required, but we
+ * must do a global TLB flush.
+ *
+ * Any caller that wishes to modify a kernel mapping that might
+ * have been made with a huge page should call this function,
+ * since doing so properly avoids race conditions with installing the
+ * newly-shattered page and then flushing all the TLB entries.
+ *
+ * @addr: Address at which to shatter any existing huge page.
+ */
+void shatter_huge_page(unsigned long addr)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        unsigned long flags = 0;  /* happy compiler */
+#ifdef __PAGETABLE_PMD_FOLDED
+        struct list_head *pos;
+#endif
+        /* Get a pointer to the pmd entry that we need to change. */
+        addr &= HPAGE_MASK;
+        BUG_ON(pgd_addr_invalid(addr));
+        BUG_ON(addr < PAGE_OFFSET);  /* only for kernel LOWMEM */
+        pgd = swapper_pg_dir + pgd_index(addr);
+        pud = pud_offset(pgd, addr);
+        BUG_ON(!pud_present(*pud));
+        pmd = pmd_offset(pud, addr);
+        BUG_ON(!pmd_present(*pmd));
+        if (!pmd_huge_page(*pmd))
+                return;
+        /*
+         * Grab the pgd_lock, since we may need it to walk the pgd_list,
+         * and since we need some kind of lock here to avoid races.
+         */
+        spin_lock_irqsave(&pgd_lock, flags);
+        if (!pmd_huge_page(*pmd)) {
+                /* Lost the race to convert the huge page. */
+                spin_unlock_irqrestore(&pgd_lock, flags);
+                return;
+        }
+        /* Shatter the huge page into the preallocated L2 page table. */
+        pmd_populate_kernel(&init_mm, pmd,
+                            get_prealloc_pte(pte_pfn(*(pte_t *)pmd)));
+#ifdef __PAGETABLE_PMD_FOLDED
+        /* Walk every pgd on the system and update the pmd there. */
+        list_for_each(pos, &pgd_list) {
+                pmd_t *copy_pmd;
+                pgd = list_to_pgd(pos) + pgd_index(addr);
+                pud = pud_offset(pgd, addr);
+                copy_pmd = pmd_offset(pud, addr);
+                __set_pmd(copy_pmd, *pmd);
+        }
+#endif
+        /* Tell every cpu to notice the change. */
+        flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
+                     cpu_possible_mask, NULL, 0);
+        /* Hold the lock until the TLB flush is finished to avoid races. */
+        spin_unlock_irqrestore(&pgd_lock, flags);
+}
 /*
 * List of all pgd's needed so it can invalidate entries in both cached
 * and uncached pgd's. This is essentially codepath-based locking
@@ -184,9 +254,9 @@ static void pgd_ctor(pgd_t *pgd)
        BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
 #endif
-        clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
+        memcpy(pgd + KERNEL_PGD_INDEX_START,
-                        swapper_pg_dir + KERNEL_PGD_INDEX_START,
+               swapper_pg_dir + KERNEL_PGD_INDEX_START,
-                        KERNEL_PGD_PTRS);
+               KERNEL_PGD_PTRS * sizeof(pgd_t));
        pgd_list_add(pgd);
        spin_unlock_irqrestore(&pgd_lock, flags);
@@ -220,8 +290,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-        gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP;
+        gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
        struct page *p;
+#if L2_USER_PGTABLE_ORDER > 0
+        int i;
+#endif
 #ifdef CONFIG_HIGHPTE
        flags |= __GFP_HIGHMEM;
@@ -231,6 +304,18 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
        if (p == NULL)
                return NULL;
+#if L2_USER_PGTABLE_ORDER > 0
+        /*
+         * Make every page have a page_count() of one, not just the first.
+         * We don't use __GFP_COMP since it doesn't look like it works
+         * correctly with tlb_remove_page().
+         */
+        for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+                init_page_count(p+i);
+                inc_zone_page_state(p+i, NR_PAGETABLE);
+        }
+#endif
        pgtable_page_ctor(p);
        return p;
 }
@@ -242,8 +327,15 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 */
 void pte_free(struct mm_struct *mm, struct page *p)
 {
+        int i;
        pgtable_page_dtor(p);
-        __free_pages(p, L2_USER_PGTABLE_ORDER);
+        __free_page(p);
+        for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+                __free_page(p+i);
+                dec_zone_page_state(p+i, NR_PAGETABLE);
+        }
 }
 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
@@ -252,8 +344,12 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
        int i;
        pgtable_page_dtor(pte);
-        for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
+        tlb_remove_page(tlb, pte);
+        for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
                tlb_remove_page(tlb, pte + i);
+                dec_zone_page_state(pte + i, NR_PAGETABLE);
+        }
 }
 #ifndef __tilegx__
@@ -335,35 +431,51 @@ int get_remote_cache_cpu(pgprot_t prot)
        return x + y * smp_width;
 }
-void set_pte_order(pte_t *ptep, pte_t pte, int order)
+/*
+ * Convert a kernel VA to a PA and homing information.
+ */
+int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)
 {
-        unsigned long pfn = pte_pfn(pte);
+        struct page *page = virt_to_page(va);
-        struct page *page = pfn_to_page(pfn);
+        pte_t null_pte = { 0 };
-        /* Update the home of a PTE if necessary */
+        *cpa = __pa(va);
-        pte = pte_set_home(pte, page_home(page));
+        /* Note that this is not writing a page table, just returning a pte. */
+        *pte = pte_set_home(null_pte, page_home(page));
+        return 0; /* return non-zero if not hfh? */
+}
+EXPORT_SYMBOL(va_to_cpa_and_pte);
+void __set_pte(pte_t *ptep, pte_t pte)
+{
 #ifdef __tilegx__
        *ptep = pte;
 #else
-        /*
+# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
-         * When setting a PTE, write the high bits first, then write
+#  error Must write the present and migrating bits last
-         * the low bits.  This sets the "present" bit only after the
+# endif
-         * other bits are in place.  If a particular PTE update
+        if (pte_present(pte)) {
-         * involves transitioning from one valid PTE to another, it
+                ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
-         * may be necessary to call set_pte_order() more than once,
+                barrier();
-         * transitioning via a suitable intermediate state.
+                ((u32 *)ptep)[0] = (u32)(pte_val(pte));
-         * Note that this sequence also means that if we are transitioning
+        } else {
-         * from any migrating PTE to a non-migrating one, we will not
+                ((u32 *)ptep)[0] = (u32)(pte_val(pte));
-         * see a half-updated PTE with the migrating bit off.
+                barrier();
-         */
+                ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
-#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
+        }
-# error Must write the present and migrating bits last
+#endif /* __tilegx__ */
-#endif
+}
-        ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
-        barrier();
+void set_pte(pte_t *ptep, pte_t pte)
-        ((u32 *)ptep)[0] = (u32)(pte_val(pte));
+{
-#endif
+        struct page *page = pfn_to_page(pte_pfn(pte));
+        /* Update the home of a PTE if necessary */
+        pte = pte_set_home(pte, page_home(page));
+        __set_pte(ptep, pte);
 }
 /* Can this mm load a PTE with cached_priority set? */

diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 2c850d9864e3..1a2b36f8866d 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c
@@ -142,6 +142,76 @@ pte_t _pte_offset_map(pmd_t dir, unsigned long address)
142	}	142	}
143	#endif	143	#endif
144		144
		145	/**
		146	* shatter_huge_page() - ensure a given address is mapped by a small page.
		147	*
		148	* This function converts a huge PTE mapping kernel LOWMEM into a bunch
		149	* of small PTEs with the same caching. No cache flush required, but we
		150	* must do a global TLB flush.
		151	*
		152	* Any caller that wishes to modify a kernel mapping that might
		153	* have been made with a huge page should call this function,
		154	* since doing so properly avoids race conditions with installing the
		155	* newly-shattered page and then flushing all the TLB entries.
		156	*
		157	* @addr: Address at which to shatter any existing huge page.
		158	*/
		159	void shatter_huge_page(unsigned long addr)
		160	{
		161	pgd_t *pgd;
		162	pud_t *pud;
		163	pmd_t *pmd;
		164	unsigned long flags = 0; /* happy compiler */
		165	#ifdef __PAGETABLE_PMD_FOLDED
		166	struct list_head *pos;
		167	#endif
		168
		169	/* Get a pointer to the pmd entry that we need to change. */
		170	addr &= HPAGE_MASK;
		171	BUG_ON(pgd_addr_invalid(addr));
		172	BUG_ON(addr < PAGE_OFFSET); /* only for kernel LOWMEM */
		173	pgd = swapper_pg_dir + pgd_index(addr);
		174	pud = pud_offset(pgd, addr);
		175	BUG_ON(!pud_present(*pud));
		176	pmd = pmd_offset(pud, addr);
		177	BUG_ON(!pmd_present(*pmd));
		178	if (!pmd_huge_page(*pmd))
		179	return;
		180
		181	/*
		182	* Grab the pgd_lock, since we may need it to walk the pgd_list,
		183	* and since we need some kind of lock here to avoid races.
		184	*/
		185	spin_lock_irqsave(&pgd_lock, flags);
		186	if (!pmd_huge_page(*pmd)) {
		187	/* Lost the race to convert the huge page. */
		188	spin_unlock_irqrestore(&pgd_lock, flags);
		189	return;
		190	}
		191
		192	/* Shatter the huge page into the preallocated L2 page table. */
		193	pmd_populate_kernel(&init_mm, pmd,
		194	get_prealloc_pte(pte_pfn((pte_t )pmd)));
		195
		196	#ifdef __PAGETABLE_PMD_FOLDED
		197	/* Walk every pgd on the system and update the pmd there. */
		198	list_for_each(pos, &pgd_list) {
		199	pmd_t *copy_pmd;
		200	pgd = list_to_pgd(pos) + pgd_index(addr);
		201	pud = pud_offset(pgd, addr);
		202	copy_pmd = pmd_offset(pud, addr);
		203	__set_pmd(copy_pmd, *pmd);
		204	}
		205	#endif
		206
		207	/* Tell every cpu to notice the change. */
		208	flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
		209	cpu_possible_mask, NULL, 0);
		210
		211	/* Hold the lock until the TLB flush is finished to avoid races. */
		212	spin_unlock_irqrestore(&pgd_lock, flags);
		213	}
		214
145	/*	215	/*
146	* List of all pgd's needed so it can invalidate entries in both cached	216	* List of all pgd's needed so it can invalidate entries in both cached
147	* and uncached pgd's. This is essentially codepath-based locking	217	* and uncached pgd's. This is essentially codepath-based locking
@@ -184,9 +254,9 @@ static void pgd_ctor(pgd_t *pgd)
184	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);	254	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
185	#endif	255	#endif
186		256
187	clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,	257	memcpy(pgd + KERNEL_PGD_INDEX_START,
188	swapper_pg_dir + KERNEL_PGD_INDEX_START,	258	swapper_pg_dir + KERNEL_PGD_INDEX_START,
189	KERNEL_PGD_PTRS);	259	KERNEL_PGD_PTRS * sizeof(pgd_t));
190		260
191	pgd_list_add(pgd);	261	pgd_list_add(pgd);
192	spin_unlock_irqrestore(&pgd_lock, flags);	262	spin_unlock_irqrestore(&pgd_lock, flags);
@@ -220,8 +290,11 @@ void pgd_free(struct mm_struct mm, pgd_t pgd)
220		290
221	struct page pte_alloc_one(struct mm_struct mm, unsigned long address)	291	struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
222	{	292	{
223	gfp_t flags = GFP_KERNEL\|__GFP_REPEAT\|__GFP_ZERO\|__GFP_COMP;	293	gfp_t flags = GFP_KERNEL\|__GFP_REPEAT\|__GFP_ZERO;
224	struct page *p;	294	struct page *p;
		295	#if L2_USER_PGTABLE_ORDER > 0
		296	int i;
		297	#endif
225		298
226	#ifdef CONFIG_HIGHPTE	299	#ifdef CONFIG_HIGHPTE
227	flags \|= __GFP_HIGHMEM;	300	flags \|= __GFP_HIGHMEM;
@@ -231,6 +304,18 @@ struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
231	if (p == NULL)	304	if (p == NULL)
232	return NULL;	305	return NULL;
233		306
		307	#if L2_USER_PGTABLE_ORDER > 0
		308	/*
		309	* Make every page have a page_count() of one, not just the first.
		310	* We don't use __GFP_COMP since it doesn't look like it works
		311	* correctly with tlb_remove_page().
		312	*/
		313	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
		314	init_page_count(p+i);
		315	inc_zone_page_state(p+i, NR_PAGETABLE);
		316	}
		317	#endif
		318
234	pgtable_page_ctor(p);	319	pgtable_page_ctor(p);
235	return p;	320	return p;
236	}	321	}
@@ -242,8 +327,15 @@ struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
242	*/	327	*/
243	void pte_free(struct mm_struct mm, struct page p)	328	void pte_free(struct mm_struct mm, struct page p)
244	{	329	{
		330	int i;
		331
245	pgtable_page_dtor(p);	332	pgtable_page_dtor(p);
246	__free_pages(p, L2_USER_PGTABLE_ORDER);	333	__free_page(p);
		334
		335	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
		336	__free_page(p+i);
		337	dec_zone_page_state(p+i, NR_PAGETABLE);
		338	}
247	}	339	}
248		340
249	void __pte_free_tlb(struct mmu_gather tlb, struct page pte,	341	void __pte_free_tlb(struct mmu_gather tlb, struct page pte,
@@ -252,8 +344,12 @@ void __pte_free_tlb(struct mmu_gather tlb, struct page pte,
252	int i;	344	int i;
253		345
254	pgtable_page_dtor(pte);	346	pgtable_page_dtor(pte);
255	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)	347	tlb_remove_page(tlb, pte);
		348
		349	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
256	tlb_remove_page(tlb, pte + i);	350	tlb_remove_page(tlb, pte + i);
		351	dec_zone_page_state(pte + i, NR_PAGETABLE);
		352	}
257	}	353	}
258		354
259	#ifndef __tilegx__	355	#ifndef __tilegx__
@@ -335,35 +431,51 @@ int get_remote_cache_cpu(pgprot_t prot)
335	return x + y * smp_width;	431	return x + y * smp_width;
336	}	432	}
337		433
338	void set_pte_order(pte_t *ptep, pte_t pte, int order)	434	/*
		435	* Convert a kernel VA to a PA and homing information.
		436	*/
		437	int va_to_cpa_and_pte(void va, unsigned long long cpa, pte_t *pte)
339	{	438	{
340	unsigned long pfn = pte_pfn(pte);	439	struct page *page = virt_to_page(va);
341	struct page *page = pfn_to_page(pfn);	440	pte_t null_pte = { 0 };
342		441
343	/* Update the home of a PTE if necessary */	442	*cpa = __pa(va);
344	pte = pte_set_home(pte, page_home(page));	443
		444	/* Note that this is not writing a page table, just returning a pte. */
		445	*pte = pte_set_home(null_pte, page_home(page));
		446
		447	return 0; /* return non-zero if not hfh? */
		448	}
		449	EXPORT_SYMBOL(va_to_cpa_and_pte);
345		450
		451	void __set_pte(pte_t *ptep, pte_t pte)
		452	{
346	#ifdef __tilegx__	453	#ifdef __tilegx__
347	*ptep = pte;	454	*ptep = pte;
348	#else	455	#else
349	/*	456	# if HV_PTE_INDEX_PRESENT >= 32 \|\| HV_PTE_INDEX_MIGRATING >= 32
350	* When setting a PTE, write the high bits first, then write	457	# error Must write the present and migrating bits last
351	* the low bits. This sets the "present" bit only after the	458	# endif
352	* other bits are in place. If a particular PTE update	459	if (pte_present(pte)) {
353	* involves transitioning from one valid PTE to another, it	460	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
354	* may be necessary to call set_pte_order() more than once,	461	barrier();
355	* transitioning via a suitable intermediate state.	462	((u32 *)ptep)[0] = (u32)(pte_val(pte));
356	* Note that this sequence also means that if we are transitioning	463	} else {
357	* from any migrating PTE to a non-migrating one, we will not	464	((u32 *)ptep)[0] = (u32)(pte_val(pte));
358	* see a half-updated PTE with the migrating bit off.	465	barrier();
359	*/	466	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
360	#if HV_PTE_INDEX_PRESENT >= 32 \|\| HV_PTE_INDEX_MIGRATING >= 32	467	}
361	# error Must write the present and migrating bits last	468	#endif /* __tilegx__ */
362	#endif	469	}
363	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);	470
364	barrier();	471	void set_pte(pte_t *ptep, pte_t pte)
365	((u32 *)ptep)[0] = (u32)(pte_val(pte));	472	{
366	#endif	473	struct page *page = pfn_to_page(pte_pfn(pte));
		474
		475	/* Update the home of a PTE if necessary */
		476	pte = pte_set_home(pte, page_home(page));
		477
		478	__set_pte(ptep, pte);
367	}	479	}
368		480
369	/* Can this mm load a PTE with cached_priority set? */	481	/* Can this mm load a PTE with cached_priority set? */