arch/tile: support 4KB page size as well as 64KB

The Tilera architecture traditionally supports 64KB page sizes to improve TLB utilization and improve performance when the hardware is being used primarily to run a single application. For more generic server scenarios, it can be beneficial to run with 4KB page sizes, so this commit allows that to be specified (by modifying the arch/tile/include/hv/pagesize.h header). As part of this change, we also re-worked the PTE management slightly so that PTE writes all go through a __set_pte() function where we can do some additional validation. The set_pte_order() function was eliminated since the "order" argument wasn't being used. One bug uncovered was in the PCI DMA code, which wasn't properly flushing the specified range. This was benign with 64KB pages, but with 4KB pages we were getting some larger flushes wrong. The per-cpu memory reservation code also needed updating to conform with the newer percpu stuff; before it always chose 64KB, and that was always correct, but with 4KB granularity we now have to pay closer attention and reserve the amount of memory that will be requested when the percpu code starts allocating. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
author: Chris Metcalf <cmetcalf@tilera.com> 2011-02-28 16:37:34 -0500
committer: Chris Metcalf <cmetcalf@tilera.com> 2011-03-10 13:17:53 -0500
commit: 76c567fbba50c3da2f4d40e2e551bab26cfd4381 (patch)
tree: 6e3c92a266d0ec255e1930adf5ba5268cd71dee9 /arch/tile/mm/pgtable.c
parent: 09c17eab075ceeafb53935d858c575b6776394d1 (diff)
1 files changed, 141 insertions, 29 deletions
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 2c850d9864e3..1a2b36f8866d 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -142,6 +142,76 @@ pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
 }
 #endif
+/**
+ * shatter_huge_page() - ensure a given address is mapped by a small page.
+ *
+ * This function converts a huge PTE mapping kernel LOWMEM into a bunch
+ * of small PTEs with the same caching.  No cache flush required, but we
+ * must do a global TLB flush.
+ *
+ * Any caller that wishes to modify a kernel mapping that might
+ * have been made with a huge page should call this function,
+ * since doing so properly avoids race conditions with installing the
+ * newly-shattered page and then flushing all the TLB entries.
+ *
+ * @addr: Address at which to shatter any existing huge page.
+ */
+void shatter_huge_page(unsigned long addr)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        unsigned long flags = 0;  /* happy compiler */
+#ifdef __PAGETABLE_PMD_FOLDED
+        struct list_head *pos;
+#endif
+        /* Get a pointer to the pmd entry that we need to change. */
+        addr &= HPAGE_MASK;
+        BUG_ON(pgd_addr_invalid(addr));
+        BUG_ON(addr < PAGE_OFFSET);  /* only for kernel LOWMEM */
+        pgd = swapper_pg_dir + pgd_index(addr);
+        pud = pud_offset(pgd, addr);
+        BUG_ON(!pud_present(*pud));
+        pmd = pmd_offset(pud, addr);
+        BUG_ON(!pmd_present(*pmd));
+        if (!pmd_huge_page(*pmd))
+                return;
+        /*
+         * Grab the pgd_lock, since we may need it to walk the pgd_list,
+         * and since we need some kind of lock here to avoid races.
+         */
+        spin_lock_irqsave(&pgd_lock, flags);
+        if (!pmd_huge_page(*pmd)) {
+                /* Lost the race to convert the huge page. */
+                spin_unlock_irqrestore(&pgd_lock, flags);
+                return;
+        }
+        /* Shatter the huge page into the preallocated L2 page table. */
+        pmd_populate_kernel(&init_mm, pmd,
+                            get_prealloc_pte(pte_pfn(*(pte_t *)pmd)));
+#ifdef __PAGETABLE_PMD_FOLDED
+        /* Walk every pgd on the system and update the pmd there. */
+        list_for_each(pos, &pgd_list) {
+                pmd_t *copy_pmd;
+                pgd = list_to_pgd(pos) + pgd_index(addr);
+                pud = pud_offset(pgd, addr);
+                copy_pmd = pmd_offset(pud, addr);
+                __set_pmd(copy_pmd, *pmd);
+        }
+#endif
+        /* Tell every cpu to notice the change. */
+        flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
+                     cpu_possible_mask, NULL, 0);
+        /* Hold the lock until the TLB flush is finished to avoid races. */
+        spin_unlock_irqrestore(&pgd_lock, flags);
+}
 /*
 * List of all pgd's needed so it can invalidate entries in both cached
 * and uncached pgd's. This is essentially codepath-based locking
@@ -184,9 +254,9 @@ static void pgd_ctor(pgd_t *pgd)
        BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
 #endif
-        clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
+        memcpy(pgd + KERNEL_PGD_INDEX_START,
-                        swapper_pg_dir + KERNEL_PGD_INDEX_START,
+               swapper_pg_dir + KERNEL_PGD_INDEX_START,
-                        KERNEL_PGD_PTRS);
+               KERNEL_PGD_PTRS * sizeof(pgd_t));
        pgd_list_add(pgd);
        spin_unlock_irqrestore(&pgd_lock, flags);
@@ -220,8 +290,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-        gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP;
+        gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
        struct page *p;
+#if L2_USER_PGTABLE_ORDER > 0
+        int i;
+#endif
 #ifdef CONFIG_HIGHPTE
        flags |= __GFP_HIGHMEM;
@@ -231,6 +304,18 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
        if (p == NULL)
                return NULL;
+#if L2_USER_PGTABLE_ORDER > 0
+        /*
+         * Make every page have a page_count() of one, not just the first.
+         * We don't use __GFP_COMP since it doesn't look like it works
+         * correctly with tlb_remove_page().
+         */
+        for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+                init_page_count(p+i);
+                inc_zone_page_state(p+i, NR_PAGETABLE);
+        }
+#endif
        pgtable_page_ctor(p);
        return p;
 }
@@ -242,8 +327,15 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 */
 void pte_free(struct mm_struct *mm, struct page *p)
 {
+        int i;
        pgtable_page_dtor(p);
-        __free_pages(p, L2_USER_PGTABLE_ORDER);
+        __free_page(p);
+        for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+                __free_page(p+i);
+                dec_zone_page_state(p+i, NR_PAGETABLE);
+        }
 }
 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
@@ -252,8 +344,12 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
        int i;
        pgtable_page_dtor(pte);
-        for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
+        tlb_remove_page(tlb, pte);
+        for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
                tlb_remove_page(tlb, pte + i);
+                dec_zone_page_state(pte + i, NR_PAGETABLE);
+        }
 }
 #ifndef __tilegx__
@@ -335,35 +431,51 @@ int get_remote_cache_cpu(pgprot_t prot)
        return x + y * smp_width;
 }
-void set_pte_order(pte_t *ptep, pte_t pte, int order)
+/*
+ * Convert a kernel VA to a PA and homing information.
+ */
+int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)
 {
-        unsigned long pfn = pte_pfn(pte);
+        struct page *page = virt_to_page(va);
-        struct page *page = pfn_to_page(pfn);
+        pte_t null_pte = { 0 };
-        /* Update the home of a PTE if necessary */
+        *cpa = __pa(va);
-        pte = pte_set_home(pte, page_home(page));
+        /* Note that this is not writing a page table, just returning a pte. */
+        *pte = pte_set_home(null_pte, page_home(page));
+        return 0; /* return non-zero if not hfh? */
+}
+EXPORT_SYMBOL(va_to_cpa_and_pte);
+void __set_pte(pte_t *ptep, pte_t pte)
+{
 #ifdef __tilegx__
        *ptep = pte;
 #else
-        /*
+# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
-         * When setting a PTE, write the high bits first, then write
+#  error Must write the present and migrating bits last
-         * the low bits.  This sets the "present" bit only after the
+# endif
-         * other bits are in place.  If a particular PTE update
+        if (pte_present(pte)) {
-         * involves transitioning from one valid PTE to another, it
+                ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
-         * may be necessary to call set_pte_order() more than once,
+                barrier();
-         * transitioning via a suitable intermediate state.
+                ((u32 *)ptep)[0] = (u32)(pte_val(pte));
-         * Note that this sequence also means that if we are transitioning
+        } else {
-         * from any migrating PTE to a non-migrating one, we will not
+                ((u32 *)ptep)[0] = (u32)(pte_val(pte));
-         * see a half-updated PTE with the migrating bit off.
+                barrier();
-         */
+                ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
-#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
+        }
-# error Must write the present and migrating bits last
+#endif /* __tilegx__ */
-#endif
+}
-        ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
-        barrier();
+void set_pte(pte_t *ptep, pte_t pte)
-        ((u32 *)ptep)[0] = (u32)(pte_val(pte));
+{
-#endif
+        struct page *page = pfn_to_page(pte_pfn(pte));
+        /* Update the home of a PTE if necessary */
+        pte = pte_set_home(pte, page_home(page));
+        __set_pte(ptep, pte);
 }
 /* Can this mm load a PTE with cached_priority set? */
author	Chris Metcalf <cmetcalf@tilera.com>	2011-02-28 16:37:34 -0500
committer	Chris Metcalf <cmetcalf@tilera.com>	2011-03-10 13:17:53 -0500
commit	76c567fbba50c3da2f4d40e2e551bab26cfd4381 (patch)
tree	6e3c92a266d0ec255e1930adf5ba5268cd71dee9 /arch/tile/mm/pgtable.c
parent	09c17eab075ceeafb53935d858c575b6776394d1 (diff)

diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 2c850d9864e3..1a2b36f8866d 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c
@@ -142,6 +142,76 @@ pte_t _pte_offset_map(pmd_t dir, unsigned long address)
142	}	142	}
143	#endif	143	#endif
144		144
		145	/**
		146	* shatter_huge_page() - ensure a given address is mapped by a small page.
		147	*
		148	* This function converts a huge PTE mapping kernel LOWMEM into a bunch
		149	* of small PTEs with the same caching. No cache flush required, but we
		150	* must do a global TLB flush.
		151	*
		152	* Any caller that wishes to modify a kernel mapping that might
		153	* have been made with a huge page should call this function,
		154	* since doing so properly avoids race conditions with installing the
		155	* newly-shattered page and then flushing all the TLB entries.
		156	*
		157	* @addr: Address at which to shatter any existing huge page.
		158	*/
		159	void shatter_huge_page(unsigned long addr)
		160	{
		161	pgd_t *pgd;
		162	pud_t *pud;
		163	pmd_t *pmd;
		164	unsigned long flags = 0; /* happy compiler */
		165	#ifdef __PAGETABLE_PMD_FOLDED
		166	struct list_head *pos;
		167	#endif
		168
		169	/* Get a pointer to the pmd entry that we need to change. */
		170	addr &= HPAGE_MASK;
		171	BUG_ON(pgd_addr_invalid(addr));
		172	BUG_ON(addr < PAGE_OFFSET); /* only for kernel LOWMEM */
		173	pgd = swapper_pg_dir + pgd_index(addr);
		174	pud = pud_offset(pgd, addr);
		175	BUG_ON(!pud_present(*pud));
		176	pmd = pmd_offset(pud, addr);
		177	BUG_ON(!pmd_present(*pmd));
		178	if (!pmd_huge_page(*pmd))
		179	return;
		180
		181	/*
		182	* Grab the pgd_lock, since we may need it to walk the pgd_list,
		183	* and since we need some kind of lock here to avoid races.
		184	*/
		185	spin_lock_irqsave(&pgd_lock, flags);
		186	if (!pmd_huge_page(*pmd)) {
		187	/* Lost the race to convert the huge page. */
		188	spin_unlock_irqrestore(&pgd_lock, flags);
		189	return;
		190	}
		191
		192	/* Shatter the huge page into the preallocated L2 page table. */
		193	pmd_populate_kernel(&init_mm, pmd,
		194	get_prealloc_pte(pte_pfn((pte_t )pmd)));
		195
		196	#ifdef __PAGETABLE_PMD_FOLDED
		197	/* Walk every pgd on the system and update the pmd there. */
		198	list_for_each(pos, &pgd_list) {
		199	pmd_t *copy_pmd;
		200	pgd = list_to_pgd(pos) + pgd_index(addr);
		201	pud = pud_offset(pgd, addr);
		202	copy_pmd = pmd_offset(pud, addr);
		203	__set_pmd(copy_pmd, *pmd);
		204	}
		205	#endif
		206
		207	/* Tell every cpu to notice the change. */
		208	flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
		209	cpu_possible_mask, NULL, 0);
		210
		211	/* Hold the lock until the TLB flush is finished to avoid races. */
		212	spin_unlock_irqrestore(&pgd_lock, flags);
		213	}
		214
145	/*	215	/*
146	* List of all pgd's needed so it can invalidate entries in both cached	216	* List of all pgd's needed so it can invalidate entries in both cached
147	* and uncached pgd's. This is essentially codepath-based locking	217	* and uncached pgd's. This is essentially codepath-based locking
@@ -184,9 +254,9 @@ static void pgd_ctor(pgd_t *pgd)
184	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);	254	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
185	#endif	255	#endif
186		256
187	clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,	257	memcpy(pgd + KERNEL_PGD_INDEX_START,
188	swapper_pg_dir + KERNEL_PGD_INDEX_START,	258	swapper_pg_dir + KERNEL_PGD_INDEX_START,
189	KERNEL_PGD_PTRS);	259	KERNEL_PGD_PTRS * sizeof(pgd_t));
190		260
191	pgd_list_add(pgd);	261	pgd_list_add(pgd);
192	spin_unlock_irqrestore(&pgd_lock, flags);	262	spin_unlock_irqrestore(&pgd_lock, flags);
@@ -220,8 +290,11 @@ void pgd_free(struct mm_struct mm, pgd_t pgd)
220		290
221	struct page pte_alloc_one(struct mm_struct mm, unsigned long address)	291	struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
222	{	292	{
223	gfp_t flags = GFP_KERNEL\|__GFP_REPEAT\|__GFP_ZERO\|__GFP_COMP;	293	gfp_t flags = GFP_KERNEL\|__GFP_REPEAT\|__GFP_ZERO;
224	struct page *p;	294	struct page *p;
		295	#if L2_USER_PGTABLE_ORDER > 0
		296	int i;
		297	#endif
225		298
226	#ifdef CONFIG_HIGHPTE	299	#ifdef CONFIG_HIGHPTE
227	flags \|= __GFP_HIGHMEM;	300	flags \|= __GFP_HIGHMEM;
@@ -231,6 +304,18 @@ struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
231	if (p == NULL)	304	if (p == NULL)
232	return NULL;	305	return NULL;
233		306
		307	#if L2_USER_PGTABLE_ORDER > 0
		308	/*
		309	* Make every page have a page_count() of one, not just the first.
		310	* We don't use __GFP_COMP since it doesn't look like it works
		311	* correctly with tlb_remove_page().
		312	*/
		313	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
		314	init_page_count(p+i);
		315	inc_zone_page_state(p+i, NR_PAGETABLE);
		316	}
		317	#endif
		318
234	pgtable_page_ctor(p);	319	pgtable_page_ctor(p);
235	return p;	320	return p;
236	}	321	}
@@ -242,8 +327,15 @@ struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
242	*/	327	*/
243	void pte_free(struct mm_struct mm, struct page p)	328	void pte_free(struct mm_struct mm, struct page p)
244	{	329	{
		330	int i;
		331
245	pgtable_page_dtor(p);	332	pgtable_page_dtor(p);
246	__free_pages(p, L2_USER_PGTABLE_ORDER);	333	__free_page(p);
		334
		335	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
		336	__free_page(p+i);
		337	dec_zone_page_state(p+i, NR_PAGETABLE);
		338	}
247	}	339	}
248		340
249	void __pte_free_tlb(struct mmu_gather tlb, struct page pte,	341	void __pte_free_tlb(struct mmu_gather tlb, struct page pte,
@@ -252,8 +344,12 @@ void __pte_free_tlb(struct mmu_gather tlb, struct page pte,
252	int i;	344	int i;
253		345
254	pgtable_page_dtor(pte);	346	pgtable_page_dtor(pte);
255	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)	347	tlb_remove_page(tlb, pte);
		348
		349	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
256	tlb_remove_page(tlb, pte + i);	350	tlb_remove_page(tlb, pte + i);
		351	dec_zone_page_state(pte + i, NR_PAGETABLE);
		352	}
257	}	353	}
258		354
259	#ifndef __tilegx__	355	#ifndef __tilegx__
@@ -335,35 +431,51 @@ int get_remote_cache_cpu(pgprot_t prot)
335	return x + y * smp_width;	431	return x + y * smp_width;
336	}	432	}
337		433
338	void set_pte_order(pte_t *ptep, pte_t pte, int order)	434	/*
		435	* Convert a kernel VA to a PA and homing information.
		436	*/
		437	int va_to_cpa_and_pte(void va, unsigned long long cpa, pte_t *pte)
339	{	438	{
340	unsigned long pfn = pte_pfn(pte);	439	struct page *page = virt_to_page(va);
341	struct page *page = pfn_to_page(pfn);	440	pte_t null_pte = { 0 };
342		441
343	/* Update the home of a PTE if necessary */	442	*cpa = __pa(va);
344	pte = pte_set_home(pte, page_home(page));	443
		444	/* Note that this is not writing a page table, just returning a pte. */
		445	*pte = pte_set_home(null_pte, page_home(page));
		446
		447	return 0; /* return non-zero if not hfh? */
		448	}
		449	EXPORT_SYMBOL(va_to_cpa_and_pte);
345		450
		451	void __set_pte(pte_t *ptep, pte_t pte)
		452	{
346	#ifdef __tilegx__	453	#ifdef __tilegx__
347	*ptep = pte;	454	*ptep = pte;
348	#else	455	#else
349	/*	456	# if HV_PTE_INDEX_PRESENT >= 32 \|\| HV_PTE_INDEX_MIGRATING >= 32
350	* When setting a PTE, write the high bits first, then write	457	# error Must write the present and migrating bits last
351	* the low bits. This sets the "present" bit only after the	458	# endif
352	* other bits are in place. If a particular PTE update	459	if (pte_present(pte)) {
353	* involves transitioning from one valid PTE to another, it	460	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
354	* may be necessary to call set_pte_order() more than once,	461	barrier();
355	* transitioning via a suitable intermediate state.	462	((u32 *)ptep)[0] = (u32)(pte_val(pte));
356	* Note that this sequence also means that if we are transitioning	463	} else {
357	* from any migrating PTE to a non-migrating one, we will not	464	((u32 *)ptep)[0] = (u32)(pte_val(pte));
358	* see a half-updated PTE with the migrating bit off.	465	barrier();
359	*/	466	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
360	#if HV_PTE_INDEX_PRESENT >= 32 \|\| HV_PTE_INDEX_MIGRATING >= 32	467	}
361	# error Must write the present and migrating bits last	468	#endif /* __tilegx__ */
362	#endif	469	}
363	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);	470
364	barrier();	471	void set_pte(pte_t *ptep, pte_t pte)
365	((u32 *)ptep)[0] = (u32)(pte_val(pte));	472	{
366	#endif	473	struct page *page = pfn_to_page(pte_pfn(pte));
		474
		475	/* Update the home of a PTE if necessary */
		476	pte = pte_set_home(pte, page_home(page));
		477
		478	__set_pte(ptep, pte);
367	}	479	}
368		480
369	/* Can this mm load a PTE with cached_priority set? */	481	/* Can this mm load a PTE with cached_priority set? */