aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2011-02-28 16:37:34 -0500
committerChris Metcalf <cmetcalf@tilera.com>2011-03-10 13:17:53 -0500
commit76c567fbba50c3da2f4d40e2e551bab26cfd4381 (patch)
tree6e3c92a266d0ec255e1930adf5ba5268cd71dee9
parent09c17eab075ceeafb53935d858c575b6776394d1 (diff)
arch/tile: support 4KB page size as well as 64KB
The Tilera architecture traditionally supports 64KB page sizes to improve TLB utilization and improve performance when the hardware is being used primarily to run a single application. For more generic server scenarios, it can be beneficial to run with 4KB page sizes, so this commit allows that to be specified (by modifying the arch/tile/include/hv/pagesize.h header). As part of this change, we also re-worked the PTE management slightly so that PTE writes all go through a __set_pte() function where we can do some additional validation. The set_pte_order() function was eliminated since the "order" argument wasn't being used. One bug uncovered was in the PCI DMA code, which wasn't properly flushing the specified range. This was benign with 64KB pages, but with 4KB pages we were getting some larger flushes wrong. The per-cpu memory reservation code also needed updating to conform with the newer percpu stuff; before it always chose 64KB, and that was always correct, but with 4KB granularity we now have to pay closer attention and reserve the amount of memory that will be requested when the percpu code starts allocating. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
-rw-r--r--arch/tile/Kconfig6
-rw-r--r--arch/tile/include/asm/hugetlb.h2
-rw-r--r--arch/tile/include/asm/page.h34
-rw-r--r--arch/tile/include/asm/pgalloc.h7
-rw-r--r--arch/tile/include/asm/pgtable.h31
-rw-r--r--arch/tile/include/asm/pgtable_32.h8
-rw-r--r--arch/tile/include/asm/stack.h3
-rw-r--r--arch/tile/include/asm/thread_info.h1
-rw-r--r--arch/tile/kernel/intvec_32.S16
-rw-r--r--arch/tile/kernel/machine_kexec.c7
-rw-r--r--arch/tile/kernel/pci-dma.c38
-rw-r--r--arch/tile/kernel/process.c2
-rw-r--r--arch/tile/kernel/setup.c20
-rw-r--r--arch/tile/lib/memcpy_tile64.c4
-rw-r--r--arch/tile/mm/homecache.c2
-rw-r--r--arch/tile/mm/init.c18
-rw-r--r--arch/tile/mm/migrate_32.S1
-rw-r--r--arch/tile/mm/pgtable.c170
18 files changed, 235 insertions, 135 deletions
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index eed0fc5dfe6..f3b78701c21 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -202,12 +202,6 @@ config NODES_SHIFT
202 By default, 2, i.e. 2^2 == 4 DDR2 controllers. 202 By default, 2, i.e. 2^2 == 4 DDR2 controllers.
203 In a system with more controllers, this value should be raised. 203 In a system with more controllers, this value should be raised.
204 204
205# Need 16MB areas to enable hugetlb
206# See build-time check in arch/tile/mm/init.c.
207config FORCE_MAX_ZONEORDER
208 int
209 default 9
210
211choice 205choice
212 depends on !TILEGX 206 depends on !TILEGX
213 prompt "Memory split" if EXPERT 207 prompt "Memory split" if EXPERT
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index 0521c277bbd..d396d180516 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -54,7 +54,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
54static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 54static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
55 pte_t *ptep, pte_t pte) 55 pte_t *ptep, pte_t pte)
56{ 56{
57 set_pte_order(ptep, pte, HUGETLB_PAGE_ORDER); 57 set_pte(ptep, pte);
58} 58}
59 59
60static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, 60static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index 7979a45430d..3eb53525bf9 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -16,10 +16,11 @@
16#define _ASM_TILE_PAGE_H 16#define _ASM_TILE_PAGE_H
17 17
18#include <linux/const.h> 18#include <linux/const.h>
19#include <hv/pagesize.h>
19 20
20/* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */ 21/* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */
21#define PAGE_SHIFT 16 22#define PAGE_SHIFT HV_LOG2_PAGE_SIZE_SMALL
22#define HPAGE_SHIFT 24 23#define HPAGE_SHIFT HV_LOG2_PAGE_SIZE_LARGE
23 24
24#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) 25#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
25#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) 26#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
@@ -29,25 +30,18 @@
29 30
30#ifdef __KERNEL__ 31#ifdef __KERNEL__
31 32
32#include <hv/hypervisor.h>
33#include <arch/chip.h>
34
35/* 33/*
36 * The {,H}PAGE_SHIFT values must match the HV_LOG2_PAGE_SIZE_xxx 34 * If the Kconfig doesn't specify, set a maximum zone order that
37 * definitions in <hv/hypervisor.h>. We validate this at build time 35 * is enough so that we can create huge pages from small pages given
38 * here, and again at runtime during early boot. We provide a 36 * the respective sizes of the two page types. See <linux/mmzone.h>.
39 * separate definition since userspace doesn't have <hv/hypervisor.h>.
40 *
41 * Be careful to distinguish PAGE_SHIFT from HV_PTE_INDEX_PFN, since
42 * they are the same on i386 but not TILE.
43 */ 37 */
44#if HV_LOG2_PAGE_SIZE_SMALL != PAGE_SHIFT 38#ifndef CONFIG_FORCE_MAX_ZONEORDER
45# error Small page size mismatch in Linux 39#define CONFIG_FORCE_MAX_ZONEORDER (HPAGE_SHIFT - PAGE_SHIFT + 1)
46#endif
47#if HV_LOG2_PAGE_SIZE_LARGE != HPAGE_SHIFT
48# error Huge page size mismatch in Linux
49#endif 40#endif
50 41
42#include <hv/hypervisor.h>
43#include <arch/chip.h>
44
51#ifndef __ASSEMBLY__ 45#ifndef __ASSEMBLY__
52 46
53#include <linux/types.h> 47#include <linux/types.h>
@@ -81,12 +75,6 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
81 * Hypervisor page tables are made of the same basic structure. 75 * Hypervisor page tables are made of the same basic structure.
82 */ 76 */
83 77
84typedef __u64 pteval_t;
85typedef __u64 pmdval_t;
86typedef __u64 pudval_t;
87typedef __u64 pgdval_t;
88typedef __u64 pgprotval_t;
89
90typedef HV_PTE pte_t; 78typedef HV_PTE pte_t;
91typedef HV_PTE pgd_t; 79typedef HV_PTE pgd_t;
92typedef HV_PTE pgprot_t; 80typedef HV_PTE pgprot_t;
diff --git a/arch/tile/include/asm/pgalloc.h b/arch/tile/include/asm/pgalloc.h
index cf52791a550..e919c0bdc22 100644
--- a/arch/tile/include/asm/pgalloc.h
+++ b/arch/tile/include/asm/pgalloc.h
@@ -41,9 +41,9 @@
41static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 41static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
42{ 42{
43#ifdef CONFIG_64BIT 43#ifdef CONFIG_64BIT
44 set_pte_order(pmdp, pmd, L2_USER_PGTABLE_ORDER); 44 set_pte(pmdp, pmd);
45#else 45#else
46 set_pte_order(&pmdp->pud.pgd, pmd.pud.pgd, L2_USER_PGTABLE_ORDER); 46 set_pte(&pmdp->pud.pgd, pmd.pud.pgd);
47#endif 47#endif
48} 48}
49 49
@@ -100,6 +100,9 @@ pte_t *get_prealloc_pte(unsigned long pfn);
100/* During init, we can shatter kernel huge pages if needed. */ 100/* During init, we can shatter kernel huge pages if needed. */
101void shatter_pmd(pmd_t *pmd); 101void shatter_pmd(pmd_t *pmd);
102 102
103/* After init, a more complex technique is required. */
104void shatter_huge_page(unsigned long addr);
105
103#ifdef __tilegx__ 106#ifdef __tilegx__
104/* We share a single page allocator for both L1 and L2 page tables. */ 107/* We share a single page allocator for both L1 and L2 page tables. */
105#if HV_L1_SIZE != HV_L2_SIZE 108#if HV_L1_SIZE != HV_L2_SIZE
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index a6604e9485d..1a20b7ef8ea 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -233,15 +233,23 @@ static inline void __pte_clear(pte_t *ptep)
233#define pgd_ERROR(e) \ 233#define pgd_ERROR(e) \
234 pr_err("%s:%d: bad pgd 0x%016llx.\n", __FILE__, __LINE__, pgd_val(e)) 234 pr_err("%s:%d: bad pgd 0x%016llx.\n", __FILE__, __LINE__, pgd_val(e))
235 235
236/* Return PA and protection info for a given kernel VA. */
237int va_to_cpa_and_pte(void *va, phys_addr_t *cpa, pte_t *pte);
238
239/*
240 * __set_pte() ensures we write the 64-bit PTE with 32-bit words in
241 * the right order on 32-bit platforms and also allows us to write
242 * hooks to check valid PTEs, etc., if we want.
243 */
244void __set_pte(pte_t *ptep, pte_t pte);
245
236/* 246/*
237 * set_pte_order() sets the given PTE and also sanity-checks the 247 * set_pte() sets the given PTE and also sanity-checks the
238 * requested PTE against the page homecaching. Unspecified parts 248 * requested PTE against the page homecaching. Unspecified parts
239 * of the PTE are filled in when it is written to memory, i.e. all 249 * of the PTE are filled in when it is written to memory, i.e. all
240 * caching attributes if "!forcecache", or the home cpu if "anyhome". 250 * caching attributes if "!forcecache", or the home cpu if "anyhome".
241 */ 251 */
242extern void set_pte_order(pte_t *ptep, pte_t pte, int order); 252extern void set_pte(pte_t *ptep, pte_t pte);
243
244#define set_pte(ptep, pteval) set_pte_order(ptep, pteval, 0)
245#define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval) 253#define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval)
246#define set_pte_atomic(pteptr, pteval) set_pte(pteptr, pteval) 254#define set_pte_atomic(pteptr, pteval) set_pte(pteptr, pteval)
247 255
@@ -293,21 +301,6 @@ extern void check_mm_caching(struct mm_struct *prev, struct mm_struct *next);
293#define __swp_entry_to_pte(swp) ((pte_t) { (((long long) ((swp).val)) << 32) }) 301#define __swp_entry_to_pte(swp) ((pte_t) { (((long long) ((swp).val)) << 32) })
294 302
295/* 303/*
296 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
297 *
298 * dst - pointer to pgd range anwhere on a pgd page
299 * src - ""
300 * count - the number of pgds to copy.
301 *
302 * dst and src can be on the same page, but the range must not overlap,
303 * and must not cross a page boundary.
304 */
305static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
306{
307 memcpy(dst, src, count * sizeof(pgd_t));
308}
309
310/*
311 * Conversion functions: convert a page and protection to a page entry, 304 * Conversion functions: convert a page and protection to a page entry,
312 * and a page entry and page directory to the page they refer to. 305 * and a page entry and page directory to the page they refer to.
313 */ 306 */
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index 53ec3488474..9f98529761f 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -24,6 +24,7 @@
24#define PGDIR_SIZE HV_PAGE_SIZE_LARGE 24#define PGDIR_SIZE HV_PAGE_SIZE_LARGE
25#define PGDIR_MASK (~(PGDIR_SIZE-1)) 25#define PGDIR_MASK (~(PGDIR_SIZE-1))
26#define PTRS_PER_PGD (1 << (32 - PGDIR_SHIFT)) 26#define PTRS_PER_PGD (1 << (32 - PGDIR_SHIFT))
27#define SIZEOF_PGD (PTRS_PER_PGD * sizeof(pgd_t))
27 28
28/* 29/*
29 * The level-2 index is defined by the difference between the huge 30 * The level-2 index is defined by the difference between the huge
@@ -33,6 +34,7 @@
33 * this nomenclature is somewhat confusing. 34 * this nomenclature is somewhat confusing.
34 */ 35 */
35#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL)) 36#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
37#define SIZEOF_PTE (PTRS_PER_PTE * sizeof(pte_t))
36 38
37#ifndef __ASSEMBLY__ 39#ifndef __ASSEMBLY__
38 40
@@ -94,7 +96,6 @@ static inline int pgd_addr_invalid(unsigned long addr)
94 */ 96 */
95#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 97#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
96#define __HAVE_ARCH_PTEP_SET_WRPROTECT 98#define __HAVE_ARCH_PTEP_SET_WRPROTECT
97#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
98 99
99extern int ptep_test_and_clear_young(struct vm_area_struct *, 100extern int ptep_test_and_clear_young(struct vm_area_struct *,
100 unsigned long addr, pte_t *); 101 unsigned long addr, pte_t *);
@@ -110,6 +111,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
110 return pte; 111 return pte;
111} 112}
112 113
114static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
115{
116 set_pte(&pmdp->pud.pgd, pmdval.pud.pgd);
117}
118
113/* Create a pmd from a PTFN. */ 119/* Create a pmd from a PTFN. */
114static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot) 120static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
115{ 121{
diff --git a/arch/tile/include/asm/stack.h b/arch/tile/include/asm/stack.h
index f908473c322..4d97a2db932 100644
--- a/arch/tile/include/asm/stack.h
+++ b/arch/tile/include/asm/stack.h
@@ -18,13 +18,14 @@
18#include <linux/types.h> 18#include <linux/types.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <asm/backtrace.h> 20#include <asm/backtrace.h>
21#include <asm/page.h>
21#include <hv/hypervisor.h> 22#include <hv/hypervisor.h>
22 23
23/* Everything we need to keep track of a backtrace iteration */ 24/* Everything we need to keep track of a backtrace iteration */
24struct KBacktraceIterator { 25struct KBacktraceIterator {
25 BacktraceIterator it; 26 BacktraceIterator it;
26 struct task_struct *task; /* task we are backtracing */ 27 struct task_struct *task; /* task we are backtracing */
27 HV_PTE *pgtable; /* page table for user space access */ 28 pte_t *pgtable; /* page table for user space access */
28 int end; /* iteration complete. */ 29 int end; /* iteration complete. */
29 int new_context; /* new context is starting */ 30 int new_context; /* new context is starting */
30 int profile; /* profiling, so stop on async intrpt */ 31 int profile; /* profiling, so stop on async intrpt */
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 3872f2b345d..9e8e9c4dfa2 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -68,6 +68,7 @@ struct thread_info {
68#else 68#else
69#define THREAD_SIZE_ORDER (0) 69#define THREAD_SIZE_ORDER (0)
70#endif 70#endif
71#define THREAD_SIZE_PAGES (1 << THREAD_SIZE_ORDER)
71 72
72#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) 73#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
73#define LOG2_THREAD_SIZE (PAGE_SHIFT + THREAD_SIZE_ORDER) 74#define LOG2_THREAD_SIZE (PAGE_SHIFT + THREAD_SIZE_ORDER)
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index eabf1ef02cb..fffcfa6b3a6 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -1556,7 +1556,10 @@ STD_ENTRY(_sys_clone)
1556 .align 64 1556 .align 64
1557 /* Align much later jump on the start of a cache line. */ 1557 /* Align much later jump on the start of a cache line. */
1558#if !ATOMIC_LOCKS_FOUND_VIA_TABLE() 1558#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
1559 nop; nop 1559 nop
1560#if PAGE_SIZE >= 0x10000
1561 nop
1562#endif
1560#endif 1563#endif
1561ENTRY(sys_cmpxchg) 1564ENTRY(sys_cmpxchg)
1562 1565
@@ -1587,6 +1590,10 @@ ENTRY(sys_cmpxchg)
1587 * NOTE: this must match __atomic_hashed_lock() in lib/atomic_32.c. 1590 * NOTE: this must match __atomic_hashed_lock() in lib/atomic_32.c.
1588 */ 1591 */
1589 1592
1593#if (PAGE_OFFSET & 0xffff) != 0
1594# error Code here assumes PAGE_OFFSET can be loaded with just hi16()
1595#endif
1596
1590#if ATOMIC_LOCKS_FOUND_VIA_TABLE() 1597#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
1591 { 1598 {
1592 /* Check for unaligned input. */ 1599 /* Check for unaligned input. */
@@ -1679,11 +1686,14 @@ ENTRY(sys_cmpxchg)
1679 lw r26, r0 1686 lw r26, r0
1680 } 1687 }
1681 { 1688 {
1682 /* atomic_locks is page aligned so this suffices to get its addr. */ 1689 auli r21, zero, ha16(atomic_locks)
1683 auli r21, zero, hi16(atomic_locks)
1684 1690
1685 bbns r23, .Lcmpxchg_badaddr 1691 bbns r23, .Lcmpxchg_badaddr
1686 } 1692 }
1693#if PAGE_SIZE < 0x10000
1694 /* atomic_locks is page-aligned so for big pages we don't need this. */
1695 addli r21, r21, lo16(atomic_locks)
1696#endif
1687 { 1697 {
1688 /* 1698 /*
1689 * Insert the hash bits into the page-aligned pointer. 1699 * Insert the hash bits into the page-aligned pointer.
diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c
index 0d8b9e93348..e00d7179989 100644
--- a/arch/tile/kernel/machine_kexec.c
+++ b/arch/tile/kernel/machine_kexec.c
@@ -240,8 +240,11 @@ static void setup_quasi_va_is_pa(void)
240 pte = hv_pte(_PAGE_KERNEL | _PAGE_HUGE_PAGE); 240 pte = hv_pte(_PAGE_KERNEL | _PAGE_HUGE_PAGE);
241 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3); 241 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
242 242
243 for (i = 0; i < pgd_index(PAGE_OFFSET); i++) 243 for (i = 0; i < pgd_index(PAGE_OFFSET); i++) {
244 pgtable[i] = pfn_pte(i << (HPAGE_SHIFT - PAGE_SHIFT), pte); 244 unsigned long pfn = i << (HPAGE_SHIFT - PAGE_SHIFT);
245 if (pfn_valid(pfn))
246 __set_pte(&pgtable[i], pfn_pte(pfn, pte));
247 }
245} 248}
246 249
247 250
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index 5ad5e13b0fa..658752b2835 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -86,6 +86,21 @@ EXPORT_SYMBOL(dma_free_coherent);
86 * can count on nothing having been touched. 86 * can count on nothing having been touched.
87 */ 87 */
88 88
89/* Flush a PA range from cache page by page. */
90static void __dma_map_pa_range(dma_addr_t dma_addr, size_t size)
91{
92 struct page *page = pfn_to_page(PFN_DOWN(dma_addr));
93 size_t bytesleft = PAGE_SIZE - (dma_addr & (PAGE_SIZE - 1));
94
95 while ((ssize_t)size > 0) {
96 /* Flush the page. */
97 homecache_flush_cache(page++, 0);
98
99 /* Figure out if we need to continue on the next page. */
100 size -= bytesleft;
101 bytesleft = PAGE_SIZE;
102 }
103}
89 104
90/* 105/*
91 * dma_map_single can be passed any memory address, and there appear 106 * dma_map_single can be passed any memory address, and there appear
@@ -97,26 +112,12 @@ EXPORT_SYMBOL(dma_free_coherent);
97dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, 112dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
98 enum dma_data_direction direction) 113 enum dma_data_direction direction)
99{ 114{
100 struct page *page; 115 dma_addr_t dma_addr = __pa(ptr);
101 dma_addr_t dma_addr;
102 int thispage;
103 116
104 BUG_ON(!valid_dma_direction(direction)); 117 BUG_ON(!valid_dma_direction(direction));
105 WARN_ON(size == 0); 118 WARN_ON(size == 0);
106 119
107 dma_addr = __pa(ptr); 120 __dma_map_pa_range(dma_addr, size);
108
109 /* We might have been handed a buffer that wraps a page boundary */
110 while ((int)size > 0) {
111 /* The amount to flush that's on this page */
112 thispage = PAGE_SIZE - ((unsigned long)ptr & (PAGE_SIZE - 1));
113 thispage = min((int)thispage, (int)size);
114 /* Is this valid for any page we could be handed? */
115 page = pfn_to_page(kaddr_to_pfn(ptr));
116 homecache_flush_cache(page, 0);
117 ptr += thispage;
118 size -= thispage;
119 }
120 121
121 return dma_addr; 122 return dma_addr;
122} 123}
@@ -140,10 +141,8 @@ int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
140 WARN_ON(nents == 0 || sglist->length == 0); 141 WARN_ON(nents == 0 || sglist->length == 0);
141 142
142 for_each_sg(sglist, sg, nents, i) { 143 for_each_sg(sglist, sg, nents, i) {
143 struct page *page;
144 sg->dma_address = sg_phys(sg); 144 sg->dma_address = sg_phys(sg);
145 page = pfn_to_page(sg->dma_address >> PAGE_SHIFT); 145 __dma_map_pa_range(sg->dma_address, sg->length);
146 homecache_flush_cache(page, 0);
147 } 146 }
148 147
149 return nents; 148 return nents;
@@ -163,6 +162,7 @@ dma_addr_t dma_map_page(struct device *dev, struct page *page,
163{ 162{
164 BUG_ON(!valid_dma_direction(direction)); 163 BUG_ON(!valid_dma_direction(direction));
165 164
165 BUG_ON(offset + size > PAGE_SIZE);
166 homecache_flush_cache(page, 0); 166 homecache_flush_cache(page, 0);
167 167
168 return page_to_pa(page) + offset; 168 return page_to_pa(page) + offset;
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 5db8b5b63ce..b9cd962e1d3 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -165,7 +165,7 @@ void free_thread_info(struct thread_info *info)
165 kfree(step_state); 165 kfree(step_state);
166 } 166 }
167 167
168 free_page((unsigned long)info); 168 free_pages((unsigned long)info, THREAD_SIZE_ORDER);
169} 169}
170 170
171static void save_arch_state(struct thread_struct *t); 171static void save_arch_state(struct thread_struct *t);
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index f18573643ed..3696b183256 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -59,6 +59,8 @@ unsigned long __initdata node_memmap_pfn[MAX_NUMNODES];
59unsigned long __initdata node_percpu_pfn[MAX_NUMNODES]; 59unsigned long __initdata node_percpu_pfn[MAX_NUMNODES];
60unsigned long __initdata node_free_pfn[MAX_NUMNODES]; 60unsigned long __initdata node_free_pfn[MAX_NUMNODES];
61 61
62static unsigned long __initdata node_percpu[MAX_NUMNODES];
63
62#ifdef CONFIG_HIGHMEM 64#ifdef CONFIG_HIGHMEM
63/* Page frame index of end of lowmem on each controller. */ 65/* Page frame index of end of lowmem on each controller. */
64unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES]; 66unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES];
@@ -554,7 +556,6 @@ static void __init setup_bootmem_allocator(void)
554 reserve_bootmem(crashk_res.start, 556 reserve_bootmem(crashk_res.start,
555 crashk_res.end - crashk_res.start + 1, 0); 557 crashk_res.end - crashk_res.start + 1, 0);
556#endif 558#endif
557
558} 559}
559 560
560void *__init alloc_remap(int nid, unsigned long size) 561void *__init alloc_remap(int nid, unsigned long size)
@@ -568,11 +569,13 @@ void *__init alloc_remap(int nid, unsigned long size)
568 569
569static int __init percpu_size(void) 570static int __init percpu_size(void)
570{ 571{
571 int size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE); 572 int size = __per_cpu_end - __per_cpu_start;
572#ifdef CONFIG_MODULES 573 size += PERCPU_MODULE_RESERVE;
573 if (size < PERCPU_ENOUGH_ROOM) 574 size += PERCPU_DYNAMIC_EARLY_SIZE;
574 size = PERCPU_ENOUGH_ROOM; 575 if (size < PCPU_MIN_UNIT_SIZE)
575#endif 576 size = PCPU_MIN_UNIT_SIZE;
577 size = roundup(size, PAGE_SIZE);
578
576 /* In several places we assume the per-cpu data fits on a huge page. */ 579 /* In several places we assume the per-cpu data fits on a huge page. */
577 BUG_ON(kdata_huge && size > HPAGE_SIZE); 580 BUG_ON(kdata_huge && size > HPAGE_SIZE);
578 return size; 581 return size;
@@ -589,7 +592,6 @@ static inline unsigned long alloc_bootmem_pfn(int size, unsigned long goal)
589static void __init zone_sizes_init(void) 592static void __init zone_sizes_init(void)
590{ 593{
591 unsigned long zones_size[MAX_NR_ZONES] = { 0 }; 594 unsigned long zones_size[MAX_NR_ZONES] = { 0 };
592 unsigned long node_percpu[MAX_NUMNODES] = { 0 };
593 int size = percpu_size(); 595 int size = percpu_size();
594 int num_cpus = smp_height * smp_width; 596 int num_cpus = smp_height * smp_width;
595 int i; 597 int i;
@@ -674,7 +676,7 @@ static void __init zone_sizes_init(void)
674 NODE_DATA(i)->bdata = NODE_DATA(0)->bdata; 676 NODE_DATA(i)->bdata = NODE_DATA(0)->bdata;
675 677
676 free_area_init_node(i, zones_size, start, NULL); 678 free_area_init_node(i, zones_size, start, NULL);
677 printk(KERN_DEBUG " DMA zone: %ld per-cpu pages\n", 679 printk(KERN_DEBUG " Normal zone: %ld per-cpu pages\n",
678 PFN_UP(node_percpu[i])); 680 PFN_UP(node_percpu[i]));
679 681
680 /* Track the type of memory on each node */ 682 /* Track the type of memory on each node */
@@ -1312,6 +1314,8 @@ static void *__init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
1312 1314
1313 BUG_ON(size % PAGE_SIZE != 0); 1315 BUG_ON(size % PAGE_SIZE != 0);
1314 pfn_offset[nid] += size / PAGE_SIZE; 1316 pfn_offset[nid] += size / PAGE_SIZE;
1317 BUG_ON(node_percpu[nid] < size);
1318 node_percpu[nid] -= size;
1315 if (percpu_pfn[cpu] == 0) 1319 if (percpu_pfn[cpu] == 0)
1316 percpu_pfn[cpu] = pfn; 1320 percpu_pfn[cpu] = pfn;
1317 return pfn_to_kaddr(pfn); 1321 return pfn_to_kaddr(pfn);
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
index f7d4a6ad61e..b2fe15e0107 100644
--- a/arch/tile/lib/memcpy_tile64.c
+++ b/arch/tile/lib/memcpy_tile64.c
@@ -96,7 +96,7 @@ static void memcpy_multicache(void *dest, const void *source,
96 newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); 96 newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
97 pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); 97 pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
98 ptep = pte_offset_kernel(pmdp, newsrc); 98 ptep = pte_offset_kernel(pmdp, newsrc);
99 *ptep = src_pte; /* set_pte() would be confused by this */ 99 __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
100 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); 100 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
101 101
102 /* Actually move the data. */ 102 /* Actually move the data. */
@@ -109,7 +109,7 @@ static void memcpy_multicache(void *dest, const void *source,
109 */ 109 */
110 src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); 110 src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
111 src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ 111 src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
112 *ptep = src_pte; /* set_pte() would be confused by this */ 112 __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
113 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); 113 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
114 114
115 /* 115 /*
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index f344f4fc734..cbe6f4f9eca 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -412,7 +412,7 @@ void homecache_change_page_home(struct page *page, int order, int home)
412 pte_t *ptep = virt_to_pte(NULL, kva); 412 pte_t *ptep = virt_to_pte(NULL, kva);
413 pte_t pteval = *ptep; 413 pte_t pteval = *ptep;
414 BUG_ON(!pte_present(pteval) || pte_huge(pteval)); 414 BUG_ON(!pte_present(pteval) || pte_huge(pteval));
415 *ptep = pte_set_home(pteval, home); 415 __set_pte(ptep, pte_set_home(pteval, home));
416 } 416 }
417} 417}
418 418
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index f89ed5dc08d..d6e87fda2fb 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -53,18 +53,6 @@
53 53
54#include "migrate.h" 54#include "migrate.h"
55 55
56/*
57 * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)"
58 * in the Tile Kconfig, but this generates configure warnings.
59 * Do it here and force people to get it right to compile this file.
60 * The problem is that with 4KB small pages and 16MB huge pages,
61 * the default value doesn't allow us to group enough small pages
62 * together to make up a huge page.
63 */
64#if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1
65# error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size"
66#endif
67
68#define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0)) 56#define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0))
69 57
70#ifndef __tilegx__ 58#ifndef __tilegx__
@@ -962,11 +950,7 @@ struct kmem_cache *pgd_cache;
962 950
963void __init pgtable_cache_init(void) 951void __init pgtable_cache_init(void)
964{ 952{
965 pgd_cache = kmem_cache_create("pgd", 953 pgd_cache = kmem_cache_create("pgd", SIZEOF_PGD, SIZEOF_PGD, 0, NULL);
966 PTRS_PER_PGD*sizeof(pgd_t),
967 PTRS_PER_PGD*sizeof(pgd_t),
968 0,
969 NULL);
970 if (!pgd_cache) 954 if (!pgd_cache)
971 panic("pgtable_cache_init(): Cannot create pgd cache"); 955 panic("pgtable_cache_init(): Cannot create pgd cache");
972} 956}
diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S
index f738765cd1e..ac01a7cdf77 100644
--- a/arch/tile/mm/migrate_32.S
+++ b/arch/tile/mm/migrate_32.S
@@ -18,6 +18,7 @@
18#include <linux/linkage.h> 18#include <linux/linkage.h>
19#include <linux/threads.h> 19#include <linux/threads.h>
20#include <asm/page.h> 20#include <asm/page.h>
21#include <asm/thread_info.h>
21#include <asm/types.h> 22#include <asm/types.h>
22#include <asm/asm-offsets.h> 23#include <asm/asm-offsets.h>
23#include <hv/hypervisor.h> 24#include <hv/hypervisor.h>
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 2c850d9864e..1a2b36f8866 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -142,6 +142,76 @@ pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
142} 142}
143#endif 143#endif
144 144
145/**
146 * shatter_huge_page() - ensure a given address is mapped by a small page.
147 *
148 * This function converts a huge PTE mapping kernel LOWMEM into a bunch
149 * of small PTEs with the same caching. No cache flush required, but we
150 * must do a global TLB flush.
151 *
152 * Any caller that wishes to modify a kernel mapping that might
153 * have been made with a huge page should call this function,
154 * since doing so properly avoids race conditions with installing the
155 * newly-shattered page and then flushing all the TLB entries.
156 *
157 * @addr: Address at which to shatter any existing huge page.
158 */
159void shatter_huge_page(unsigned long addr)
160{
161 pgd_t *pgd;
162 pud_t *pud;
163 pmd_t *pmd;
164 unsigned long flags = 0; /* happy compiler */
165#ifdef __PAGETABLE_PMD_FOLDED
166 struct list_head *pos;
167#endif
168
169 /* Get a pointer to the pmd entry that we need to change. */
170 addr &= HPAGE_MASK;
171 BUG_ON(pgd_addr_invalid(addr));
172 BUG_ON(addr < PAGE_OFFSET); /* only for kernel LOWMEM */
173 pgd = swapper_pg_dir + pgd_index(addr);
174 pud = pud_offset(pgd, addr);
175 BUG_ON(!pud_present(*pud));
176 pmd = pmd_offset(pud, addr);
177 BUG_ON(!pmd_present(*pmd));
178 if (!pmd_huge_page(*pmd))
179 return;
180
181 /*
182 * Grab the pgd_lock, since we may need it to walk the pgd_list,
183 * and since we need some kind of lock here to avoid races.
184 */
185 spin_lock_irqsave(&pgd_lock, flags);
186 if (!pmd_huge_page(*pmd)) {
187 /* Lost the race to convert the huge page. */
188 spin_unlock_irqrestore(&pgd_lock, flags);
189 return;
190 }
191
192 /* Shatter the huge page into the preallocated L2 page table. */
193 pmd_populate_kernel(&init_mm, pmd,
194 get_prealloc_pte(pte_pfn(*(pte_t *)pmd)));
195
196#ifdef __PAGETABLE_PMD_FOLDED
197 /* Walk every pgd on the system and update the pmd there. */
198 list_for_each(pos, &pgd_list) {
199 pmd_t *copy_pmd;
200 pgd = list_to_pgd(pos) + pgd_index(addr);
201 pud = pud_offset(pgd, addr);
202 copy_pmd = pmd_offset(pud, addr);
203 __set_pmd(copy_pmd, *pmd);
204 }
205#endif
206
207 /* Tell every cpu to notice the change. */
208 flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
209 cpu_possible_mask, NULL, 0);
210
211 /* Hold the lock until the TLB flush is finished to avoid races. */
212 spin_unlock_irqrestore(&pgd_lock, flags);
213}
214
145/* 215/*
146 * List of all pgd's needed so it can invalidate entries in both cached 216 * List of all pgd's needed so it can invalidate entries in both cached
147 * and uncached pgd's. This is essentially codepath-based locking 217 * and uncached pgd's. This is essentially codepath-based locking
@@ -184,9 +254,9 @@ static void pgd_ctor(pgd_t *pgd)
184 BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0); 254 BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
185#endif 255#endif
186 256
187 clone_pgd_range(pgd + KERNEL_PGD_INDEX_START, 257 memcpy(pgd + KERNEL_PGD_INDEX_START,
188 swapper_pg_dir + KERNEL_PGD_INDEX_START, 258 swapper_pg_dir + KERNEL_PGD_INDEX_START,
189 KERNEL_PGD_PTRS); 259 KERNEL_PGD_PTRS * sizeof(pgd_t));
190 260
191 pgd_list_add(pgd); 261 pgd_list_add(pgd);
192 spin_unlock_irqrestore(&pgd_lock, flags); 262 spin_unlock_irqrestore(&pgd_lock, flags);
@@ -220,8 +290,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
220 290
221struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) 291struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
222{ 292{
223 gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP; 293 gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
224 struct page *p; 294 struct page *p;
295#if L2_USER_PGTABLE_ORDER > 0
296 int i;
297#endif
225 298
226#ifdef CONFIG_HIGHPTE 299#ifdef CONFIG_HIGHPTE
227 flags |= __GFP_HIGHMEM; 300 flags |= __GFP_HIGHMEM;
@@ -231,6 +304,18 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
231 if (p == NULL) 304 if (p == NULL)
232 return NULL; 305 return NULL;
233 306
307#if L2_USER_PGTABLE_ORDER > 0
308 /*
309 * Make every page have a page_count() of one, not just the first.
310 * We don't use __GFP_COMP since it doesn't look like it works
311 * correctly with tlb_remove_page().
312 */
313 for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
314 init_page_count(p+i);
315 inc_zone_page_state(p+i, NR_PAGETABLE);
316 }
317#endif
318
234 pgtable_page_ctor(p); 319 pgtable_page_ctor(p);
235 return p; 320 return p;
236} 321}
@@ -242,8 +327,15 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
242 */ 327 */
243void pte_free(struct mm_struct *mm, struct page *p) 328void pte_free(struct mm_struct *mm, struct page *p)
244{ 329{
330 int i;
331
245 pgtable_page_dtor(p); 332 pgtable_page_dtor(p);
246 __free_pages(p, L2_USER_PGTABLE_ORDER); 333 __free_page(p);
334
335 for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
336 __free_page(p+i);
337 dec_zone_page_state(p+i, NR_PAGETABLE);
338 }
247} 339}
248 340
249void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, 341void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
@@ -252,8 +344,12 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
252 int i; 344 int i;
253 345
254 pgtable_page_dtor(pte); 346 pgtable_page_dtor(pte);
255 for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) 347 tlb_remove_page(tlb, pte);
348
349 for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
256 tlb_remove_page(tlb, pte + i); 350 tlb_remove_page(tlb, pte + i);
351 dec_zone_page_state(pte + i, NR_PAGETABLE);
352 }
257} 353}
258 354
259#ifndef __tilegx__ 355#ifndef __tilegx__
@@ -335,35 +431,51 @@ int get_remote_cache_cpu(pgprot_t prot)
335 return x + y * smp_width; 431 return x + y * smp_width;
336} 432}
337 433
338void set_pte_order(pte_t *ptep, pte_t pte, int order) 434/*
435 * Convert a kernel VA to a PA and homing information.
436 */
437int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)
339{ 438{
340 unsigned long pfn = pte_pfn(pte); 439 struct page *page = virt_to_page(va);
341 struct page *page = pfn_to_page(pfn); 440 pte_t null_pte = { 0 };
342 441
343 /* Update the home of a PTE if necessary */ 442 *cpa = __pa(va);
344 pte = pte_set_home(pte, page_home(page)); 443
444 /* Note that this is not writing a page table, just returning a pte. */
445 *pte = pte_set_home(null_pte, page_home(page));
446
447 return 0; /* return non-zero if not hfh? */
448}
449EXPORT_SYMBOL(va_to_cpa_and_pte);
345 450
451void __set_pte(pte_t *ptep, pte_t pte)
452{
346#ifdef __tilegx__ 453#ifdef __tilegx__
347 *ptep = pte; 454 *ptep = pte;
348#else 455#else
349 /* 456# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
350 * When setting a PTE, write the high bits first, then write 457# error Must write the present and migrating bits last
351 * the low bits. This sets the "present" bit only after the 458# endif
352 * other bits are in place. If a particular PTE update 459 if (pte_present(pte)) {
353 * involves transitioning from one valid PTE to another, it 460 ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
354 * may be necessary to call set_pte_order() more than once, 461 barrier();
355 * transitioning via a suitable intermediate state. 462 ((u32 *)ptep)[0] = (u32)(pte_val(pte));
356 * Note that this sequence also means that if we are transitioning 463 } else {
357 * from any migrating PTE to a non-migrating one, we will not 464 ((u32 *)ptep)[0] = (u32)(pte_val(pte));
358 * see a half-updated PTE with the migrating bit off. 465 barrier();
359 */ 466 ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
360#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32 467 }
361# error Must write the present and migrating bits last 468#endif /* __tilegx__ */
362#endif 469}
363 ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); 470
364 barrier(); 471void set_pte(pte_t *ptep, pte_t pte)
365 ((u32 *)ptep)[0] = (u32)(pte_val(pte)); 472{
366#endif 473 struct page *page = pfn_to_page(pte_pfn(pte));
474
475 /* Update the home of a PTE if necessary */
476 pte = pte_set_home(pte, page_home(page));
477
478 __set_pte(ptep, pte);
367} 479}
368 480
369/* Can this mm load a PTE with cached_priority set? */ 481/* Can this mm load a PTE with cached_priority set? */