diff options
author | David Gibson <david@gibson.dropbear.id.au> | 2009-10-28 12:27:18 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2009-10-30 02:20:57 -0400 |
commit | a0668cdc154e54bf0c85182e0535eea237d53146 (patch) | |
tree | 84efcadf011e16c240ac9b1c948141fc1cc7d324 /arch/powerpc/mm | |
parent | f71dc176aa06359681c30ba6877ffccab6fba3a6 (diff) |
powerpc/mm: Cleanup management of kmem_caches for pagetables
Currently we have a fair bit of rather fiddly code to manage the
various kmem_caches used to store page tables of various levels. We
generally have two caches holding some combination of PGD, PUD and PMD
tables, plus several more for the special hugepage pagetables.
This patch cleans this all up by taking a different approach. Rather
than the caches being designated as for PUDs or for hugeptes for 16M
pages, the caches are simply allocated to be a specific size. Thus
sharing of caches between different types/levels of pagetables happens
naturally. The pagetable size, where needed, is passed around encoded
in the same way as {PGD,PUD,PMD}_INDEX_SIZE; that is n where the
pagetable contains 2^n pointers.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 51 | ||||
-rw-r--r-- | arch/powerpc/mm/init_64.c | 70 | ||||
-rw-r--r-- | arch/powerpc/mm/pgtable.c | 25 |
3 files changed, 80 insertions, 66 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 3d542a9732ae..7230d7a4fbd9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -43,26 +43,14 @@ static unsigned nr_gpages; | |||
43 | unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ | 43 | unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ |
44 | 44 | ||
45 | #define hugepte_shift mmu_huge_psizes | 45 | #define hugepte_shift mmu_huge_psizes |
46 | #define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize]) | 46 | #define HUGEPTE_INDEX_SIZE(psize) (mmu_huge_psizes[(psize)]) |
47 | #define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize]) | 47 | #define PTRS_PER_HUGEPTE(psize) (1 << mmu_huge_psizes[psize]) |
48 | 48 | ||
49 | #define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ | 49 | #define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ |
50 | + hugepte_shift[psize]) | 50 | + HUGEPTE_INDEX_SIZE(psize)) |
51 | #define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) | 51 | #define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) |
52 | #define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) | 52 | #define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) |
53 | 53 | ||
54 | /* Subtract one from array size because we don't need a cache for 4K since | ||
55 | * is not a huge page size */ | ||
56 | #define HUGE_PGTABLE_INDEX(psize) (HUGEPTE_CACHE_NUM + psize - 1) | ||
57 | #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) | ||
58 | |||
59 | static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { | ||
60 | [MMU_PAGE_64K] = "hugepte_cache_64K", | ||
61 | [MMU_PAGE_1M] = "hugepte_cache_1M", | ||
62 | [MMU_PAGE_16M] = "hugepte_cache_16M", | ||
63 | [MMU_PAGE_16G] = "hugepte_cache_16G", | ||
64 | }; | ||
65 | |||
66 | /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() | 54 | /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() |
67 | * will choke on pointers to hugepte tables, which is handy for | 55 | * will choke on pointers to hugepte tables, which is handy for |
68 | * catching screwups early. */ | 56 | * catching screwups early. */ |
@@ -114,15 +102,15 @@ static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, | |||
114 | static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, | 102 | static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, |
115 | unsigned long address, unsigned int psize) | 103 | unsigned long address, unsigned int psize) |
116 | { | 104 | { |
117 | pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], | 105 | pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]), |
118 | GFP_KERNEL|__GFP_REPEAT); | 106 | GFP_KERNEL|__GFP_REPEAT); |
119 | 107 | ||
120 | if (! new) | 108 | if (! new) |
121 | return -ENOMEM; | 109 | return -ENOMEM; |
122 | 110 | ||
123 | spin_lock(&mm->page_table_lock); | 111 | spin_lock(&mm->page_table_lock); |
124 | if (!hugepd_none(*hpdp)) | 112 | if (!hugepd_none(*hpdp)) |
125 | kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new); | 113 | kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new); |
126 | else | 114 | else |
127 | hpdp->pd = (unsigned long)new | HUGEPD_OK; | 115 | hpdp->pd = (unsigned long)new | HUGEPD_OK; |
128 | spin_unlock(&mm->page_table_lock); | 116 | spin_unlock(&mm->page_table_lock); |
@@ -271,9 +259,7 @@ static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, | |||
271 | 259 | ||
272 | hpdp->pd = 0; | 260 | hpdp->pd = 0; |
273 | tlb->need_flush = 1; | 261 | tlb->need_flush = 1; |
274 | pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, | 262 | pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]); |
275 | HUGEPTE_CACHE_NUM+psize-1, | ||
276 | PGF_CACHENUM_MASK)); | ||
277 | } | 263 | } |
278 | 264 | ||
279 | static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 265 | static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, |
@@ -698,8 +684,6 @@ static void __init set_huge_psize(int psize) | |||
698 | if (mmu_huge_psizes[psize] || | 684 | if (mmu_huge_psizes[psize] || |
699 | mmu_psize_defs[psize].shift == PAGE_SHIFT) | 685 | mmu_psize_defs[psize].shift == PAGE_SHIFT) |
700 | return; | 686 | return; |
701 | if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL)) | ||
702 | return; | ||
703 | hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); | 687 | hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); |
704 | 688 | ||
705 | switch (mmu_psize_defs[psize].shift) { | 689 | switch (mmu_psize_defs[psize].shift) { |
@@ -753,9 +737,9 @@ static int __init hugetlbpage_init(void) | |||
753 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) | 737 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) |
754 | return -ENODEV; | 738 | return -ENODEV; |
755 | 739 | ||
756 | /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE | 740 | /* Add supported huge page sizes. Need to change |
757 | * and adjust PTE_NONCACHE_NUM if the number of supported huge page | 741 | * HUGE_MAX_HSTATE if the number of supported huge page sizes |
758 | * sizes changes. | 742 | * changes. |
759 | */ | 743 | */ |
760 | set_huge_psize(MMU_PAGE_16M); | 744 | set_huge_psize(MMU_PAGE_16M); |
761 | set_huge_psize(MMU_PAGE_16G); | 745 | set_huge_psize(MMU_PAGE_16G); |
@@ -769,16 +753,11 @@ static int __init hugetlbpage_init(void) | |||
769 | 753 | ||
770 | for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { | 754 | for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { |
771 | if (mmu_huge_psizes[psize]) { | 755 | if (mmu_huge_psizes[psize]) { |
772 | pgtable_cache[HUGE_PGTABLE_INDEX(psize)] = | 756 | pgtable_cache_add(hugepte_shift[psize], NULL); |
773 | kmem_cache_create( | 757 | if (!PGT_CACHE(hugepte_shift[psize])) |
774 | HUGEPTE_CACHE_NAME(psize), | 758 | panic("hugetlbpage_init(): could not create " |
775 | HUGEPTE_TABLE_SIZE(psize), | 759 | "pgtable cache for %d bit pagesize\n", |
776 | HUGEPTE_TABLE_SIZE(psize), | 760 | mmu_psize_to_shift(psize)); |
777 | 0, | ||
778 | NULL); | ||
779 | if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)]) | ||
780 | panic("hugetlbpage_init(): could not create %s"\ | ||
781 | "\n", HUGEPTE_CACHE_NAME(psize)); | ||
782 | } | 761 | } |
783 | } | 762 | } |
784 | 763 | ||
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 335c578b9cc3..82ac61dcd3af 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c | |||
@@ -119,30 +119,58 @@ static void pmd_ctor(void *addr) | |||
119 | memset(addr, 0, PMD_TABLE_SIZE); | 119 | memset(addr, 0, PMD_TABLE_SIZE); |
120 | } | 120 | } |
121 | 121 | ||
122 | static const unsigned int pgtable_cache_size[2] = { | 122 | struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; |
123 | PGD_TABLE_SIZE, PMD_TABLE_SIZE | 123 | |
124 | }; | 124 | /* |
125 | static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { | 125 | * Create a kmem_cache() for pagetables. This is not used for PTE |
126 | #ifdef CONFIG_PPC_64K_PAGES | 126 | * pages - they're linked to struct page, come from the normal free |
127 | "pgd_cache", "pmd_cache", | 127 | * pages pool and have a different entry size (see real_pte_t) to |
128 | #else | 128 | * everything else. Caches created by this function are used for all |
129 | "pgd_cache", "pud_pmd_cache", | 129 | * the higher level pagetables, and for hugepage pagetables. |
130 | #endif /* CONFIG_PPC_64K_PAGES */ | 130 | */ |
131 | }; | 131 | void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) |
132 | 132 | { | |
133 | #ifdef CONFIG_HUGETLB_PAGE | 133 | char *name; |
134 | /* Hugepages need an extra cache per hugepagesize, initialized in | 134 | unsigned long table_size = sizeof(void *) << shift; |
135 | * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT | 135 | unsigned long align = table_size; |
136 | * is not compile time constant. */ | 136 | |
137 | struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT]; | 137 | /* When batching pgtable pointers for RCU freeing, we store |
138 | #else | 138 | * the index size in the low bits. Table alignment must be |
139 | struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; | 139 | * big enough to fit it */ |
140 | #endif | 140 | unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1; |
141 | struct kmem_cache *new; | ||
142 | |||
143 | /* It would be nice if this was a BUILD_BUG_ON(), but at the | ||
144 | * moment, gcc doesn't seem to recognize is_power_of_2 as a | ||
145 | * constant expression, so so much for that. */ | ||
146 | BUG_ON(!is_power_of_2(minalign)); | ||
147 | BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE)); | ||
148 | |||
149 | if (PGT_CACHE(shift)) | ||
150 | return; /* Already have a cache of this size */ | ||
151 | |||
152 | align = max_t(unsigned long, align, minalign); | ||
153 | name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); | ||
154 | new = kmem_cache_create(name, table_size, align, 0, ctor); | ||
155 | PGT_CACHE(shift) = new; | ||
156 | |||
157 | pr_debug("Allocated pgtable cache for order %d\n", shift); | ||
158 | } | ||
159 | |||
141 | 160 | ||
142 | void pgtable_cache_init(void) | 161 | void pgtable_cache_init(void) |
143 | { | 162 | { |
144 | pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); | 163 | pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); |
145 | pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); | 164 | pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); |
165 | if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) | ||
166 | panic("Couldn't allocate pgtable caches"); | ||
167 | |||
168 | /* In all current configs, when the PUD index exists it's the | ||
169 | * same size as either the pgd or pmd index. Verify that the | ||
170 | * initialization above has also created a PUD cache. This | ||
171 | * will need re-examiniation if we add new possibilities for | ||
172 | * the pagetable layout. */ | ||
173 | BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE)); | ||
146 | } | 174 | } |
147 | 175 | ||
148 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 176 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 53040931de32..99df697c601a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c | |||
@@ -49,12 +49,12 @@ struct pte_freelist_batch | |||
49 | { | 49 | { |
50 | struct rcu_head rcu; | 50 | struct rcu_head rcu; |
51 | unsigned int index; | 51 | unsigned int index; |
52 | pgtable_free_t tables[0]; | 52 | unsigned long tables[0]; |
53 | }; | 53 | }; |
54 | 54 | ||
55 | #define PTE_FREELIST_SIZE \ | 55 | #define PTE_FREELIST_SIZE \ |
56 | ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ | 56 | ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ |
57 | / sizeof(pgtable_free_t)) | 57 | / sizeof(unsigned long)) |
58 | 58 | ||
59 | static void pte_free_smp_sync(void *arg) | 59 | static void pte_free_smp_sync(void *arg) |
60 | { | 60 | { |
@@ -64,13 +64,13 @@ static void pte_free_smp_sync(void *arg) | |||
64 | /* This is only called when we are critically out of memory | 64 | /* This is only called when we are critically out of memory |
65 | * (and fail to get a page in pte_free_tlb). | 65 | * (and fail to get a page in pte_free_tlb). |
66 | */ | 66 | */ |
67 | static void pgtable_free_now(pgtable_free_t pgf) | 67 | static void pgtable_free_now(void *table, unsigned shift) |
68 | { | 68 | { |
69 | pte_freelist_forced_free++; | 69 | pte_freelist_forced_free++; |
70 | 70 | ||
71 | smp_call_function(pte_free_smp_sync, NULL, 1); | 71 | smp_call_function(pte_free_smp_sync, NULL, 1); |
72 | 72 | ||
73 | pgtable_free(pgf); | 73 | pgtable_free(table, shift); |
74 | } | 74 | } |
75 | 75 | ||
76 | static void pte_free_rcu_callback(struct rcu_head *head) | 76 | static void pte_free_rcu_callback(struct rcu_head *head) |
@@ -79,8 +79,12 @@ static void pte_free_rcu_callback(struct rcu_head *head) | |||
79 | container_of(head, struct pte_freelist_batch, rcu); | 79 | container_of(head, struct pte_freelist_batch, rcu); |
80 | unsigned int i; | 80 | unsigned int i; |
81 | 81 | ||
82 | for (i = 0; i < batch->index; i++) | 82 | for (i = 0; i < batch->index; i++) { |
83 | pgtable_free(batch->tables[i]); | 83 | void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE); |
84 | unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE; | ||
85 | |||
86 | pgtable_free(table, shift); | ||
87 | } | ||
84 | 88 | ||
85 | free_page((unsigned long)batch); | 89 | free_page((unsigned long)batch); |
86 | } | 90 | } |
@@ -91,25 +95,28 @@ static void pte_free_submit(struct pte_freelist_batch *batch) | |||
91 | call_rcu(&batch->rcu, pte_free_rcu_callback); | 95 | call_rcu(&batch->rcu, pte_free_rcu_callback); |
92 | } | 96 | } |
93 | 97 | ||
94 | void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) | 98 | void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift) |
95 | { | 99 | { |
96 | /* This is safe since tlb_gather_mmu has disabled preemption */ | 100 | /* This is safe since tlb_gather_mmu has disabled preemption */ |
97 | struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); | 101 | struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); |
102 | unsigned long pgf; | ||
98 | 103 | ||
99 | if (atomic_read(&tlb->mm->mm_users) < 2 || | 104 | if (atomic_read(&tlb->mm->mm_users) < 2 || |
100 | cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ | 105 | cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ |
101 | pgtable_free(pgf); | 106 | pgtable_free(table, shift); |
102 | return; | 107 | return; |
103 | } | 108 | } |
104 | 109 | ||
105 | if (*batchp == NULL) { | 110 | if (*batchp == NULL) { |
106 | *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); | 111 | *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); |
107 | if (*batchp == NULL) { | 112 | if (*batchp == NULL) { |
108 | pgtable_free_now(pgf); | 113 | pgtable_free_now(table, shift); |
109 | return; | 114 | return; |
110 | } | 115 | } |
111 | (*batchp)->index = 0; | 116 | (*batchp)->index = 0; |
112 | } | 117 | } |
118 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | ||
119 | pgf = (unsigned long)table | shift; | ||
113 | (*batchp)->tables[(*batchp)->index++] = pgf; | 120 | (*batchp)->tables[(*batchp)->index++] = pgf; |
114 | if ((*batchp)->index == PTE_FREELIST_SIZE) { | 121 | if ((*batchp)->index == PTE_FREELIST_SIZE) { |
115 | pte_free_submit(*batchp); | 122 | pte_free_submit(*batchp); |