aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRobin Holt <holt@sgi.com>2005-04-25 16:13:16 -0400
committerTony Luck <tony.luck@intel.com>2005-04-25 16:13:16 -0400
commitfde740e4dd4a05ca8957490d468fa9b2770f5bd6 (patch)
tree04bc0221bc6c59379a17f3631fc4bd3c886e1d61
parentff3eb55ed97db3f12964beeffe3d34602d295367 (diff)
[IA64] Percpu quicklist for combined allocator for pgd/pmd/pte.
This patch introduces using the quicklists for pgd, pmd, and pte levels by combining the alloc and free functions into a common set of routines. This greatly simplifies the reading of this header file. This patch is simple but necessary for large numa configurations. It simply ensures that only pages from the local node are added to a cpus quicklist. This prevents the trapping of pages on a remote nodes quicklist by starting a process, touching a large number of pages to fill pmd and pte entries, migrating to another node, and then unmapping or exiting. With those conditions, the pages get trapped and if the machine has more than 100 nodes of the same size, the calculation of the pgtable high water mark will be larger than any single node so page table cache flushing will never occur. I ran lmbench lat_proc fork and lat_proc exec on a zx1 with and without this patch and did not notice any change. On an sn2 machine, there was a slight improvement which is possibly due to pages from other nodes trapped on the test node before starting the run. I did not investigate further. This patch shrinks the quicklist based upon free memory on the node instead of the high/low water marks. I have written it to enable preemption periodically and recalculate the amount to shrink every time we have freed enough pages that the quicklist size should have grown. I rescan the nodes zones each pass because other processess may be draining node memory at the same time as we are adding. Signed-off-by: Robin Holt <holt@sgi.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
-rw-r--r--arch/ia64/mm/contig.c3
-rw-r--r--arch/ia64/mm/discontig.c3
-rw-r--r--arch/ia64/mm/init.c74
-rw-r--r--include/asm-ia64/pgalloc.h144
-rw-r--r--include/asm-ia64/processor.h3
5 files changed, 112 insertions, 115 deletions
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 6daf15ac8940..91a055f5731f 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -61,7 +61,8 @@ show_mem (void)
61 printk("%d reserved pages\n", reserved); 61 printk("%d reserved pages\n", reserved);
62 printk("%d pages shared\n", shared); 62 printk("%d pages shared\n", shared);
63 printk("%d pages swap cached\n", cached); 63 printk("%d pages swap cached\n", cached);
64 printk("%ld pages in page table cache\n", pgtable_cache_size); 64 printk("%ld pages in page table cache\n",
65 pgtable_quicklist_total_size());
65} 66}
66 67
67/* physical address where the bootmem map is located */ 68/* physical address where the bootmem map is located */
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 3456a9b6971e..c00710929390 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -582,7 +582,8 @@ void show_mem(void)
582 printk("%d reserved pages\n", total_reserved); 582 printk("%d reserved pages\n", total_reserved);
583 printk("%d pages shared\n", total_shared); 583 printk("%d pages shared\n", total_shared);
584 printk("%d pages swap cached\n", total_cached); 584 printk("%d pages swap cached\n", total_cached);
585 printk("Total of %ld pages in page table cache\n", pgtable_cache_size); 585 printk("Total of %ld pages in page table cache\n",
586 pgtable_quicklist_total_size());
586 printk("%d free buffer pages\n", nr_free_buffer_pages()); 587 printk("%d free buffer pages\n", nr_free_buffer_pages());
587} 588}
588 589
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 65cf839573ea..4892be53e227 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -39,6 +39,9 @@
39 39
40DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 40DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
41 41
42DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist);
43DEFINE_PER_CPU(long, __pgtable_quicklist_size);
44
42extern void ia64_tlb_init (void); 45extern void ia64_tlb_init (void);
43 46
44unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL; 47unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
@@ -50,27 +53,53 @@ struct page *vmem_map;
50EXPORT_SYMBOL(vmem_map); 53EXPORT_SYMBOL(vmem_map);
51#endif 54#endif
52 55
53static int pgt_cache_water[2] = { 25, 50 }; 56struct page *zero_page_memmap_ptr; /* map entry for zero page */
54
55struct page *zero_page_memmap_ptr; /* map entry for zero page */
56EXPORT_SYMBOL(zero_page_memmap_ptr); 57EXPORT_SYMBOL(zero_page_memmap_ptr);
57 58
59#define MIN_PGT_PAGES 25UL
60#define MAX_PGT_FREES_PER_PASS 16
61#define PGT_FRACTION_OF_NODE_MEM 16
62
63static inline long
64max_pgt_pages(void)
65{
66 u64 node_free_pages, max_pgt_pages;
67
68#ifndef CONFIG_NUMA
69 node_free_pages = nr_free_pages();
70#else
71 node_free_pages = nr_free_pages_pgdat(NODE_DATA(numa_node_id()));
72#endif
73 max_pgt_pages = node_free_pages / PGT_FRACTION_OF_NODE_MEM;
74 max_pgt_pages = max(max_pgt_pages, MIN_PGT_PAGES);
75 return max_pgt_pages;
76}
77
78static inline long
79min_pages_to_free(void)
80{
81 long pages_to_free;
82
83 pages_to_free = pgtable_quicklist_size - max_pgt_pages();
84 pages_to_free = min(pages_to_free, MAX_PGT_FREES_PER_PASS);
85 return pages_to_free;
86}
87
58void 88void
59check_pgt_cache (void) 89check_pgt_cache(void)
60{ 90{
61 int low, high; 91 long pages_to_free;
62 92
63 low = pgt_cache_water[0]; 93 if (unlikely(pgtable_quicklist_size <= MIN_PGT_PAGES))
64 high = pgt_cache_water[1]; 94 return;
65 95
66 preempt_disable(); 96 preempt_disable();
67 if (pgtable_cache_size > (u64) high) { 97 while (unlikely((pages_to_free = min_pages_to_free()) > 0)) {
68 do { 98 while (pages_to_free--) {
69 if (pgd_quicklist) 99 free_page((unsigned long)pgtable_quicklist_alloc());
70 free_page((unsigned long)pgd_alloc_one_fast(NULL)); 100 }
71 if (pmd_quicklist) 101 preempt_enable();
72 free_page((unsigned long)pmd_alloc_one_fast(NULL, 0)); 102 preempt_disable();
73 } while (pgtable_cache_size > (u64) low);
74 } 103 }
75 preempt_enable(); 104 preempt_enable();
76} 105}
@@ -523,11 +552,14 @@ void
523mem_init (void) 552mem_init (void)
524{ 553{
525 long reserved_pages, codesize, datasize, initsize; 554 long reserved_pages, codesize, datasize, initsize;
526 unsigned long num_pgt_pages;
527 pg_data_t *pgdat; 555 pg_data_t *pgdat;
528 int i; 556 int i;
529 static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel; 557 static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
530 558
559 BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
560 BUG_ON(PTRS_PER_PMD * sizeof(pmd_t) != PAGE_SIZE);
561 BUG_ON(PTRS_PER_PTE * sizeof(pte_t) != PAGE_SIZE);
562
531#ifdef CONFIG_PCI 563#ifdef CONFIG_PCI
532 /* 564 /*
533 * This needs to be called _after_ the command line has been parsed but _before_ 565 * This needs to be called _after_ the command line has been parsed but _before_
@@ -564,18 +596,6 @@ mem_init (void)
564 num_physpages << (PAGE_SHIFT - 10), codesize >> 10, 596 num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
565 reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10); 597 reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
566 598
567 /*
568 * Allow for enough (cached) page table pages so that we can map the entire memory
569 * at least once. Each task also needs a couple of page tables pages, so add in a
570 * fudge factor for that (don't use "threads-max" here; that would be wrong!).
571 * Don't allow the cache to be more than 10% of total memory, though.
572 */
573# define NUM_TASKS 500 /* typical number of tasks */
574 num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
575 if (num_pgt_pages > nr_free_pages() / 10)
576 num_pgt_pages = nr_free_pages() / 10;
577 if (num_pgt_pages > (u64) pgt_cache_water[1])
578 pgt_cache_water[1] = num_pgt_pages;
579 599
580 /* 600 /*
581 * For fsyscall entrpoints with no light-weight handler, use the ordinary 601 * For fsyscall entrpoints with no light-weight handler, use the ordinary
diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h
index 0f05dc8bd460..e86a8c331ee6 100644
--- a/include/asm-ia64/pgalloc.h
+++ b/include/asm-ia64/pgalloc.h
@@ -22,146 +22,124 @@
22 22
23#include <asm/mmu_context.h> 23#include <asm/mmu_context.h>
24 24
25/* 25DECLARE_PER_CPU(unsigned long *, __pgtable_quicklist);
26 * Very stupidly, we used to get new pgd's and pmd's, init their contents 26#define pgtable_quicklist __ia64_per_cpu_var(__pgtable_quicklist)
27 * to point to the NULL versions of the next level page table, later on 27DECLARE_PER_CPU(long, __pgtable_quicklist_size);
28 * completely re-init them the same way, then free them up. This wasted 28#define pgtable_quicklist_size __ia64_per_cpu_var(__pgtable_quicklist_size)
29 * a lot of work and caused unnecessary memory traffic. How broken...
30 * We fix this by caching them.
31 */
32#define pgd_quicklist (local_cpu_data->pgd_quick)
33#define pmd_quicklist (local_cpu_data->pmd_quick)
34#define pgtable_cache_size (local_cpu_data->pgtable_cache_sz)
35 29
36static inline pgd_t* 30static inline long pgtable_quicklist_total_size(void)
37pgd_alloc_one_fast (struct mm_struct *mm) 31{
32 long ql_size;
33 int cpuid;
34
35 for_each_online_cpu(cpuid) {
36 ql_size += per_cpu(__pgtable_quicklist_size, cpuid);
37 }
38 return ql_size;
39}
40
41static inline void *pgtable_quicklist_alloc(void)
38{ 42{
39 unsigned long *ret = NULL; 43 unsigned long *ret = NULL;
40 44
41 preempt_disable(); 45 preempt_disable();
42 46
43 ret = pgd_quicklist; 47 ret = pgtable_quicklist;
44 if (likely(ret != NULL)) { 48 if (likely(ret != NULL)) {
45 pgd_quicklist = (unsigned long *)(*ret); 49 pgtable_quicklist = (unsigned long *)(*ret);
46 ret[0] = 0; 50 ret[0] = 0;
47 --pgtable_cache_size; 51 --pgtable_quicklist_size;
48 } else 52 } else {
49 ret = NULL; 53 ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
54 }
50 55
51 preempt_enable(); 56 preempt_enable();
52 57
53 return (pgd_t *) ret; 58 return ret;
54} 59}
55 60
56static inline pgd_t* 61static inline void pgtable_quicklist_free(void *pgtable_entry)
57pgd_alloc (struct mm_struct *mm)
58{ 62{
59 /* the VM system never calls pgd_alloc_one_fast(), so we do it here. */ 63#ifdef CONFIG_NUMA
60 pgd_t *pgd = pgd_alloc_one_fast(mm); 64 unsigned long nid = page_to_nid(virt_to_page(pgtable_entry));
61 65
62 if (unlikely(pgd == NULL)) { 66 if (unlikely(nid != numa_node_id())) {
63 pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); 67 free_page((unsigned long)pgtable_entry);
68 return;
64 } 69 }
65 return pgd; 70#endif
66}
67 71
68static inline void
69pgd_free (pgd_t *pgd)
70{
71 preempt_disable(); 72 preempt_disable();
72 *(unsigned long *)pgd = (unsigned long) pgd_quicklist; 73 *(unsigned long *)pgtable_entry = (unsigned long)pgtable_quicklist;
73 pgd_quicklist = (unsigned long *) pgd; 74 pgtable_quicklist = (unsigned long *)pgtable_entry;
74 ++pgtable_cache_size; 75 ++pgtable_quicklist_size;
75 preempt_enable(); 76 preempt_enable();
76} 77}
77 78
78static inline void 79static inline pgd_t *pgd_alloc(struct mm_struct *mm)
79pud_populate (struct mm_struct *mm, pud_t *pud_entry, pmd_t *pmd)
80{ 80{
81 pud_val(*pud_entry) = __pa(pmd); 81 return pgtable_quicklist_alloc();
82} 82}
83 83
84static inline pmd_t* 84static inline void pgd_free(pgd_t * pgd)
85pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr)
86{ 85{
87 unsigned long *ret = NULL; 86 pgtable_quicklist_free(pgd);
88
89 preempt_disable();
90
91 ret = (unsigned long *)pmd_quicklist;
92 if (likely(ret != NULL)) {
93 pmd_quicklist = (unsigned long *)(*ret);
94 ret[0] = 0;
95 --pgtable_cache_size;
96 }
97
98 preempt_enable();
99
100 return (pmd_t *)ret;
101} 87}
102 88
103static inline pmd_t* 89static inline void
104pmd_alloc_one (struct mm_struct *mm, unsigned long addr) 90pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
105{ 91{
106 pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); 92 pud_val(*pud_entry) = __pa(pmd);
93}
107 94
108 return pmd; 95static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
96{
97 return pgtable_quicklist_alloc();
109} 98}
110 99
111static inline void 100static inline void pmd_free(pmd_t * pmd)
112pmd_free (pmd_t *pmd)
113{ 101{
114 preempt_disable(); 102 pgtable_quicklist_free(pmd);
115 *(unsigned long *)pmd = (unsigned long) pmd_quicklist;
116 pmd_quicklist = (unsigned long *) pmd;
117 ++pgtable_cache_size;
118 preempt_enable();
119} 103}
120 104
121#define __pmd_free_tlb(tlb, pmd) pmd_free(pmd) 105#define __pmd_free_tlb(tlb, pmd) pmd_free(pmd)
122 106
123static inline void 107static inline void
124pmd_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte) 108pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, struct page *pte)
125{ 109{
126 pmd_val(*pmd_entry) = page_to_phys(pte); 110 pmd_val(*pmd_entry) = page_to_phys(pte);
127} 111}
128 112
129static inline void 113static inline void
130pmd_populate_kernel (struct mm_struct *mm, pmd_t *pmd_entry, pte_t *pte) 114pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte)
131{ 115{
132 pmd_val(*pmd_entry) = __pa(pte); 116 pmd_val(*pmd_entry) = __pa(pte);
133} 117}
134 118
135static inline struct page * 119static inline struct page *pte_alloc_one(struct mm_struct *mm,
136pte_alloc_one (struct mm_struct *mm, unsigned long addr) 120 unsigned long addr)
137{ 121{
138 struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); 122 return virt_to_page(pgtable_quicklist_alloc());
139
140 return pte;
141} 123}
142 124
143static inline pte_t * 125static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
144pte_alloc_one_kernel (struct mm_struct *mm, unsigned long addr) 126 unsigned long addr)
145{ 127{
146 pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); 128 return pgtable_quicklist_alloc();
147
148 return pte;
149} 129}
150 130
151static inline void 131static inline void pte_free(struct page *pte)
152pte_free (struct page *pte)
153{ 132{
154 __free_page(pte); 133 pgtable_quicklist_free(page_address(pte));
155} 134}
156 135
157static inline void 136static inline void pte_free_kernel(pte_t * pte)
158pte_free_kernel (pte_t *pte)
159{ 137{
160 free_page((unsigned long) pte); 138 pgtable_quicklist_free(pte);
161} 139}
162 140
163#define __pte_free_tlb(tlb, pte) tlb_remove_page((tlb), (pte)) 141#define __pte_free_tlb(tlb, pte) pte_free(pte)
164 142
165extern void check_pgt_cache (void); 143extern void check_pgt_cache(void);
166 144
167#endif /* _ASM_IA64_PGALLOC_H */ 145#endif /* _ASM_IA64_PGALLOC_H */
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 2807f8d766d4..983798ec1791 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -137,9 +137,6 @@ struct cpuinfo_ia64 {
137 __u64 nsec_per_cyc; /* (1000000000<<IA64_NSEC_PER_CYC_SHIFT)/itc_freq */ 137 __u64 nsec_per_cyc; /* (1000000000<<IA64_NSEC_PER_CYC_SHIFT)/itc_freq */
138 __u64 unimpl_va_mask; /* mask of unimplemented virtual address bits (from PAL) */ 138 __u64 unimpl_va_mask; /* mask of unimplemented virtual address bits (from PAL) */
139 __u64 unimpl_pa_mask; /* mask of unimplemented physical address bits (from PAL) */ 139 __u64 unimpl_pa_mask; /* mask of unimplemented physical address bits (from PAL) */
140 __u64 *pgd_quick;
141 __u64 *pmd_quick;
142 __u64 pgtable_cache_sz;
143 __u64 itc_freq; /* frequency of ITC counter */ 140 __u64 itc_freq; /* frequency of ITC counter */
144 __u64 proc_freq; /* frequency of processor */ 141 __u64 proc_freq; /* frequency of processor */
145 __u64 cyc_per_usec; /* itc_freq/1000000 */ 142 __u64 cyc_per_usec; /* itc_freq/1000000 */