diff options
author | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2010-10-25 10:10:11 -0400 |
---|---|---|
committer | Martin Schwidefsky <sky@mschwide.boeblingen.de.ibm.com> | 2010-10-25 10:10:15 -0400 |
commit | 80217147a3d80c8a4e48f06e2f6e965455f3fe2a (patch) | |
tree | b419ae9ee3ab0e5b92c0ed2a30ff59b76d6a4978 /arch | |
parent | 87799ebab760dd1460f6e4193d4f71ba416d1451 (diff) |
[S390] lockless get_user_pages_fast()
Implement get_user_pages_fast without locking in the fastpath on s390.
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/s390/Kconfig | 1 | ||||
-rw-r--r-- | arch/s390/include/asm/pgalloc.h | 4 | ||||
-rw-r--r-- | arch/s390/include/asm/pgtable.h | 1 | ||||
-rw-r--r-- | arch/s390/include/asm/tlb.h | 13 | ||||
-rw-r--r-- | arch/s390/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/s390/mm/gup.c | 225 | ||||
-rw-r--r-- | arch/s390/mm/hugetlbpage.c | 2 | ||||
-rw-r--r-- | arch/s390/mm/init.c | 2 | ||||
-rw-r--r-- | arch/s390/mm/pgtable.c | 171 |
9 files changed, 394 insertions, 27 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 75976a141947..7afc17340500 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig | |||
@@ -101,6 +101,7 @@ config S390 | |||
101 | select HAVE_KERNEL_BZIP2 | 101 | select HAVE_KERNEL_BZIP2 |
102 | select HAVE_KERNEL_LZMA | 102 | select HAVE_KERNEL_LZMA |
103 | select HAVE_KERNEL_LZO | 103 | select HAVE_KERNEL_LZO |
104 | select HAVE_GET_USER_PAGES_FAST | ||
104 | select ARCH_INLINE_SPIN_TRYLOCK | 105 | select ARCH_INLINE_SPIN_TRYLOCK |
105 | select ARCH_INLINE_SPIN_TRYLOCK_BH | 106 | select ARCH_INLINE_SPIN_TRYLOCK_BH |
106 | select ARCH_INLINE_SPIN_LOCK | 107 | select ARCH_INLINE_SPIN_LOCK |
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 68940d0bad91..082eb4e50e8b 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h | |||
@@ -21,9 +21,11 @@ | |||
21 | 21 | ||
22 | unsigned long *crst_table_alloc(struct mm_struct *, int); | 22 | unsigned long *crst_table_alloc(struct mm_struct *, int); |
23 | void crst_table_free(struct mm_struct *, unsigned long *); | 23 | void crst_table_free(struct mm_struct *, unsigned long *); |
24 | void crst_table_free_rcu(struct mm_struct *, unsigned long *); | ||
24 | 25 | ||
25 | unsigned long *page_table_alloc(struct mm_struct *); | 26 | unsigned long *page_table_alloc(struct mm_struct *); |
26 | void page_table_free(struct mm_struct *, unsigned long *); | 27 | void page_table_free(struct mm_struct *, unsigned long *); |
28 | void page_table_free_rcu(struct mm_struct *, unsigned long *); | ||
27 | void disable_noexec(struct mm_struct *, struct task_struct *); | 29 | void disable_noexec(struct mm_struct *, struct task_struct *); |
28 | 30 | ||
29 | static inline void clear_table(unsigned long *s, unsigned long val, size_t n) | 31 | static inline void clear_table(unsigned long *s, unsigned long val, size_t n) |
@@ -176,4 +178,6 @@ static inline void pmd_populate(struct mm_struct *mm, | |||
176 | #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) | 178 | #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) |
177 | #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) | 179 | #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) |
178 | 180 | ||
181 | extern void rcu_table_freelist_finish(void); | ||
182 | |||
179 | #endif /* _S390_PGALLOC_H */ | 183 | #endif /* _S390_PGALLOC_H */ |
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 22a294571000..785229ae39cb 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h | |||
@@ -316,6 +316,7 @@ extern unsigned long VMALLOC_START; | |||
316 | 316 | ||
317 | /* Bits in the segment table entry */ | 317 | /* Bits in the segment table entry */ |
318 | #define _SEGMENT_ENTRY_ORIGIN 0x7fffffc0UL /* page table origin */ | 318 | #define _SEGMENT_ENTRY_ORIGIN 0x7fffffc0UL /* page table origin */ |
319 | #define _SEGMENT_ENTRY_RO 0x200 /* page protection bit */ | ||
319 | #define _SEGMENT_ENTRY_INV 0x20 /* invalid segment table entry */ | 320 | #define _SEGMENT_ENTRY_INV 0x20 /* invalid segment table entry */ |
320 | #define _SEGMENT_ENTRY_COMMON 0x10 /* common segment bit */ | 321 | #define _SEGMENT_ENTRY_COMMON 0x10 /* common segment bit */ |
321 | #define _SEGMENT_ENTRY_PTL 0x0f /* page table length */ | 322 | #define _SEGMENT_ENTRY_PTL 0x0f /* page table length */ |
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index fd1c00d08bf5..f1f644f2240a 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h | |||
@@ -64,10 +64,9 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb, | |||
64 | if (!tlb->fullmm && (tlb->nr_ptes > 0 || tlb->nr_pxds < TLB_NR_PTRS)) | 64 | if (!tlb->fullmm && (tlb->nr_ptes > 0 || tlb->nr_pxds < TLB_NR_PTRS)) |
65 | __tlb_flush_mm(tlb->mm); | 65 | __tlb_flush_mm(tlb->mm); |
66 | while (tlb->nr_ptes > 0) | 66 | while (tlb->nr_ptes > 0) |
67 | pte_free(tlb->mm, tlb->array[--tlb->nr_ptes]); | 67 | page_table_free_rcu(tlb->mm, tlb->array[--tlb->nr_ptes]); |
68 | while (tlb->nr_pxds < TLB_NR_PTRS) | 68 | while (tlb->nr_pxds < TLB_NR_PTRS) |
69 | /* pgd_free frees the pointer as region or segment table */ | 69 | crst_table_free_rcu(tlb->mm, tlb->array[tlb->nr_pxds++]); |
70 | pgd_free(tlb->mm, tlb->array[tlb->nr_pxds++]); | ||
71 | } | 70 | } |
72 | 71 | ||
73 | static inline void tlb_finish_mmu(struct mmu_gather *tlb, | 72 | static inline void tlb_finish_mmu(struct mmu_gather *tlb, |
@@ -75,6 +74,8 @@ static inline void tlb_finish_mmu(struct mmu_gather *tlb, | |||
75 | { | 74 | { |
76 | tlb_flush_mmu(tlb, start, end); | 75 | tlb_flush_mmu(tlb, start, end); |
77 | 76 | ||
77 | rcu_table_freelist_finish(); | ||
78 | |||
78 | /* keep the page table cache within bounds */ | 79 | /* keep the page table cache within bounds */ |
79 | check_pgt_cache(); | 80 | check_pgt_cache(); |
80 | 81 | ||
@@ -103,7 +104,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, | |||
103 | if (tlb->nr_ptes >= tlb->nr_pxds) | 104 | if (tlb->nr_ptes >= tlb->nr_pxds) |
104 | tlb_flush_mmu(tlb, 0, 0); | 105 | tlb_flush_mmu(tlb, 0, 0); |
105 | } else | 106 | } else |
106 | pte_free(tlb->mm, pte); | 107 | page_table_free(tlb->mm, (unsigned long *) pte); |
107 | } | 108 | } |
108 | 109 | ||
109 | /* | 110 | /* |
@@ -124,7 +125,7 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, | |||
124 | if (tlb->nr_ptes >= tlb->nr_pxds) | 125 | if (tlb->nr_ptes >= tlb->nr_pxds) |
125 | tlb_flush_mmu(tlb, 0, 0); | 126 | tlb_flush_mmu(tlb, 0, 0); |
126 | } else | 127 | } else |
127 | pmd_free(tlb->mm, pmd); | 128 | crst_table_free(tlb->mm, (unsigned long *) pmd); |
128 | #endif | 129 | #endif |
129 | } | 130 | } |
130 | 131 | ||
@@ -146,7 +147,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, | |||
146 | if (tlb->nr_ptes >= tlb->nr_pxds) | 147 | if (tlb->nr_ptes >= tlb->nr_pxds) |
147 | tlb_flush_mmu(tlb, 0, 0); | 148 | tlb_flush_mmu(tlb, 0, 0); |
148 | } else | 149 | } else |
149 | pud_free(tlb->mm, pud); | 150 | crst_table_free(tlb->mm, (unsigned long *) pud); |
150 | #endif | 151 | #endif |
151 | } | 152 | } |
152 | 153 | ||
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index eec054484419..6fbc6f3fbdf2 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile | |||
@@ -3,6 +3,6 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o \ | 5 | obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o \ |
6 | page-states.o | 6 | page-states.o gup.o |
7 | obj-$(CONFIG_CMM) += cmm.o | 7 | obj-$(CONFIG_CMM) += cmm.o |
8 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | 8 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o |
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c new file mode 100644 index 000000000000..38e641cdd977 --- /dev/null +++ b/arch/s390/mm/gup.c | |||
@@ -0,0 +1,225 @@ | |||
1 | /* | ||
2 | * Lockless get_user_pages_fast for s390 | ||
3 | * | ||
4 | * Copyright IBM Corp. 2010 | ||
5 | * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> | ||
6 | */ | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/hugetlb.h> | ||
10 | #include <linux/vmstat.h> | ||
11 | #include <linux/pagemap.h> | ||
12 | #include <linux/rwsem.h> | ||
13 | #include <asm/pgtable.h> | ||
14 | |||
15 | /* | ||
16 | * The performance critical leaf functions are made noinline otherwise gcc | ||
17 | * inlines everything into a single function which results in too much | ||
18 | * register pressure. | ||
19 | */ | ||
20 | static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, | ||
21 | unsigned long end, int write, struct page **pages, int *nr) | ||
22 | { | ||
23 | unsigned long mask, result; | ||
24 | pte_t *ptep, pte; | ||
25 | struct page *page; | ||
26 | |||
27 | result = write ? 0 : _PAGE_RO; | ||
28 | mask = result | _PAGE_INVALID | _PAGE_SPECIAL; | ||
29 | |||
30 | ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr); | ||
31 | do { | ||
32 | pte = *ptep; | ||
33 | barrier(); | ||
34 | if ((pte_val(pte) & mask) != result) | ||
35 | return 0; | ||
36 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
37 | page = pte_page(pte); | ||
38 | if (!page_cache_get_speculative(page)) | ||
39 | return 0; | ||
40 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | ||
41 | put_page(page); | ||
42 | return 0; | ||
43 | } | ||
44 | pages[*nr] = page; | ||
45 | (*nr)++; | ||
46 | |||
47 | } while (ptep++, addr += PAGE_SIZE, addr != end); | ||
48 | |||
49 | return 1; | ||
50 | } | ||
51 | |||
52 | static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, | ||
53 | unsigned long end, int write, struct page **pages, int *nr) | ||
54 | { | ||
55 | unsigned long mask, result; | ||
56 | struct page *head, *page; | ||
57 | int refs; | ||
58 | |||
59 | result = write ? 0 : _SEGMENT_ENTRY_RO; | ||
60 | mask = result | _SEGMENT_ENTRY_INV; | ||
61 | if ((pmd_val(pmd) & mask) != result) | ||
62 | return 0; | ||
63 | VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); | ||
64 | |||
65 | refs = 0; | ||
66 | head = pmd_page(pmd); | ||
67 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
68 | do { | ||
69 | VM_BUG_ON(compound_head(page) != head); | ||
70 | pages[*nr] = page; | ||
71 | (*nr)++; | ||
72 | page++; | ||
73 | refs++; | ||
74 | } while (addr += PAGE_SIZE, addr != end); | ||
75 | |||
76 | if (!page_cache_add_speculative(head, refs)) { | ||
77 | *nr -= refs; | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) { | ||
82 | *nr -= refs; | ||
83 | while (refs--) | ||
84 | put_page(head); | ||
85 | } | ||
86 | |||
87 | return 1; | ||
88 | } | ||
89 | |||
90 | |||
91 | static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, | ||
92 | unsigned long end, int write, struct page **pages, int *nr) | ||
93 | { | ||
94 | unsigned long next; | ||
95 | pmd_t *pmdp, pmd; | ||
96 | |||
97 | pmdp = (pmd_t *) pudp; | ||
98 | #ifdef CONFIG_64BIT | ||
99 | if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) | ||
100 | pmdp = (pmd_t *) pud_deref(pud); | ||
101 | pmdp += pmd_index(addr); | ||
102 | #endif | ||
103 | do { | ||
104 | pmd = *pmdp; | ||
105 | barrier(); | ||
106 | next = pmd_addr_end(addr, end); | ||
107 | if (pmd_none(pmd)) | ||
108 | return 0; | ||
109 | if (unlikely(pmd_huge(pmd))) { | ||
110 | if (!gup_huge_pmd(pmdp, pmd, addr, next, | ||
111 | write, pages, nr)) | ||
112 | return 0; | ||
113 | } else if (!gup_pte_range(pmdp, pmd, addr, next, | ||
114 | write, pages, nr)) | ||
115 | return 0; | ||
116 | } while (pmdp++, addr = next, addr != end); | ||
117 | |||
118 | return 1; | ||
119 | } | ||
120 | |||
121 | static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, | ||
122 | unsigned long end, int write, struct page **pages, int *nr) | ||
123 | { | ||
124 | unsigned long next; | ||
125 | pud_t *pudp, pud; | ||
126 | |||
127 | pudp = (pud_t *) pgdp; | ||
128 | #ifdef CONFIG_64BIT | ||
129 | if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) | ||
130 | pudp = (pud_t *) pgd_deref(pgd); | ||
131 | pudp += pud_index(addr); | ||
132 | #endif | ||
133 | do { | ||
134 | pud = *pudp; | ||
135 | barrier(); | ||
136 | next = pud_addr_end(addr, end); | ||
137 | if (pud_none(pud)) | ||
138 | return 0; | ||
139 | if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr)) | ||
140 | return 0; | ||
141 | } while (pudp++, addr = next, addr != end); | ||
142 | |||
143 | return 1; | ||
144 | } | ||
145 | |||
146 | /** | ||
147 | * get_user_pages_fast() - pin user pages in memory | ||
148 | * @start: starting user address | ||
149 | * @nr_pages: number of pages from start to pin | ||
150 | * @write: whether pages will be written to | ||
151 | * @pages: array that receives pointers to the pages pinned. | ||
152 | * Should be at least nr_pages long. | ||
153 | * | ||
154 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
155 | * If not successful, it will fall back to taking the lock and | ||
156 | * calling get_user_pages(). | ||
157 | * | ||
158 | * Returns number of pages pinned. This may be fewer than the number | ||
159 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
160 | * were pinned, returns -errno. | ||
161 | */ | ||
162 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
163 | struct page **pages) | ||
164 | { | ||
165 | struct mm_struct *mm = current->mm; | ||
166 | unsigned long addr, len, end; | ||
167 | unsigned long next; | ||
168 | pgd_t *pgdp, pgd; | ||
169 | int nr = 0; | ||
170 | |||
171 | start &= PAGE_MASK; | ||
172 | addr = start; | ||
173 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
174 | end = start + len; | ||
175 | if (end < start) | ||
176 | goto slow_irqon; | ||
177 | |||
178 | /* | ||
179 | * local_irq_disable() doesn't prevent pagetable teardown, but does | ||
180 | * prevent the pagetables from being freed on s390. | ||
181 | * | ||
182 | * So long as we atomically load page table pointers versus teardown, | ||
183 | * we can follow the address down to the the page and take a ref on it. | ||
184 | */ | ||
185 | local_irq_disable(); | ||
186 | pgdp = pgd_offset(mm, addr); | ||
187 | do { | ||
188 | pgd = *pgdp; | ||
189 | barrier(); | ||
190 | next = pgd_addr_end(addr, end); | ||
191 | if (pgd_none(pgd)) | ||
192 | goto slow; | ||
193 | if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr)) | ||
194 | goto slow; | ||
195 | } while (pgdp++, addr = next, addr != end); | ||
196 | local_irq_enable(); | ||
197 | |||
198 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | ||
199 | return nr; | ||
200 | |||
201 | { | ||
202 | int ret; | ||
203 | slow: | ||
204 | local_irq_enable(); | ||
205 | slow_irqon: | ||
206 | /* Try to get the remaining pages with get_user_pages */ | ||
207 | start += nr << PAGE_SHIFT; | ||
208 | pages += nr; | ||
209 | |||
210 | down_read(&mm->mmap_sem); | ||
211 | ret = get_user_pages(current, mm, start, | ||
212 | (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); | ||
213 | up_read(&mm->mmap_sem); | ||
214 | |||
215 | /* Have to be a bit careful with return values */ | ||
216 | if (nr > 0) { | ||
217 | if (ret < 0) | ||
218 | ret = nr; | ||
219 | else | ||
220 | ret += nr; | ||
221 | } | ||
222 | |||
223 | return ret; | ||
224 | } | ||
225 | } | ||
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index f28c43d2f61d..639cd21f2218 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c | |||
@@ -68,7 +68,7 @@ void arch_release_hugepage(struct page *page) | |||
68 | ptep = (pte_t *) page[1].index; | 68 | ptep = (pte_t *) page[1].index; |
69 | if (!ptep) | 69 | if (!ptep) |
70 | return; | 70 | return; |
71 | pte_free(&init_mm, ptep); | 71 | page_table_free(&init_mm, (unsigned long *) ptep); |
72 | page[1].index = 0; | 72 | page[1].index = 0; |
73 | } | 73 | } |
74 | 74 | ||
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 0744fb3536b1..852a3fec1ece 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c | |||
@@ -38,8 +38,6 @@ | |||
38 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
39 | #include <asm/sections.h> | 39 | #include <asm/sections.h> |
40 | 40 | ||
41 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | ||
42 | |||
43 | pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE))); | 41 | pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE))); |
44 | 42 | ||
45 | unsigned long empty_zero_page, zero_page_mask; | 43 | unsigned long empty_zero_page, zero_page_mask; |
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 8d999249d357..19338d228c9b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/spinlock.h> | 15 | #include <linux/spinlock.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/quicklist.h> | 17 | #include <linux/quicklist.h> |
18 | #include <linux/rcupdate.h> | ||
18 | 19 | ||
19 | #include <asm/system.h> | 20 | #include <asm/system.h> |
20 | #include <asm/pgtable.h> | 21 | #include <asm/pgtable.h> |
@@ -23,6 +24,67 @@ | |||
23 | #include <asm/tlbflush.h> | 24 | #include <asm/tlbflush.h> |
24 | #include <asm/mmu_context.h> | 25 | #include <asm/mmu_context.h> |
25 | 26 | ||
27 | struct rcu_table_freelist { | ||
28 | struct rcu_head rcu; | ||
29 | struct mm_struct *mm; | ||
30 | unsigned int pgt_index; | ||
31 | unsigned int crst_index; | ||
32 | unsigned long *table[0]; | ||
33 | }; | ||
34 | |||
35 | #define RCU_FREELIST_SIZE \ | ||
36 | ((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \ | ||
37 | / sizeof(unsigned long)) | ||
38 | |||
39 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | ||
40 | static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist); | ||
41 | |||
42 | static void __page_table_free(struct mm_struct *mm, unsigned long *table); | ||
43 | static void __crst_table_free(struct mm_struct *mm, unsigned long *table); | ||
44 | |||
45 | static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm) | ||
46 | { | ||
47 | struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist); | ||
48 | struct rcu_table_freelist *batch = *batchp; | ||
49 | |||
50 | if (batch) | ||
51 | return batch; | ||
52 | batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC); | ||
53 | if (batch) { | ||
54 | batch->mm = mm; | ||
55 | batch->pgt_index = 0; | ||
56 | batch->crst_index = RCU_FREELIST_SIZE; | ||
57 | *batchp = batch; | ||
58 | } | ||
59 | return batch; | ||
60 | } | ||
61 | |||
62 | static void rcu_table_freelist_callback(struct rcu_head *head) | ||
63 | { | ||
64 | struct rcu_table_freelist *batch = | ||
65 | container_of(head, struct rcu_table_freelist, rcu); | ||
66 | |||
67 | while (batch->pgt_index > 0) | ||
68 | __page_table_free(batch->mm, batch->table[--batch->pgt_index]); | ||
69 | while (batch->crst_index < RCU_FREELIST_SIZE) | ||
70 | __crst_table_free(batch->mm, batch->table[batch->crst_index++]); | ||
71 | free_page((unsigned long) batch); | ||
72 | } | ||
73 | |||
74 | void rcu_table_freelist_finish(void) | ||
75 | { | ||
76 | struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist); | ||
77 | |||
78 | if (!batch) | ||
79 | return; | ||
80 | call_rcu(&batch->rcu, rcu_table_freelist_callback); | ||
81 | __get_cpu_var(rcu_table_freelist) = NULL; | ||
82 | } | ||
83 | |||
84 | static void smp_sync(void *arg) | ||
85 | { | ||
86 | } | ||
87 | |||
26 | #ifndef CONFIG_64BIT | 88 | #ifndef CONFIG_64BIT |
27 | #define ALLOC_ORDER 1 | 89 | #define ALLOC_ORDER 1 |
28 | #define TABLES_PER_PAGE 4 | 90 | #define TABLES_PER_PAGE 4 |
@@ -78,25 +140,55 @@ unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) | |||
78 | } | 140 | } |
79 | page->index = page_to_phys(shadow); | 141 | page->index = page_to_phys(shadow); |
80 | } | 142 | } |
81 | spin_lock(&mm->context.list_lock); | 143 | spin_lock_bh(&mm->context.list_lock); |
82 | list_add(&page->lru, &mm->context.crst_list); | 144 | list_add(&page->lru, &mm->context.crst_list); |
83 | spin_unlock(&mm->context.list_lock); | 145 | spin_unlock_bh(&mm->context.list_lock); |
84 | return (unsigned long *) page_to_phys(page); | 146 | return (unsigned long *) page_to_phys(page); |
85 | } | 147 | } |
86 | 148 | ||
87 | void crst_table_free(struct mm_struct *mm, unsigned long *table) | 149 | static void __crst_table_free(struct mm_struct *mm, unsigned long *table) |
88 | { | 150 | { |
89 | unsigned long *shadow = get_shadow_table(table); | 151 | unsigned long *shadow = get_shadow_table(table); |
90 | struct page *page = virt_to_page(table); | ||
91 | 152 | ||
92 | spin_lock(&mm->context.list_lock); | ||
93 | list_del(&page->lru); | ||
94 | spin_unlock(&mm->context.list_lock); | ||
95 | if (shadow) | 153 | if (shadow) |
96 | free_pages((unsigned long) shadow, ALLOC_ORDER); | 154 | free_pages((unsigned long) shadow, ALLOC_ORDER); |
97 | free_pages((unsigned long) table, ALLOC_ORDER); | 155 | free_pages((unsigned long) table, ALLOC_ORDER); |
98 | } | 156 | } |
99 | 157 | ||
158 | void crst_table_free(struct mm_struct *mm, unsigned long *table) | ||
159 | { | ||
160 | struct page *page = virt_to_page(table); | ||
161 | |||
162 | spin_lock_bh(&mm->context.list_lock); | ||
163 | list_del(&page->lru); | ||
164 | spin_unlock_bh(&mm->context.list_lock); | ||
165 | __crst_table_free(mm, table); | ||
166 | } | ||
167 | |||
168 | void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table) | ||
169 | { | ||
170 | struct rcu_table_freelist *batch; | ||
171 | struct page *page = virt_to_page(table); | ||
172 | |||
173 | spin_lock_bh(&mm->context.list_lock); | ||
174 | list_del(&page->lru); | ||
175 | spin_unlock_bh(&mm->context.list_lock); | ||
176 | if (atomic_read(&mm->mm_users) < 2 && | ||
177 | cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { | ||
178 | __crst_table_free(mm, table); | ||
179 | return; | ||
180 | } | ||
181 | batch = rcu_table_freelist_get(mm); | ||
182 | if (!batch) { | ||
183 | smp_call_function(smp_sync, NULL, 1); | ||
184 | __crst_table_free(mm, table); | ||
185 | return; | ||
186 | } | ||
187 | batch->table[--batch->crst_index] = table; | ||
188 | if (batch->pgt_index >= batch->crst_index) | ||
189 | rcu_table_freelist_finish(); | ||
190 | } | ||
191 | |||
100 | #ifdef CONFIG_64BIT | 192 | #ifdef CONFIG_64BIT |
101 | int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) | 193 | int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) |
102 | { | 194 | { |
@@ -108,7 +200,7 @@ repeat: | |||
108 | table = crst_table_alloc(mm, mm->context.noexec); | 200 | table = crst_table_alloc(mm, mm->context.noexec); |
109 | if (!table) | 201 | if (!table) |
110 | return -ENOMEM; | 202 | return -ENOMEM; |
111 | spin_lock(&mm->page_table_lock); | 203 | spin_lock_bh(&mm->page_table_lock); |
112 | if (mm->context.asce_limit < limit) { | 204 | if (mm->context.asce_limit < limit) { |
113 | pgd = (unsigned long *) mm->pgd; | 205 | pgd = (unsigned long *) mm->pgd; |
114 | if (mm->context.asce_limit <= (1UL << 31)) { | 206 | if (mm->context.asce_limit <= (1UL << 31)) { |
@@ -130,7 +222,7 @@ repeat: | |||
130 | mm->task_size = mm->context.asce_limit; | 222 | mm->task_size = mm->context.asce_limit; |
131 | table = NULL; | 223 | table = NULL; |
132 | } | 224 | } |
133 | spin_unlock(&mm->page_table_lock); | 225 | spin_unlock_bh(&mm->page_table_lock); |
134 | if (table) | 226 | if (table) |
135 | crst_table_free(mm, table); | 227 | crst_table_free(mm, table); |
136 | if (mm->context.asce_limit < limit) | 228 | if (mm->context.asce_limit < limit) |
@@ -182,7 +274,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
182 | unsigned long bits; | 274 | unsigned long bits; |
183 | 275 | ||
184 | bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; | 276 | bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; |
185 | spin_lock(&mm->context.list_lock); | 277 | spin_lock_bh(&mm->context.list_lock); |
186 | page = NULL; | 278 | page = NULL; |
187 | if (!list_empty(&mm->context.pgtable_list)) { | 279 | if (!list_empty(&mm->context.pgtable_list)) { |
188 | page = list_first_entry(&mm->context.pgtable_list, | 280 | page = list_first_entry(&mm->context.pgtable_list, |
@@ -191,7 +283,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
191 | page = NULL; | 283 | page = NULL; |
192 | } | 284 | } |
193 | if (!page) { | 285 | if (!page) { |
194 | spin_unlock(&mm->context.list_lock); | 286 | spin_unlock_bh(&mm->context.list_lock); |
195 | page = alloc_page(GFP_KERNEL|__GFP_REPEAT); | 287 | page = alloc_page(GFP_KERNEL|__GFP_REPEAT); |
196 | if (!page) | 288 | if (!page) |
197 | return NULL; | 289 | return NULL; |
@@ -202,7 +294,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
202 | clear_table_pgstes(table); | 294 | clear_table_pgstes(table); |
203 | else | 295 | else |
204 | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); | 296 | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); |
205 | spin_lock(&mm->context.list_lock); | 297 | spin_lock_bh(&mm->context.list_lock); |
206 | list_add(&page->lru, &mm->context.pgtable_list); | 298 | list_add(&page->lru, &mm->context.pgtable_list); |
207 | } | 299 | } |
208 | table = (unsigned long *) page_to_phys(page); | 300 | table = (unsigned long *) page_to_phys(page); |
@@ -213,10 +305,25 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
213 | page->flags |= bits; | 305 | page->flags |= bits; |
214 | if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) | 306 | if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) |
215 | list_move_tail(&page->lru, &mm->context.pgtable_list); | 307 | list_move_tail(&page->lru, &mm->context.pgtable_list); |
216 | spin_unlock(&mm->context.list_lock); | 308 | spin_unlock_bh(&mm->context.list_lock); |
217 | return table; | 309 | return table; |
218 | } | 310 | } |
219 | 311 | ||
312 | static void __page_table_free(struct mm_struct *mm, unsigned long *table) | ||
313 | { | ||
314 | struct page *page; | ||
315 | unsigned long bits; | ||
316 | |||
317 | bits = ((unsigned long) table) & 15; | ||
318 | table = (unsigned long *)(((unsigned long) table) ^ bits); | ||
319 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | ||
320 | page->flags ^= bits; | ||
321 | if (!(page->flags & FRAG_MASK)) { | ||
322 | pgtable_page_dtor(page); | ||
323 | __free_page(page); | ||
324 | } | ||
325 | } | ||
326 | |||
220 | void page_table_free(struct mm_struct *mm, unsigned long *table) | 327 | void page_table_free(struct mm_struct *mm, unsigned long *table) |
221 | { | 328 | { |
222 | struct page *page; | 329 | struct page *page; |
@@ -225,7 +332,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) | |||
225 | bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; | 332 | bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; |
226 | bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); | 333 | bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); |
227 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | 334 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); |
228 | spin_lock(&mm->context.list_lock); | 335 | spin_lock_bh(&mm->context.list_lock); |
229 | page->flags ^= bits; | 336 | page->flags ^= bits; |
230 | if (page->flags & FRAG_MASK) { | 337 | if (page->flags & FRAG_MASK) { |
231 | /* Page now has some free pgtable fragments. */ | 338 | /* Page now has some free pgtable fragments. */ |
@@ -234,18 +341,48 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) | |||
234 | } else | 341 | } else |
235 | /* All fragments of the 4K page have been freed. */ | 342 | /* All fragments of the 4K page have been freed. */ |
236 | list_del(&page->lru); | 343 | list_del(&page->lru); |
237 | spin_unlock(&mm->context.list_lock); | 344 | spin_unlock_bh(&mm->context.list_lock); |
238 | if (page) { | 345 | if (page) { |
239 | pgtable_page_dtor(page); | 346 | pgtable_page_dtor(page); |
240 | __free_page(page); | 347 | __free_page(page); |
241 | } | 348 | } |
242 | } | 349 | } |
243 | 350 | ||
351 | void page_table_free_rcu(struct mm_struct *mm, unsigned long *table) | ||
352 | { | ||
353 | struct rcu_table_freelist *batch; | ||
354 | struct page *page; | ||
355 | unsigned long bits; | ||
356 | |||
357 | if (atomic_read(&mm->mm_users) < 2 && | ||
358 | cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { | ||
359 | page_table_free(mm, table); | ||
360 | return; | ||
361 | } | ||
362 | batch = rcu_table_freelist_get(mm); | ||
363 | if (!batch) { | ||
364 | smp_call_function(smp_sync, NULL, 1); | ||
365 | page_table_free(mm, table); | ||
366 | return; | ||
367 | } | ||
368 | bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; | ||
369 | bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); | ||
370 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | ||
371 | spin_lock_bh(&mm->context.list_lock); | ||
372 | /* Delayed freeing with rcu prevents reuse of pgtable fragments */ | ||
373 | list_del_init(&page->lru); | ||
374 | spin_unlock_bh(&mm->context.list_lock); | ||
375 | table = (unsigned long *)(((unsigned long) table) | bits); | ||
376 | batch->table[batch->pgt_index++] = table; | ||
377 | if (batch->pgt_index >= batch->crst_index) | ||
378 | rcu_table_freelist_finish(); | ||
379 | } | ||
380 | |||
244 | void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) | 381 | void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) |
245 | { | 382 | { |
246 | struct page *page; | 383 | struct page *page; |
247 | 384 | ||
248 | spin_lock(&mm->context.list_lock); | 385 | spin_lock_bh(&mm->context.list_lock); |
249 | /* Free shadow region and segment tables. */ | 386 | /* Free shadow region and segment tables. */ |
250 | list_for_each_entry(page, &mm->context.crst_list, lru) | 387 | list_for_each_entry(page, &mm->context.crst_list, lru) |
251 | if (page->index) { | 388 | if (page->index) { |
@@ -255,7 +392,7 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) | |||
255 | /* "Free" second halves of page tables. */ | 392 | /* "Free" second halves of page tables. */ |
256 | list_for_each_entry(page, &mm->context.pgtable_list, lru) | 393 | list_for_each_entry(page, &mm->context.pgtable_list, lru) |
257 | page->flags &= ~SECOND_HALVES; | 394 | page->flags &= ~SECOND_HALVES; |
258 | spin_unlock(&mm->context.list_lock); | 395 | spin_unlock_bh(&mm->context.list_lock); |
259 | mm->context.noexec = 0; | 396 | mm->context.noexec = 0; |
260 | update_mm(mm, tsk); | 397 | update_mm(mm, tsk); |
261 | } | 398 | } |