diff options
author | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2009-09-11 04:28:57 -0400 |
---|---|---|
committer | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2009-09-11 04:29:53 -0400 |
commit | 50aa98bad056a17655864a4d71ebc32d95c629a7 (patch) | |
tree | bf8d22851d99583e2ea388766697bf64672d7926 | |
parent | c4de0c1a18237c2727dde8ad392e333539b0af3c (diff) |
[S390] fix recursive locking on page_table_lock
Suzuki Poulose reported the following recursive locking bug on s390:
Here is the stack trace : (see Appendix I for more info)
[<0000000000406ed6>] _spin_lock+0x52/0x94
[<0000000000103bde>] crst_table_free+0x14e/0x1a4
[<00000000001ba684>] __pmd_alloc+0x114/0x1ec
[<00000000001be8d0>] handle_mm_fault+0x2cc/0xb80
[<0000000000407d62>] do_dat_exception+0x2b6/0x3a0
[<0000000000114f8c>] sysc_return+0x0/0x8
[<00000200001642b2>] 0x200001642b2
The page_table_lock is already acquired in __pmd_alloc (mm/memory.c) and
it tries to populate the pud/pgd with a new pmd allocated. If another
thread populates it before we get a chance, we free the pmd using
pmd_free().
On s390x, pmd_free(even pud_free ) is #defined to crst_table_free(),
which acquires the page_table_lock to protect the crst_table index updates.
Hence this ends up in a recursive locking of the page_table_lock.
The solution suggested by Dave Hansen is to use a new spin lock in the mmu
context to protect the access to the crst_list and the pgtable_list.
Reported-by: Suzuki Poulose <suzuki@in.ibm.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
-rw-r--r-- | arch/s390/include/asm/mmu.h | 1 | ||||
-rw-r--r-- | arch/s390/include/asm/pgalloc.h | 1 | ||||
-rw-r--r-- | arch/s390/mm/pgtable.c | 24 | ||||
-rw-r--r-- | arch/s390/mm/vmem.c | 1 |
4 files changed, 15 insertions, 12 deletions
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 3b59216e6284..03be99919d62 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define __MMU_H | 2 | #define __MMU_H |
3 | 3 | ||
4 | typedef struct { | 4 | typedef struct { |
5 | spinlock_t list_lock; | ||
5 | struct list_head crst_list; | 6 | struct list_head crst_list; |
6 | struct list_head pgtable_list; | 7 | struct list_head pgtable_list; |
7 | unsigned long asce_bits; | 8 | unsigned long asce_bits; |
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index b2658b9220fe..ddad5903341c 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h | |||
@@ -140,6 +140,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) | |||
140 | 140 | ||
141 | static inline pgd_t *pgd_alloc(struct mm_struct *mm) | 141 | static inline pgd_t *pgd_alloc(struct mm_struct *mm) |
142 | { | 142 | { |
143 | spin_lock_init(&mm->context.list_lock); | ||
143 | INIT_LIST_HEAD(&mm->context.crst_list); | 144 | INIT_LIST_HEAD(&mm->context.crst_list); |
144 | INIT_LIST_HEAD(&mm->context.pgtable_list); | 145 | INIT_LIST_HEAD(&mm->context.pgtable_list); |
145 | return (pgd_t *) crst_table_alloc(mm, s390_noexec); | 146 | return (pgd_t *) crst_table_alloc(mm, s390_noexec); |
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 565667207985..c70215247071 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c | |||
@@ -78,9 +78,9 @@ unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) | |||
78 | } | 78 | } |
79 | page->index = page_to_phys(shadow); | 79 | page->index = page_to_phys(shadow); |
80 | } | 80 | } |
81 | spin_lock(&mm->page_table_lock); | 81 | spin_lock(&mm->context.list_lock); |
82 | list_add(&page->lru, &mm->context.crst_list); | 82 | list_add(&page->lru, &mm->context.crst_list); |
83 | spin_unlock(&mm->page_table_lock); | 83 | spin_unlock(&mm->context.list_lock); |
84 | return (unsigned long *) page_to_phys(page); | 84 | return (unsigned long *) page_to_phys(page); |
85 | } | 85 | } |
86 | 86 | ||
@@ -89,9 +89,9 @@ void crst_table_free(struct mm_struct *mm, unsigned long *table) | |||
89 | unsigned long *shadow = get_shadow_table(table); | 89 | unsigned long *shadow = get_shadow_table(table); |
90 | struct page *page = virt_to_page(table); | 90 | struct page *page = virt_to_page(table); |
91 | 91 | ||
92 | spin_lock(&mm->page_table_lock); | 92 | spin_lock(&mm->context.list_lock); |
93 | list_del(&page->lru); | 93 | list_del(&page->lru); |
94 | spin_unlock(&mm->page_table_lock); | 94 | spin_unlock(&mm->context.list_lock); |
95 | if (shadow) | 95 | if (shadow) |
96 | free_pages((unsigned long) shadow, ALLOC_ORDER); | 96 | free_pages((unsigned long) shadow, ALLOC_ORDER); |
97 | free_pages((unsigned long) table, ALLOC_ORDER); | 97 | free_pages((unsigned long) table, ALLOC_ORDER); |
@@ -182,7 +182,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
182 | unsigned long bits; | 182 | unsigned long bits; |
183 | 183 | ||
184 | bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; | 184 | bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; |
185 | spin_lock(&mm->page_table_lock); | 185 | spin_lock(&mm->context.list_lock); |
186 | page = NULL; | 186 | page = NULL; |
187 | if (!list_empty(&mm->context.pgtable_list)) { | 187 | if (!list_empty(&mm->context.pgtable_list)) { |
188 | page = list_first_entry(&mm->context.pgtable_list, | 188 | page = list_first_entry(&mm->context.pgtable_list, |
@@ -191,7 +191,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
191 | page = NULL; | 191 | page = NULL; |
192 | } | 192 | } |
193 | if (!page) { | 193 | if (!page) { |
194 | spin_unlock(&mm->page_table_lock); | 194 | spin_unlock(&mm->context.list_lock); |
195 | page = alloc_page(GFP_KERNEL|__GFP_REPEAT); | 195 | page = alloc_page(GFP_KERNEL|__GFP_REPEAT); |
196 | if (!page) | 196 | if (!page) |
197 | return NULL; | 197 | return NULL; |
@@ -202,7 +202,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
202 | clear_table_pgstes(table); | 202 | clear_table_pgstes(table); |
203 | else | 203 | else |
204 | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); | 204 | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); |
205 | spin_lock(&mm->page_table_lock); | 205 | spin_lock(&mm->context.list_lock); |
206 | list_add(&page->lru, &mm->context.pgtable_list); | 206 | list_add(&page->lru, &mm->context.pgtable_list); |
207 | } | 207 | } |
208 | table = (unsigned long *) page_to_phys(page); | 208 | table = (unsigned long *) page_to_phys(page); |
@@ -213,7 +213,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
213 | page->flags |= bits; | 213 | page->flags |= bits; |
214 | if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) | 214 | if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) |
215 | list_move_tail(&page->lru, &mm->context.pgtable_list); | 215 | list_move_tail(&page->lru, &mm->context.pgtable_list); |
216 | spin_unlock(&mm->page_table_lock); | 216 | spin_unlock(&mm->context.list_lock); |
217 | return table; | 217 | return table; |
218 | } | 218 | } |
219 | 219 | ||
@@ -225,7 +225,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) | |||
225 | bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; | 225 | bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; |
226 | bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); | 226 | bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); |
227 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | 227 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); |
228 | spin_lock(&mm->page_table_lock); | 228 | spin_lock(&mm->context.list_lock); |
229 | page->flags ^= bits; | 229 | page->flags ^= bits; |
230 | if (page->flags & FRAG_MASK) { | 230 | if (page->flags & FRAG_MASK) { |
231 | /* Page now has some free pgtable fragments. */ | 231 | /* Page now has some free pgtable fragments. */ |
@@ -234,7 +234,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) | |||
234 | } else | 234 | } else |
235 | /* All fragments of the 4K page have been freed. */ | 235 | /* All fragments of the 4K page have been freed. */ |
236 | list_del(&page->lru); | 236 | list_del(&page->lru); |
237 | spin_unlock(&mm->page_table_lock); | 237 | spin_unlock(&mm->context.list_lock); |
238 | if (page) { | 238 | if (page) { |
239 | pgtable_page_dtor(page); | 239 | pgtable_page_dtor(page); |
240 | __free_page(page); | 240 | __free_page(page); |
@@ -245,7 +245,7 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) | |||
245 | { | 245 | { |
246 | struct page *page; | 246 | struct page *page; |
247 | 247 | ||
248 | spin_lock(&mm->page_table_lock); | 248 | spin_lock(&mm->context.list_lock); |
249 | /* Free shadow region and segment tables. */ | 249 | /* Free shadow region and segment tables. */ |
250 | list_for_each_entry(page, &mm->context.crst_list, lru) | 250 | list_for_each_entry(page, &mm->context.crst_list, lru) |
251 | if (page->index) { | 251 | if (page->index) { |
@@ -255,7 +255,7 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) | |||
255 | /* "Free" second halves of page tables. */ | 255 | /* "Free" second halves of page tables. */ |
256 | list_for_each_entry(page, &mm->context.pgtable_list, lru) | 256 | list_for_each_entry(page, &mm->context.pgtable_list, lru) |
257 | page->flags &= ~SECOND_HALVES; | 257 | page->flags &= ~SECOND_HALVES; |
258 | spin_unlock(&mm->page_table_lock); | 258 | spin_unlock(&mm->context.list_lock); |
259 | mm->context.noexec = 0; | 259 | mm->context.noexec = 0; |
260 | update_mm(mm, tsk); | 260 | update_mm(mm, tsk); |
261 | } | 261 | } |
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index e4868bfc672f..5f91a38d7592 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c | |||
@@ -331,6 +331,7 @@ void __init vmem_map_init(void) | |||
331 | unsigned long start, end; | 331 | unsigned long start, end; |
332 | int i; | 332 | int i; |
333 | 333 | ||
334 | spin_lock_init(&init_mm.context.list_lock); | ||
334 | INIT_LIST_HEAD(&init_mm.context.crst_list); | 335 | INIT_LIST_HEAD(&init_mm.context.crst_list); |
335 | INIT_LIST_HEAD(&init_mm.context.pgtable_list); | 336 | INIT_LIST_HEAD(&init_mm.context.pgtable_list); |
336 | init_mm.context.noexec = 0; | 337 | init_mm.context.noexec = 0; |