aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2008-05-14 00:37:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-05-14 13:05:18 -0400
commit362a61ad61199e19a61b8e432015e2586b288f5b (patch)
treeb766e454928eea0db1ec6e301340c27ef5f5244f
parent73f10281ea96d7e8b4fc1c5d755a7c8eb484155b (diff)
fix SMP data race in pagetable setup vs walking
There is a possible data race in the page table walking code. After the split ptlock patches, it actually seems to have been introduced to the core code, but even before that I think it would have impacted some architectures (powerpc and sparc64, at least, walk the page tables without taking locks eg. see find_linux_pte()). The race is as follows: The pte page is allocated, zeroed, and its struct page gets its spinlock initialized. The mm-wide ptl is then taken, and then the pte page is inserted into the pagetables. At this point, the spinlock is not guaranteed to have ordered the previous stores to initialize the pte page with the subsequent store to put it in the page tables. So another Linux page table walker might be walking down (without any locks, because we have split-leaf-ptls), and find that new pte we've inserted. It might try to take the spinlock before the store from the other CPU initializes it. And subsequently it might read a pte_t out before stores from the other CPU have cleared the memory. There are also similar races in higher levels of the page tables. They obviously don't involve the spinlock, but could see uninitialized memory. Arch code and hardware pagetable walkers that walk the pagetables without locks could see similar uninitialized memory problems, regardless of whether split ptes are enabled or not. I prefer to put the barriers in core code, because that's where the higher level logic happens, but the page table accessors are per-arch, and open-coding them everywhere I don't think is an option. I'll put the read-side barriers in alpha arch code for now (other architectures perform data-dependent loads in order). Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/asm-alpha/pgtable.h21
-rw-r--r--mm/memory.c21
2 files changed, 40 insertions, 2 deletions
diff --git a/include/asm-alpha/pgtable.h b/include/asm-alpha/pgtable.h
index 05ce5fba43e3..3f0c59f6d8aa 100644
--- a/include/asm-alpha/pgtable.h
+++ b/include/asm-alpha/pgtable.h
@@ -287,17 +287,34 @@ extern inline pte_t pte_mkspecial(pte_t pte) { return pte; }
287#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) 287#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
288#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) 288#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
289 289
290/*
291 * The smp_read_barrier_depends() in the following functions are required to
292 * order the load of *dir (the pointer in the top level page table) with any
293 * subsequent load of the returned pmd_t *ret (ret is data dependent on *dir).
294 *
295 * If this ordering is not enforced, the CPU might load an older value of
296 * *ret, which may be uninitialized data. See mm/memory.c:__pte_alloc for
297 * more details.
298 *
299 * Note that we never change the mm->pgd pointer after the task is running, so
300 * pgd_offset does not require such a barrier.
301 */
302
290/* Find an entry in the second-level page table.. */ 303/* Find an entry in the second-level page table.. */
291extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) 304extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
292{ 305{
293 return (pmd_t *) pgd_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); 306 pmd_t *ret = (pmd_t *) pgd_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1));
307 smp_read_barrier_depends(); /* see above */
308 return ret;
294} 309}
295 310
296/* Find an entry in the third-level page table.. */ 311/* Find an entry in the third-level page table.. */
297extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) 312extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address)
298{ 313{
299 return (pte_t *) pmd_page_vaddr(*dir) 314 pte_t *ret = (pte_t *) pmd_page_vaddr(*dir)
300 + ((address >> PAGE_SHIFT) & (PTRS_PER_PAGE - 1)); 315 + ((address >> PAGE_SHIFT) & (PTRS_PER_PAGE - 1));
316 smp_read_barrier_depends(); /* see above */
317 return ret;
301} 318}
302 319
303#define pte_offset_map(dir,addr) pte_offset_kernel((dir),(addr)) 320#define pte_offset_map(dir,addr) pte_offset_kernel((dir),(addr))
diff --git a/mm/memory.c b/mm/memory.c
index 48c122d42ed7..fb5608a120ed 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -311,6 +311,21 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
311 if (!new) 311 if (!new)
312 return -ENOMEM; 312 return -ENOMEM;
313 313
314 /*
315 * Ensure all pte setup (eg. pte page lock and page clearing) are
316 * visible before the pte is made visible to other CPUs by being
317 * put into page tables.
318 *
319 * The other side of the story is the pointer chasing in the page
320 * table walking code (when walking the page table without locking;
321 * ie. most of the time). Fortunately, these data accesses consist
322 * of a chain of data-dependent loads, meaning most CPUs (alpha
323 * being the notable exception) will already guarantee loads are
324 * seen in-order. See the alpha page table accessors for the
325 * smp_read_barrier_depends() barriers in page table walking code.
326 */
327 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
328
314 spin_lock(&mm->page_table_lock); 329 spin_lock(&mm->page_table_lock);
315 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 330 if (!pmd_present(*pmd)) { /* Has another populated it ? */
316 mm->nr_ptes++; 331 mm->nr_ptes++;
@@ -329,6 +344,8 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
329 if (!new) 344 if (!new)
330 return -ENOMEM; 345 return -ENOMEM;
331 346
347 smp_wmb(); /* See comment in __pte_alloc */
348
332 spin_lock(&init_mm.page_table_lock); 349 spin_lock(&init_mm.page_table_lock);
333 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 350 if (!pmd_present(*pmd)) { /* Has another populated it ? */
334 pmd_populate_kernel(&init_mm, pmd, new); 351 pmd_populate_kernel(&init_mm, pmd, new);
@@ -2619,6 +2636,8 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2619 if (!new) 2636 if (!new)
2620 return -ENOMEM; 2637 return -ENOMEM;
2621 2638
2639 smp_wmb(); /* See comment in __pte_alloc */
2640
2622 spin_lock(&mm->page_table_lock); 2641 spin_lock(&mm->page_table_lock);
2623 if (pgd_present(*pgd)) /* Another has populated it */ 2642 if (pgd_present(*pgd)) /* Another has populated it */
2624 pud_free(mm, new); 2643 pud_free(mm, new);
@@ -2640,6 +2659,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2640 if (!new) 2659 if (!new)
2641 return -ENOMEM; 2660 return -ENOMEM;
2642 2661
2662 smp_wmb(); /* See comment in __pte_alloc */
2663
2643 spin_lock(&mm->page_table_lock); 2664 spin_lock(&mm->page_table_lock);
2644#ifndef __ARCH_HAS_4LEVEL_HACK 2665#ifndef __ARCH_HAS_4LEVEL_HACK
2645 if (pud_present(*pud)) /* Another has populated it */ 2666 if (pud_present(*pud)) /* Another has populated it */