aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2008-05-14 00:37:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-05-14 13:05:18 -0400
commit362a61ad61199e19a61b8e432015e2586b288f5b (patch)
treeb766e454928eea0db1ec6e301340c27ef5f5244f /mm/memory.c
parent73f10281ea96d7e8b4fc1c5d755a7c8eb484155b (diff)
fix SMP data race in pagetable setup vs walking
There is a possible data race in the page table walking code. After the split ptlock patches, it actually seems to have been introduced to the core code, but even before that I think it would have impacted some architectures (powerpc and sparc64, at least, walk the page tables without taking locks eg. see find_linux_pte()). The race is as follows: The pte page is allocated, zeroed, and its struct page gets its spinlock initialized. The mm-wide ptl is then taken, and then the pte page is inserted into the pagetables. At this point, the spinlock is not guaranteed to have ordered the previous stores to initialize the pte page with the subsequent store to put it in the page tables. So another Linux page table walker might be walking down (without any locks, because we have split-leaf-ptls), and find that new pte we've inserted. It might try to take the spinlock before the store from the other CPU initializes it. And subsequently it might read a pte_t out before stores from the other CPU have cleared the memory. There are also similar races in higher levels of the page tables. They obviously don't involve the spinlock, but could see uninitialized memory. Arch code and hardware pagetable walkers that walk the pagetables without locks could see similar uninitialized memory problems, regardless of whether split ptes are enabled or not. I prefer to put the barriers in core code, because that's where the higher level logic happens, but the page table accessors are per-arch, and open-coding them everywhere I don't think is an option. I'll put the read-side barriers in alpha arch code for now (other architectures perform data-dependent loads in order). Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c21
1 files changed, 21 insertions, 0 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 48c122d42ed7..fb5608a120ed 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -311,6 +311,21 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
311 if (!new) 311 if (!new)
312 return -ENOMEM; 312 return -ENOMEM;
313 313
314 /*
315 * Ensure all pte setup (eg. pte page lock and page clearing) are
316 * visible before the pte is made visible to other CPUs by being
317 * put into page tables.
318 *
319 * The other side of the story is the pointer chasing in the page
320 * table walking code (when walking the page table without locking;
321 * ie. most of the time). Fortunately, these data accesses consist
322 * of a chain of data-dependent loads, meaning most CPUs (alpha
323 * being the notable exception) will already guarantee loads are
324 * seen in-order. See the alpha page table accessors for the
325 * smp_read_barrier_depends() barriers in page table walking code.
326 */
327 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
328
314 spin_lock(&mm->page_table_lock); 329 spin_lock(&mm->page_table_lock);
315 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 330 if (!pmd_present(*pmd)) { /* Has another populated it ? */
316 mm->nr_ptes++; 331 mm->nr_ptes++;
@@ -329,6 +344,8 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
329 if (!new) 344 if (!new)
330 return -ENOMEM; 345 return -ENOMEM;
331 346
347 smp_wmb(); /* See comment in __pte_alloc */
348
332 spin_lock(&init_mm.page_table_lock); 349 spin_lock(&init_mm.page_table_lock);
333 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 350 if (!pmd_present(*pmd)) { /* Has another populated it ? */
334 pmd_populate_kernel(&init_mm, pmd, new); 351 pmd_populate_kernel(&init_mm, pmd, new);
@@ -2619,6 +2636,8 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2619 if (!new) 2636 if (!new)
2620 return -ENOMEM; 2637 return -ENOMEM;
2621 2638
2639 smp_wmb(); /* See comment in __pte_alloc */
2640
2622 spin_lock(&mm->page_table_lock); 2641 spin_lock(&mm->page_table_lock);
2623 if (pgd_present(*pgd)) /* Another has populated it */ 2642 if (pgd_present(*pgd)) /* Another has populated it */
2624 pud_free(mm, new); 2643 pud_free(mm, new);
@@ -2640,6 +2659,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2640 if (!new) 2659 if (!new)
2641 return -ENOMEM; 2660 return -ENOMEM;
2642 2661
2662 smp_wmb(); /* See comment in __pte_alloc */
2663
2643 spin_lock(&mm->page_table_lock); 2664 spin_lock(&mm->page_table_lock);
2644#ifndef __ARCH_HAS_4LEVEL_HACK 2665#ifndef __ARCH_HAS_4LEVEL_HACK
2645 if (pud_present(*pud)) /* Another has populated it */ 2666 if (pud_present(*pud)) /* Another has populated it */