aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2008-02-05 01:29:34 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-05 12:44:19 -0500
commit0ed361dec36945f3116ee1338638ada9a8920905 (patch)
tree3e0fc6319ef49f6cac82e8203a8aa199302ab9c5
parent62e1c55300f306e06478f460a7eefba085206e0b (diff)
mm: fix PageUptodate data race
After running SetPageUptodate, preceeding stores to the page contents to actually bring it uptodate may not be ordered with the store to set the page uptodate. Therefore, another CPU which checks PageUptodate is true, then reads the page contents can get stale data. Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after PageUptodate. Many places that test PageUptodate, do so with the page locked, and this would be enough to ensure memory ordering in those places if SetPageUptodate were only called while the page is locked. Unfortunately that is not always the case for some filesystems, but it could be an idea for the future. Also bring the handling of anonymous page uptodateness in line with that of file backed page management, by marking anon pages as uptodate when they _are_ uptodate, rather than when our implementation requires that they be marked as such. Doing allows us to get rid of the smp_wmb's in the page copying functions, which were especially added for anonymous pages for an analogous memory ordering problem. Both file and anonymous pages are handled with the same barriers. FAQ: Q. Why not do this in flush_dcache_page? A. Firstly, flush_dcache_page handles only one side (the smb side) of the ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away memory barriers in a completely unrelated function is nasty; at least in the PageUptodate macros, they are located together with (half) the operations involved in the ordering. Thirdly, the smp_wmb is only required when first bringing the page uptodate, wheras flush_dcache_page should be called each time it is written to through the kernel mapping. It is logically the wrong place to put it. Q. Why does this increase my text size / reduce my performance / etc. A. Because it is adding the necessary instructions to eliminate the data-race. Q. Can it be improved? A. Yes, eg. if you were to create a rule that all SetPageUptodate operations run under the page lock, we could avoid the smp_rmb places where PageUptodate is queried under the page lock. Requires audit of all filesystems and at least some would need reworking. That's great you're interested, I'm eagerly awaiting your patches. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/highmem.h4
-rw-r--r--include/linux/page-flags.h42
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/memory.c9
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/swap_state.c2
6 files changed, 48 insertions, 13 deletions
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 61a5e5eb27f0..7dcbc82f3b7b 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -68,8 +68,6 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
68 void *addr = kmap_atomic(page, KM_USER0); 68 void *addr = kmap_atomic(page, KM_USER0);
69 clear_user_page(addr, vaddr, page); 69 clear_user_page(addr, vaddr, page);
70 kunmap_atomic(addr, KM_USER0); 70 kunmap_atomic(addr, KM_USER0);
71 /* Make sure this page is cleared on other CPU's too before using it */
72 smp_wmb();
73} 71}
74 72
75#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE 73#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
@@ -172,8 +170,6 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
172 copy_user_page(vto, vfrom, vaddr, to); 170 copy_user_page(vto, vfrom, vaddr, to);
173 kunmap_atomic(vfrom, KM_USER0); 171 kunmap_atomic(vfrom, KM_USER0);
174 kunmap_atomic(vto, KM_USER1); 172 kunmap_atomic(vto, KM_USER1);
175 /* Make sure this page is cleared on other CPU's too before using it */
176 smp_wmb();
177} 173}
178 174
179#endif 175#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 209d3a47f50f..bbad43fb8181 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -131,16 +131,52 @@
131#define ClearPageReferenced(page) clear_bit(PG_referenced, &(page)->flags) 131#define ClearPageReferenced(page) clear_bit(PG_referenced, &(page)->flags)
132#define TestClearPageReferenced(page) test_and_clear_bit(PG_referenced, &(page)->flags) 132#define TestClearPageReferenced(page) test_and_clear_bit(PG_referenced, &(page)->flags)
133 133
134#define PageUptodate(page) test_bit(PG_uptodate, &(page)->flags) 134static inline int PageUptodate(struct page *page)
135{
136 int ret = test_bit(PG_uptodate, &(page)->flags);
137
138 /*
139 * Must ensure that the data we read out of the page is loaded
140 * _after_ we've loaded page->flags to check for PageUptodate.
141 * We can skip the barrier if the page is not uptodate, because
142 * we wouldn't be reading anything from it.
143 *
144 * See SetPageUptodate() for the other side of the story.
145 */
146 if (ret)
147 smp_rmb();
148
149 return ret;
150}
151
152static inline void __SetPageUptodate(struct page *page)
153{
154 smp_wmb();
155 __set_bit(PG_uptodate, &(page)->flags);
135#ifdef CONFIG_S390 156#ifdef CONFIG_S390
157 page_clear_dirty(page);
158#endif
159}
160
136static inline void SetPageUptodate(struct page *page) 161static inline void SetPageUptodate(struct page *page)
137{ 162{
163#ifdef CONFIG_S390
138 if (!test_and_set_bit(PG_uptodate, &page->flags)) 164 if (!test_and_set_bit(PG_uptodate, &page->flags))
139 page_clear_dirty(page); 165 page_clear_dirty(page);
140}
141#else 166#else
142#define SetPageUptodate(page) set_bit(PG_uptodate, &(page)->flags) 167 /*
168 * Memory barrier must be issued before setting the PG_uptodate bit,
169 * so that all previous stores issued in order to bring the page
170 * uptodate are actually visible before PageUptodate becomes true.
171 *
172 * s390 doesn't need an explicit smp_wmb here because the test and
173 * set bit already provides full barriers.
174 */
175 smp_wmb();
176 set_bit(PG_uptodate, &(page)->flags);
143#endif 177#endif
178}
179
144#define ClearPageUptodate(page) clear_bit(PG_uptodate, &(page)->flags) 180#define ClearPageUptodate(page) clear_bit(PG_uptodate, &(page)->flags)
145 181
146#define PageDirty(page) test_bit(PG_dirty, &(page)->flags) 182#define PageDirty(page) test_bit(PG_dirty, &(page)->flags)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index db861d8b6c28..1a5642074e34 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -813,6 +813,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
813 813
814 spin_unlock(&mm->page_table_lock); 814 spin_unlock(&mm->page_table_lock);
815 copy_huge_page(new_page, old_page, address, vma); 815 copy_huge_page(new_page, old_page, address, vma);
816 __SetPageUptodate(new_page);
816 spin_lock(&mm->page_table_lock); 817 spin_lock(&mm->page_table_lock);
817 818
818 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 819 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -858,6 +859,7 @@ retry:
858 goto out; 859 goto out;
859 } 860 }
860 clear_huge_page(page, address); 861 clear_huge_page(page, address);
862 __SetPageUptodate(page);
861 863
862 if (vma->vm_flags & VM_SHARED) { 864 if (vma->vm_flags & VM_SHARED) {
863 int err; 865 int err;
diff --git a/mm/memory.c b/mm/memory.c
index 6a9c048f6012..7bb70728bb52 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1518,10 +1518,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1518 memset(kaddr, 0, PAGE_SIZE); 1518 memset(kaddr, 0, PAGE_SIZE);
1519 kunmap_atomic(kaddr, KM_USER0); 1519 kunmap_atomic(kaddr, KM_USER0);
1520 flush_dcache_page(dst); 1520 flush_dcache_page(dst);
1521 return; 1521 } else
1522 1522 copy_user_highpage(dst, src, va, vma);
1523 }
1524 copy_user_highpage(dst, src, va, vma);
1525} 1523}
1526 1524
1527/* 1525/*
@@ -1630,6 +1628,7 @@ gotten:
1630 if (!new_page) 1628 if (!new_page)
1631 goto oom; 1629 goto oom;
1632 cow_user_page(new_page, old_page, address, vma); 1630 cow_user_page(new_page, old_page, address, vma);
1631 __SetPageUptodate(new_page);
1633 1632
1634 /* 1633 /*
1635 * Re-check the pte - we dropped the lock 1634 * Re-check the pte - we dropped the lock
@@ -2102,6 +2101,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2102 page = alloc_zeroed_user_highpage_movable(vma, address); 2101 page = alloc_zeroed_user_highpage_movable(vma, address);
2103 if (!page) 2102 if (!page)
2104 goto oom; 2103 goto oom;
2104 __SetPageUptodate(page);
2105 2105
2106 entry = mk_pte(page, vma->vm_page_prot); 2106 entry = mk_pte(page, vma->vm_page_prot);
2107 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2107 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2202,6 +2202,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2202 goto out; 2202 goto out;
2203 } 2203 }
2204 copy_user_highpage(page, vmf.page, address, vma); 2204 copy_user_highpage(page, vmf.page, address, vma);
2205 __SetPageUptodate(page);
2205 } else { 2206 } else {
2206 /* 2207 /*
2207 * If the page will be shareable, see if the backing 2208 * If the page will be shareable, see if the backing
diff --git a/mm/page_io.c b/mm/page_io.c
index 3b97f6850273..065c4480eaf0 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -126,7 +126,7 @@ int swap_readpage(struct file *file, struct page *page)
126 int ret = 0; 126 int ret = 0;
127 127
128 BUG_ON(!PageLocked(page)); 128 BUG_ON(!PageLocked(page));
129 ClearPageUptodate(page); 129 BUG_ON(PageUptodate(page));
130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page, 130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
131 end_swap_bio_read); 131 end_swap_bio_read);
132 if (bio == NULL) { 132 if (bio == NULL) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 65b81c92738f..ec42f01a8d02 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -125,6 +125,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
125 int err; 125 int err;
126 126
127 BUG_ON(!PageLocked(page)); 127 BUG_ON(!PageLocked(page));
128 BUG_ON(!PageUptodate(page));
128 129
129 for (;;) { 130 for (;;) {
130 entry = get_swap_page(); 131 entry = get_swap_page();
@@ -147,7 +148,6 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
147 148
148 switch (err) { 149 switch (err) {
149 case 0: /* Success */ 150 case 0: /* Success */
150 SetPageUptodate(page);
151 SetPageDirty(page); 151 SetPageDirty(page);
152 return 1; 152 return 1;
153 case -EEXIST: 153 case -EEXIST: