diff options
author | Nick Piggin <npiggin@suse.de> | 2008-07-25 22:45:30 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-26 15:00:06 -0400 |
commit | e286781d5f2e9c846e012a39653a166e9d31777d (patch) | |
tree | 14958fe6d8f3e0459c96c68b3034ea2433ab85ac /include | |
parent | 47feff2c8eefe85099f87c43d3096855f0085ca0 (diff) |
mm: speculative page references
If we can be sure that elevating the page_count on a pagecache page will
pin it, we can speculatively run this operation, and subsequently check to
see if we hit the right page rather than relying on holding a lock or
otherwise pinning a reference to the page.
This can be done if get_page/put_page behaves consistently throughout the
whole tree (ie. if we "get" the page after it has been used for something
else, we must be able to free it with a put_page).
Actually, there is a period where the count behaves differently: when the
page is free or if it is a constituent page of a compound page. We need
an atomic_inc_not_zero operation to ensure we don't try to grab the page
in either case.
This patch introduces the core locking protocol to the pagecache (ie.
adds page_cache_get_speculative, and tweaks some update-side code to make
it work).
Thanks to Hugh for pointing out an improvement to the algorithm setting
page_count to zero when we have control of all references, in order to
hold off speculative getters.
[kamezawa.hiroyu@jp.fujitsu.com: fix migration_entry_wait()]
[hugh@veritas.com: fix add_to_page_cache]
[akpm@linux-foundation.org: repair a comment]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/pagemap.h | 111 |
1 files changed, 110 insertions, 1 deletions
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ee1ec2c7723c..a81d81890422 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <asm/uaccess.h> | 12 | #include <asm/uaccess.h> |
13 | #include <linux/gfp.h> | 13 | #include <linux/gfp.h> |
14 | #include <linux/bitops.h> | 14 | #include <linux/bitops.h> |
15 | #include <linux/hardirq.h> /* for in_interrupt() */ | ||
15 | 16 | ||
16 | /* | 17 | /* |
17 | * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page | 18 | * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page |
@@ -62,6 +63,98 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) | |||
62 | #define page_cache_release(page) put_page(page) | 63 | #define page_cache_release(page) put_page(page) |
63 | void release_pages(struct page **pages, int nr, int cold); | 64 | void release_pages(struct page **pages, int nr, int cold); |
64 | 65 | ||
66 | /* | ||
67 | * speculatively take a reference to a page. | ||
68 | * If the page is free (_count == 0), then _count is untouched, and 0 | ||
69 | * is returned. Otherwise, _count is incremented by 1 and 1 is returned. | ||
70 | * | ||
71 | * This function must be called inside the same rcu_read_lock() section as has | ||
72 | * been used to lookup the page in the pagecache radix-tree (or page table): | ||
73 | * this allows allocators to use a synchronize_rcu() to stabilize _count. | ||
74 | * | ||
75 | * Unless an RCU grace period has passed, the count of all pages coming out | ||
76 | * of the allocator must be considered unstable. page_count may return higher | ||
77 | * than expected, and put_page must be able to do the right thing when the | ||
78 | * page has been finished with, no matter what it is subsequently allocated | ||
79 | * for (because put_page is what is used here to drop an invalid speculative | ||
80 | * reference). | ||
81 | * | ||
82 | * This is the interesting part of the lockless pagecache (and lockless | ||
83 | * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page) | ||
84 | * has the following pattern: | ||
85 | * 1. find page in radix tree | ||
86 | * 2. conditionally increment refcount | ||
87 | * 3. check the page is still in pagecache (if no, goto 1) | ||
88 | * | ||
89 | * Remove-side that cares about stability of _count (eg. reclaim) has the | ||
90 | * following (with tree_lock held for write): | ||
91 | * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) | ||
92 | * B. remove page from pagecache | ||
93 | * C. free the page | ||
94 | * | ||
95 | * There are 2 critical interleavings that matter: | ||
96 | * - 2 runs before A: in this case, A sees elevated refcount and bails out | ||
97 | * - A runs before 2: in this case, 2 sees zero refcount and retries; | ||
98 | * subsequently, B will complete and 1 will find no page, causing the | ||
99 | * lookup to return NULL. | ||
100 | * | ||
101 | * It is possible that between 1 and 2, the page is removed then the exact same | ||
102 | * page is inserted into the same position in pagecache. That's OK: the | ||
103 | * old find_get_page using tree_lock could equally have run before or after | ||
104 | * such a re-insertion, depending on order that locks are granted. | ||
105 | * | ||
106 | * Lookups racing against pagecache insertion isn't a big problem: either 1 | ||
107 | * will find the page or it will not. Likewise, the old find_get_page could run | ||
108 | * either before the insertion or afterwards, depending on timing. | ||
109 | */ | ||
110 | static inline int page_cache_get_speculative(struct page *page) | ||
111 | { | ||
112 | VM_BUG_ON(in_interrupt()); | ||
113 | |||
114 | #if !defined(CONFIG_SMP) && defined(CONFIG_CLASSIC_RCU) | ||
115 | # ifdef CONFIG_PREEMPT | ||
116 | VM_BUG_ON(!in_atomic()); | ||
117 | # endif | ||
118 | /* | ||
119 | * Preempt must be disabled here - we rely on rcu_read_lock doing | ||
120 | * this for us. | ||
121 | * | ||
122 | * Pagecache won't be truncated from interrupt context, so if we have | ||
123 | * found a page in the radix tree here, we have pinned its refcount by | ||
124 | * disabling preempt, and hence no need for the "speculative get" that | ||
125 | * SMP requires. | ||
126 | */ | ||
127 | VM_BUG_ON(page_count(page) == 0); | ||
128 | atomic_inc(&page->_count); | ||
129 | |||
130 | #else | ||
131 | if (unlikely(!get_page_unless_zero(page))) { | ||
132 | /* | ||
133 | * Either the page has been freed, or will be freed. | ||
134 | * In either case, retry here and the caller should | ||
135 | * do the right thing (see comments above). | ||
136 | */ | ||
137 | return 0; | ||
138 | } | ||
139 | #endif | ||
140 | VM_BUG_ON(PageTail(page)); | ||
141 | |||
142 | return 1; | ||
143 | } | ||
144 | |||
145 | static inline int page_freeze_refs(struct page *page, int count) | ||
146 | { | ||
147 | return likely(atomic_cmpxchg(&page->_count, count, 0) == count); | ||
148 | } | ||
149 | |||
150 | static inline void page_unfreeze_refs(struct page *page, int count) | ||
151 | { | ||
152 | VM_BUG_ON(page_count(page) != 0); | ||
153 | VM_BUG_ON(count == 0); | ||
154 | |||
155 | atomic_set(&page->_count, count); | ||
156 | } | ||
157 | |||
65 | #ifdef CONFIG_NUMA | 158 | #ifdef CONFIG_NUMA |
66 | extern struct page *__page_cache_alloc(gfp_t gfp); | 159 | extern struct page *__page_cache_alloc(gfp_t gfp); |
67 | #else | 160 | #else |
@@ -133,7 +226,7 @@ static inline struct page *read_mapping_page(struct address_space *mapping, | |||
133 | return read_cache_page(mapping, index, filler, data); | 226 | return read_cache_page(mapping, index, filler, data); |
134 | } | 227 | } |
135 | 228 | ||
136 | int add_to_page_cache(struct page *page, struct address_space *mapping, | 229 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, |
137 | pgoff_t index, gfp_t gfp_mask); | 230 | pgoff_t index, gfp_t gfp_mask); |
138 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 231 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
139 | pgoff_t index, gfp_t gfp_mask); | 232 | pgoff_t index, gfp_t gfp_mask); |
@@ -141,6 +234,22 @@ extern void remove_from_page_cache(struct page *page); | |||
141 | extern void __remove_from_page_cache(struct page *page); | 234 | extern void __remove_from_page_cache(struct page *page); |
142 | 235 | ||
143 | /* | 236 | /* |
237 | * Like add_to_page_cache_locked, but used to add newly allocated pages: | ||
238 | * the page is new, so we can just run SetPageLocked() against it. | ||
239 | */ | ||
240 | static inline int add_to_page_cache(struct page *page, | ||
241 | struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) | ||
242 | { | ||
243 | int error; | ||
244 | |||
245 | SetPageLocked(page); | ||
246 | error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); | ||
247 | if (unlikely(error)) | ||
248 | ClearPageLocked(page); | ||
249 | return error; | ||
250 | } | ||
251 | |||
252 | /* | ||
144 | * Return byte-offset into filesystem object for page. | 253 | * Return byte-offset into filesystem object for page. |
145 | */ | 254 | */ |
146 | static inline loff_t page_offset(struct page *page) | 255 | static inline loff_t page_offset(struct page *page) |