aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swap.c')
-rw-r--r--mm/swap.c278
1 files changed, 155 insertions, 123 deletions
diff --git a/mm/swap.c b/mm/swap.c
index 84b26aaabd03..d1100b619e61 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,7 +31,6 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/uio.h> 33#include <linux/uio.h>
34#include <linux/hugetlb.h>
35 34
36#include "internal.h" 35#include "internal.h"
37 36
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page)
82 81
83static void put_compound_page(struct page *page) 82static void put_compound_page(struct page *page)
84{ 83{
85 if (unlikely(PageTail(page))) { 84 struct page *page_head;
86 /* __split_huge_page_refcount can run under us */
87 struct page *page_head = compound_trans_head(page);
88
89 if (likely(page != page_head &&
90 get_page_unless_zero(page_head))) {
91 unsigned long flags;
92 85
86 if (likely(!PageTail(page))) {
87 if (put_page_testzero(page)) {
93 /* 88 /*
94 * THP can not break up slab pages so avoid taking 89 * By the time all refcounts have been released
95 * compound_lock(). Slab performs non-atomic bit ops 90 * split_huge_page cannot run anymore from under us.
96 * on page->flags for better performance. In particular
97 * slab_unlock() in slub used to be a hot path. It is
98 * still hot on arches that do not support
99 * this_cpu_cmpxchg_double().
100 */ 91 */
101 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 92 if (PageHead(page))
102 if (likely(PageTail(page))) { 93 __put_compound_page(page);
103 /* 94 else
104 * __split_huge_page_refcount 95 __put_single_page(page);
105 * cannot race here. 96 }
106 */ 97 return;
107 VM_BUG_ON(!PageHead(page_head)); 98 }
108 atomic_dec(&page->_mapcount); 99
109 if (put_page_testzero(page_head)) 100 /* __split_huge_page_refcount can run under us */
110 VM_BUG_ON(1); 101 page_head = compound_trans_head(page);
111 if (put_page_testzero(page_head)) 102
112 __put_compound_page(page_head); 103 /*
113 return; 104 * THP can not break up slab pages so avoid taking
114 } else 105 * compound_lock() and skip the tail page refcounting (in
115 /* 106 * _mapcount) too. Slab performs non-atomic bit ops on
116 * __split_huge_page_refcount 107 * page->flags for better performance. In particular
117 * run before us, "page" was a 108 * slab_unlock() in slub used to be a hot path. It is still
118 * THP tail. The split 109 * hot on arches that do not support
119 * page_head has been freed 110 * this_cpu_cmpxchg_double().
120 * and reallocated as slab or 111 *
121 * hugetlbfs page of smaller 112 * If "page" is part of a slab or hugetlbfs page it cannot be
122 * order (only possible if 113 * splitted and the head page cannot change from under us. And
123 * reallocated as slab on 114 * if "page" is part of a THP page under splitting, if the
124 * x86). 115 * head page pointed by the THP tail isn't a THP head anymore,
125 */ 116 * we'll find PageTail clear after smp_rmb() and we'll treat
126 goto skip_lock; 117 * it as a single page.
127 } 118 */
119 if (!__compound_tail_refcounted(page_head)) {
120 /*
121 * If "page" is a THP tail, we must read the tail page
122 * flags after the head page flags. The
123 * split_huge_page side enforces write memory barriers
124 * between clearing PageTail and before the head page
125 * can be freed and reallocated.
126 */
127 smp_rmb();
128 if (likely(PageTail(page))) {
128 /* 129 /*
129 * page_head wasn't a dangling pointer but it 130 * __split_huge_page_refcount cannot race
130 * may not be a head page anymore by the time 131 * here.
131 * we obtain the lock. That is ok as long as it
132 * can't be freed from under us.
133 */ 132 */
134 flags = compound_lock_irqsave(page_head); 133 VM_BUG_ON(!PageHead(page_head));
135 if (unlikely(!PageTail(page))) { 134 VM_BUG_ON(page_mapcount(page) != 0);
136 /* __split_huge_page_refcount run before us */ 135 if (put_page_testzero(page_head)) {
137 compound_unlock_irqrestore(page_head, flags); 136 /*
138skip_lock: 137 * If this is the tail of a slab
139 if (put_page_testzero(page_head)) { 138 * compound page, the tail pin must
140 /* 139 * not be the last reference held on
141 * The head page may have been 140 * the page, because the PG_slab
142 * freed and reallocated as a 141 * cannot be cleared before all tail
143 * compound page of smaller 142 * pins (which skips the _mapcount
144 * order and then freed again. 143 * tail refcounting) have been
145 * All we know is that it 144 * released. For hugetlbfs the tail
146 * cannot have become: a THP 145 * pin may be the last reference on
147 * page, a compound page of 146 * the page instead, because
148 * higher order, a tail page. 147 * PageHeadHuge will not go away until
149 * That is because we still 148 * the compound page enters the buddy
150 * hold the refcount of the 149 * allocator.
151 * split THP tail and 150 */
152 * page_head was the THP head 151 VM_BUG_ON(PageSlab(page_head));
153 * before the split. 152 __put_compound_page(page_head);
154 */
155 if (PageHead(page_head))
156 __put_compound_page(page_head);
157 else
158 __put_single_page(page_head);
159 }
160out_put_single:
161 if (put_page_testzero(page))
162 __put_single_page(page);
163 return;
164 } 153 }
165 VM_BUG_ON(page_head != page->first_page); 154 return;
155 } else
166 /* 156 /*
167 * We can release the refcount taken by 157 * __split_huge_page_refcount run before us,
168 * get_page_unless_zero() now that 158 * "page" was a THP tail. The split page_head
169 * __split_huge_page_refcount() is blocked on 159 * has been freed and reallocated as slab or
170 * the compound_lock. 160 * hugetlbfs page of smaller order (only
161 * possible if reallocated as slab on x86).
171 */ 162 */
172 if (put_page_testzero(page_head)) 163 goto out_put_single;
173 VM_BUG_ON(1); 164 }
174 /* __split_huge_page_refcount will wait now */
175 VM_BUG_ON(page_mapcount(page) <= 0);
176 atomic_dec(&page->_mapcount);
177 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
178 VM_BUG_ON(atomic_read(&page->_count) != 0);
179 compound_unlock_irqrestore(page_head, flags);
180 165
166 if (likely(page != page_head && get_page_unless_zero(page_head))) {
167 unsigned long flags;
168
169 /*
170 * page_head wasn't a dangling pointer but it may not
171 * be a head page anymore by the time we obtain the
172 * lock. That is ok as long as it can't be freed from
173 * under us.
174 */
175 flags = compound_lock_irqsave(page_head);
176 if (unlikely(!PageTail(page))) {
177 /* __split_huge_page_refcount run before us */
178 compound_unlock_irqrestore(page_head, flags);
181 if (put_page_testzero(page_head)) { 179 if (put_page_testzero(page_head)) {
180 /*
181 * The head page may have been freed
182 * and reallocated as a compound page
183 * of smaller order and then freed
184 * again. All we know is that it
185 * cannot have become: a THP page, a
186 * compound page of higher order, a
187 * tail page. That is because we
188 * still hold the refcount of the
189 * split THP tail and page_head was
190 * the THP head before the split.
191 */
182 if (PageHead(page_head)) 192 if (PageHead(page_head))
183 __put_compound_page(page_head); 193 __put_compound_page(page_head);
184 else 194 else
185 __put_single_page(page_head); 195 __put_single_page(page_head);
186 } 196 }
187 } else { 197out_put_single:
188 /* page_head is a dangling pointer */ 198 if (put_page_testzero(page))
189 VM_BUG_ON(PageTail(page)); 199 __put_single_page(page);
190 goto out_put_single; 200 return;
191 } 201 }
192 } else if (put_page_testzero(page)) { 202 VM_BUG_ON(page_head != page->first_page);
193 if (PageHead(page)) 203 /*
194 __put_compound_page(page); 204 * We can release the refcount taken by
195 else 205 * get_page_unless_zero() now that
196 __put_single_page(page); 206 * __split_huge_page_refcount() is blocked on the
207 * compound_lock.
208 */
209 if (put_page_testzero(page_head))
210 VM_BUG_ON(1);
211 /* __split_huge_page_refcount will wait now */
212 VM_BUG_ON(page_mapcount(page) <= 0);
213 atomic_dec(&page->_mapcount);
214 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
215 VM_BUG_ON(atomic_read(&page->_count) != 0);
216 compound_unlock_irqrestore(page_head, flags);
217
218 if (put_page_testzero(page_head)) {
219 if (PageHead(page_head))
220 __put_compound_page(page_head);
221 else
222 __put_single_page(page_head);
223 }
224 } else {
225 /* page_head is a dangling pointer */
226 VM_BUG_ON(PageTail(page));
227 goto out_put_single;
197 } 228 }
198} 229}
199 230
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page)
221 * split_huge_page(). 252 * split_huge_page().
222 */ 253 */
223 unsigned long flags; 254 unsigned long flags;
224 bool got = false; 255 bool got;
225 struct page *page_head = compound_trans_head(page); 256 struct page *page_head = compound_trans_head(page);
226 257
227 if (likely(page != page_head && get_page_unless_zero(page_head))) { 258 /* Ref to put_compound_page() comment. */
228 /* Ref to put_compound_page() comment. */ 259 if (!__compound_tail_refcounted(page_head)) {
229 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 260 smp_rmb();
230 if (likely(PageTail(page))) { 261 if (likely(PageTail(page))) {
231 /* 262 /*
232 * This is a hugetlbfs page or a slab 263 * This is a hugetlbfs page or a slab
233 * page. __split_huge_page_refcount 264 * page. __split_huge_page_refcount
234 * cannot race here. 265 * cannot race here.
235 */ 266 */
236 VM_BUG_ON(!PageHead(page_head)); 267 VM_BUG_ON(!PageHead(page_head));
237 __get_page_tail_foll(page, false); 268 __get_page_tail_foll(page, true);
238 return true; 269 return true;
239 } else { 270 } else {
240 /* 271 /*
241 * __split_huge_page_refcount run 272 * __split_huge_page_refcount run
242 * before us, "page" was a THP 273 * before us, "page" was a THP
243 * tail. The split page_head has been 274 * tail. The split page_head has been
244 * freed and reallocated as slab or 275 * freed and reallocated as slab or
245 * hugetlbfs page of smaller order 276 * hugetlbfs page of smaller order
246 * (only possible if reallocated as 277 * (only possible if reallocated as
247 * slab on x86). 278 * slab on x86).
248 */ 279 */
249 put_page(page_head); 280 return false;
250 return false;
251 }
252 } 281 }
282 }
253 283
284 got = false;
285 if (likely(page != page_head && get_page_unless_zero(page_head))) {
254 /* 286 /*
255 * page_head wasn't a dangling pointer but it 287 * page_head wasn't a dangling pointer but it
256 * may not be a head page anymore by the time 288 * may not be a head page anymore by the time