diff options
author | Hugh Dickins <hughd@google.com> | 2011-08-03 19:21:22 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-08-03 20:25:23 -0400 |
commit | 7a5d0fbb29936fad7f17b1cb001b0c33a5f13328 (patch) | |
tree | 903c66ee195451f45c782155a8eb6dd6e7217522 /mm | |
parent | bda97eab0cc9c6385b9f26abdda6459f630f4513 (diff) |
tmpfs: convert shmem_truncate_range to radix-swap
Disable the toy swapping implementation in shmem_writepage() - it's hard
to support two schemes at once - and convert shmem_truncate_range() to a
lockless gang lookup of swap entries along with pages, freeing both.
Since the second loop tightens its noose until all entries of either
kind have been squeezed out (and we shall make sure that there's not an
instant when neither is visible), there is no longer a need for yet
another pass below.
shmem_radix_tree_replace() compensates for the lockless lookup by
checking that the expected entry is in place, under lock, before
replacing it. Here it just deletes, but will be used in later patches
to substitute swap entry for page or page for swap entry.
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/shmem.c | 192 |
1 files changed, 146 insertions, 46 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index e101c211ed1f..4439b7d55819 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -238,6 +238,111 @@ static swp_entry_t shmem_get_swap(struct shmem_inode_info *info, pgoff_t index) | |||
238 | info->i_direct[index] : (swp_entry_t){0}; | 238 | info->i_direct[index] : (swp_entry_t){0}; |
239 | } | 239 | } |
240 | 240 | ||
241 | /* | ||
242 | * Replace item expected in radix tree by a new item, while holding tree lock. | ||
243 | */ | ||
244 | static int shmem_radix_tree_replace(struct address_space *mapping, | ||
245 | pgoff_t index, void *expected, void *replacement) | ||
246 | { | ||
247 | void **pslot; | ||
248 | void *item = NULL; | ||
249 | |||
250 | VM_BUG_ON(!expected); | ||
251 | pslot = radix_tree_lookup_slot(&mapping->page_tree, index); | ||
252 | if (pslot) | ||
253 | item = radix_tree_deref_slot_protected(pslot, | ||
254 | &mapping->tree_lock); | ||
255 | if (item != expected) | ||
256 | return -ENOENT; | ||
257 | if (replacement) | ||
258 | radix_tree_replace_slot(pslot, replacement); | ||
259 | else | ||
260 | radix_tree_delete(&mapping->page_tree, index); | ||
261 | return 0; | ||
262 | } | ||
263 | |||
264 | /* | ||
265 | * Like find_get_pages, but collecting swap entries as well as pages. | ||
266 | */ | ||
267 | static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, | ||
268 | pgoff_t start, unsigned int nr_pages, | ||
269 | struct page **pages, pgoff_t *indices) | ||
270 | { | ||
271 | unsigned int i; | ||
272 | unsigned int ret; | ||
273 | unsigned int nr_found; | ||
274 | |||
275 | rcu_read_lock(); | ||
276 | restart: | ||
277 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
278 | (void ***)pages, indices, start, nr_pages); | ||
279 | ret = 0; | ||
280 | for (i = 0; i < nr_found; i++) { | ||
281 | struct page *page; | ||
282 | repeat: | ||
283 | page = radix_tree_deref_slot((void **)pages[i]); | ||
284 | if (unlikely(!page)) | ||
285 | continue; | ||
286 | if (radix_tree_exception(page)) { | ||
287 | if (radix_tree_exceptional_entry(page)) | ||
288 | goto export; | ||
289 | /* radix_tree_deref_retry(page) */ | ||
290 | goto restart; | ||
291 | } | ||
292 | if (!page_cache_get_speculative(page)) | ||
293 | goto repeat; | ||
294 | |||
295 | /* Has the page moved? */ | ||
296 | if (unlikely(page != *((void **)pages[i]))) { | ||
297 | page_cache_release(page); | ||
298 | goto repeat; | ||
299 | } | ||
300 | export: | ||
301 | indices[ret] = indices[i]; | ||
302 | pages[ret] = page; | ||
303 | ret++; | ||
304 | } | ||
305 | if (unlikely(!ret && nr_found)) | ||
306 | goto restart; | ||
307 | rcu_read_unlock(); | ||
308 | return ret; | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * Remove swap entry from radix tree, free the swap and its page cache. | ||
313 | */ | ||
314 | static int shmem_free_swap(struct address_space *mapping, | ||
315 | pgoff_t index, void *radswap) | ||
316 | { | ||
317 | int error; | ||
318 | |||
319 | spin_lock_irq(&mapping->tree_lock); | ||
320 | error = shmem_radix_tree_replace(mapping, index, radswap, NULL); | ||
321 | spin_unlock_irq(&mapping->tree_lock); | ||
322 | if (!error) | ||
323 | free_swap_and_cache(radix_to_swp_entry(radswap)); | ||
324 | return error; | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * Pagevec may contain swap entries, so shuffle up pages before releasing. | ||
329 | */ | ||
330 | static void shmem_pagevec_release(struct pagevec *pvec) | ||
331 | { | ||
332 | int i, j; | ||
333 | |||
334 | for (i = 0, j = 0; i < pagevec_count(pvec); i++) { | ||
335 | struct page *page = pvec->pages[i]; | ||
336 | if (!radix_tree_exceptional_entry(page)) | ||
337 | pvec->pages[j++] = page; | ||
338 | } | ||
339 | pvec->nr = j; | ||
340 | pagevec_release(pvec); | ||
341 | } | ||
342 | |||
343 | /* | ||
344 | * Remove range of pages and swap entries from radix tree, and free them. | ||
345 | */ | ||
241 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | 346 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) |
242 | { | 347 | { |
243 | struct address_space *mapping = inode->i_mapping; | 348 | struct address_space *mapping = inode->i_mapping; |
@@ -246,36 +351,44 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
246 | unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 351 | unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); |
247 | pgoff_t end = (lend >> PAGE_CACHE_SHIFT); | 352 | pgoff_t end = (lend >> PAGE_CACHE_SHIFT); |
248 | struct pagevec pvec; | 353 | struct pagevec pvec; |
354 | pgoff_t indices[PAGEVEC_SIZE]; | ||
355 | long nr_swaps_freed = 0; | ||
249 | pgoff_t index; | 356 | pgoff_t index; |
250 | swp_entry_t swap; | ||
251 | int i; | 357 | int i; |
252 | 358 | ||
253 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); | 359 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); |
254 | 360 | ||
255 | pagevec_init(&pvec, 0); | 361 | pagevec_init(&pvec, 0); |
256 | index = start; | 362 | index = start; |
257 | while (index <= end && pagevec_lookup(&pvec, mapping, index, | 363 | while (index <= end) { |
258 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 364 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
365 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | ||
366 | pvec.pages, indices); | ||
367 | if (!pvec.nr) | ||
368 | break; | ||
259 | mem_cgroup_uncharge_start(); | 369 | mem_cgroup_uncharge_start(); |
260 | for (i = 0; i < pagevec_count(&pvec); i++) { | 370 | for (i = 0; i < pagevec_count(&pvec); i++) { |
261 | struct page *page = pvec.pages[i]; | 371 | struct page *page = pvec.pages[i]; |
262 | 372 | ||
263 | /* We rely upon deletion not changing page->index */ | 373 | index = indices[i]; |
264 | index = page->index; | ||
265 | if (index > end) | 374 | if (index > end) |
266 | break; | 375 | break; |
267 | 376 | ||
268 | if (!trylock_page(page)) | 377 | if (radix_tree_exceptional_entry(page)) { |
378 | nr_swaps_freed += !shmem_free_swap(mapping, | ||
379 | index, page); | ||
269 | continue; | 380 | continue; |
270 | WARN_ON(page->index != index); | 381 | } |
271 | if (PageWriteback(page)) { | 382 | |
272 | unlock_page(page); | 383 | if (!trylock_page(page)) |
273 | continue; | 384 | continue; |
385 | if (page->mapping == mapping) { | ||
386 | VM_BUG_ON(PageWriteback(page)); | ||
387 | truncate_inode_page(mapping, page); | ||
274 | } | 388 | } |
275 | truncate_inode_page(mapping, page); | ||
276 | unlock_page(page); | 389 | unlock_page(page); |
277 | } | 390 | } |
278 | pagevec_release(&pvec); | 391 | shmem_pagevec_release(&pvec); |
279 | mem_cgroup_uncharge_end(); | 392 | mem_cgroup_uncharge_end(); |
280 | cond_resched(); | 393 | cond_resched(); |
281 | index++; | 394 | index++; |
@@ -295,59 +408,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
295 | index = start; | 408 | index = start; |
296 | for ( ; ; ) { | 409 | for ( ; ; ) { |
297 | cond_resched(); | 410 | cond_resched(); |
298 | if (!pagevec_lookup(&pvec, mapping, index, | 411 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
299 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 412 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, |
413 | pvec.pages, indices); | ||
414 | if (!pvec.nr) { | ||
300 | if (index == start) | 415 | if (index == start) |
301 | break; | 416 | break; |
302 | index = start; | 417 | index = start; |
303 | continue; | 418 | continue; |
304 | } | 419 | } |
305 | if (index == start && pvec.pages[0]->index > end) { | 420 | if (index == start && indices[0] > end) { |
306 | pagevec_release(&pvec); | 421 | shmem_pagevec_release(&pvec); |
307 | break; | 422 | break; |
308 | } | 423 | } |
309 | mem_cgroup_uncharge_start(); | 424 | mem_cgroup_uncharge_start(); |
310 | for (i = 0; i < pagevec_count(&pvec); i++) { | 425 | for (i = 0; i < pagevec_count(&pvec); i++) { |
311 | struct page *page = pvec.pages[i]; | 426 | struct page *page = pvec.pages[i]; |
312 | 427 | ||
313 | /* We rely upon deletion not changing page->index */ | 428 | index = indices[i]; |
314 | index = page->index; | ||
315 | if (index > end) | 429 | if (index > end) |
316 | break; | 430 | break; |
317 | 431 | ||
432 | if (radix_tree_exceptional_entry(page)) { | ||
433 | nr_swaps_freed += !shmem_free_swap(mapping, | ||
434 | index, page); | ||
435 | continue; | ||
436 | } | ||
437 | |||
318 | lock_page(page); | 438 | lock_page(page); |
319 | WARN_ON(page->index != index); | 439 | if (page->mapping == mapping) { |
320 | wait_on_page_writeback(page); | 440 | VM_BUG_ON(PageWriteback(page)); |
321 | truncate_inode_page(mapping, page); | 441 | truncate_inode_page(mapping, page); |
442 | } | ||
322 | unlock_page(page); | 443 | unlock_page(page); |
323 | } | 444 | } |
324 | pagevec_release(&pvec); | 445 | shmem_pagevec_release(&pvec); |
325 | mem_cgroup_uncharge_end(); | 446 | mem_cgroup_uncharge_end(); |
326 | index++; | 447 | index++; |
327 | } | 448 | } |
328 | 449 | ||
329 | if (end > SHMEM_NR_DIRECT) | ||
330 | end = SHMEM_NR_DIRECT; | ||
331 | |||
332 | spin_lock(&info->lock); | 450 | spin_lock(&info->lock); |
333 | for (index = start; index < end; index++) { | 451 | info->swapped -= nr_swaps_freed; |
334 | swap = shmem_get_swap(info, index); | ||
335 | if (swap.val) { | ||
336 | free_swap_and_cache(swap); | ||
337 | shmem_put_swap(info, index, (swp_entry_t){0}); | ||
338 | info->swapped--; | ||
339 | } | ||
340 | } | ||
341 | |||
342 | if (mapping->nrpages) { | ||
343 | spin_unlock(&info->lock); | ||
344 | /* | ||
345 | * A page may have meanwhile sneaked in from swap. | ||
346 | */ | ||
347 | truncate_inode_pages_range(mapping, lstart, lend); | ||
348 | spin_lock(&info->lock); | ||
349 | } | ||
350 | |||
351 | shmem_recalc_inode(inode); | 452 | shmem_recalc_inode(inode); |
352 | spin_unlock(&info->lock); | 453 | spin_unlock(&info->lock); |
353 | 454 | ||
@@ -552,11 +653,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
552 | } | 653 | } |
553 | 654 | ||
554 | /* | 655 | /* |
555 | * Just for this patch, we have a toy implementation, | 656 | * Disable even the toy swapping implementation, while we convert |
556 | * which can swap out only the first SHMEM_NR_DIRECT pages: | 657 | * functions one by one to having swap entries in the radix tree. |
557 | * for simple demonstration of where we need to think about swap. | ||
558 | */ | 658 | */ |
559 | if (index >= SHMEM_NR_DIRECT) | 659 | if (index < ULONG_MAX) |
560 | goto redirty; | 660 | goto redirty; |
561 | 661 | ||
562 | swap = get_swap_page(); | 662 | swap = get_swap_page(); |