aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2012-01-20 17:34:21 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-23 11:38:48 -0500
commit245132643e1cfcd145bbc86a716c1818371fcb93 (patch)
treee5bf3cb56efedb059b1a68fd8efd37482131783b
parent85046579bde15e532983438f86b36856e358f417 (diff)
SHM_UNLOCK: fix Unevictable pages stranded after swap
Commit cc39c6a9bbde ("mm: account skipped entries to avoid looping in find_get_pages") correctly fixed an infinite loop; but left a problem that find_get_pages() on shmem would return 0 (appearing to callers to mean end of tree) when it meets a run of nr_pages swap entries. The only uses of find_get_pages() on shmem are via pagevec_lookup(), called from invalidate_mapping_pages(), and from shmctl SHM_UNLOCK's scan_mapping_unevictable_pages(). The first is already commented, and not worth worrying about; but the second can leave pages on the Unevictable list after an unusual sequence of swapping and locking. Fix that by using shmem_find_get_pages_and_swap() (then ignoring the swap) instead of pagevec_lookup(). But I don't want to contaminate vmscan.c with shmem internals, nor shmem.c with LRU locking. So move scan_mapping_unevictable_pages() into shmem.c, renaming it shmem_unlock_mapping(); and rename check_move_unevictable_page() to check_move_unevictable_pages(), looping down an array of pages, oftentimes under the same lock. Leave out the "rotate unevictable list" block: that's a leftover from when this was used for /proc/sys/vm/scan_unevictable_pages, whose flawed handling involved looking at pages at tail of LRU. Was there significance to the sequence first ClearPageUnevictable, then test page_evictable, then SetPageUnevictable here? I think not, we're under LRU lock, and have no barriers between those. Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Rik van Riel <riel@redhat.com> Cc: Shaohua Li <shaohua.li@intel.com> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michel Lespinasse <walken@google.com> Cc: <stable@vger.kernel.org> [back to 3.1 but will need respins] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/shmem_fs.h1
-rw-r--r--include/linux/swap.h2
-rw-r--r--ipc/shm.c2
-rw-r--r--mm/shmem.c46
-rw-r--r--mm/vmscan.c128
5 files changed, 83 insertions, 96 deletions
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index e4c711c6f321..79ab2555b3b0 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -48,6 +48,7 @@ extern struct file *shmem_file_setup(const char *name,
48 loff_t size, unsigned long flags); 48 loff_t size, unsigned long flags);
49extern int shmem_zero_setup(struct vm_area_struct *); 49extern int shmem_zero_setup(struct vm_area_struct *);
50extern int shmem_lock(struct file *file, int lock, struct user_struct *user); 50extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
51extern void shmem_unlock_mapping(struct address_space *mapping);
51extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 52extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
52 pgoff_t index, gfp_t gfp_mask); 53 pgoff_t index, gfp_t gfp_mask);
53extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); 54extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 06061a7f8e69..3e60228e7299 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -273,7 +273,7 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
273#endif 273#endif
274 274
275extern int page_evictable(struct page *page, struct vm_area_struct *vma); 275extern int page_evictable(struct page *page, struct vm_area_struct *vma);
276extern void scan_mapping_unevictable_pages(struct address_space *); 276extern void check_move_unevictable_pages(struct page **, int nr_pages);
277 277
278extern unsigned long scan_unevictable_pages; 278extern unsigned long scan_unevictable_pages;
279extern int scan_unevictable_handler(struct ctl_table *, int, 279extern int scan_unevictable_handler(struct ctl_table *, int,
diff --git a/ipc/shm.c b/ipc/shm.c
index 854ab58e5f6e..b76be5bda6c2 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -916,7 +916,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
916 shp->mlock_user = NULL; 916 shp->mlock_user = NULL;
917 get_file(shm_file); 917 get_file(shm_file);
918 shm_unlock(shp); 918 shm_unlock(shp);
919 scan_mapping_unevictable_pages(shm_file->f_mapping); 919 shmem_unlock_mapping(shm_file->f_mapping);
920 fput(shm_file); 920 fput(shm_file);
921 goto out; 921 goto out;
922 } 922 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 4aaa53abe302..269d049294ab 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -379,7 +379,7 @@ static int shmem_free_swap(struct address_space *mapping,
379/* 379/*
380 * Pagevec may contain swap entries, so shuffle up pages before releasing. 380 * Pagevec may contain swap entries, so shuffle up pages before releasing.
381 */ 381 */
382static void shmem_pagevec_release(struct pagevec *pvec) 382static void shmem_deswap_pagevec(struct pagevec *pvec)
383{ 383{
384 int i, j; 384 int i, j;
385 385
@@ -389,7 +389,36 @@ static void shmem_pagevec_release(struct pagevec *pvec)
389 pvec->pages[j++] = page; 389 pvec->pages[j++] = page;
390 } 390 }
391 pvec->nr = j; 391 pvec->nr = j;
392 pagevec_release(pvec); 392}
393
394/*
395 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
396 */
397void shmem_unlock_mapping(struct address_space *mapping)
398{
399 struct pagevec pvec;
400 pgoff_t indices[PAGEVEC_SIZE];
401 pgoff_t index = 0;
402
403 pagevec_init(&pvec, 0);
404 /*
405 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
406 */
407 while (!mapping_unevictable(mapping)) {
408 /*
409 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
410 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
411 */
412 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
413 PAGEVEC_SIZE, pvec.pages, indices);
414 if (!pvec.nr)
415 break;
416 index = indices[pvec.nr - 1] + 1;
417 shmem_deswap_pagevec(&pvec);
418 check_move_unevictable_pages(pvec.pages, pvec.nr);
419 pagevec_release(&pvec);
420 cond_resched();
421 }
393} 422}
394 423
395/* 424/*
@@ -440,7 +469,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
440 } 469 }
441 unlock_page(page); 470 unlock_page(page);
442 } 471 }
443 shmem_pagevec_release(&pvec); 472 shmem_deswap_pagevec(&pvec);
473 pagevec_release(&pvec);
444 mem_cgroup_uncharge_end(); 474 mem_cgroup_uncharge_end();
445 cond_resched(); 475 cond_resched();
446 index++; 476 index++;
@@ -470,7 +500,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
470 continue; 500 continue;
471 } 501 }
472 if (index == start && indices[0] > end) { 502 if (index == start && indices[0] > end) {
473 shmem_pagevec_release(&pvec); 503 shmem_deswap_pagevec(&pvec);
504 pagevec_release(&pvec);
474 break; 505 break;
475 } 506 }
476 mem_cgroup_uncharge_start(); 507 mem_cgroup_uncharge_start();
@@ -494,7 +525,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
494 } 525 }
495 unlock_page(page); 526 unlock_page(page);
496 } 527 }
497 shmem_pagevec_release(&pvec); 528 shmem_deswap_pagevec(&pvec);
529 pagevec_release(&pvec);
498 mem_cgroup_uncharge_end(); 530 mem_cgroup_uncharge_end();
499 index++; 531 index++;
500 } 532 }
@@ -2438,6 +2470,10 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2438 return 0; 2470 return 0;
2439} 2471}
2440 2472
2473void shmem_unlock_mapping(struct address_space *mapping)
2474{
2475}
2476
2441void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 2477void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
2442{ 2478{
2443 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 2479 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e097c1026b58..c52b23552659 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -26,7 +26,6 @@
26#include <linux/buffer_head.h> /* for try_to_release_page(), 26#include <linux/buffer_head.h> /* for try_to_release_page(),
27 buffer_heads_over_limit */ 27 buffer_heads_over_limit */
28#include <linux/mm_inline.h> 28#include <linux/mm_inline.h>
29#include <linux/pagevec.h>
30#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
31#include <linux/rmap.h> 30#include <linux/rmap.h>
32#include <linux/topology.h> 31#include <linux/topology.h>
@@ -661,7 +660,7 @@ redo:
661 * When racing with an mlock or AS_UNEVICTABLE clearing 660 * When racing with an mlock or AS_UNEVICTABLE clearing
662 * (page is unlocked) make sure that if the other thread 661 * (page is unlocked) make sure that if the other thread
663 * does not observe our setting of PG_lru and fails 662 * does not observe our setting of PG_lru and fails
664 * isolation/check_move_unevictable_page, 663 * isolation/check_move_unevictable_pages,
665 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move 664 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
666 * the page back to the evictable list. 665 * the page back to the evictable list.
667 * 666 *
@@ -3501,107 +3500,58 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
3501 3500
3502#ifdef CONFIG_SHMEM 3501#ifdef CONFIG_SHMEM
3503/** 3502/**
3504 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list 3503 * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
3505 * @page: page to check evictability and move to appropriate lru list 3504 * @pages: array of pages to check
3506 * @zone: zone page is in 3505 * @nr_pages: number of pages to check
3507 * 3506 *
3508 * Checks a page for evictability and moves the page to the appropriate 3507 * Checks pages for evictability and moves them to the appropriate lru list.
3509 * zone lru list.
3510 *
3511 * Restrictions: zone->lru_lock must be held, page must be on LRU and must
3512 * have PageUnevictable set.
3513 * 3508 *
3514 * This function is only used for SysV IPC SHM_UNLOCK. 3509 * This function is only used for SysV IPC SHM_UNLOCK.
3515 */ 3510 */
3516static void check_move_unevictable_page(struct page *page, struct zone *zone) 3511void check_move_unevictable_pages(struct page **pages, int nr_pages)
3517{ 3512{
3518 struct lruvec *lruvec; 3513 struct lruvec *lruvec;
3514 struct zone *zone = NULL;
3515 int pgscanned = 0;
3516 int pgrescued = 0;
3517 int i;
3519 3518
3520 VM_BUG_ON(PageActive(page)); 3519 for (i = 0; i < nr_pages; i++) {
3521retry: 3520 struct page *page = pages[i];
3522 ClearPageUnevictable(page); 3521 struct zone *pagezone;
3523 if (page_evictable(page, NULL)) {
3524 enum lru_list l = page_lru_base_type(page);
3525
3526 __dec_zone_state(zone, NR_UNEVICTABLE);
3527 lruvec = mem_cgroup_lru_move_lists(zone, page,
3528 LRU_UNEVICTABLE, l);
3529 list_move(&page->lru, &lruvec->lists[l]);
3530 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
3531 __count_vm_event(UNEVICTABLE_PGRESCUED);
3532 } else {
3533 /*
3534 * rotate unevictable list
3535 */
3536 SetPageUnevictable(page);
3537 lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE,
3538 LRU_UNEVICTABLE);
3539 list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]);
3540 if (page_evictable(page, NULL))
3541 goto retry;
3542 }
3543}
3544
3545/**
3546 * scan_mapping_unevictable_pages - scan an address space for evictable pages
3547 * @mapping: struct address_space to scan for evictable pages
3548 *
3549 * Scan all pages in mapping. Check unevictable pages for
3550 * evictability and move them to the appropriate zone lru list.
3551 *
3552 * This function is only used for SysV IPC SHM_UNLOCK.
3553 */
3554void scan_mapping_unevictable_pages(struct address_space *mapping)
3555{
3556 pgoff_t next = 0;
3557 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
3558 PAGE_CACHE_SHIFT;
3559 struct zone *zone;
3560 struct pagevec pvec;
3561
3562 if (mapping->nrpages == 0)
3563 return;
3564
3565 pagevec_init(&pvec, 0);
3566 while (next < end &&
3567 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
3568 int i;
3569 int pg_scanned = 0;
3570
3571 zone = NULL;
3572
3573 for (i = 0; i < pagevec_count(&pvec); i++) {
3574 struct page *page = pvec.pages[i];
3575 pgoff_t page_index = page->index;
3576 struct zone *pagezone = page_zone(page);
3577 3522
3578 pg_scanned++; 3523 pgscanned++;
3579 if (page_index > next) 3524 pagezone = page_zone(page);
3580 next = page_index; 3525 if (pagezone != zone) {
3581 next++; 3526 if (zone)
3527 spin_unlock_irq(&zone->lru_lock);
3528 zone = pagezone;
3529 spin_lock_irq(&zone->lru_lock);
3530 }
3582 3531
3583 if (pagezone != zone) { 3532 if (!PageLRU(page) || !PageUnevictable(page))
3584 if (zone) 3533 continue;
3585 spin_unlock_irq(&zone->lru_lock);
3586 zone = pagezone;
3587 spin_lock_irq(&zone->lru_lock);
3588 }
3589 3534
3590 if (PageLRU(page) && PageUnevictable(page)) 3535 if (page_evictable(page, NULL)) {
3591 check_move_unevictable_page(page, zone); 3536 enum lru_list lru = page_lru_base_type(page);
3537
3538 VM_BUG_ON(PageActive(page));
3539 ClearPageUnevictable(page);
3540 __dec_zone_state(zone, NR_UNEVICTABLE);
3541 lruvec = mem_cgroup_lru_move_lists(zone, page,
3542 LRU_UNEVICTABLE, lru);
3543 list_move(&page->lru, &lruvec->lists[lru]);
3544 __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
3545 pgrescued++;
3592 } 3546 }
3593 if (zone) 3547 }
3594 spin_unlock_irq(&zone->lru_lock);
3595 pagevec_release(&pvec);
3596 3548
3597 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); 3549 if (zone) {
3598 cond_resched(); 3550 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
3551 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
3552 spin_unlock_irq(&zone->lru_lock);
3599 } 3553 }
3600} 3554}
3601#else
3602void scan_mapping_unevictable_pages(struct address_space *mapping)
3603{
3604}
3605#endif /* CONFIG_SHMEM */ 3555#endif /* CONFIG_SHMEM */
3606 3556
3607static void warn_scan_unevictable_pages(void) 3557static void warn_scan_unevictable_pages(void)