aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c150
1 files changed, 99 insertions, 51 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f071648e1360..02ccab5ad9d9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -27,6 +27,7 @@
27#include <linux/mutex.h> 27#include <linux/mutex.h>
28#include <linux/capability.h> 28#include <linux/capability.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/memcontrol.h>
30 31
31#include <asm/pgtable.h> 32#include <asm/pgtable.h>
32#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
@@ -506,9 +507,24 @@ unsigned int count_swap_pages(int type, int free)
506 * just let do_wp_page work it out if a write is requested later - to 507 * just let do_wp_page work it out if a write is requested later - to
507 * force COW, vm_page_prot omits write permission from any private vma. 508 * force COW, vm_page_prot omits write permission from any private vma.
508 */ 509 */
509static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, 510static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
510 unsigned long addr, swp_entry_t entry, struct page *page) 511 unsigned long addr, swp_entry_t entry, struct page *page)
511{ 512{
513 spinlock_t *ptl;
514 pte_t *pte;
515 int ret = 1;
516
517 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
518 ret = -ENOMEM;
519
520 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
521 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
522 if (ret > 0)
523 mem_cgroup_uncharge_page(page);
524 ret = 0;
525 goto out;
526 }
527
512 inc_mm_counter(vma->vm_mm, anon_rss); 528 inc_mm_counter(vma->vm_mm, anon_rss);
513 get_page(page); 529 get_page(page);
514 set_pte_at(vma->vm_mm, addr, pte, 530 set_pte_at(vma->vm_mm, addr, pte,
@@ -520,6 +536,9 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
520 * immediately swapped out again after swapon. 536 * immediately swapped out again after swapon.
521 */ 537 */
522 activate_page(page); 538 activate_page(page);
539out:
540 pte_unmap_unlock(pte, ptl);
541 return ret;
523} 542}
524 543
525static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 544static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -528,23 +547,34 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
528{ 547{
529 pte_t swp_pte = swp_entry_to_pte(entry); 548 pte_t swp_pte = swp_entry_to_pte(entry);
530 pte_t *pte; 549 pte_t *pte;
531 spinlock_t *ptl; 550 int ret = 0;
532 int found = 0;
533 551
534 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 552 /*
553 * We don't actually need pte lock while scanning for swp_pte: since
554 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
555 * page table while we're scanning; though it could get zapped, and on
556 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
557 * of unmatched parts which look like swp_pte, so unuse_pte must
558 * recheck under pte lock. Scanning without pte lock lets it be
559 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
560 */
561 pte = pte_offset_map(pmd, addr);
535 do { 562 do {
536 /* 563 /*
537 * swapoff spends a _lot_ of time in this loop! 564 * swapoff spends a _lot_ of time in this loop!
538 * Test inline before going to call unuse_pte. 565 * Test inline before going to call unuse_pte.
539 */ 566 */
540 if (unlikely(pte_same(*pte, swp_pte))) { 567 if (unlikely(pte_same(*pte, swp_pte))) {
541 unuse_pte(vma, pte++, addr, entry, page); 568 pte_unmap(pte);
542 found = 1; 569 ret = unuse_pte(vma, pmd, addr, entry, page);
543 break; 570 if (ret)
571 goto out;
572 pte = pte_offset_map(pmd, addr);
544 } 573 }
545 } while (pte++, addr += PAGE_SIZE, addr != end); 574 } while (pte++, addr += PAGE_SIZE, addr != end);
546 pte_unmap_unlock(pte - 1, ptl); 575 pte_unmap(pte - 1);
547 return found; 576out:
577 return ret;
548} 578}
549 579
550static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 580static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -553,14 +583,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
553{ 583{
554 pmd_t *pmd; 584 pmd_t *pmd;
555 unsigned long next; 585 unsigned long next;
586 int ret;
556 587
557 pmd = pmd_offset(pud, addr); 588 pmd = pmd_offset(pud, addr);
558 do { 589 do {
559 next = pmd_addr_end(addr, end); 590 next = pmd_addr_end(addr, end);
560 if (pmd_none_or_clear_bad(pmd)) 591 if (pmd_none_or_clear_bad(pmd))
561 continue; 592 continue;
562 if (unuse_pte_range(vma, pmd, addr, next, entry, page)) 593 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
563 return 1; 594 if (ret)
595 return ret;
564 } while (pmd++, addr = next, addr != end); 596 } while (pmd++, addr = next, addr != end);
565 return 0; 597 return 0;
566} 598}
@@ -571,14 +603,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
571{ 603{
572 pud_t *pud; 604 pud_t *pud;
573 unsigned long next; 605 unsigned long next;
606 int ret;
574 607
575 pud = pud_offset(pgd, addr); 608 pud = pud_offset(pgd, addr);
576 do { 609 do {
577 next = pud_addr_end(addr, end); 610 next = pud_addr_end(addr, end);
578 if (pud_none_or_clear_bad(pud)) 611 if (pud_none_or_clear_bad(pud))
579 continue; 612 continue;
580 if (unuse_pmd_range(vma, pud, addr, next, entry, page)) 613 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
581 return 1; 614 if (ret)
615 return ret;
582 } while (pud++, addr = next, addr != end); 616 } while (pud++, addr = next, addr != end);
583 return 0; 617 return 0;
584} 618}
@@ -588,6 +622,7 @@ static int unuse_vma(struct vm_area_struct *vma,
588{ 622{
589 pgd_t *pgd; 623 pgd_t *pgd;
590 unsigned long addr, end, next; 624 unsigned long addr, end, next;
625 int ret;
591 626
592 if (page->mapping) { 627 if (page->mapping) {
593 addr = page_address_in_vma(page, vma); 628 addr = page_address_in_vma(page, vma);
@@ -605,8 +640,9 @@ static int unuse_vma(struct vm_area_struct *vma,
605 next = pgd_addr_end(addr, end); 640 next = pgd_addr_end(addr, end);
606 if (pgd_none_or_clear_bad(pgd)) 641 if (pgd_none_or_clear_bad(pgd))
607 continue; 642 continue;
608 if (unuse_pud_range(vma, pgd, addr, next, entry, page)) 643 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
609 return 1; 644 if (ret)
645 return ret;
610 } while (pgd++, addr = next, addr != end); 646 } while (pgd++, addr = next, addr != end);
611 return 0; 647 return 0;
612} 648}
@@ -615,6 +651,7 @@ static int unuse_mm(struct mm_struct *mm,
615 swp_entry_t entry, struct page *page) 651 swp_entry_t entry, struct page *page)
616{ 652{
617 struct vm_area_struct *vma; 653 struct vm_area_struct *vma;
654 int ret = 0;
618 655
619 if (!down_read_trylock(&mm->mmap_sem)) { 656 if (!down_read_trylock(&mm->mmap_sem)) {
620 /* 657 /*
@@ -627,15 +664,11 @@ static int unuse_mm(struct mm_struct *mm,
627 lock_page(page); 664 lock_page(page);
628 } 665 }
629 for (vma = mm->mmap; vma; vma = vma->vm_next) { 666 for (vma = mm->mmap; vma; vma = vma->vm_next) {
630 if (vma->anon_vma && unuse_vma(vma, entry, page)) 667 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
631 break; 668 break;
632 } 669 }
633 up_read(&mm->mmap_sem); 670 up_read(&mm->mmap_sem);
634 /* 671 return (ret < 0)? ret: 0;
635 * Currently unuse_mm cannot fail, but leave error handling
636 * at call sites for now, since we change it from time to time.
637 */
638 return 0;
639} 672}
640 673
641/* 674/*
@@ -730,7 +763,8 @@ static int try_to_unuse(unsigned int type)
730 */ 763 */
731 swap_map = &si->swap_map[i]; 764 swap_map = &si->swap_map[i];
732 entry = swp_entry(type, i); 765 entry = swp_entry(type, i);
733 page = read_swap_cache_async(entry, NULL, 0); 766 page = read_swap_cache_async(entry,
767 GFP_HIGHUSER_MOVABLE, NULL, 0);
734 if (!page) { 768 if (!page) {
735 /* 769 /*
736 * Either swap_duplicate() failed because entry 770 * Either swap_duplicate() failed because entry
@@ -789,7 +823,7 @@ static int try_to_unuse(unsigned int type)
789 atomic_inc(&new_start_mm->mm_users); 823 atomic_inc(&new_start_mm->mm_users);
790 atomic_inc(&prev_mm->mm_users); 824 atomic_inc(&prev_mm->mm_users);
791 spin_lock(&mmlist_lock); 825 spin_lock(&mmlist_lock);
792 while (*swap_map > 1 && !retval && 826 while (*swap_map > 1 && !retval && !shmem &&
793 (p = p->next) != &start_mm->mmlist) { 827 (p = p->next) != &start_mm->mmlist) {
794 mm = list_entry(p, struct mm_struct, mmlist); 828 mm = list_entry(p, struct mm_struct, mmlist);
795 if (!atomic_inc_not_zero(&mm->mm_users)) 829 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -821,6 +855,13 @@ static int try_to_unuse(unsigned int type)
821 mmput(start_mm); 855 mmput(start_mm);
822 start_mm = new_start_mm; 856 start_mm = new_start_mm;
823 } 857 }
858 if (shmem) {
859 /* page has already been unlocked and released */
860 if (shmem > 0)
861 continue;
862 retval = shmem;
863 break;
864 }
824 if (retval) { 865 if (retval) {
825 unlock_page(page); 866 unlock_page(page);
826 page_cache_release(page); 867 page_cache_release(page);
@@ -859,12 +900,6 @@ static int try_to_unuse(unsigned int type)
859 * read from disk into another page. Splitting into two 900 * read from disk into another page. Splitting into two
860 * pages would be incorrect if swap supported "shared 901 * pages would be incorrect if swap supported "shared
861 * private" pages, but they are handled by tmpfs files. 902 * private" pages, but they are handled by tmpfs files.
862 *
863 * Note shmem_unuse already deleted a swappage from
864 * the swap cache, unless the move to filepage failed:
865 * in which case it left swappage in cache, lowered its
866 * swap count to pass quickly through the loops above,
867 * and now we must reincrement count to try again later.
868 */ 903 */
869 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 904 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
870 struct writeback_control wbc = { 905 struct writeback_control wbc = {
@@ -875,12 +910,8 @@ static int try_to_unuse(unsigned int type)
875 lock_page(page); 910 lock_page(page);
876 wait_on_page_writeback(page); 911 wait_on_page_writeback(page);
877 } 912 }
878 if (PageSwapCache(page)) { 913 if (PageSwapCache(page))
879 if (shmem) 914 delete_from_swap_cache(page);
880 swap_duplicate(entry);
881 else
882 delete_from_swap_cache(page);
883 }
884 915
885 /* 916 /*
886 * So we could skip searching mms once swap count went 917 * So we could skip searching mms once swap count went
@@ -1768,31 +1799,48 @@ get_swap_info_struct(unsigned type)
1768 */ 1799 */
1769int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 1800int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1770{ 1801{
1802 struct swap_info_struct *si;
1771 int our_page_cluster = page_cluster; 1803 int our_page_cluster = page_cluster;
1772 int ret = 0, i = 1 << our_page_cluster; 1804 pgoff_t target, toff;
1773 unsigned long toff; 1805 pgoff_t base, end;
1774 struct swap_info_struct *swapdev = swp_type(entry) + swap_info; 1806 int nr_pages = 0;
1775 1807
1776 if (!our_page_cluster) /* no readahead */ 1808 if (!our_page_cluster) /* no readahead */
1777 return 0; 1809 return 0;
1778 toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster; 1810
1779 if (!toff) /* first page is swap header */ 1811 si = &swap_info[swp_type(entry)];
1780 toff++, i--; 1812 target = swp_offset(entry);
1781 *offset = toff; 1813 base = (target >> our_page_cluster) << our_page_cluster;
1814 end = base + (1 << our_page_cluster);
1815 if (!base) /* first page is swap header */
1816 base++;
1782 1817
1783 spin_lock(&swap_lock); 1818 spin_lock(&swap_lock);
1784 do { 1819 if (end > si->max) /* don't go beyond end of map */
1785 /* Don't read-ahead past the end of the swap area */ 1820 end = si->max;
1786 if (toff >= swapdev->max) 1821
1822 /* Count contiguous allocated slots above our target */
1823 for (toff = target; ++toff < end; nr_pages++) {
1824 /* Don't read in free or bad pages */
1825 if (!si->swap_map[toff])
1787 break; 1826 break;
1827 if (si->swap_map[toff] == SWAP_MAP_BAD)
1828 break;
1829 }
1830 /* Count contiguous allocated slots below our target */
1831 for (toff = target; --toff >= base; nr_pages++) {
1788 /* Don't read in free or bad pages */ 1832 /* Don't read in free or bad pages */
1789 if (!swapdev->swap_map[toff]) 1833 if (!si->swap_map[toff])
1790 break; 1834 break;
1791 if (swapdev->swap_map[toff] == SWAP_MAP_BAD) 1835 if (si->swap_map[toff] == SWAP_MAP_BAD)
1792 break; 1836 break;
1793 toff++; 1837 }
1794 ret++;
1795 } while (--i);
1796 spin_unlock(&swap_lock); 1838 spin_unlock(&swap_lock);
1797 return ret; 1839
1840 /*
1841 * Indicate starting offset, and return number of pages to get:
1842 * if only 1, say 0, since there's then no readahead to be done.
1843 */
1844 *offset = ++toff;
1845 return nr_pages? ++nr_pages: 0;
1798} 1846}