diff options
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 150 |
1 files changed, 99 insertions, 51 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index f071648e1360..02ccab5ad9d9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/mutex.h> | 27 | #include <linux/mutex.h> |
28 | #include <linux/capability.h> | 28 | #include <linux/capability.h> |
29 | #include <linux/syscalls.h> | 29 | #include <linux/syscalls.h> |
30 | #include <linux/memcontrol.h> | ||
30 | 31 | ||
31 | #include <asm/pgtable.h> | 32 | #include <asm/pgtable.h> |
32 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
@@ -506,9 +507,24 @@ unsigned int count_swap_pages(int type, int free) | |||
506 | * just let do_wp_page work it out if a write is requested later - to | 507 | * just let do_wp_page work it out if a write is requested later - to |
507 | * force COW, vm_page_prot omits write permission from any private vma. | 508 | * force COW, vm_page_prot omits write permission from any private vma. |
508 | */ | 509 | */ |
509 | static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | 510 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
510 | unsigned long addr, swp_entry_t entry, struct page *page) | 511 | unsigned long addr, swp_entry_t entry, struct page *page) |
511 | { | 512 | { |
513 | spinlock_t *ptl; | ||
514 | pte_t *pte; | ||
515 | int ret = 1; | ||
516 | |||
517 | if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) | ||
518 | ret = -ENOMEM; | ||
519 | |||
520 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
521 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | ||
522 | if (ret > 0) | ||
523 | mem_cgroup_uncharge_page(page); | ||
524 | ret = 0; | ||
525 | goto out; | ||
526 | } | ||
527 | |||
512 | inc_mm_counter(vma->vm_mm, anon_rss); | 528 | inc_mm_counter(vma->vm_mm, anon_rss); |
513 | get_page(page); | 529 | get_page(page); |
514 | set_pte_at(vma->vm_mm, addr, pte, | 530 | set_pte_at(vma->vm_mm, addr, pte, |
@@ -520,6 +536,9 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | |||
520 | * immediately swapped out again after swapon. | 536 | * immediately swapped out again after swapon. |
521 | */ | 537 | */ |
522 | activate_page(page); | 538 | activate_page(page); |
539 | out: | ||
540 | pte_unmap_unlock(pte, ptl); | ||
541 | return ret; | ||
523 | } | 542 | } |
524 | 543 | ||
525 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 544 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
@@ -528,23 +547,34 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
528 | { | 547 | { |
529 | pte_t swp_pte = swp_entry_to_pte(entry); | 548 | pte_t swp_pte = swp_entry_to_pte(entry); |
530 | pte_t *pte; | 549 | pte_t *pte; |
531 | spinlock_t *ptl; | 550 | int ret = 0; |
532 | int found = 0; | ||
533 | 551 | ||
534 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 552 | /* |
553 | * We don't actually need pte lock while scanning for swp_pte: since | ||
554 | * we hold page lock and mmap_sem, swp_pte cannot be inserted into the | ||
555 | * page table while we're scanning; though it could get zapped, and on | ||
556 | * some architectures (e.g. x86_32 with PAE) we might catch a glimpse | ||
557 | * of unmatched parts which look like swp_pte, so unuse_pte must | ||
558 | * recheck under pte lock. Scanning without pte lock lets it be | ||
559 | * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. | ||
560 | */ | ||
561 | pte = pte_offset_map(pmd, addr); | ||
535 | do { | 562 | do { |
536 | /* | 563 | /* |
537 | * swapoff spends a _lot_ of time in this loop! | 564 | * swapoff spends a _lot_ of time in this loop! |
538 | * Test inline before going to call unuse_pte. | 565 | * Test inline before going to call unuse_pte. |
539 | */ | 566 | */ |
540 | if (unlikely(pte_same(*pte, swp_pte))) { | 567 | if (unlikely(pte_same(*pte, swp_pte))) { |
541 | unuse_pte(vma, pte++, addr, entry, page); | 568 | pte_unmap(pte); |
542 | found = 1; | 569 | ret = unuse_pte(vma, pmd, addr, entry, page); |
543 | break; | 570 | if (ret) |
571 | goto out; | ||
572 | pte = pte_offset_map(pmd, addr); | ||
544 | } | 573 | } |
545 | } while (pte++, addr += PAGE_SIZE, addr != end); | 574 | } while (pte++, addr += PAGE_SIZE, addr != end); |
546 | pte_unmap_unlock(pte - 1, ptl); | 575 | pte_unmap(pte - 1); |
547 | return found; | 576 | out: |
577 | return ret; | ||
548 | } | 578 | } |
549 | 579 | ||
550 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 580 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
@@ -553,14 +583,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
553 | { | 583 | { |
554 | pmd_t *pmd; | 584 | pmd_t *pmd; |
555 | unsigned long next; | 585 | unsigned long next; |
586 | int ret; | ||
556 | 587 | ||
557 | pmd = pmd_offset(pud, addr); | 588 | pmd = pmd_offset(pud, addr); |
558 | do { | 589 | do { |
559 | next = pmd_addr_end(addr, end); | 590 | next = pmd_addr_end(addr, end); |
560 | if (pmd_none_or_clear_bad(pmd)) | 591 | if (pmd_none_or_clear_bad(pmd)) |
561 | continue; | 592 | continue; |
562 | if (unuse_pte_range(vma, pmd, addr, next, entry, page)) | 593 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
563 | return 1; | 594 | if (ret) |
595 | return ret; | ||
564 | } while (pmd++, addr = next, addr != end); | 596 | } while (pmd++, addr = next, addr != end); |
565 | return 0; | 597 | return 0; |
566 | } | 598 | } |
@@ -571,14 +603,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
571 | { | 603 | { |
572 | pud_t *pud; | 604 | pud_t *pud; |
573 | unsigned long next; | 605 | unsigned long next; |
606 | int ret; | ||
574 | 607 | ||
575 | pud = pud_offset(pgd, addr); | 608 | pud = pud_offset(pgd, addr); |
576 | do { | 609 | do { |
577 | next = pud_addr_end(addr, end); | 610 | next = pud_addr_end(addr, end); |
578 | if (pud_none_or_clear_bad(pud)) | 611 | if (pud_none_or_clear_bad(pud)) |
579 | continue; | 612 | continue; |
580 | if (unuse_pmd_range(vma, pud, addr, next, entry, page)) | 613 | ret = unuse_pmd_range(vma, pud, addr, next, entry, page); |
581 | return 1; | 614 | if (ret) |
615 | return ret; | ||
582 | } while (pud++, addr = next, addr != end); | 616 | } while (pud++, addr = next, addr != end); |
583 | return 0; | 617 | return 0; |
584 | } | 618 | } |
@@ -588,6 +622,7 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
588 | { | 622 | { |
589 | pgd_t *pgd; | 623 | pgd_t *pgd; |
590 | unsigned long addr, end, next; | 624 | unsigned long addr, end, next; |
625 | int ret; | ||
591 | 626 | ||
592 | if (page->mapping) { | 627 | if (page->mapping) { |
593 | addr = page_address_in_vma(page, vma); | 628 | addr = page_address_in_vma(page, vma); |
@@ -605,8 +640,9 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
605 | next = pgd_addr_end(addr, end); | 640 | next = pgd_addr_end(addr, end); |
606 | if (pgd_none_or_clear_bad(pgd)) | 641 | if (pgd_none_or_clear_bad(pgd)) |
607 | continue; | 642 | continue; |
608 | if (unuse_pud_range(vma, pgd, addr, next, entry, page)) | 643 | ret = unuse_pud_range(vma, pgd, addr, next, entry, page); |
609 | return 1; | 644 | if (ret) |
645 | return ret; | ||
610 | } while (pgd++, addr = next, addr != end); | 646 | } while (pgd++, addr = next, addr != end); |
611 | return 0; | 647 | return 0; |
612 | } | 648 | } |
@@ -615,6 +651,7 @@ static int unuse_mm(struct mm_struct *mm, | |||
615 | swp_entry_t entry, struct page *page) | 651 | swp_entry_t entry, struct page *page) |
616 | { | 652 | { |
617 | struct vm_area_struct *vma; | 653 | struct vm_area_struct *vma; |
654 | int ret = 0; | ||
618 | 655 | ||
619 | if (!down_read_trylock(&mm->mmap_sem)) { | 656 | if (!down_read_trylock(&mm->mmap_sem)) { |
620 | /* | 657 | /* |
@@ -627,15 +664,11 @@ static int unuse_mm(struct mm_struct *mm, | |||
627 | lock_page(page); | 664 | lock_page(page); |
628 | } | 665 | } |
629 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 666 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
630 | if (vma->anon_vma && unuse_vma(vma, entry, page)) | 667 | if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) |
631 | break; | 668 | break; |
632 | } | 669 | } |
633 | up_read(&mm->mmap_sem); | 670 | up_read(&mm->mmap_sem); |
634 | /* | 671 | return (ret < 0)? ret: 0; |
635 | * Currently unuse_mm cannot fail, but leave error handling | ||
636 | * at call sites for now, since we change it from time to time. | ||
637 | */ | ||
638 | return 0; | ||
639 | } | 672 | } |
640 | 673 | ||
641 | /* | 674 | /* |
@@ -730,7 +763,8 @@ static int try_to_unuse(unsigned int type) | |||
730 | */ | 763 | */ |
731 | swap_map = &si->swap_map[i]; | 764 | swap_map = &si->swap_map[i]; |
732 | entry = swp_entry(type, i); | 765 | entry = swp_entry(type, i); |
733 | page = read_swap_cache_async(entry, NULL, 0); | 766 | page = read_swap_cache_async(entry, |
767 | GFP_HIGHUSER_MOVABLE, NULL, 0); | ||
734 | if (!page) { | 768 | if (!page) { |
735 | /* | 769 | /* |
736 | * Either swap_duplicate() failed because entry | 770 | * Either swap_duplicate() failed because entry |
@@ -789,7 +823,7 @@ static int try_to_unuse(unsigned int type) | |||
789 | atomic_inc(&new_start_mm->mm_users); | 823 | atomic_inc(&new_start_mm->mm_users); |
790 | atomic_inc(&prev_mm->mm_users); | 824 | atomic_inc(&prev_mm->mm_users); |
791 | spin_lock(&mmlist_lock); | 825 | spin_lock(&mmlist_lock); |
792 | while (*swap_map > 1 && !retval && | 826 | while (*swap_map > 1 && !retval && !shmem && |
793 | (p = p->next) != &start_mm->mmlist) { | 827 | (p = p->next) != &start_mm->mmlist) { |
794 | mm = list_entry(p, struct mm_struct, mmlist); | 828 | mm = list_entry(p, struct mm_struct, mmlist); |
795 | if (!atomic_inc_not_zero(&mm->mm_users)) | 829 | if (!atomic_inc_not_zero(&mm->mm_users)) |
@@ -821,6 +855,13 @@ static int try_to_unuse(unsigned int type) | |||
821 | mmput(start_mm); | 855 | mmput(start_mm); |
822 | start_mm = new_start_mm; | 856 | start_mm = new_start_mm; |
823 | } | 857 | } |
858 | if (shmem) { | ||
859 | /* page has already been unlocked and released */ | ||
860 | if (shmem > 0) | ||
861 | continue; | ||
862 | retval = shmem; | ||
863 | break; | ||
864 | } | ||
824 | if (retval) { | 865 | if (retval) { |
825 | unlock_page(page); | 866 | unlock_page(page); |
826 | page_cache_release(page); | 867 | page_cache_release(page); |
@@ -859,12 +900,6 @@ static int try_to_unuse(unsigned int type) | |||
859 | * read from disk into another page. Splitting into two | 900 | * read from disk into another page. Splitting into two |
860 | * pages would be incorrect if swap supported "shared | 901 | * pages would be incorrect if swap supported "shared |
861 | * private" pages, but they are handled by tmpfs files. | 902 | * private" pages, but they are handled by tmpfs files. |
862 | * | ||
863 | * Note shmem_unuse already deleted a swappage from | ||
864 | * the swap cache, unless the move to filepage failed: | ||
865 | * in which case it left swappage in cache, lowered its | ||
866 | * swap count to pass quickly through the loops above, | ||
867 | * and now we must reincrement count to try again later. | ||
868 | */ | 903 | */ |
869 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { | 904 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { |
870 | struct writeback_control wbc = { | 905 | struct writeback_control wbc = { |
@@ -875,12 +910,8 @@ static int try_to_unuse(unsigned int type) | |||
875 | lock_page(page); | 910 | lock_page(page); |
876 | wait_on_page_writeback(page); | 911 | wait_on_page_writeback(page); |
877 | } | 912 | } |
878 | if (PageSwapCache(page)) { | 913 | if (PageSwapCache(page)) |
879 | if (shmem) | 914 | delete_from_swap_cache(page); |
880 | swap_duplicate(entry); | ||
881 | else | ||
882 | delete_from_swap_cache(page); | ||
883 | } | ||
884 | 915 | ||
885 | /* | 916 | /* |
886 | * So we could skip searching mms once swap count went | 917 | * So we could skip searching mms once swap count went |
@@ -1768,31 +1799,48 @@ get_swap_info_struct(unsigned type) | |||
1768 | */ | 1799 | */ |
1769 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | 1800 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) |
1770 | { | 1801 | { |
1802 | struct swap_info_struct *si; | ||
1771 | int our_page_cluster = page_cluster; | 1803 | int our_page_cluster = page_cluster; |
1772 | int ret = 0, i = 1 << our_page_cluster; | 1804 | pgoff_t target, toff; |
1773 | unsigned long toff; | 1805 | pgoff_t base, end; |
1774 | struct swap_info_struct *swapdev = swp_type(entry) + swap_info; | 1806 | int nr_pages = 0; |
1775 | 1807 | ||
1776 | if (!our_page_cluster) /* no readahead */ | 1808 | if (!our_page_cluster) /* no readahead */ |
1777 | return 0; | 1809 | return 0; |
1778 | toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster; | 1810 | |
1779 | if (!toff) /* first page is swap header */ | 1811 | si = &swap_info[swp_type(entry)]; |
1780 | toff++, i--; | 1812 | target = swp_offset(entry); |
1781 | *offset = toff; | 1813 | base = (target >> our_page_cluster) << our_page_cluster; |
1814 | end = base + (1 << our_page_cluster); | ||
1815 | if (!base) /* first page is swap header */ | ||
1816 | base++; | ||
1782 | 1817 | ||
1783 | spin_lock(&swap_lock); | 1818 | spin_lock(&swap_lock); |
1784 | do { | 1819 | if (end > si->max) /* don't go beyond end of map */ |
1785 | /* Don't read-ahead past the end of the swap area */ | 1820 | end = si->max; |
1786 | if (toff >= swapdev->max) | 1821 | |
1822 | /* Count contiguous allocated slots above our target */ | ||
1823 | for (toff = target; ++toff < end; nr_pages++) { | ||
1824 | /* Don't read in free or bad pages */ | ||
1825 | if (!si->swap_map[toff]) | ||
1787 | break; | 1826 | break; |
1827 | if (si->swap_map[toff] == SWAP_MAP_BAD) | ||
1828 | break; | ||
1829 | } | ||
1830 | /* Count contiguous allocated slots below our target */ | ||
1831 | for (toff = target; --toff >= base; nr_pages++) { | ||
1788 | /* Don't read in free or bad pages */ | 1832 | /* Don't read in free or bad pages */ |
1789 | if (!swapdev->swap_map[toff]) | 1833 | if (!si->swap_map[toff]) |
1790 | break; | 1834 | break; |
1791 | if (swapdev->swap_map[toff] == SWAP_MAP_BAD) | 1835 | if (si->swap_map[toff] == SWAP_MAP_BAD) |
1792 | break; | 1836 | break; |
1793 | toff++; | 1837 | } |
1794 | ret++; | ||
1795 | } while (--i); | ||
1796 | spin_unlock(&swap_lock); | 1838 | spin_unlock(&swap_lock); |
1797 | return ret; | 1839 | |
1840 | /* | ||
1841 | * Indicate starting offset, and return number of pages to get: | ||
1842 | * if only 1, say 0, since there's then no readahead to be done. | ||
1843 | */ | ||
1844 | *offset = ++toff; | ||
1845 | return nr_pages? ++nr_pages: 0; | ||
1798 | } | 1846 | } |