aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-05-07 05:17:13 -0400
committerIngo Molnar <mingo@elte.hu>2009-05-07 05:17:34 -0400
commit44347d947f628060b92449702071bfe1d31dfb75 (patch)
treec6ed74610d5b3295df4296659f80f5feb94b28cc /mm
parentd94fc523f3c35bd8013f04827e94756cbc0212f4 (diff)
parent413f81eba35d6ede9289b0c8a920c013a84fac71 (diff)
Merge branch 'linus' into tracing/core
Merge reason: tracing/core was on a .30-rc1 base and was missing out on on a handful of tracing fixes present in .30-rc5-almost. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/madvise.c8
-rw-r--r--mm/memcontrol.c40
-rw-r--r--mm/memory.c112
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/nommu.c13
-rw-r--r--mm/shmem.c35
-rw-r--r--mm/swap.c46
-rw-r--r--mm/util.c16
-rw-r--r--mm/vmscan.c19
11 files changed, 175 insertions, 135 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b53427ad30a3..57971d2ab848 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -213,6 +213,8 @@ config UNEVICTABLE_LRU
213 will use one page flag and increase the code size a little, 213 will use one page flag and increase the code size a little,
214 say Y unless you know what you are doing. 214 say Y unless you know what you are doing.
215 215
216 See Documentation/vm/unevictable-lru.txt for more information.
217
216config HAVE_MLOCK 218config HAVE_MLOCK
217 bool 219 bool
218 default y if MMU=y 220 default y if MMU=y
diff --git a/mm/filemap.c b/mm/filemap.c
index 2e2d38ebda4b..379ff0bcbf6e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -441,6 +441,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
441 } 441 }
442 return err; 442 return err;
443} 443}
444EXPORT_SYMBOL(filemap_write_and_wait_range);
444 445
445/** 446/**
446 * add_to_page_cache_locked - add a locked page to the pagecache 447 * add_to_page_cache_locked - add a locked page to the pagecache
@@ -567,8 +568,8 @@ EXPORT_SYMBOL(wait_on_page_bit);
567 568
568/** 569/**
569 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 570 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
570 * @page - Page defining the wait queue of interest 571 * @page: Page defining the wait queue of interest
571 * @waiter - Waiter to add to the queue 572 * @waiter: Waiter to add to the queue
572 * 573 *
573 * Add an arbitrary @waiter to the wait queue for the nominated @page. 574 * Add an arbitrary @waiter to the wait queue for the nominated @page.
574 */ 575 */
diff --git a/mm/madvise.c b/mm/madvise.c
index b9ce574827c8..36d6ea2b6340 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -112,6 +112,14 @@ static long madvise_willneed(struct vm_area_struct * vma,
112 if (!file) 112 if (!file)
113 return -EBADF; 113 return -EBADF;
114 114
115 /*
116 * Page cache readahead assumes page cache pages are order-0 which
117 * is not the case for hugetlbfs. Do not give a bad return value
118 * but ignore the advice.
119 */
120 if (vma->vm_flags & VM_HUGETLB)
121 return 0;
122
115 if (file->f_mapping->a_ops->get_xip_mem) { 123 if (file->f_mapping->a_ops->get_xip_mem) {
116 /* no bad return value, but ignore advice */ 124 /* no bad return value, but ignore advice */
117 return 0; 125 return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2fc6d6c48238..01c2d8f14685 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -932,7 +932,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
932 if (unlikely(!mem)) 932 if (unlikely(!mem))
933 return 0; 933 return 0;
934 934
935 VM_BUG_ON(mem_cgroup_is_obsolete(mem)); 935 VM_BUG_ON(!mem || mem_cgroup_is_obsolete(mem));
936 936
937 while (1) { 937 while (1) {
938 int ret; 938 int ret;
@@ -1024,9 +1024,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
1024 return NULL; 1024 return NULL;
1025 1025
1026 pc = lookup_page_cgroup(page); 1026 pc = lookup_page_cgroup(page);
1027 /* 1027 lock_page_cgroup(pc);
1028 * Used bit of swapcache is solid under page lock.
1029 */
1030 if (PageCgroupUsed(pc)) { 1028 if (PageCgroupUsed(pc)) {
1031 mem = pc->mem_cgroup; 1029 mem = pc->mem_cgroup;
1032 if (mem && !css_tryget(&mem->css)) 1030 if (mem && !css_tryget(&mem->css))
@@ -1040,6 +1038,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
1040 mem = NULL; 1038 mem = NULL;
1041 rcu_read_unlock(); 1039 rcu_read_unlock();
1042 } 1040 }
1041 unlock_page_cgroup(pc);
1043 return mem; 1042 return mem;
1044} 1043}
1045 1044
@@ -1618,37 +1617,28 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1618} 1617}
1619 1618
1620/* 1619/*
1621 * A call to try to shrink memory usage under specified resource controller. 1620 * A call to try to shrink memory usage on charge failure at shmem's swapin.
1622 * This is typically used for page reclaiming for shmem for reducing side 1621 * Calling hierarchical_reclaim is not enough because we should update
1623 * effect of page allocation from shmem, which is used by some mem_cgroup. 1622 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
1623 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
1624 * not from the memcg which this page would be charged to.
1625 * try_charge_swapin does all of these works properly.
1624 */ 1626 */
1625int mem_cgroup_shrink_usage(struct page *page, 1627int mem_cgroup_shmem_charge_fallback(struct page *page,
1626 struct mm_struct *mm, 1628 struct mm_struct *mm,
1627 gfp_t gfp_mask) 1629 gfp_t gfp_mask)
1628{ 1630{
1629 struct mem_cgroup *mem = NULL; 1631 struct mem_cgroup *mem = NULL;
1630 int progress = 0; 1632 int ret;
1631 int retry = MEM_CGROUP_RECLAIM_RETRIES;
1632 1633
1633 if (mem_cgroup_disabled()) 1634 if (mem_cgroup_disabled())
1634 return 0; 1635 return 0;
1635 if (page)
1636 mem = try_get_mem_cgroup_from_swapcache(page);
1637 if (!mem && mm)
1638 mem = try_get_mem_cgroup_from_mm(mm);
1639 if (unlikely(!mem))
1640 return 0;
1641 1636
1642 do { 1637 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1643 progress = mem_cgroup_hierarchical_reclaim(mem, 1638 if (!ret)
1644 gfp_mask, true, false); 1639 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
1645 progress += mem_cgroup_check_under_limit(mem);
1646 } while (!progress && --retry);
1647 1640
1648 css_put(&mem->css); 1641 return ret;
1649 if (!retry)
1650 return -ENOMEM;
1651 return 0;
1652} 1642}
1653 1643
1654static DEFINE_MUTEX(set_limit_mutex); 1644static DEFINE_MUTEX(set_limit_mutex);
diff --git a/mm/memory.c b/mm/memory.c
index cf6873e91c6a..4126dd16778c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1971 ret = tmp; 1971 ret = tmp;
1972 goto unwritable_page; 1972 goto unwritable_page;
1973 } 1973 }
1974 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
1975 lock_page(old_page);
1976 if (!old_page->mapping) {
1977 ret = 0; /* retry the fault */
1978 unlock_page(old_page);
1979 goto unwritable_page;
1980 }
1981 } else
1982 VM_BUG_ON(!PageLocked(old_page));
1974 1983
1975 /* 1984 /*
1976 * Since we dropped the lock we need to revalidate 1985 * Since we dropped the lock we need to revalidate
@@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1980 */ 1989 */
1981 page_table = pte_offset_map_lock(mm, pmd, address, 1990 page_table = pte_offset_map_lock(mm, pmd, address,
1982 &ptl); 1991 &ptl);
1983 page_cache_release(old_page); 1992 if (!pte_same(*page_table, orig_pte)) {
1984 if (!pte_same(*page_table, orig_pte)) 1993 unlock_page(old_page);
1994 page_cache_release(old_page);
1985 goto unlock; 1995 goto unlock;
1996 }
1986 1997
1987 page_mkwrite = 1; 1998 page_mkwrite = 1;
1988 } 1999 }
@@ -2094,9 +2105,6 @@ gotten:
2094unlock: 2105unlock:
2095 pte_unmap_unlock(page_table, ptl); 2106 pte_unmap_unlock(page_table, ptl);
2096 if (dirty_page) { 2107 if (dirty_page) {
2097 if (vma->vm_file)
2098 file_update_time(vma->vm_file);
2099
2100 /* 2108 /*
2101 * Yes, Virginia, this is actually required to prevent a race 2109 * Yes, Virginia, this is actually required to prevent a race
2102 * with clear_page_dirty_for_io() from clearing the page dirty 2110 * with clear_page_dirty_for_io() from clearing the page dirty
@@ -2105,16 +2113,41 @@ unlock:
2105 * 2113 *
2106 * do_no_page is protected similarly. 2114 * do_no_page is protected similarly.
2107 */ 2115 */
2108 wait_on_page_locked(dirty_page); 2116 if (!page_mkwrite) {
2109 set_page_dirty_balance(dirty_page, page_mkwrite); 2117 wait_on_page_locked(dirty_page);
2118 set_page_dirty_balance(dirty_page, page_mkwrite);
2119 }
2110 put_page(dirty_page); 2120 put_page(dirty_page);
2121 if (page_mkwrite) {
2122 struct address_space *mapping = dirty_page->mapping;
2123
2124 set_page_dirty(dirty_page);
2125 unlock_page(dirty_page);
2126 page_cache_release(dirty_page);
2127 if (mapping) {
2128 /*
2129 * Some device drivers do not set page.mapping
2130 * but still dirty their pages
2131 */
2132 balance_dirty_pages_ratelimited(mapping);
2133 }
2134 }
2135
2136 /* file_update_time outside page_lock */
2137 if (vma->vm_file)
2138 file_update_time(vma->vm_file);
2111 } 2139 }
2112 return ret; 2140 return ret;
2113oom_free_new: 2141oom_free_new:
2114 page_cache_release(new_page); 2142 page_cache_release(new_page);
2115oom: 2143oom:
2116 if (old_page) 2144 if (old_page) {
2145 if (page_mkwrite) {
2146 unlock_page(old_page);
2147 page_cache_release(old_page);
2148 }
2117 page_cache_release(old_page); 2149 page_cache_release(old_page);
2150 }
2118 return VM_FAULT_OOM; 2151 return VM_FAULT_OOM;
2119 2152
2120unwritable_page: 2153unwritable_page:
@@ -2458,8 +2491,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2458 2491
2459 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2492 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2460 ret = VM_FAULT_OOM; 2493 ret = VM_FAULT_OOM;
2461 unlock_page(page); 2494 goto out_page;
2462 goto out;
2463 } 2495 }
2464 2496
2465 /* 2497 /*
@@ -2521,6 +2553,7 @@ out:
2521out_nomap: 2553out_nomap:
2522 mem_cgroup_cancel_charge_swapin(ptr); 2554 mem_cgroup_cancel_charge_swapin(ptr);
2523 pte_unmap_unlock(page_table, ptl); 2555 pte_unmap_unlock(page_table, ptl);
2556out_page:
2524 unlock_page(page); 2557 unlock_page(page);
2525 page_cache_release(page); 2558 page_cache_release(page);
2526 return ret; 2559 return ret;
@@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2664 int tmp; 2697 int tmp;
2665 2698
2666 unlock_page(page); 2699 unlock_page(page);
2667 vmf.flags |= FAULT_FLAG_MKWRITE; 2700 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2668 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 2701 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2669 if (unlikely(tmp & 2702 if (unlikely(tmp &
2670 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 2703 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2671 ret = tmp; 2704 ret = tmp;
2672 anon = 1; /* no anon but release vmf.page */ 2705 goto unwritable_page;
2673 goto out_unlocked;
2674 }
2675 lock_page(page);
2676 /*
2677 * XXX: this is not quite right (racy vs
2678 * invalidate) to unlock and relock the page
2679 * like this, however a better fix requires
2680 * reworking page_mkwrite locking API, which
2681 * is better done later.
2682 */
2683 if (!page->mapping) {
2684 ret = 0;
2685 anon = 1; /* no anon but release vmf.page */
2686 goto out;
2687 } 2706 }
2707 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2708 lock_page(page);
2709 if (!page->mapping) {
2710 ret = 0; /* retry the fault */
2711 unlock_page(page);
2712 goto unwritable_page;
2713 }
2714 } else
2715 VM_BUG_ON(!PageLocked(page));
2688 page_mkwrite = 1; 2716 page_mkwrite = 1;
2689 } 2717 }
2690 } 2718 }
@@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2736 pte_unmap_unlock(page_table, ptl); 2764 pte_unmap_unlock(page_table, ptl);
2737 2765
2738out: 2766out:
2739 unlock_page(vmf.page); 2767 if (dirty_page) {
2740out_unlocked: 2768 struct address_space *mapping = page->mapping;
2741 if (anon)
2742 page_cache_release(vmf.page);
2743 else if (dirty_page) {
2744 if (vma->vm_file)
2745 file_update_time(vma->vm_file);
2746 2769
2747 set_page_dirty_balance(dirty_page, page_mkwrite); 2770 if (set_page_dirty(dirty_page))
2771 page_mkwrite = 1;
2772 unlock_page(dirty_page);
2748 put_page(dirty_page); 2773 put_page(dirty_page);
2774 if (page_mkwrite && mapping) {
2775 /*
2776 * Some device drivers do not set page.mapping but still
2777 * dirty their pages
2778 */
2779 balance_dirty_pages_ratelimited(mapping);
2780 }
2781
2782 /* file_update_time outside page_lock */
2783 if (vma->vm_file)
2784 file_update_time(vma->vm_file);
2785 } else {
2786 unlock_page(vmf.page);
2787 if (anon)
2788 page_cache_release(vmf.page);
2749 } 2789 }
2750 2790
2751 return ret; 2791 return ret;
2792
2793unwritable_page:
2794 page_cache_release(page);
2795 return ret;
2752} 2796}
2753 2797
2754static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2798static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a3841186c11..6b7b1a95944b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
85int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 85int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
86int sysctl_overcommit_ratio = 50; /* default is 50% */ 86int sysctl_overcommit_ratio = 50; /* default is 50% */
87int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 87int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
88atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 88struct percpu_counter vm_committed_as;
89 89
90/* 90/*
91 * Check that a process has enough memory to allocate a new virtual 91 * Check that a process has enough memory to allocate a new virtual
@@ -179,11 +179,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
179 if (mm) 179 if (mm)
180 allowed -= mm->total_vm / 32; 180 allowed -= mm->total_vm / 32;
181 181
182 /* 182 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
183 * cast `allowed' as a signed long because vm_committed_space
184 * sometimes has a negative value
185 */
186 if (atomic_long_read(&vm_committed_space) < (long)allowed)
187 return 0; 183 return 0;
188error: 184error:
189 vm_unacct_memory(pages); 185 vm_unacct_memory(pages);
@@ -1575,7 +1571,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1575 * Overcommit.. This must be the final test, as it will 1571 * Overcommit.. This must be the final test, as it will
1576 * update security statistics. 1572 * update security statistics.
1577 */ 1573 */
1578 if (security_vm_enough_memory(grow)) 1574 if (security_vm_enough_memory_mm(mm, grow))
1579 return -ENOMEM; 1575 return -ENOMEM;
1580 1576
1581 /* Ok, everything looks good - let it rip */ 1577 /* Ok, everything looks good - let it rip */
@@ -2481,4 +2477,8 @@ void mm_drop_all_locks(struct mm_struct *mm)
2481 */ 2477 */
2482void __init mmap_init(void) 2478void __init mmap_init(void)
2483{ 2479{
2480 int ret;
2481
2482 ret = percpu_counter_init(&vm_committed_as, 0);
2483 VM_BUG_ON(ret);
2484} 2484}
diff --git a/mm/nommu.c b/mm/nommu.c
index 72eda4aee2cb..809998aa7b50 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -62,7 +62,7 @@ void *high_memory;
62struct page *mem_map; 62struct page *mem_map;
63unsigned long max_mapnr; 63unsigned long max_mapnr;
64unsigned long num_physpages; 64unsigned long num_physpages;
65atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 65struct percpu_counter vm_committed_as;
66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
67int sysctl_overcommit_ratio = 50; /* default is 50% */ 67int sysctl_overcommit_ratio = 50; /* default is 50% */
68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
@@ -463,6 +463,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
463 */ 463 */
464void __init mmap_init(void) 464void __init mmap_init(void)
465{ 465{
466 int ret;
467
468 ret = percpu_counter_init(&vm_committed_as, 0);
469 VM_BUG_ON(ret);
466 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); 470 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
467} 471}
468 472
@@ -1847,12 +1851,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1847 if (mm) 1851 if (mm)
1848 allowed -= mm->total_vm / 32; 1852 allowed -= mm->total_vm / 32;
1849 1853
1850 /* 1854 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
1851 * cast `allowed' as a signed long because vm_committed_space
1852 * sometimes has a negative value
1853 */
1854 if (atomic_long_read(&vm_committed_space) < (long)allowed)
1855 return 0; 1855 return 0;
1856
1856error: 1857error:
1857 vm_unacct_memory(pages); 1858 vm_unacct_memory(pages);
1858 1859
diff --git a/mm/shmem.c b/mm/shmem.c
index d94d2e9146bc..b25f95ce3db7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -24,6 +24,7 @@
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/vfs.h> 25#include <linux/vfs.h>
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/pagemap.h>
27#include <linux/file.h> 28#include <linux/file.h>
28#include <linux/mm.h> 29#include <linux/mm.h>
29#include <linux/module.h> 30#include <linux/module.h>
@@ -43,7 +44,6 @@ static struct vfsmount *shm_mnt;
43#include <linux/exportfs.h> 44#include <linux/exportfs.h>
44#include <linux/generic_acl.h> 45#include <linux/generic_acl.h>
45#include <linux/mman.h> 46#include <linux/mman.h>
46#include <linux/pagemap.h>
47#include <linux/string.h> 47#include <linux/string.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/backing-dev.h> 49#include <linux/backing-dev.h>
@@ -65,13 +65,28 @@ static struct vfsmount *shm_mnt;
65#include <asm/div64.h> 65#include <asm/div64.h>
66#include <asm/pgtable.h> 66#include <asm/pgtable.h>
67 67
68/*
69 * The maximum size of a shmem/tmpfs file is limited by the maximum size of
70 * its triple-indirect swap vector - see illustration at shmem_swp_entry().
71 *
72 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
73 * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
74 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
75 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
76 *
77 * We use / and * instead of shifts in the definitions below, so that the swap
78 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
79 */
68#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) 80#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
69#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) 81#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
70#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
71 82
72#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) 83#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
73#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) 84#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
74 85
86#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
87#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
88
89#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
75#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 90#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
76 91
77/* info->flags needs VM_flags to handle pagein/truncate races efficiently */ 92/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
@@ -1325,8 +1340,12 @@ repeat:
1325 shmem_swp_unmap(entry); 1340 shmem_swp_unmap(entry);
1326 spin_unlock(&info->lock); 1341 spin_unlock(&info->lock);
1327 if (error == -ENOMEM) { 1342 if (error == -ENOMEM) {
1328 /* allow reclaim from this memory cgroup */ 1343 /*
1329 error = mem_cgroup_shrink_usage(swappage, 1344 * reclaim from proper memory cgroup and
1345 * call memcg's OOM if needed.
1346 */
1347 error = mem_cgroup_shmem_charge_fallback(
1348 swappage,
1330 current->mm, 1349 current->mm,
1331 gfp); 1350 gfp);
1332 if (error) { 1351 if (error) {
@@ -2581,7 +2600,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2581#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) 2600#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
2582#define shmem_acct_size(flags, size) 0 2601#define shmem_acct_size(flags, size) 0
2583#define shmem_unacct_size(flags, size) do {} while (0) 2602#define shmem_unacct_size(flags, size) do {} while (0)
2584#define SHMEM_MAX_BYTES LLONG_MAX 2603#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
2585 2604
2586#endif /* CONFIG_SHMEM */ 2605#endif /* CONFIG_SHMEM */
2587 2606
diff --git a/mm/swap.c b/mm/swap.c
index bede23ce64ea..cb29ae5d33ab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -491,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
491 491
492EXPORT_SYMBOL(pagevec_lookup_tag); 492EXPORT_SYMBOL(pagevec_lookup_tag);
493 493
494#ifdef CONFIG_SMP
495/*
496 * We tolerate a little inaccuracy to avoid ping-ponging the counter between
497 * CPUs
498 */
499#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
500
501static DEFINE_PER_CPU(long, committed_space);
502
503void vm_acct_memory(long pages)
504{
505 long *local;
506
507 preempt_disable();
508 local = &__get_cpu_var(committed_space);
509 *local += pages;
510 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
511 atomic_long_add(*local, &vm_committed_space);
512 *local = 0;
513 }
514 preempt_enable();
515}
516
517#ifdef CONFIG_HOTPLUG_CPU
518
519/* Drop the CPU's cached committed space back into the central pool. */
520static int cpu_swap_callback(struct notifier_block *nfb,
521 unsigned long action,
522 void *hcpu)
523{
524 long *committed;
525
526 committed = &per_cpu(committed_space, (long)hcpu);
527 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
528 atomic_long_add(*committed, &vm_committed_space);
529 *committed = 0;
530 drain_cpu_pagevecs((long)hcpu);
531 }
532 return NOTIFY_OK;
533}
534#endif /* CONFIG_HOTPLUG_CPU */
535#endif /* CONFIG_SMP */
536
537/* 494/*
538 * Perform any setup for the swap system 495 * Perform any setup for the swap system
539 */ 496 */
@@ -554,7 +511,4 @@ void __init swap_setup(void)
554 * Right now other parts of the system means that we 511 * Right now other parts of the system means that we
555 * _really_ don't want to cluster much more 512 * _really_ don't want to cluster much more
556 */ 513 */
557#ifdef CONFIG_HOTPLUG_CPU
558 hotcpu_notifier(cpu_swap_callback, 0);
559#endif
560} 514}
diff --git a/mm/util.c b/mm/util.c
index 6794a336e9af..abc65aa7cdfc 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -225,6 +225,22 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
225} 225}
226#endif 226#endif
227 227
228/**
229 * get_user_pages_fast() - pin user pages in memory
230 * @start: starting user address
231 * @nr_pages: number of pages from start to pin
232 * @write: whether pages will be written to
233 * @pages: array that receives pointers to the pages pinned.
234 * Should be at least nr_pages long.
235 *
236 * Attempt to pin user pages in memory without taking mm->mmap_sem.
237 * If not successful, it will fall back to taking the lock and
238 * calling get_user_pages().
239 *
240 * Returns number of pages pinned. This may be fewer than the number
241 * requested. If nr_pages is 0 or negative, returns 0. If no pages
242 * were pinned, returns -errno.
243 */
228int __attribute__((weak)) get_user_pages_fast(unsigned long start, 244int __attribute__((weak)) get_user_pages_fast(unsigned long start,
229 int nr_pages, int write, struct page **pages) 245 int nr_pages, int write, struct page **pages)
230{ 246{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 39fdfb14eeaa..5fa3eda1f03f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,6 +63,9 @@ struct scan_control {
63 /* Can mapped pages be reclaimed? */ 63 /* Can mapped pages be reclaimed? */
64 int may_unmap; 64 int may_unmap;
65 65
66 /* Can pages be swapped as part of reclaim? */
67 int may_swap;
68
66 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 69 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
67 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 70 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
68 * In this context, it doesn't matter that we scan the 71 * In this context, it doesn't matter that we scan the
@@ -1380,7 +1383,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1380 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1383 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1381 1384
1382 /* If we have no swap space, do not bother scanning anon pages. */ 1385 /* If we have no swap space, do not bother scanning anon pages. */
1383 if (nr_swap_pages <= 0) { 1386 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1384 percent[0] = 0; 1387 percent[0] = 0;
1385 percent[1] = 100; 1388 percent[1] = 100;
1386 return; 1389 return;
@@ -1468,7 +1471,7 @@ static void shrink_zone(int priority, struct zone *zone,
1468 1471
1469 for_each_evictable_lru(l) { 1472 for_each_evictable_lru(l) {
1470 int file = is_file_lru(l); 1473 int file = is_file_lru(l);
1471 int scan; 1474 unsigned long scan;
1472 1475
1473 scan = zone_nr_pages(zone, sc, l); 1476 scan = zone_nr_pages(zone, sc, l);
1474 if (priority) { 1477 if (priority) {
@@ -1697,6 +1700,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1697 .may_writepage = !laptop_mode, 1700 .may_writepage = !laptop_mode,
1698 .swap_cluster_max = SWAP_CLUSTER_MAX, 1701 .swap_cluster_max = SWAP_CLUSTER_MAX,
1699 .may_unmap = 1, 1702 .may_unmap = 1,
1703 .may_swap = 1,
1700 .swappiness = vm_swappiness, 1704 .swappiness = vm_swappiness,
1701 .order = order, 1705 .order = order,
1702 .mem_cgroup = NULL, 1706 .mem_cgroup = NULL,
@@ -1717,6 +1721,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1717 struct scan_control sc = { 1721 struct scan_control sc = {
1718 .may_writepage = !laptop_mode, 1722 .may_writepage = !laptop_mode,
1719 .may_unmap = 1, 1723 .may_unmap = 1,
1724 .may_swap = !noswap,
1720 .swap_cluster_max = SWAP_CLUSTER_MAX, 1725 .swap_cluster_max = SWAP_CLUSTER_MAX,
1721 .swappiness = swappiness, 1726 .swappiness = swappiness,
1722 .order = 0, 1727 .order = 0,
@@ -1726,9 +1731,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1726 }; 1731 };
1727 struct zonelist *zonelist; 1732 struct zonelist *zonelist;
1728 1733
1729 if (noswap)
1730 sc.may_unmap = 0;
1731
1732 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1734 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1733 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1735 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1734 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 1736 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
@@ -1767,6 +1769,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1767 struct scan_control sc = { 1769 struct scan_control sc = {
1768 .gfp_mask = GFP_KERNEL, 1770 .gfp_mask = GFP_KERNEL,
1769 .may_unmap = 1, 1771 .may_unmap = 1,
1772 .may_swap = 1,
1770 .swap_cluster_max = SWAP_CLUSTER_MAX, 1773 .swap_cluster_max = SWAP_CLUSTER_MAX,
1771 .swappiness = vm_swappiness, 1774 .swappiness = vm_swappiness,
1772 .order = order, 1775 .order = order,
@@ -2088,13 +2091,13 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2088 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2091 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2089 sc, prio); 2092 sc, prio);
2090 if (nr_reclaimed >= nr_pages) { 2093 if (nr_reclaimed >= nr_pages) {
2091 sc->nr_reclaimed = nr_reclaimed; 2094 sc->nr_reclaimed += nr_reclaimed;
2092 return; 2095 return;
2093 } 2096 }
2094 } 2097 }
2095 } 2098 }
2096 } 2099 }
2097 sc->nr_reclaimed = nr_reclaimed; 2100 sc->nr_reclaimed += nr_reclaimed;
2098} 2101}
2099 2102
2100/* 2103/*
@@ -2115,6 +2118,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2115 .may_unmap = 0, 2118 .may_unmap = 0,
2116 .may_writepage = 1, 2119 .may_writepage = 1,
2117 .isolate_pages = isolate_pages_global, 2120 .isolate_pages = isolate_pages_global,
2121 .nr_reclaimed = 0,
2118 }; 2122 };
2119 2123
2120 current->reclaim_state = &reclaim_state; 2124 current->reclaim_state = &reclaim_state;
@@ -2297,6 +2301,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2297 struct scan_control sc = { 2301 struct scan_control sc = {
2298 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2302 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2299 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2303 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2304 .may_swap = 1,
2300 .swap_cluster_max = max_t(unsigned long, nr_pages, 2305 .swap_cluster_max = max_t(unsigned long, nr_pages,
2301 SWAP_CLUSTER_MAX), 2306 SWAP_CLUSTER_MAX),
2302 .gfp_mask = gfp_mask, 2307 .gfp_mask = gfp_mask,