aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig30
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/madvise.c8
-rw-r--r--mm/memcontrol.c40
-rw-r--r--mm/memory.c112
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/nommu.c17
-rw-r--r--mm/oom_kill.c44
-rw-r--r--mm/page_alloc.c20
-rw-r--r--mm/shmem.c35
-rw-r--r--mm/swap.c46
-rw-r--r--mm/util.c16
-rw-r--r--mm/vmalloc.c1
-rw-r--r--mm/vmscan.c19
14 files changed, 245 insertions, 162 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b53427ad30a3..c2b57d81e153 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -213,6 +213,8 @@ config UNEVICTABLE_LRU
213 will use one page flag and increase the code size a little, 213 will use one page flag and increase the code size a little,
214 say Y unless you know what you are doing. 214 say Y unless you know what you are doing.
215 215
216 See Documentation/vm/unevictable-lru.txt for more information.
217
216config HAVE_MLOCK 218config HAVE_MLOCK
217 bool 219 bool
218 default y if MMU=y 220 default y if MMU=y
@@ -223,3 +225,31 @@ config HAVE_MLOCKED_PAGE_BIT
223 225
224config MMU_NOTIFIER 226config MMU_NOTIFIER
225 bool 227 bool
228
229config NOMMU_INITIAL_TRIM_EXCESS
230 int "Turn on mmap() excess space trimming before booting"
231 depends on !MMU
232 default 1
233 help
234 The NOMMU mmap() frequently needs to allocate large contiguous chunks
235 of memory on which to store mappings, but it can only ask the system
236 allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
237 more than it requires. To deal with this, mmap() is able to trim off
238 the excess and return it to the allocator.
239
240 If trimming is enabled, the excess is trimmed off and returned to the
241 system allocator, which can cause extra fragmentation, particularly
242 if there are a lot of transient processes.
243
244 If trimming is disabled, the excess is kept, but not used, which for
245 long-term mappings means that the space is wasted.
246
247 Trimming can be dynamically controlled through a sysctl option
248 (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
249 excess pages there must be before trimming should occur, or zero if
250 no trimming is to occur.
251
252 This option specifies the initial value of this option. The default
253 of 1 says that all excess pages should be trimmed.
254
255 See Documentation/nommu-mmap.txt for more information.
diff --git a/mm/filemap.c b/mm/filemap.c
index 2e2d38ebda4b..379ff0bcbf6e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -441,6 +441,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
441 } 441 }
442 return err; 442 return err;
443} 443}
444EXPORT_SYMBOL(filemap_write_and_wait_range);
444 445
445/** 446/**
446 * add_to_page_cache_locked - add a locked page to the pagecache 447 * add_to_page_cache_locked - add a locked page to the pagecache
@@ -567,8 +568,8 @@ EXPORT_SYMBOL(wait_on_page_bit);
567 568
568/** 569/**
569 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 570 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
570 * @page - Page defining the wait queue of interest 571 * @page: Page defining the wait queue of interest
571 * @waiter - Waiter to add to the queue 572 * @waiter: Waiter to add to the queue
572 * 573 *
573 * Add an arbitrary @waiter to the wait queue for the nominated @page. 574 * Add an arbitrary @waiter to the wait queue for the nominated @page.
574 */ 575 */
diff --git a/mm/madvise.c b/mm/madvise.c
index b9ce574827c8..36d6ea2b6340 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -112,6 +112,14 @@ static long madvise_willneed(struct vm_area_struct * vma,
112 if (!file) 112 if (!file)
113 return -EBADF; 113 return -EBADF;
114 114
115 /*
116 * Page cache readahead assumes page cache pages are order-0 which
117 * is not the case for hugetlbfs. Do not give a bad return value
118 * but ignore the advice.
119 */
120 if (vma->vm_flags & VM_HUGETLB)
121 return 0;
122
115 if (file->f_mapping->a_ops->get_xip_mem) { 123 if (file->f_mapping->a_ops->get_xip_mem) {
116 /* no bad return value, but ignore advice */ 124 /* no bad return value, but ignore advice */
117 return 0; 125 return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2fc6d6c48238..01c2d8f14685 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -932,7 +932,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
932 if (unlikely(!mem)) 932 if (unlikely(!mem))
933 return 0; 933 return 0;
934 934
935 VM_BUG_ON(mem_cgroup_is_obsolete(mem)); 935 VM_BUG_ON(!mem || mem_cgroup_is_obsolete(mem));
936 936
937 while (1) { 937 while (1) {
938 int ret; 938 int ret;
@@ -1024,9 +1024,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
1024 return NULL; 1024 return NULL;
1025 1025
1026 pc = lookup_page_cgroup(page); 1026 pc = lookup_page_cgroup(page);
1027 /* 1027 lock_page_cgroup(pc);
1028 * Used bit of swapcache is solid under page lock.
1029 */
1030 if (PageCgroupUsed(pc)) { 1028 if (PageCgroupUsed(pc)) {
1031 mem = pc->mem_cgroup; 1029 mem = pc->mem_cgroup;
1032 if (mem && !css_tryget(&mem->css)) 1030 if (mem && !css_tryget(&mem->css))
@@ -1040,6 +1038,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
1040 mem = NULL; 1038 mem = NULL;
1041 rcu_read_unlock(); 1039 rcu_read_unlock();
1042 } 1040 }
1041 unlock_page_cgroup(pc);
1043 return mem; 1042 return mem;
1044} 1043}
1045 1044
@@ -1618,37 +1617,28 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1618} 1617}
1619 1618
1620/* 1619/*
1621 * A call to try to shrink memory usage under specified resource controller. 1620 * A call to try to shrink memory usage on charge failure at shmem's swapin.
1622 * This is typically used for page reclaiming for shmem for reducing side 1621 * Calling hierarchical_reclaim is not enough because we should update
1623 * effect of page allocation from shmem, which is used by some mem_cgroup. 1622 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
1623 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
1624 * not from the memcg which this page would be charged to.
1625 * try_charge_swapin does all of these works properly.
1624 */ 1626 */
1625int mem_cgroup_shrink_usage(struct page *page, 1627int mem_cgroup_shmem_charge_fallback(struct page *page,
1626 struct mm_struct *mm, 1628 struct mm_struct *mm,
1627 gfp_t gfp_mask) 1629 gfp_t gfp_mask)
1628{ 1630{
1629 struct mem_cgroup *mem = NULL; 1631 struct mem_cgroup *mem = NULL;
1630 int progress = 0; 1632 int ret;
1631 int retry = MEM_CGROUP_RECLAIM_RETRIES;
1632 1633
1633 if (mem_cgroup_disabled()) 1634 if (mem_cgroup_disabled())
1634 return 0; 1635 return 0;
1635 if (page)
1636 mem = try_get_mem_cgroup_from_swapcache(page);
1637 if (!mem && mm)
1638 mem = try_get_mem_cgroup_from_mm(mm);
1639 if (unlikely(!mem))
1640 return 0;
1641 1636
1642 do { 1637 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1643 progress = mem_cgroup_hierarchical_reclaim(mem, 1638 if (!ret)
1644 gfp_mask, true, false); 1639 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
1645 progress += mem_cgroup_check_under_limit(mem);
1646 } while (!progress && --retry);
1647 1640
1648 css_put(&mem->css); 1641 return ret;
1649 if (!retry)
1650 return -ENOMEM;
1651 return 0;
1652} 1642}
1653 1643
1654static DEFINE_MUTEX(set_limit_mutex); 1644static DEFINE_MUTEX(set_limit_mutex);
diff --git a/mm/memory.c b/mm/memory.c
index cf6873e91c6a..4126dd16778c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1971 ret = tmp; 1971 ret = tmp;
1972 goto unwritable_page; 1972 goto unwritable_page;
1973 } 1973 }
1974 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
1975 lock_page(old_page);
1976 if (!old_page->mapping) {
1977 ret = 0; /* retry the fault */
1978 unlock_page(old_page);
1979 goto unwritable_page;
1980 }
1981 } else
1982 VM_BUG_ON(!PageLocked(old_page));
1974 1983
1975 /* 1984 /*
1976 * Since we dropped the lock we need to revalidate 1985 * Since we dropped the lock we need to revalidate
@@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1980 */ 1989 */
1981 page_table = pte_offset_map_lock(mm, pmd, address, 1990 page_table = pte_offset_map_lock(mm, pmd, address,
1982 &ptl); 1991 &ptl);
1983 page_cache_release(old_page); 1992 if (!pte_same(*page_table, orig_pte)) {
1984 if (!pte_same(*page_table, orig_pte)) 1993 unlock_page(old_page);
1994 page_cache_release(old_page);
1985 goto unlock; 1995 goto unlock;
1996 }
1986 1997
1987 page_mkwrite = 1; 1998 page_mkwrite = 1;
1988 } 1999 }
@@ -2094,9 +2105,6 @@ gotten:
2094unlock: 2105unlock:
2095 pte_unmap_unlock(page_table, ptl); 2106 pte_unmap_unlock(page_table, ptl);
2096 if (dirty_page) { 2107 if (dirty_page) {
2097 if (vma->vm_file)
2098 file_update_time(vma->vm_file);
2099
2100 /* 2108 /*
2101 * Yes, Virginia, this is actually required to prevent a race 2109 * Yes, Virginia, this is actually required to prevent a race
2102 * with clear_page_dirty_for_io() from clearing the page dirty 2110 * with clear_page_dirty_for_io() from clearing the page dirty
@@ -2105,16 +2113,41 @@ unlock:
2105 * 2113 *
2106 * do_no_page is protected similarly. 2114 * do_no_page is protected similarly.
2107 */ 2115 */
2108 wait_on_page_locked(dirty_page); 2116 if (!page_mkwrite) {
2109 set_page_dirty_balance(dirty_page, page_mkwrite); 2117 wait_on_page_locked(dirty_page);
2118 set_page_dirty_balance(dirty_page, page_mkwrite);
2119 }
2110 put_page(dirty_page); 2120 put_page(dirty_page);
2121 if (page_mkwrite) {
2122 struct address_space *mapping = dirty_page->mapping;
2123
2124 set_page_dirty(dirty_page);
2125 unlock_page(dirty_page);
2126 page_cache_release(dirty_page);
2127 if (mapping) {
2128 /*
2129 * Some device drivers do not set page.mapping
2130 * but still dirty their pages
2131 */
2132 balance_dirty_pages_ratelimited(mapping);
2133 }
2134 }
2135
2136 /* file_update_time outside page_lock */
2137 if (vma->vm_file)
2138 file_update_time(vma->vm_file);
2111 } 2139 }
2112 return ret; 2140 return ret;
2113oom_free_new: 2141oom_free_new:
2114 page_cache_release(new_page); 2142 page_cache_release(new_page);
2115oom: 2143oom:
2116 if (old_page) 2144 if (old_page) {
2145 if (page_mkwrite) {
2146 unlock_page(old_page);
2147 page_cache_release(old_page);
2148 }
2117 page_cache_release(old_page); 2149 page_cache_release(old_page);
2150 }
2118 return VM_FAULT_OOM; 2151 return VM_FAULT_OOM;
2119 2152
2120unwritable_page: 2153unwritable_page:
@@ -2458,8 +2491,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2458 2491
2459 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2492 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2460 ret = VM_FAULT_OOM; 2493 ret = VM_FAULT_OOM;
2461 unlock_page(page); 2494 goto out_page;
2462 goto out;
2463 } 2495 }
2464 2496
2465 /* 2497 /*
@@ -2521,6 +2553,7 @@ out:
2521out_nomap: 2553out_nomap:
2522 mem_cgroup_cancel_charge_swapin(ptr); 2554 mem_cgroup_cancel_charge_swapin(ptr);
2523 pte_unmap_unlock(page_table, ptl); 2555 pte_unmap_unlock(page_table, ptl);
2556out_page:
2524 unlock_page(page); 2557 unlock_page(page);
2525 page_cache_release(page); 2558 page_cache_release(page);
2526 return ret; 2559 return ret;
@@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2664 int tmp; 2697 int tmp;
2665 2698
2666 unlock_page(page); 2699 unlock_page(page);
2667 vmf.flags |= FAULT_FLAG_MKWRITE; 2700 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2668 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 2701 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2669 if (unlikely(tmp & 2702 if (unlikely(tmp &
2670 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 2703 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2671 ret = tmp; 2704 ret = tmp;
2672 anon = 1; /* no anon but release vmf.page */ 2705 goto unwritable_page;
2673 goto out_unlocked;
2674 }
2675 lock_page(page);
2676 /*
2677 * XXX: this is not quite right (racy vs
2678 * invalidate) to unlock and relock the page
2679 * like this, however a better fix requires
2680 * reworking page_mkwrite locking API, which
2681 * is better done later.
2682 */
2683 if (!page->mapping) {
2684 ret = 0;
2685 anon = 1; /* no anon but release vmf.page */
2686 goto out;
2687 } 2706 }
2707 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2708 lock_page(page);
2709 if (!page->mapping) {
2710 ret = 0; /* retry the fault */
2711 unlock_page(page);
2712 goto unwritable_page;
2713 }
2714 } else
2715 VM_BUG_ON(!PageLocked(page));
2688 page_mkwrite = 1; 2716 page_mkwrite = 1;
2689 } 2717 }
2690 } 2718 }
@@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2736 pte_unmap_unlock(page_table, ptl); 2764 pte_unmap_unlock(page_table, ptl);
2737 2765
2738out: 2766out:
2739 unlock_page(vmf.page); 2767 if (dirty_page) {
2740out_unlocked: 2768 struct address_space *mapping = page->mapping;
2741 if (anon)
2742 page_cache_release(vmf.page);
2743 else if (dirty_page) {
2744 if (vma->vm_file)
2745 file_update_time(vma->vm_file);
2746 2769
2747 set_page_dirty_balance(dirty_page, page_mkwrite); 2770 if (set_page_dirty(dirty_page))
2771 page_mkwrite = 1;
2772 unlock_page(dirty_page);
2748 put_page(dirty_page); 2773 put_page(dirty_page);
2774 if (page_mkwrite && mapping) {
2775 /*
2776 * Some device drivers do not set page.mapping but still
2777 * dirty their pages
2778 */
2779 balance_dirty_pages_ratelimited(mapping);
2780 }
2781
2782 /* file_update_time outside page_lock */
2783 if (vma->vm_file)
2784 file_update_time(vma->vm_file);
2785 } else {
2786 unlock_page(vmf.page);
2787 if (anon)
2788 page_cache_release(vmf.page);
2749 } 2789 }
2750 2790
2751 return ret; 2791 return ret;
2792
2793unwritable_page:
2794 page_cache_release(page);
2795 return ret;
2752} 2796}
2753 2797
2754static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2798static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a3841186c11..6b7b1a95944b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
85int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 85int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
86int sysctl_overcommit_ratio = 50; /* default is 50% */ 86int sysctl_overcommit_ratio = 50; /* default is 50% */
87int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 87int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
88atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 88struct percpu_counter vm_committed_as;
89 89
90/* 90/*
91 * Check that a process has enough memory to allocate a new virtual 91 * Check that a process has enough memory to allocate a new virtual
@@ -179,11 +179,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
179 if (mm) 179 if (mm)
180 allowed -= mm->total_vm / 32; 180 allowed -= mm->total_vm / 32;
181 181
182 /* 182 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
183 * cast `allowed' as a signed long because vm_committed_space
184 * sometimes has a negative value
185 */
186 if (atomic_long_read(&vm_committed_space) < (long)allowed)
187 return 0; 183 return 0;
188error: 184error:
189 vm_unacct_memory(pages); 185 vm_unacct_memory(pages);
@@ -1575,7 +1571,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1575 * Overcommit.. This must be the final test, as it will 1571 * Overcommit.. This must be the final test, as it will
1576 * update security statistics. 1572 * update security statistics.
1577 */ 1573 */
1578 if (security_vm_enough_memory(grow)) 1574 if (security_vm_enough_memory_mm(mm, grow))
1579 return -ENOMEM; 1575 return -ENOMEM;
1580 1576
1581 /* Ok, everything looks good - let it rip */ 1577 /* Ok, everything looks good - let it rip */
@@ -2481,4 +2477,8 @@ void mm_drop_all_locks(struct mm_struct *mm)
2481 */ 2477 */
2482void __init mmap_init(void) 2478void __init mmap_init(void)
2483{ 2479{
2480 int ret;
2481
2482 ret = percpu_counter_init(&vm_committed_as, 0);
2483 VM_BUG_ON(ret);
2484} 2484}
diff --git a/mm/nommu.c b/mm/nommu.c
index 72eda4aee2cb..b571ef707428 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -62,11 +62,11 @@ void *high_memory;
62struct page *mem_map; 62struct page *mem_map;
63unsigned long max_mapnr; 63unsigned long max_mapnr;
64unsigned long num_physpages; 64unsigned long num_physpages;
65atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 65struct percpu_counter vm_committed_as;
66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
67int sysctl_overcommit_ratio = 50; /* default is 50% */ 67int sysctl_overcommit_ratio = 50; /* default is 50% */
68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ 69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72atomic_long_t mmap_pages_allocated; 72atomic_long_t mmap_pages_allocated;
@@ -463,6 +463,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
463 */ 463 */
464void __init mmap_init(void) 464void __init mmap_init(void)
465{ 465{
466 int ret;
467
468 ret = percpu_counter_init(&vm_committed_as, 0);
469 VM_BUG_ON(ret);
466 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); 470 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
467} 471}
468 472
@@ -511,8 +515,6 @@ static void add_nommu_region(struct vm_region *region)
511 515
512 validate_nommu_regions(); 516 validate_nommu_regions();
513 517
514 BUG_ON(region->vm_start & ~PAGE_MASK);
515
516 parent = NULL; 518 parent = NULL;
517 p = &nommu_region_tree.rb_node; 519 p = &nommu_region_tree.rb_node;
518 while (*p) { 520 while (*p) {
@@ -1847,12 +1849,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1847 if (mm) 1849 if (mm)
1848 allowed -= mm->total_vm / 32; 1850 allowed -= mm->total_vm / 32;
1849 1851
1850 /* 1852 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
1851 * cast `allowed' as a signed long because vm_committed_space
1852 * sometimes has a negative value
1853 */
1854 if (atomic_long_read(&vm_committed_space) < (long)allowed)
1855 return 0; 1853 return 0;
1854
1856error: 1855error:
1857 vm_unacct_memory(pages); 1856 vm_unacct_memory(pages);
1858 1857
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2f3166e308d9..92bcf1db16b2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,34 +514,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
514 */ 514 */
515static void __out_of_memory(gfp_t gfp_mask, int order) 515static void __out_of_memory(gfp_t gfp_mask, int order)
516{ 516{
517 if (sysctl_oom_kill_allocating_task) { 517 struct task_struct *p;
518 oom_kill_process(current, gfp_mask, order, 0, NULL, 518 unsigned long points;
519 "Out of memory (oom_kill_allocating_task)");
520
521 } else {
522 unsigned long points;
523 struct task_struct *p;
524
525retry:
526 /*
527 * Rambo mode: Shoot down a process and hope it solves whatever
528 * issues we may have.
529 */
530 p = select_bad_process(&points, NULL);
531 519
532 if (PTR_ERR(p) == -1UL) 520 if (sysctl_oom_kill_allocating_task)
521 if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
522 "Out of memory (oom_kill_allocating_task)"))
533 return; 523 return;
524retry:
525 /*
526 * Rambo mode: Shoot down a process and hope it solves whatever
527 * issues we may have.
528 */
529 p = select_bad_process(&points, NULL);
534 530
535 /* Found nothing?!?! Either we hang forever, or we panic. */ 531 if (PTR_ERR(p) == -1UL)
536 if (!p) { 532 return;
537 read_unlock(&tasklist_lock);
538 panic("Out of memory and no killable processes...\n");
539 }
540 533
541 if (oom_kill_process(p, gfp_mask, order, points, NULL, 534 /* Found nothing?!?! Either we hang forever, or we panic. */
542 "Out of memory")) 535 if (!p) {
543 goto retry; 536 read_unlock(&tasklist_lock);
537 panic("Out of memory and no killable processes...\n");
544 } 538 }
539
540 if (oom_kill_process(p, gfp_mask, order, points, NULL,
541 "Out of memory"))
542 goto retry;
545} 543}
546 544
547/* 545/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2f26991fff1..fe753ecf2aa5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2681,6 +2681,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
2681 2681
2682static int zone_batchsize(struct zone *zone) 2682static int zone_batchsize(struct zone *zone)
2683{ 2683{
2684#ifdef CONFIG_MMU
2684 int batch; 2685 int batch;
2685 2686
2686 /* 2687 /*
@@ -2706,9 +2707,26 @@ static int zone_batchsize(struct zone *zone)
2706 * of pages of one half of the possible page colors 2707 * of pages of one half of the possible page colors
2707 * and the other with pages of the other colors. 2708 * and the other with pages of the other colors.
2708 */ 2709 */
2709 batch = (1 << (fls(batch + batch/2)-1)) - 1; 2710 batch = rounddown_pow_of_two(batch + batch/2) - 1;
2710 2711
2711 return batch; 2712 return batch;
2713
2714#else
2715 /* The deferral and batching of frees should be suppressed under NOMMU
2716 * conditions.
2717 *
2718 * The problem is that NOMMU needs to be able to allocate large chunks
2719 * of contiguous memory as there's no hardware page translation to
2720 * assemble apparent contiguous memory from discontiguous pages.
2721 *
2722 * Queueing large contiguous runs of pages for batching, however,
2723 * causes the pages to actually be freed in smaller chunks. As there
2724 * can be a significant delay between the individual batches being
2725 * recycled, this leads to the once large chunks of space being
2726 * fragmented and becoming unavailable for high-order allocations.
2727 */
2728 return 0;
2729#endif
2712} 2730}
2713 2731
2714static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2732static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
diff --git a/mm/shmem.c b/mm/shmem.c
index d94d2e9146bc..b25f95ce3db7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -24,6 +24,7 @@
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/vfs.h> 25#include <linux/vfs.h>
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/pagemap.h>
27#include <linux/file.h> 28#include <linux/file.h>
28#include <linux/mm.h> 29#include <linux/mm.h>
29#include <linux/module.h> 30#include <linux/module.h>
@@ -43,7 +44,6 @@ static struct vfsmount *shm_mnt;
43#include <linux/exportfs.h> 44#include <linux/exportfs.h>
44#include <linux/generic_acl.h> 45#include <linux/generic_acl.h>
45#include <linux/mman.h> 46#include <linux/mman.h>
46#include <linux/pagemap.h>
47#include <linux/string.h> 47#include <linux/string.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/backing-dev.h> 49#include <linux/backing-dev.h>
@@ -65,13 +65,28 @@ static struct vfsmount *shm_mnt;
65#include <asm/div64.h> 65#include <asm/div64.h>
66#include <asm/pgtable.h> 66#include <asm/pgtable.h>
67 67
68/*
69 * The maximum size of a shmem/tmpfs file is limited by the maximum size of
70 * its triple-indirect swap vector - see illustration at shmem_swp_entry().
71 *
72 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
73 * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
74 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
75 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
76 *
77 * We use / and * instead of shifts in the definitions below, so that the swap
78 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
79 */
68#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) 80#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
69#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) 81#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
70#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
71 82
72#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) 83#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
73#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) 84#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
74 85
86#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
87#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
88
89#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
75#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 90#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
76 91
77/* info->flags needs VM_flags to handle pagein/truncate races efficiently */ 92/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
@@ -1325,8 +1340,12 @@ repeat:
1325 shmem_swp_unmap(entry); 1340 shmem_swp_unmap(entry);
1326 spin_unlock(&info->lock); 1341 spin_unlock(&info->lock);
1327 if (error == -ENOMEM) { 1342 if (error == -ENOMEM) {
1328 /* allow reclaim from this memory cgroup */ 1343 /*
1329 error = mem_cgroup_shrink_usage(swappage, 1344 * reclaim from proper memory cgroup and
1345 * call memcg's OOM if needed.
1346 */
1347 error = mem_cgroup_shmem_charge_fallback(
1348 swappage,
1330 current->mm, 1349 current->mm,
1331 gfp); 1350 gfp);
1332 if (error) { 1351 if (error) {
@@ -2581,7 +2600,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2581#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) 2600#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
2582#define shmem_acct_size(flags, size) 0 2601#define shmem_acct_size(flags, size) 0
2583#define shmem_unacct_size(flags, size) do {} while (0) 2602#define shmem_unacct_size(flags, size) do {} while (0)
2584#define SHMEM_MAX_BYTES LLONG_MAX 2603#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
2585 2604
2586#endif /* CONFIG_SHMEM */ 2605#endif /* CONFIG_SHMEM */
2587 2606
diff --git a/mm/swap.c b/mm/swap.c
index bede23ce64ea..cb29ae5d33ab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -491,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
491 491
492EXPORT_SYMBOL(pagevec_lookup_tag); 492EXPORT_SYMBOL(pagevec_lookup_tag);
493 493
494#ifdef CONFIG_SMP
495/*
496 * We tolerate a little inaccuracy to avoid ping-ponging the counter between
497 * CPUs
498 */
499#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
500
501static DEFINE_PER_CPU(long, committed_space);
502
503void vm_acct_memory(long pages)
504{
505 long *local;
506
507 preempt_disable();
508 local = &__get_cpu_var(committed_space);
509 *local += pages;
510 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
511 atomic_long_add(*local, &vm_committed_space);
512 *local = 0;
513 }
514 preempt_enable();
515}
516
517#ifdef CONFIG_HOTPLUG_CPU
518
519/* Drop the CPU's cached committed space back into the central pool. */
520static int cpu_swap_callback(struct notifier_block *nfb,
521 unsigned long action,
522 void *hcpu)
523{
524 long *committed;
525
526 committed = &per_cpu(committed_space, (long)hcpu);
527 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
528 atomic_long_add(*committed, &vm_committed_space);
529 *committed = 0;
530 drain_cpu_pagevecs((long)hcpu);
531 }
532 return NOTIFY_OK;
533}
534#endif /* CONFIG_HOTPLUG_CPU */
535#endif /* CONFIG_SMP */
536
537/* 494/*
538 * Perform any setup for the swap system 495 * Perform any setup for the swap system
539 */ 496 */
@@ -554,7 +511,4 @@ void __init swap_setup(void)
554 * Right now other parts of the system means that we 511 * Right now other parts of the system means that we
555 * _really_ don't want to cluster much more 512 * _really_ don't want to cluster much more
556 */ 513 */
557#ifdef CONFIG_HOTPLUG_CPU
558 hotcpu_notifier(cpu_swap_callback, 0);
559#endif
560} 514}
diff --git a/mm/util.c b/mm/util.c
index 2599e83eea17..55bef160b9f1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -223,6 +223,22 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
223} 223}
224#endif 224#endif
225 225
226/**
227 * get_user_pages_fast() - pin user pages in memory
228 * @start: starting user address
229 * @nr_pages: number of pages from start to pin
230 * @write: whether pages will be written to
231 * @pages: array that receives pointers to the pages pinned.
232 * Should be at least nr_pages long.
233 *
234 * Attempt to pin user pages in memory without taking mm->mmap_sem.
235 * If not successful, it will fall back to taking the lock and
236 * calling get_user_pages().
237 *
238 * Returns number of pages pinned. This may be fewer than the number
239 * requested. If nr_pages is 0 or negative, returns 0. If no pages
240 * were pinned, returns -errno.
241 */
226int __attribute__((weak)) get_user_pages_fast(unsigned long start, 242int __attribute__((weak)) get_user_pages_fast(unsigned long start,
227 int nr_pages, int write, struct page **pages) 243 int nr_pages, int write, struct page **pages)
228{ 244{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fab19876b4d1..083716ea38c9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -402,6 +402,7 @@ overflow:
402 printk(KERN_WARNING 402 printk(KERN_WARNING
403 "vmap allocation for size %lu failed: " 403 "vmap allocation for size %lu failed: "
404 "use vmalloc=<size> to increase size.\n", size); 404 "use vmalloc=<size> to increase size.\n", size);
405 kfree(va);
405 return ERR_PTR(-EBUSY); 406 return ERR_PTR(-EBUSY);
406 } 407 }
407 408
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 39fdfb14eeaa..5fa3eda1f03f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,6 +63,9 @@ struct scan_control {
63 /* Can mapped pages be reclaimed? */ 63 /* Can mapped pages be reclaimed? */
64 int may_unmap; 64 int may_unmap;
65 65
66 /* Can pages be swapped as part of reclaim? */
67 int may_swap;
68
66 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 69 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
67 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 70 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
68 * In this context, it doesn't matter that we scan the 71 * In this context, it doesn't matter that we scan the
@@ -1380,7 +1383,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1380 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1383 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1381 1384
1382 /* If we have no swap space, do not bother scanning anon pages. */ 1385 /* If we have no swap space, do not bother scanning anon pages. */
1383 if (nr_swap_pages <= 0) { 1386 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1384 percent[0] = 0; 1387 percent[0] = 0;
1385 percent[1] = 100; 1388 percent[1] = 100;
1386 return; 1389 return;
@@ -1468,7 +1471,7 @@ static void shrink_zone(int priority, struct zone *zone,
1468 1471
1469 for_each_evictable_lru(l) { 1472 for_each_evictable_lru(l) {
1470 int file = is_file_lru(l); 1473 int file = is_file_lru(l);
1471 int scan; 1474 unsigned long scan;
1472 1475
1473 scan = zone_nr_pages(zone, sc, l); 1476 scan = zone_nr_pages(zone, sc, l);
1474 if (priority) { 1477 if (priority) {
@@ -1697,6 +1700,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1697 .may_writepage = !laptop_mode, 1700 .may_writepage = !laptop_mode,
1698 .swap_cluster_max = SWAP_CLUSTER_MAX, 1701 .swap_cluster_max = SWAP_CLUSTER_MAX,
1699 .may_unmap = 1, 1702 .may_unmap = 1,
1703 .may_swap = 1,
1700 .swappiness = vm_swappiness, 1704 .swappiness = vm_swappiness,
1701 .order = order, 1705 .order = order,
1702 .mem_cgroup = NULL, 1706 .mem_cgroup = NULL,
@@ -1717,6 +1721,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1717 struct scan_control sc = { 1721 struct scan_control sc = {
1718 .may_writepage = !laptop_mode, 1722 .may_writepage = !laptop_mode,
1719 .may_unmap = 1, 1723 .may_unmap = 1,
1724 .may_swap = !noswap,
1720 .swap_cluster_max = SWAP_CLUSTER_MAX, 1725 .swap_cluster_max = SWAP_CLUSTER_MAX,
1721 .swappiness = swappiness, 1726 .swappiness = swappiness,
1722 .order = 0, 1727 .order = 0,
@@ -1726,9 +1731,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1726 }; 1731 };
1727 struct zonelist *zonelist; 1732 struct zonelist *zonelist;
1728 1733
1729 if (noswap)
1730 sc.may_unmap = 0;
1731
1732 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1734 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1733 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1735 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1734 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 1736 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
@@ -1767,6 +1769,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1767 struct scan_control sc = { 1769 struct scan_control sc = {
1768 .gfp_mask = GFP_KERNEL, 1770 .gfp_mask = GFP_KERNEL,
1769 .may_unmap = 1, 1771 .may_unmap = 1,
1772 .may_swap = 1,
1770 .swap_cluster_max = SWAP_CLUSTER_MAX, 1773 .swap_cluster_max = SWAP_CLUSTER_MAX,
1771 .swappiness = vm_swappiness, 1774 .swappiness = vm_swappiness,
1772 .order = order, 1775 .order = order,
@@ -2088,13 +2091,13 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2088 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2091 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2089 sc, prio); 2092 sc, prio);
2090 if (nr_reclaimed >= nr_pages) { 2093 if (nr_reclaimed >= nr_pages) {
2091 sc->nr_reclaimed = nr_reclaimed; 2094 sc->nr_reclaimed += nr_reclaimed;
2092 return; 2095 return;
2093 } 2096 }
2094 } 2097 }
2095 } 2098 }
2096 } 2099 }
2097 sc->nr_reclaimed = nr_reclaimed; 2100 sc->nr_reclaimed += nr_reclaimed;
2098} 2101}
2099 2102
2100/* 2103/*
@@ -2115,6 +2118,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2115 .may_unmap = 0, 2118 .may_unmap = 0,
2116 .may_writepage = 1, 2119 .may_writepage = 1,
2117 .isolate_pages = isolate_pages_global, 2120 .isolate_pages = isolate_pages_global,
2121 .nr_reclaimed = 0,
2118 }; 2122 };
2119 2123
2120 current->reclaim_state = &reclaim_state; 2124 current->reclaim_state = &reclaim_state;
@@ -2297,6 +2301,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2297 struct scan_control sc = { 2301 struct scan_control sc = {
2298 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2302 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2299 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2303 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2304 .may_swap = 1,
2300 .swap_cluster_max = max_t(unsigned long, nr_pages, 2305 .swap_cluster_max = max_t(unsigned long, nr_pages,
2301 SWAP_CLUSTER_MAX), 2306 SWAP_CLUSTER_MAX),
2302 .gfp_mask = gfp_mask, 2307 .gfp_mask = gfp_mask,