aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJens Axboe <jaxboe@fusionio.com>2010-10-19 03:13:04 -0400
committerJens Axboe <jaxboe@fusionio.com>2010-10-19 03:13:04 -0400
commitfa251f89903d73989e2f63e13d0eaed1e07ce0da (patch)
tree3f7fe779941e3b6d67754dd7c44a32f48ea47c74 /mm
parentdd3932eddf428571762596e17b65f5dc92ca361b (diff)
parentcd07202cc8262e1669edff0d97715f3dd9260917 (diff)
Merge branch 'v2.6.36-rc8' into for-2.6.37/barrier
Conflicts: block/blk-core.c drivers/block/loop.c mm/swapfile.c Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/backing-dev.c9
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/fremap.c7
-rw-r--r--mm/hugetlb.c24
-rw-r--r--mm/ksm.c9
-rw-r--r--mm/memcontrol.c10
-rw-r--r--mm/memory-failure.c12
-rw-r--r--mm/memory.c56
-rw-r--r--mm/memory_hotplug.c16
-rw-r--r--mm/mlock.c6
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/oom_kill.c49
-rw-r--r--mm/page-writeback.c27
-rw-r--r--mm/page_alloc.c37
-rw-r--r--mm/percpu.c8
-rw-r--r--mm/percpu_up.c4
-rw-r--r--mm/rmap.c42
-rw-r--r--mm/swapfile.c120
-rw-r--r--mm/vmscan.c43
-rw-r--r--mm/vmstat.c16
23 files changed, 319 insertions, 212 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f4e516e9c37c..f0fb9124e410 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -189,7 +189,7 @@ config COMPACTION
189config MIGRATION 189config MIGRATION
190 bool "Page migration" 190 bool "Page migration"
191 def_bool y 191 def_bool y
192 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE 192 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION
193 help 193 help
194 Allows the migration of the physical location of pages of processes 194 Allows the migration of the physical location of pages of processes
195 while the virtual addresses are not changed. This is useful in 195 while the virtual addresses are not changed. This is useful in
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index eaa4a5bbe063..65d420499a61 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -30,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
30 30
31struct backing_dev_info noop_backing_dev_info = { 31struct backing_dev_info noop_backing_dev_info = {
32 .name = "noop", 32 .name = "noop",
33 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
33}; 34};
34EXPORT_SYMBOL_GPL(noop_backing_dev_info); 35EXPORT_SYMBOL_GPL(noop_backing_dev_info);
35 36
@@ -243,6 +244,7 @@ static int __init default_bdi_init(void)
243 err = bdi_init(&default_backing_dev_info); 244 err = bdi_init(&default_backing_dev_info);
244 if (!err) 245 if (!err)
245 bdi_register(&default_backing_dev_info, NULL, "default"); 246 bdi_register(&default_backing_dev_info, NULL, "default");
247 err = bdi_init(&noop_backing_dev_info);
246 248
247 return err; 249 return err;
248} 250}
@@ -445,8 +447,8 @@ static int bdi_forker_thread(void *ptr)
445 switch (action) { 447 switch (action) {
446 case FORK_THREAD: 448 case FORK_THREAD:
447 __set_current_state(TASK_RUNNING); 449 __set_current_state(TASK_RUNNING);
448 task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s", 450 task = kthread_create(bdi_writeback_thread, &bdi->wb,
449 dev_name(bdi->dev)); 451 "flush-%s", dev_name(bdi->dev));
450 if (IS_ERR(task)) { 452 if (IS_ERR(task)) {
451 /* 453 /*
452 * If thread creation fails, force writeout of 454 * If thread creation fails, force writeout of
@@ -457,10 +459,13 @@ static int bdi_forker_thread(void *ptr)
457 /* 459 /*
458 * The spinlock makes sure we do not lose 460 * The spinlock makes sure we do not lose
459 * wake-ups when racing with 'bdi_queue_work()'. 461 * wake-ups when racing with 'bdi_queue_work()'.
462 * And as soon as the bdi thread is visible, we
463 * can start it.
460 */ 464 */
461 spin_lock_bh(&bdi->wb_lock); 465 spin_lock_bh(&bdi->wb_lock);
462 bdi->wb.task = task; 466 bdi->wb.task = task;
463 spin_unlock_bh(&bdi->wb_lock); 467 spin_unlock_bh(&bdi->wb_lock);
468 wake_up_process(task);
464 } 469 }
465 break; 470 break;
466 471
diff --git a/mm/bounce.c b/mm/bounce.c
index 13b6dad1eed2..1481de68184b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -116,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
116 */ 116 */
117 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; 117 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
118 118
119 flush_dcache_page(tovec->bv_page);
120 bounce_copy_vec(tovec, vfrom); 119 bounce_copy_vec(tovec, vfrom);
120 flush_dcache_page(tovec->bv_page);
121 } 121 }
122} 122}
123 123
diff --git a/mm/compaction.c b/mm/compaction.c
index 94cce51b0b35..4d709ee59013 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -214,15 +214,16 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
214/* Similar to reclaim, but different enough that they don't share logic */ 214/* Similar to reclaim, but different enough that they don't share logic */
215static bool too_many_isolated(struct zone *zone) 215static bool too_many_isolated(struct zone *zone)
216{ 216{
217 217 unsigned long active, inactive, isolated;
218 unsigned long inactive, isolated;
219 218
220 inactive = zone_page_state(zone, NR_INACTIVE_FILE) + 219 inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
221 zone_page_state(zone, NR_INACTIVE_ANON); 220 zone_page_state(zone, NR_INACTIVE_ANON);
221 active = zone_page_state(zone, NR_ACTIVE_FILE) +
222 zone_page_state(zone, NR_ACTIVE_ANON);
222 isolated = zone_page_state(zone, NR_ISOLATED_FILE) + 223 isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
223 zone_page_state(zone, NR_ISOLATED_ANON); 224 zone_page_state(zone, NR_ISOLATED_ANON);
224 225
225 return isolated > inactive; 226 return isolated > (inactive + active) / 2;
226} 227}
227 228
228/* 229/*
diff --git a/mm/fremap.c b/mm/fremap.c
index 46f5dacf90a2..ec520c7b28df 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
125{ 125{
126 struct mm_struct *mm = current->mm; 126 struct mm_struct *mm = current->mm;
127 struct address_space *mapping; 127 struct address_space *mapping;
128 unsigned long end = start + size;
129 struct vm_area_struct *vma; 128 struct vm_area_struct *vma;
130 int err = -EINVAL; 129 int err = -EINVAL;
131 int has_write_lock = 0; 130 int has_write_lock = 0;
@@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
142 if (start + size <= start) 141 if (start + size <= start)
143 return err; 142 return err;
144 143
144 /* Does pgoff wrap? */
145 if (pgoff + (size >> PAGE_SHIFT) < pgoff)
146 return err;
147
145 /* Can we represent this offset inside this architecture's pte's? */ 148 /* Can we represent this offset inside this architecture's pte's? */
146#if PTE_FILE_MAX_BITS < BITS_PER_LONG 149#if PTE_FILE_MAX_BITS < BITS_PER_LONG
147 if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) 150 if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
@@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
168 if (!(vma->vm_flags & VM_CAN_NONLINEAR)) 171 if (!(vma->vm_flags & VM_CAN_NONLINEAR))
169 goto out; 172 goto out;
170 173
171 if (end <= start || start < vma->vm_start || end > vma->vm_end) 174 if (start < vma->vm_start || start + size > vma->vm_end)
172 goto out; 175 goto out;
173 176
174 /* Must set VM_NONLINEAR before any pages are populated. */ 177 /* Must set VM_NONLINEAR before any pages are populated. */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cc5be788a39f..c03273807182 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2324,11 +2324,8 @@ retry_avoidcopy:
2324 * and just make the page writable */ 2324 * and just make the page writable */
2325 avoidcopy = (page_mapcount(old_page) == 1); 2325 avoidcopy = (page_mapcount(old_page) == 1);
2326 if (avoidcopy) { 2326 if (avoidcopy) {
2327 if (!trylock_page(old_page)) { 2327 if (PageAnon(old_page))
2328 if (PageAnon(old_page)) 2328 page_move_anon_rmap(old_page, vma, address);
2329 page_move_anon_rmap(old_page, vma, address);
2330 } else
2331 unlock_page(old_page);
2332 set_huge_ptep_writable(vma, address, ptep); 2329 set_huge_ptep_writable(vma, address, ptep);
2333 return 0; 2330 return 0;
2334 } 2331 }
@@ -2404,7 +2401,7 @@ retry_avoidcopy:
2404 set_huge_pte_at(mm, address, ptep, 2401 set_huge_pte_at(mm, address, ptep,
2405 make_huge_pte(vma, new_page, 1)); 2402 make_huge_pte(vma, new_page, 1));
2406 page_remove_rmap(old_page); 2403 page_remove_rmap(old_page);
2407 hugepage_add_anon_rmap(new_page, vma, address); 2404 hugepage_add_new_anon_rmap(new_page, vma, address);
2408 /* Make the old page be freed below */ 2405 /* Make the old page be freed below */
2409 new_page = old_page; 2406 new_page = old_page;
2410 mmu_notifier_invalidate_range_end(mm, 2407 mmu_notifier_invalidate_range_end(mm,
@@ -2631,10 +2628,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2631 vma, address); 2628 vma, address);
2632 } 2629 }
2633 2630
2634 if (!pagecache_page) { 2631 /*
2635 page = pte_page(entry); 2632 * hugetlb_cow() requires page locks of pte_page(entry) and
2633 * pagecache_page, so here we need take the former one
2634 * when page != pagecache_page or !pagecache_page.
2635 * Note that locking order is always pagecache_page -> page,
2636 * so no worry about deadlock.
2637 */
2638 page = pte_page(entry);
2639 if (page != pagecache_page)
2636 lock_page(page); 2640 lock_page(page);
2637 }
2638 2641
2639 spin_lock(&mm->page_table_lock); 2642 spin_lock(&mm->page_table_lock);
2640 /* Check for a racing update before calling hugetlb_cow */ 2643 /* Check for a racing update before calling hugetlb_cow */
@@ -2661,9 +2664,8 @@ out_page_table_lock:
2661 if (pagecache_page) { 2664 if (pagecache_page) {
2662 unlock_page(pagecache_page); 2665 unlock_page(pagecache_page);
2663 put_page(pagecache_page); 2666 put_page(pagecache_page);
2664 } else {
2665 unlock_page(page);
2666 } 2667 }
2668 unlock_page(page);
2667 2669
2668out_mutex: 2670out_mutex:
2669 mutex_unlock(&hugetlb_instantiation_mutex); 2671 mutex_unlock(&hugetlb_instantiation_mutex);
diff --git a/mm/ksm.c b/mm/ksm.c
index e2ae00458320..65ab5c7067d9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -712,7 +712,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
712 if (!ptep) 712 if (!ptep)
713 goto out; 713 goto out;
714 714
715 if (pte_write(*ptep)) { 715 if (pte_write(*ptep) || pte_dirty(*ptep)) {
716 pte_t entry; 716 pte_t entry;
717 717
718 swapped = PageSwapCache(page); 718 swapped = PageSwapCache(page);
@@ -735,7 +735,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
735 set_pte_at(mm, addr, ptep, entry); 735 set_pte_at(mm, addr, ptep, entry);
736 goto out_unlock; 736 goto out_unlock;
737 } 737 }
738 entry = pte_wrprotect(entry); 738 if (pte_dirty(entry))
739 set_page_dirty(page);
740 entry = pte_mkclean(pte_wrprotect(entry));
739 set_pte_at_notify(mm, addr, ptep, entry); 741 set_pte_at_notify(mm, addr, ptep, entry);
740 } 742 }
741 *orig_pte = *ptep; 743 *orig_pte = *ptep;
@@ -1504,8 +1506,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
1504{ 1506{
1505 struct page *new_page; 1507 struct page *new_page;
1506 1508
1507 unlock_page(page); /* any racers will COW it, not modify it */
1508
1509 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1509 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1510 if (new_page) { 1510 if (new_page) {
1511 copy_user_highpage(new_page, page, address, vma); 1511 copy_user_highpage(new_page, page, address, vma);
@@ -1521,7 +1521,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
1521 add_page_to_unevictable_list(new_page); 1521 add_page_to_unevictable_list(new_page);
1522 } 1522 }
1523 1523
1524 page_cache_release(page);
1525 return new_page; 1524 return new_page;
1526} 1525}
1527 1526
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3eed583895a6..9be3cf8a5da4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3587,9 +3587,13 @@ unlock:
3587 3587
3588static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3588static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3589{ 3589{
3590 __mem_cgroup_threshold(memcg, false); 3590 while (memcg) {
3591 if (do_swap_account) 3591 __mem_cgroup_threshold(memcg, false);
3592 __mem_cgroup_threshold(memcg, true); 3592 if (do_swap_account)
3593 __mem_cgroup_threshold(memcg, true);
3594
3595 memcg = parent_mem_cgroup(memcg);
3596 }
3593} 3597}
3594 3598
3595static int compare_thresholds(const void *a, const void *b) 3599static int compare_thresholds(const void *a, const void *b)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9c26eeca1342..757f6b0accfe 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter);
183 * signal. 183 * signal.
184 */ 184 */
185static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, 185static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
186 unsigned long pfn) 186 unsigned long pfn, struct page *page)
187{ 187{
188 struct siginfo si; 188 struct siginfo si;
189 int ret; 189 int ret;
@@ -198,7 +198,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
198#ifdef __ARCH_SI_TRAPNO 198#ifdef __ARCH_SI_TRAPNO
199 si.si_trapno = trapno; 199 si.si_trapno = trapno;
200#endif 200#endif
201 si.si_addr_lsb = PAGE_SHIFT; 201 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
202 /* 202 /*
203 * Don't use force here, it's convenient if the signal 203 * Don't use force here, it's convenient if the signal
204 * can be temporarily blocked. 204 * can be temporarily blocked.
@@ -235,7 +235,7 @@ void shake_page(struct page *p, int access)
235 int nr; 235 int nr;
236 do { 236 do {
237 nr = shrink_slab(1000, GFP_KERNEL, 1000); 237 nr = shrink_slab(1000, GFP_KERNEL, 1000);
238 if (page_count(p) == 0) 238 if (page_count(p) == 1)
239 break; 239 break;
240 } while (nr > 10); 240 } while (nr > 10);
241 } 241 }
@@ -327,7 +327,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
327 * wrong earlier. 327 * wrong earlier.
328 */ 328 */
329static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, 329static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
330 int fail, unsigned long pfn) 330 int fail, struct page *page, unsigned long pfn)
331{ 331{
332 struct to_kill *tk, *next; 332 struct to_kill *tk, *next;
333 333
@@ -352,7 +352,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
352 * process anyways. 352 * process anyways.
353 */ 353 */
354 else if (kill_proc_ao(tk->tsk, tk->addr, trapno, 354 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
355 pfn) < 0) 355 pfn, page) < 0)
356 printk(KERN_ERR 356 printk(KERN_ERR
357 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", 357 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
358 pfn, tk->tsk->comm, tk->tsk->pid); 358 pfn, tk->tsk->comm, tk->tsk->pid);
@@ -928,7 +928,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
928 * any accesses to the poisoned memory. 928 * any accesses to the poisoned memory.
929 */ 929 */
930 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, 930 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
931 ret != SWAP_SUCCESS, pfn); 931 ret != SWAP_SUCCESS, p, pfn);
932 932
933 return ret; 933 return ret;
934} 934}
diff --git a/mm/memory.c b/mm/memory.c
index 2ed2267439df..0e18b4d649ec 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2623,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2623 unsigned int flags, pte_t orig_pte) 2623 unsigned int flags, pte_t orig_pte)
2624{ 2624{
2625 spinlock_t *ptl; 2625 spinlock_t *ptl;
2626 struct page *page; 2626 struct page *page, *swapcache = NULL;
2627 swp_entry_t entry; 2627 swp_entry_t entry;
2628 pte_t pte; 2628 pte_t pte;
2629 struct mem_cgroup *ptr = NULL; 2629 struct mem_cgroup *ptr = NULL;
@@ -2679,10 +2679,25 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2679 lock_page(page); 2679 lock_page(page);
2680 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2680 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2681 2681
2682 page = ksm_might_need_to_copy(page, vma, address); 2682 /*
2683 if (!page) { 2683 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
2684 ret = VM_FAULT_OOM; 2684 * release the swapcache from under us. The page pin, and pte_same
2685 goto out; 2685 * test below, are not enough to exclude that. Even if it is still
2686 * swapcache, we need to check that the page's swap has not changed.
2687 */
2688 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2689 goto out_page;
2690
2691 if (ksm_might_need_to_copy(page, vma, address)) {
2692 swapcache = page;
2693 page = ksm_does_need_to_copy(page, vma, address);
2694
2695 if (unlikely(!page)) {
2696 ret = VM_FAULT_OOM;
2697 page = swapcache;
2698 swapcache = NULL;
2699 goto out_page;
2700 }
2686 } 2701 }
2687 2702
2688 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2703 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -2735,6 +2750,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2735 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2750 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2736 try_to_free_swap(page); 2751 try_to_free_swap(page);
2737 unlock_page(page); 2752 unlock_page(page);
2753 if (swapcache) {
2754 /*
2755 * Hold the lock to avoid the swap entry to be reused
2756 * until we take the PT lock for the pte_same() check
2757 * (to avoid false positives from pte_same). For
2758 * further safety release the lock after the swap_free
2759 * so that the swap count won't change under a
2760 * parallel locked swapcache.
2761 */
2762 unlock_page(swapcache);
2763 page_cache_release(swapcache);
2764 }
2738 2765
2739 if (flags & FAULT_FLAG_WRITE) { 2766 if (flags & FAULT_FLAG_WRITE) {
2740 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 2767 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@@ -2756,15 +2783,17 @@ out_page:
2756 unlock_page(page); 2783 unlock_page(page);
2757out_release: 2784out_release:
2758 page_cache_release(page); 2785 page_cache_release(page);
2786 if (swapcache) {
2787 unlock_page(swapcache);
2788 page_cache_release(swapcache);
2789 }
2759 return ret; 2790 return ret;
2760} 2791}
2761 2792
2762/* 2793/*
2763 * This is like a special single-page "expand_downwards()", 2794 * This is like a special single-page "expand_{down|up}wards()",
2764 * except we must first make sure that 'address-PAGE_SIZE' 2795 * except we must first make sure that 'address{-|+}PAGE_SIZE'
2765 * doesn't hit another vma. 2796 * doesn't hit another vma.
2766 *
2767 * The "find_vma()" will do the right thing even if we wrap
2768 */ 2797 */
2769static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) 2798static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2770{ 2799{
@@ -2783,6 +2812,15 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
2783 2812
2784 expand_stack(vma, address - PAGE_SIZE); 2813 expand_stack(vma, address - PAGE_SIZE);
2785 } 2814 }
2815 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2816 struct vm_area_struct *next = vma->vm_next;
2817
2818 /* As VM_GROWSDOWN but s/below/above/ */
2819 if (next && next->vm_start == address + PAGE_SIZE)
2820 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2821
2822 expand_upwards(vma, address + PAGE_SIZE);
2823 }
2786 return 0; 2824 return 0;
2787} 2825}
2788 2826
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a4cfcdc00455..dd186c1a5d53 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -584,19 +584,19 @@ static inline int pageblock_free(struct page *page)
584/* Return the start of the next active pageblock after a given page */ 584/* Return the start of the next active pageblock after a given page */
585static struct page *next_active_pageblock(struct page *page) 585static struct page *next_active_pageblock(struct page *page)
586{ 586{
587 int pageblocks_stride;
588
589 /* Ensure the starting page is pageblock-aligned */ 587 /* Ensure the starting page is pageblock-aligned */
590 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 588 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
591 589
592 /* Move forward by at least 1 * pageblock_nr_pages */
593 pageblocks_stride = 1;
594
595 /* If the entire pageblock is free, move to the end of free page */ 590 /* If the entire pageblock is free, move to the end of free page */
596 if (pageblock_free(page)) 591 if (pageblock_free(page)) {
597 pageblocks_stride += page_order(page) - pageblock_order; 592 int order;
593 /* be careful. we don't have locks, page_order can be changed.*/
594 order = page_order(page);
595 if ((order < MAX_ORDER) && (order >= pageblock_order))
596 return page + (1 << order);
597 }
598 598
599 return page + (pageblocks_stride * pageblock_nr_pages); 599 return page + pageblock_nr_pages;
600} 600}
601 601
602/* Checks if this range of memory is likely to be hot-removable. */ 602/* Checks if this range of memory is likely to be hot-removable. */
diff --git a/mm/mlock.c b/mm/mlock.c
index cbae7c5b9568..b70919ce4f72 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,12 +135,6 @@ void munlock_vma_page(struct page *page)
135 } 135 }
136} 136}
137 137
138/* Is the vma a continuation of the stack vma above it? */
139static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr)
140{
141 return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
142}
143
144static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) 138static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
145{ 139{
146 return (vma->vm_flags & VM_GROWSDOWN) && 140 return (vma->vm_flags & VM_GROWSDOWN) &&
diff --git a/mm/mmap.c b/mm/mmap.c
index 331e51af38c9..00161a48a451 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1716,9 +1716,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1716 * PA-RISC uses this for its stack; IA64 for its Register Backing Store. 1716 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
1717 * vma is the last one with address > vma->vm_end. Have to extend vma. 1717 * vma is the last one with address > vma->vm_end. Have to extend vma.
1718 */ 1718 */
1719#ifndef CONFIG_IA64
1720static
1721#endif
1722int expand_upwards(struct vm_area_struct *vma, unsigned long address) 1719int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1723{ 1720{
1724 int error; 1721 int error;
@@ -2012,6 +2009,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2012 removed_exe_file_vma(mm); 2009 removed_exe_file_vma(mm);
2013 fput(new->vm_file); 2010 fput(new->vm_file);
2014 } 2011 }
2012 unlink_anon_vmas(new);
2015 out_free_mpol: 2013 out_free_mpol:
2016 mpol_put(pol); 2014 mpol_put(pol);
2017 out_free_vma: 2015 out_free_vma:
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d1760213..e35bfb82c855 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fc81cb22869e..4029583a1024 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -121,8 +121,8 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
121} 121}
122 122
123/* return true if the task is not adequate as candidate victim task. */ 123/* return true if the task is not adequate as candidate victim task. */
124static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem, 124static bool oom_unkillable_task(struct task_struct *p,
125 const nodemask_t *nodemask) 125 const struct mem_cgroup *mem, const nodemask_t *nodemask)
126{ 126{
127 if (is_global_init(p)) 127 if (is_global_init(p))
128 return true; 128 return true;
@@ -208,8 +208,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
208 */ 208 */
209 points += p->signal->oom_score_adj; 209 points += p->signal->oom_score_adj;
210 210
211 if (points < 0) 211 /*
212 return 0; 212 * Never return 0 for an eligible task that may be killed since it's
213 * possible that no single user task uses more than 0.1% of memory and
214 * no single admin tasks uses more than 3.0%.
215 */
216 if (points <= 0)
217 return 1;
213 return (points < 1000) ? points : 1000; 218 return (points < 1000) ? points : 1000;
214} 219}
215 220
@@ -339,26 +344,24 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
339/** 344/**
340 * dump_tasks - dump current memory state of all system tasks 345 * dump_tasks - dump current memory state of all system tasks
341 * @mem: current's memory controller, if constrained 346 * @mem: current's memory controller, if constrained
347 * @nodemask: nodemask passed to page allocator for mempolicy ooms
342 * 348 *
343 * Dumps the current memory state of all system tasks, excluding kernel threads. 349 * Dumps the current memory state of all eligible tasks. Tasks not in the same
350 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
351 * are not shown.
344 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 352 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
345 * value, oom_score_adj value, and name. 353 * value, oom_score_adj value, and name.
346 * 354 *
347 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
348 * shown.
349 *
350 * Call with tasklist_lock read-locked. 355 * Call with tasklist_lock read-locked.
351 */ 356 */
352static void dump_tasks(const struct mem_cgroup *mem) 357static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
353{ 358{
354 struct task_struct *p; 359 struct task_struct *p;
355 struct task_struct *task; 360 struct task_struct *task;
356 361
357 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); 362 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
358 for_each_process(p) { 363 for_each_process(p) {
359 if (p->flags & PF_KTHREAD) 364 if (oom_unkillable_task(p, mem, nodemask))
360 continue;
361 if (mem && !task_in_mem_cgroup(p, mem))
362 continue; 365 continue;
363 366
364 task = find_lock_task_mm(p); 367 task = find_lock_task_mm(p);
@@ -381,7 +384,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
381} 384}
382 385
383static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 386static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
384 struct mem_cgroup *mem) 387 struct mem_cgroup *mem, const nodemask_t *nodemask)
385{ 388{
386 task_lock(current); 389 task_lock(current);
387 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 390 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
@@ -394,7 +397,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
394 mem_cgroup_print_oom_info(mem, p); 397 mem_cgroup_print_oom_info(mem, p);
395 show_mem(); 398 show_mem();
396 if (sysctl_oom_dump_tasks) 399 if (sysctl_oom_dump_tasks)
397 dump_tasks(mem); 400 dump_tasks(mem, nodemask);
398} 401}
399 402
400#define K(x) ((x) << (PAGE_SHIFT-10)) 403#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -436,7 +439,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
436 unsigned int victim_points = 0; 439 unsigned int victim_points = 0;
437 440
438 if (printk_ratelimit()) 441 if (printk_ratelimit())
439 dump_header(p, gfp_mask, order, mem); 442 dump_header(p, gfp_mask, order, mem, nodemask);
440 443
441 /* 444 /*
442 * If the task is already exiting, don't alarm the sysadmin or kill 445 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -482,7 +485,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
482 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 485 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
483 */ 486 */
484static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 487static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
485 int order) 488 int order, const nodemask_t *nodemask)
486{ 489{
487 if (likely(!sysctl_panic_on_oom)) 490 if (likely(!sysctl_panic_on_oom))
488 return; 491 return;
@@ -496,7 +499,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
496 return; 499 return;
497 } 500 }
498 read_lock(&tasklist_lock); 501 read_lock(&tasklist_lock);
499 dump_header(NULL, gfp_mask, order, NULL); 502 dump_header(NULL, gfp_mask, order, NULL, nodemask);
500 read_unlock(&tasklist_lock); 503 read_unlock(&tasklist_lock);
501 panic("Out of memory: %s panic_on_oom is enabled\n", 504 panic("Out of memory: %s panic_on_oom is enabled\n",
502 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 505 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
@@ -509,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
509 unsigned int points = 0; 512 unsigned int points = 0;
510 struct task_struct *p; 513 struct task_struct *p;
511 514
512 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0); 515 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
513 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; 516 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
514 read_lock(&tasklist_lock); 517 read_lock(&tasklist_lock);
515retry: 518retry:
@@ -641,6 +644,7 @@ static void clear_system_oom(void)
641void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 644void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
642 int order, nodemask_t *nodemask) 645 int order, nodemask_t *nodemask)
643{ 646{
647 const nodemask_t *mpol_mask;
644 struct task_struct *p; 648 struct task_struct *p;
645 unsigned long totalpages; 649 unsigned long totalpages;
646 unsigned long freed = 0; 650 unsigned long freed = 0;
@@ -670,7 +674,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
670 */ 674 */
671 constraint = constrained_alloc(zonelist, gfp_mask, nodemask, 675 constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
672 &totalpages); 676 &totalpages);
673 check_panic_on_oom(constraint, gfp_mask, order); 677 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
678 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
674 679
675 read_lock(&tasklist_lock); 680 read_lock(&tasklist_lock);
676 if (sysctl_oom_kill_allocating_task && 681 if (sysctl_oom_kill_allocating_task &&
@@ -688,15 +693,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
688 } 693 }
689 694
690retry: 695retry:
691 p = select_bad_process(&points, totalpages, NULL, 696 p = select_bad_process(&points, totalpages, NULL, mpol_mask);
692 constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
693 NULL);
694 if (PTR_ERR(p) == -1UL) 697 if (PTR_ERR(p) == -1UL)
695 goto out; 698 goto out;
696 699
697 /* Found nothing?!?! Either we hang forever, or we panic. */ 700 /* Found nothing?!?! Either we hang forever, or we panic. */
698 if (!p) { 701 if (!p) {
699 dump_header(NULL, gfp_mask, order, NULL); 702 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
700 read_unlock(&tasklist_lock); 703 read_unlock(&tasklist_lock);
701 panic("Out of memory and no killable processes...\n"); 704 panic("Out of memory and no killable processes...\n");
702 } 705 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c09ef5219cbe..e3bccac1f025 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -985,22 +985,16 @@ continue_unlock:
985 } 985 }
986 } 986 }
987 987
988 if (wbc->nr_to_write > 0) { 988 /*
989 if (--wbc->nr_to_write == 0 && 989 * We stop writing back only if we are not doing
990 wbc->sync_mode == WB_SYNC_NONE) { 990 * integrity sync. In case of integrity sync we have to
991 /* 991 * keep going until we have written all the pages
992 * We stop writing back only if we are 992 * we tagged for writeback prior to entering this loop.
993 * not doing integrity sync. In case of 993 */
994 * integrity sync we have to keep going 994 if (--wbc->nr_to_write <= 0 &&
995 * because someone may be concurrently 995 wbc->sync_mode == WB_SYNC_NONE) {
996 * dirtying pages, and we might have 996 done = 1;
997 * synced a lot of newly appeared dirty 997 break;
998 * pages, but have not synced all of the
999 * old dirty pages.
1000 */
1001 done = 1;
1002 break;
1003 }
1004 } 998 }
1005 } 999 }
1006 pagevec_release(&pvec); 1000 pagevec_release(&pvec);
@@ -1132,6 +1126,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1132 task_io_account_write(PAGE_CACHE_SIZE); 1126 task_io_account_write(PAGE_CACHE_SIZE);
1133 } 1127 }
1134} 1128}
1129EXPORT_SYMBOL(account_page_dirtied);
1135 1130
1136/* 1131/*
1137 * For address_spaces which do not use buffers. Just tag the page as dirty in 1132 * For address_spaces which do not use buffers. Just tag the page as dirty in
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a9649f4b261e..f12ad1836abe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -588,13 +588,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
588{ 588{
589 int migratetype = 0; 589 int migratetype = 0;
590 int batch_free = 0; 590 int batch_free = 0;
591 int to_free = count;
591 592
592 spin_lock(&zone->lock); 593 spin_lock(&zone->lock);
593 zone->all_unreclaimable = 0; 594 zone->all_unreclaimable = 0;
594 zone->pages_scanned = 0; 595 zone->pages_scanned = 0;
595 596
596 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 597 while (to_free) {
597 while (count) {
598 struct page *page; 598 struct page *page;
599 struct list_head *list; 599 struct list_head *list;
600 600
@@ -619,8 +619,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
619 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 619 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
620 __free_one_page(page, zone, 0, page_private(page)); 620 __free_one_page(page, zone, 0, page_private(page));
621 trace_mm_page_pcpu_drain(page, 0, page_private(page)); 621 trace_mm_page_pcpu_drain(page, 0, page_private(page));
622 } while (--count && --batch_free && !list_empty(list)); 622 } while (--to_free && --batch_free && !list_empty(list));
623 } 623 }
624 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
624 spin_unlock(&zone->lock); 625 spin_unlock(&zone->lock);
625} 626}
626 627
@@ -631,8 +632,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
631 zone->all_unreclaimable = 0; 632 zone->all_unreclaimable = 0;
632 zone->pages_scanned = 0; 633 zone->pages_scanned = 0;
633 634
634 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
635 __free_one_page(page, zone, order, migratetype); 635 __free_one_page(page, zone, order, migratetype);
636 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
636 spin_unlock(&zone->lock); 637 spin_unlock(&zone->lock);
637} 638}
638 639
@@ -1461,7 +1462,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1461{ 1462{
1462 /* free_pages my go negative - that's OK */ 1463 /* free_pages my go negative - that's OK */
1463 long min = mark; 1464 long min = mark;
1464 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1465 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1465 int o; 1466 int o;
1466 1467
1467 if (alloc_flags & ALLOC_HIGH) 1468 if (alloc_flags & ALLOC_HIGH)
@@ -1846,6 +1847,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1846 struct page *page = NULL; 1847 struct page *page = NULL;
1847 struct reclaim_state reclaim_state; 1848 struct reclaim_state reclaim_state;
1848 struct task_struct *p = current; 1849 struct task_struct *p = current;
1850 bool drained = false;
1849 1851
1850 cond_resched(); 1852 cond_resched();
1851 1853
@@ -1864,14 +1866,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1864 1866
1865 cond_resched(); 1867 cond_resched();
1866 1868
1867 if (order != 0) 1869 if (unlikely(!(*did_some_progress)))
1868 drain_all_pages(); 1870 return NULL;
1869 1871
1870 if (likely(*did_some_progress)) 1872retry:
1871 page = get_page_from_freelist(gfp_mask, nodemask, order, 1873 page = get_page_from_freelist(gfp_mask, nodemask, order,
1872 zonelist, high_zoneidx, 1874 zonelist, high_zoneidx,
1873 alloc_flags, preferred_zone, 1875 alloc_flags, preferred_zone,
1874 migratetype); 1876 migratetype);
1877
1878 /*
1879 * If an allocation failed after direct reclaim, it could be because
1880 * pages are pinned on the per-cpu lists. Drain them and try again
1881 */
1882 if (!page && !drained) {
1883 drain_all_pages();
1884 drained = true;
1885 goto retry;
1886 }
1887
1875 return page; 1888 return page;
1876} 1889}
1877 1890
@@ -2423,7 +2436,7 @@ void show_free_areas(void)
2423 " all_unreclaimable? %s" 2436 " all_unreclaimable? %s"
2424 "\n", 2437 "\n",
2425 zone->name, 2438 zone->name,
2426 K(zone_page_state(zone, NR_FREE_PAGES)), 2439 K(zone_nr_free_pages(zone)),
2427 K(min_wmark_pages(zone)), 2440 K(min_wmark_pages(zone)),
2428 K(low_wmark_pages(zone)), 2441 K(low_wmark_pages(zone)),
2429 K(high_wmark_pages(zone)), 2442 K(high_wmark_pages(zone)),
@@ -5169,9 +5182,9 @@ void *__init alloc_large_system_hash(const char *tablename,
5169 if (!table) 5182 if (!table)
5170 panic("Failed to allocate %s hash table\n", tablename); 5183 panic("Failed to allocate %s hash table\n", tablename);
5171 5184
5172 printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", 5185 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5173 tablename, 5186 tablename,
5174 (1U << log2qty), 5187 (1UL << log2qty),
5175 ilog2(size) - PAGE_SHIFT, 5188 ilog2(size) - PAGE_SHIFT,
5176 size); 5189 size);
5177 5190
diff --git a/mm/percpu.c b/mm/percpu.c
index e61dc2cc5873..c76ef3891e0d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -393,7 +393,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
393 goto out_unlock; 393 goto out_unlock;
394 394
395 old_size = chunk->map_alloc * sizeof(chunk->map[0]); 395 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
396 memcpy(new, chunk->map, old_size); 396 old = chunk->map;
397
398 memcpy(new, old, old_size);
397 399
398 chunk->map_alloc = new_alloc; 400 chunk->map_alloc = new_alloc;
399 chunk->map = new; 401 chunk->map = new;
@@ -1162,7 +1164,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1162 } 1164 }
1163 1165
1164 /* 1166 /*
1165 * Don't accept if wastage is over 25%. The 1167 * Don't accept if wastage is over 1/3. The
1166 * greater-than comparison ensures upa==1 always 1168 * greater-than comparison ensures upa==1 always
1167 * passes the following check. 1169 * passes the following check.
1168 */ 1170 */
@@ -1399,9 +1401,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1399 1401
1400 if (pcpu_first_unit_cpu == NR_CPUS) 1402 if (pcpu_first_unit_cpu == NR_CPUS)
1401 pcpu_first_unit_cpu = cpu; 1403 pcpu_first_unit_cpu = cpu;
1404 pcpu_last_unit_cpu = cpu;
1402 } 1405 }
1403 } 1406 }
1404 pcpu_last_unit_cpu = cpu;
1405 pcpu_nr_units = unit; 1407 pcpu_nr_units = unit;
1406 1408
1407 for_each_possible_cpu(cpu) 1409 for_each_possible_cpu(cpu)
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
index c4351c7f57d2..db884fae5721 100644
--- a/mm/percpu_up.c
+++ b/mm/percpu_up.c
@@ -14,13 +14,13 @@ void __percpu *__alloc_percpu(size_t size, size_t align)
14 * percpu sections on SMP for which this path isn't used. 14 * percpu sections on SMP for which this path isn't used.
15 */ 15 */
16 WARN_ON_ONCE(align > SMP_CACHE_BYTES); 16 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
17 return kzalloc(size, GFP_KERNEL); 17 return (void __percpu __force *)kzalloc(size, GFP_KERNEL);
18} 18}
19EXPORT_SYMBOL_GPL(__alloc_percpu); 19EXPORT_SYMBOL_GPL(__alloc_percpu);
20 20
21void free_percpu(void __percpu *p) 21void free_percpu(void __percpu *p)
22{ 22{
23 kfree(p); 23 kfree(this_cpu_ptr(p));
24} 24}
25EXPORT_SYMBOL_GPL(free_percpu); 25EXPORT_SYMBOL_GPL(free_percpu);
26 26
diff --git a/mm/rmap.c b/mm/rmap.c
index 87b9e8ad4509..92e6757f196e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -316,7 +316,7 @@ void __init anon_vma_init(void)
316 */ 316 */
317struct anon_vma *page_lock_anon_vma(struct page *page) 317struct anon_vma *page_lock_anon_vma(struct page *page)
318{ 318{
319 struct anon_vma *anon_vma; 319 struct anon_vma *anon_vma, *root_anon_vma;
320 unsigned long anon_mapping; 320 unsigned long anon_mapping;
321 321
322 rcu_read_lock(); 322 rcu_read_lock();
@@ -327,8 +327,21 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
327 goto out; 327 goto out;
328 328
329 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 329 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
330 anon_vma_lock(anon_vma); 330 root_anon_vma = ACCESS_ONCE(anon_vma->root);
331 return anon_vma; 331 spin_lock(&root_anon_vma->lock);
332
333 /*
334 * If this page is still mapped, then its anon_vma cannot have been
335 * freed. But if it has been unmapped, we have no security against
336 * the anon_vma structure being freed and reused (for another anon_vma:
337 * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot
338 * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting
339 * anon_vma->root before page_unlock_anon_vma() is called to unlock.
340 */
341 if (page_mapped(page))
342 return anon_vma;
343
344 spin_unlock(&root_anon_vma->lock);
332out: 345out:
333 rcu_read_unlock(); 346 rcu_read_unlock();
334 return NULL; 347 return NULL;
@@ -368,7 +381,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
368unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 381unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
369{ 382{
370 if (PageAnon(page)) { 383 if (PageAnon(page)) {
371 if (vma->anon_vma->root != page_anon_vma(page)->root) 384 struct anon_vma *page__anon_vma = page_anon_vma(page);
385 /*
386 * Note: swapoff's unuse_vma() is more efficient with this
387 * check, and needs it to match anon_vma when KSM is active.
388 */
389 if (!vma->anon_vma || !page__anon_vma ||
390 vma->anon_vma->root != page__anon_vma->root)
372 return -EFAULT; 391 return -EFAULT;
373 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 392 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
374 if (!vma->vm_file || 393 if (!vma->vm_file ||
@@ -1551,13 +1570,14 @@ static void __hugepage_set_anon_rmap(struct page *page,
1551 struct vm_area_struct *vma, unsigned long address, int exclusive) 1570 struct vm_area_struct *vma, unsigned long address, int exclusive)
1552{ 1571{
1553 struct anon_vma *anon_vma = vma->anon_vma; 1572 struct anon_vma *anon_vma = vma->anon_vma;
1573
1554 BUG_ON(!anon_vma); 1574 BUG_ON(!anon_vma);
1555 if (!exclusive) { 1575
1556 struct anon_vma_chain *avc; 1576 if (PageAnon(page))
1557 avc = list_entry(vma->anon_vma_chain.prev, 1577 return;
1558 struct anon_vma_chain, same_vma); 1578 if (!exclusive)
1559 anon_vma = avc->anon_vma; 1579 anon_vma = anon_vma->root;
1560 } 1580
1561 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1581 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1562 page->mapping = (struct address_space *) anon_vma; 1582 page->mapping = (struct address_space *) anon_vma;
1563 page->index = linear_page_index(vma, address); 1583 page->index = linear_page_index(vma, address);
@@ -1568,6 +1588,8 @@ void hugepage_add_anon_rmap(struct page *page,
1568{ 1588{
1569 struct anon_vma *anon_vma = vma->anon_vma; 1589 struct anon_vma *anon_vma = vma->anon_vma;
1570 int first; 1590 int first;
1591
1592 BUG_ON(!PageLocked(page));
1571 BUG_ON(!anon_vma); 1593 BUG_ON(!anon_vma);
1572 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1594 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1573 first = atomic_inc_and_test(&page->_mapcount); 1595 first = atomic_inc_and_test(&page->_mapcount);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e132e1708acc..9fc7bac7db0c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,8 +47,6 @@ long nr_swap_pages;
47long total_swap_pages; 47long total_swap_pages;
48static int least_priority; 48static int least_priority;
49 49
50static bool swap_for_hibernation;
51
52static const char Bad_file[] = "Bad swap file entry "; 50static const char Bad_file[] = "Bad swap file entry ";
53static const char Unused_file[] = "Unused swap file entry "; 51static const char Unused_file[] = "Unused swap file entry ";
54static const char Bad_offset[] = "Bad swap offset entry "; 52static const char Bad_offset[] = "Bad swap offset entry ";
@@ -317,10 +315,8 @@ checks:
317 if (offset > si->highest_bit) 315 if (offset > si->highest_bit)
318 scan_base = offset = si->lowest_bit; 316 scan_base = offset = si->lowest_bit;
319 317
320 /* reuse swap entry of cache-only swap if not hibernation. */ 318 /* reuse swap entry of cache-only swap if not busy. */
321 if (vm_swap_full() 319 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
322 && usage == SWAP_HAS_CACHE
323 && si->swap_map[offset] == SWAP_HAS_CACHE) {
324 int swap_was_freed; 320 int swap_was_freed;
325 spin_unlock(&swap_lock); 321 spin_unlock(&swap_lock);
326 swap_was_freed = __try_to_reclaim_swap(si, offset); 322 swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -450,8 +446,6 @@ swp_entry_t get_swap_page(void)
450 spin_lock(&swap_lock); 446 spin_lock(&swap_lock);
451 if (nr_swap_pages <= 0) 447 if (nr_swap_pages <= 0)
452 goto noswap; 448 goto noswap;
453 if (swap_for_hibernation)
454 goto noswap;
455 nr_swap_pages--; 449 nr_swap_pages--;
456 450
457 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 451 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -484,6 +478,28 @@ noswap:
484 return (swp_entry_t) {0}; 478 return (swp_entry_t) {0};
485} 479}
486 480
481/* The only caller of this function is now susupend routine */
482swp_entry_t get_swap_page_of_type(int type)
483{
484 struct swap_info_struct *si;
485 pgoff_t offset;
486
487 spin_lock(&swap_lock);
488 si = swap_info[type];
489 if (si && (si->flags & SWP_WRITEOK)) {
490 nr_swap_pages--;
491 /* This is called for allocating swap entry, not cache */
492 offset = scan_swap_map(si, 1);
493 if (offset) {
494 spin_unlock(&swap_lock);
495 return swp_entry(type, offset);
496 }
497 nr_swap_pages++;
498 }
499 spin_unlock(&swap_lock);
500 return (swp_entry_t) {0};
501}
502
487static struct swap_info_struct *swap_info_get(swp_entry_t entry) 503static struct swap_info_struct *swap_info_get(swp_entry_t entry)
488{ 504{
489 struct swap_info_struct *p; 505 struct swap_info_struct *p;
@@ -667,6 +683,24 @@ int try_to_free_swap(struct page *page)
667 if (page_swapcount(page)) 683 if (page_swapcount(page))
668 return 0; 684 return 0;
669 685
686 /*
687 * Once hibernation has begun to create its image of memory,
688 * there's a danger that one of the calls to try_to_free_swap()
689 * - most probably a call from __try_to_reclaim_swap() while
690 * hibernation is allocating its own swap pages for the image,
691 * but conceivably even a call from memory reclaim - will free
692 * the swap from a page which has already been recorded in the
693 * image as a clean swapcache page, and then reuse its swap for
694 * another page of the image. On waking from hibernation, the
695 * original page might be freed under memory pressure, then
696 * later read back in from swap, now with the wrong data.
697 *
698 * Hibernation clears bits from gfp_allowed_mask to prevent
699 * memory reclaim from writing to disk, so check that here.
700 */
701 if (!(gfp_allowed_mask & __GFP_IO))
702 return 0;
703
670 delete_from_swap_cache(page); 704 delete_from_swap_cache(page);
671 SetPageDirty(page); 705 SetPageDirty(page);
672 return 1; 706 return 1;
@@ -743,74 +777,6 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
743#endif 777#endif
744 778
745#ifdef CONFIG_HIBERNATION 779#ifdef CONFIG_HIBERNATION
746
747static pgoff_t hibernation_offset[MAX_SWAPFILES];
748/*
749 * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
750 * saved swap_map[] image to the disk will be an incomplete because it's
751 * changing without synchronization with hibernation snap shot.
752 * At resume, we just make swap_for_hibernation=false. We can forget
753 * used maps easily.
754 */
755void hibernation_freeze_swap(void)
756{
757 int i;
758
759 spin_lock(&swap_lock);
760
761 printk(KERN_INFO "PM: Freeze Swap\n");
762 swap_for_hibernation = true;
763 for (i = 0; i < MAX_SWAPFILES; i++)
764 hibernation_offset[i] = 1;
765 spin_unlock(&swap_lock);
766}
767
768void hibernation_thaw_swap(void)
769{
770 spin_lock(&swap_lock);
771 if (swap_for_hibernation) {
772 printk(KERN_INFO "PM: Thaw Swap\n");
773 swap_for_hibernation = false;
774 }
775 spin_unlock(&swap_lock);
776}
777
778/*
779 * Because updateing swap_map[] can make not-saved-status-change,
780 * we use our own easy allocator.
781 * Please see kernel/power/swap.c, Used swaps are recorded into
782 * RB-tree.
783 */
784swp_entry_t get_swap_for_hibernation(int type)
785{
786 pgoff_t off;
787 swp_entry_t val = {0};
788 struct swap_info_struct *si;
789
790 spin_lock(&swap_lock);
791
792 si = swap_info[type];
793 if (!si || !(si->flags & SWP_WRITEOK))
794 goto done;
795
796 for (off = hibernation_offset[type]; off < si->max; ++off) {
797 if (!si->swap_map[off])
798 break;
799 }
800 if (off < si->max) {
801 val = swp_entry(type, off);
802 hibernation_offset[type] = off + 1;
803 }
804done:
805 spin_unlock(&swap_lock);
806 return val;
807}
808
809void swap_free_for_hibernation(swp_entry_t ent)
810{
811 /* Nothing to do */
812}
813
814/* 780/*
815 * Find the swap type that corresponds to given device (if any). 781 * Find the swap type that corresponds to given device (if any).
816 * 782 *
@@ -2081,7 +2047,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2081 p->flags |= SWP_SOLIDSTATE; 2047 p->flags |= SWP_SOLIDSTATE;
2082 p->cluster_next = 1 + (random32() % p->highest_bit); 2048 p->cluster_next = 1 + (random32() % p->highest_bit);
2083 } 2049 }
2084 if (discard_swap(p) == 0) 2050 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2085 p->flags |= SWP_DISCARDABLE; 2051 p->flags |= SWP_DISCARDABLE;
2086 } 2052 }
2087 2053
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c391c320dbaf..c5dfabf25f11 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1804,12 +1804,11 @@ static void shrink_zone(int priority, struct zone *zone,
1804 * If a zone is deemed to be full of pinned pages then just give it a light 1804 * If a zone is deemed to be full of pinned pages then just give it a light
1805 * scan then give up on it. 1805 * scan then give up on it.
1806 */ 1806 */
1807static bool shrink_zones(int priority, struct zonelist *zonelist, 1807static void shrink_zones(int priority, struct zonelist *zonelist,
1808 struct scan_control *sc) 1808 struct scan_control *sc)
1809{ 1809{
1810 struct zoneref *z; 1810 struct zoneref *z;
1811 struct zone *zone; 1811 struct zone *zone;
1812 bool all_unreclaimable = true;
1813 1812
1814 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1813 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1815 gfp_zone(sc->gfp_mask), sc->nodemask) { 1814 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1827,8 +1826,38 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
1827 } 1826 }
1828 1827
1829 shrink_zone(priority, zone, sc); 1828 shrink_zone(priority, zone, sc);
1830 all_unreclaimable = false;
1831 } 1829 }
1830}
1831
1832static bool zone_reclaimable(struct zone *zone)
1833{
1834 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
1835}
1836
1837/*
1838 * As hibernation is going on, kswapd is freezed so that it can't mark
1839 * the zone into all_unreclaimable. It can't handle OOM during hibernation.
1840 * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
1841 */
1842static bool all_unreclaimable(struct zonelist *zonelist,
1843 struct scan_control *sc)
1844{
1845 struct zoneref *z;
1846 struct zone *zone;
1847 bool all_unreclaimable = true;
1848
1849 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1850 gfp_zone(sc->gfp_mask), sc->nodemask) {
1851 if (!populated_zone(zone))
1852 continue;
1853 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1854 continue;
1855 if (zone_reclaimable(zone)) {
1856 all_unreclaimable = false;
1857 break;
1858 }
1859 }
1860
1832 return all_unreclaimable; 1861 return all_unreclaimable;
1833} 1862}
1834 1863
@@ -1852,7 +1881,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1852 struct scan_control *sc) 1881 struct scan_control *sc)
1853{ 1882{
1854 int priority; 1883 int priority;
1855 bool all_unreclaimable;
1856 unsigned long total_scanned = 0; 1884 unsigned long total_scanned = 0;
1857 struct reclaim_state *reclaim_state = current->reclaim_state; 1885 struct reclaim_state *reclaim_state = current->reclaim_state;
1858 struct zoneref *z; 1886 struct zoneref *z;
@@ -1869,7 +1897,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1869 sc->nr_scanned = 0; 1897 sc->nr_scanned = 0;
1870 if (!priority) 1898 if (!priority)
1871 disable_swap_token(); 1899 disable_swap_token();
1872 all_unreclaimable = shrink_zones(priority, zonelist, sc); 1900 shrink_zones(priority, zonelist, sc);
1873 /* 1901 /*
1874 * Don't shrink slabs when reclaiming memory from 1902 * Don't shrink slabs when reclaiming memory from
1875 * over limit cgroups 1903 * over limit cgroups
@@ -1931,7 +1959,7 @@ out:
1931 return sc->nr_reclaimed; 1959 return sc->nr_reclaimed;
1932 1960
1933 /* top priority shrink_zones still had more to do? don't OOM, then */ 1961 /* top priority shrink_zones still had more to do? don't OOM, then */
1934 if (scanning_global_lru(sc) && !all_unreclaimable) 1962 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
1935 return 1; 1963 return 1;
1936 1964
1937 return 0; 1965 return 0;
@@ -2197,8 +2225,7 @@ loop_again:
2197 total_scanned += sc.nr_scanned; 2225 total_scanned += sc.nr_scanned;
2198 if (zone->all_unreclaimable) 2226 if (zone->all_unreclaimable)
2199 continue; 2227 continue;
2200 if (nr_slab == 0 && 2228 if (nr_slab == 0 && !zone_reclaimable(zone))
2201 zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
2202 zone->all_unreclaimable = 1; 2229 zone->all_unreclaimable = 1;
2203 /* 2230 /*
2204 * If we've done a decent amount of scanning and 2231 * If we've done a decent amount of scanning and
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f389168f9a83..355a9e669aaa 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -138,11 +138,24 @@ static void refresh_zone_stat_thresholds(void)
138 int threshold; 138 int threshold;
139 139
140 for_each_populated_zone(zone) { 140 for_each_populated_zone(zone) {
141 unsigned long max_drift, tolerate_drift;
142
141 threshold = calculate_threshold(zone); 143 threshold = calculate_threshold(zone);
142 144
143 for_each_online_cpu(cpu) 145 for_each_online_cpu(cpu)
144 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 146 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
145 = threshold; 147 = threshold;
148
149 /*
150 * Only set percpu_drift_mark if there is a danger that
151 * NR_FREE_PAGES reports the low watermark is ok when in fact
152 * the min watermark could be breached by an allocation
153 */
154 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
155 max_drift = num_online_cpus() * threshold;
156 if (max_drift > tolerate_drift)
157 zone->percpu_drift_mark = high_wmark_pages(zone) +
158 max_drift;
146 } 159 }
147} 160}
148 161
@@ -813,7 +826,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
813 "\n scanned %lu" 826 "\n scanned %lu"
814 "\n spanned %lu" 827 "\n spanned %lu"
815 "\n present %lu", 828 "\n present %lu",
816 zone_page_state(zone, NR_FREE_PAGES), 829 zone_nr_free_pages(zone),
817 min_wmark_pages(zone), 830 min_wmark_pages(zone),
818 low_wmark_pages(zone), 831 low_wmark_pages(zone),
819 high_wmark_pages(zone), 832 high_wmark_pages(zone),
@@ -998,6 +1011,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
998 switch (action) { 1011 switch (action) {
999 case CPU_ONLINE: 1012 case CPU_ONLINE:
1000 case CPU_ONLINE_FROZEN: 1013 case CPU_ONLINE_FROZEN:
1014 refresh_zone_stat_thresholds();
1001 start_cpu_timer(cpu); 1015 start_cpu_timer(cpu);
1002 node_set_state(cpu_to_node(cpu), N_CPU); 1016 node_set_state(cpu_to_node(cpu), N_CPU);
1003 break; 1017 break;