aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-10-08 03:14:51 -0400
committerIngo Molnar <mingo@elte.hu>2010-10-08 03:15:00 -0400
commit153db80f8cf74e8700cac96305b6c0b92918f17c (patch)
treec2afb28e7b3f4fbf0aacd9edd39d7f895321ca0c /mm
parent5fd03ddab7fdbc44bfb2d183a4531c26a8dbca5a (diff)
parentcb655d0f3d57c23db51b981648e452988c0223f9 (diff)
Merge commit 'v2.6.36-rc7' into core/memblock
Merge reason: Update from -rc3 to -rc7. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/backing-dev.c9
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/fremap.c7
-rw-r--r--mm/hugetlb.c24
-rw-r--r--mm/ksm.c9
-rw-r--r--mm/memory.c41
-rw-r--r--mm/memory_hotplug.c16
-rw-r--r--mm/mlock.c6
-rw-r--r--mm/mmap.c1
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/oom_kill.c49
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/percpu.c8
-rw-r--r--mm/percpu_up.c4
-rw-r--r--mm/rmap.c23
-rw-r--r--mm/swapfile.c129
-rw-r--r--mm/vmscan.c43
-rw-r--r--mm/vmstat.c16
20 files changed, 269 insertions, 181 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f4e516e9c37c..f0fb9124e410 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -189,7 +189,7 @@ config COMPACTION
189config MIGRATION 189config MIGRATION
190 bool "Page migration" 190 bool "Page migration"
191 def_bool y 191 def_bool y
192 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE 192 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION
193 help 193 help
194 Allows the migration of the physical location of pages of processes 194 Allows the migration of the physical location of pages of processes
195 while the virtual addresses are not changed. This is useful in 195 while the virtual addresses are not changed. This is useful in
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index eaa4a5bbe063..65d420499a61 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -30,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
30 30
31struct backing_dev_info noop_backing_dev_info = { 31struct backing_dev_info noop_backing_dev_info = {
32 .name = "noop", 32 .name = "noop",
33 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
33}; 34};
34EXPORT_SYMBOL_GPL(noop_backing_dev_info); 35EXPORT_SYMBOL_GPL(noop_backing_dev_info);
35 36
@@ -243,6 +244,7 @@ static int __init default_bdi_init(void)
243 err = bdi_init(&default_backing_dev_info); 244 err = bdi_init(&default_backing_dev_info);
244 if (!err) 245 if (!err)
245 bdi_register(&default_backing_dev_info, NULL, "default"); 246 bdi_register(&default_backing_dev_info, NULL, "default");
247 err = bdi_init(&noop_backing_dev_info);
246 248
247 return err; 249 return err;
248} 250}
@@ -445,8 +447,8 @@ static int bdi_forker_thread(void *ptr)
445 switch (action) { 447 switch (action) {
446 case FORK_THREAD: 448 case FORK_THREAD:
447 __set_current_state(TASK_RUNNING); 449 __set_current_state(TASK_RUNNING);
448 task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s", 450 task = kthread_create(bdi_writeback_thread, &bdi->wb,
449 dev_name(bdi->dev)); 451 "flush-%s", dev_name(bdi->dev));
450 if (IS_ERR(task)) { 452 if (IS_ERR(task)) {
451 /* 453 /*
452 * If thread creation fails, force writeout of 454 * If thread creation fails, force writeout of
@@ -457,10 +459,13 @@ static int bdi_forker_thread(void *ptr)
457 /* 459 /*
458 * The spinlock makes sure we do not lose 460 * The spinlock makes sure we do not lose
459 * wake-ups when racing with 'bdi_queue_work()'. 461 * wake-ups when racing with 'bdi_queue_work()'.
462 * And as soon as the bdi thread is visible, we
463 * can start it.
460 */ 464 */
461 spin_lock_bh(&bdi->wb_lock); 465 spin_lock_bh(&bdi->wb_lock);
462 bdi->wb.task = task; 466 bdi->wb.task = task;
463 spin_unlock_bh(&bdi->wb_lock); 467 spin_unlock_bh(&bdi->wb_lock);
468 wake_up_process(task);
464 } 469 }
465 break; 470 break;
466 471
diff --git a/mm/bounce.c b/mm/bounce.c
index 13b6dad1eed2..1481de68184b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -116,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
116 */ 116 */
117 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; 117 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
118 118
119 flush_dcache_page(tovec->bv_page);
120 bounce_copy_vec(tovec, vfrom); 119 bounce_copy_vec(tovec, vfrom);
120 flush_dcache_page(tovec->bv_page);
121 } 121 }
122} 122}
123 123
diff --git a/mm/compaction.c b/mm/compaction.c
index 94cce51b0b35..4d709ee59013 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -214,15 +214,16 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
214/* Similar to reclaim, but different enough that they don't share logic */ 214/* Similar to reclaim, but different enough that they don't share logic */
215static bool too_many_isolated(struct zone *zone) 215static bool too_many_isolated(struct zone *zone)
216{ 216{
217 217 unsigned long active, inactive, isolated;
218 unsigned long inactive, isolated;
219 218
220 inactive = zone_page_state(zone, NR_INACTIVE_FILE) + 219 inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
221 zone_page_state(zone, NR_INACTIVE_ANON); 220 zone_page_state(zone, NR_INACTIVE_ANON);
221 active = zone_page_state(zone, NR_ACTIVE_FILE) +
222 zone_page_state(zone, NR_ACTIVE_ANON);
222 isolated = zone_page_state(zone, NR_ISOLATED_FILE) + 223 isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
223 zone_page_state(zone, NR_ISOLATED_ANON); 224 zone_page_state(zone, NR_ISOLATED_ANON);
224 225
225 return isolated > inactive; 226 return isolated > (inactive + active) / 2;
226} 227}
227 228
228/* 229/*
diff --git a/mm/fremap.c b/mm/fremap.c
index 46f5dacf90a2..ec520c7b28df 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
125{ 125{
126 struct mm_struct *mm = current->mm; 126 struct mm_struct *mm = current->mm;
127 struct address_space *mapping; 127 struct address_space *mapping;
128 unsigned long end = start + size;
129 struct vm_area_struct *vma; 128 struct vm_area_struct *vma;
130 int err = -EINVAL; 129 int err = -EINVAL;
131 int has_write_lock = 0; 130 int has_write_lock = 0;
@@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
142 if (start + size <= start) 141 if (start + size <= start)
143 return err; 142 return err;
144 143
144 /* Does pgoff wrap? */
145 if (pgoff + (size >> PAGE_SHIFT) < pgoff)
146 return err;
147
145 /* Can we represent this offset inside this architecture's pte's? */ 148 /* Can we represent this offset inside this architecture's pte's? */
146#if PTE_FILE_MAX_BITS < BITS_PER_LONG 149#if PTE_FILE_MAX_BITS < BITS_PER_LONG
147 if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) 150 if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
@@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
168 if (!(vma->vm_flags & VM_CAN_NONLINEAR)) 171 if (!(vma->vm_flags & VM_CAN_NONLINEAR))
169 goto out; 172 goto out;
170 173
171 if (end <= start || start < vma->vm_start || end > vma->vm_end) 174 if (start < vma->vm_start || start + size > vma->vm_end)
172 goto out; 175 goto out;
173 176
174 /* Must set VM_NONLINEAR before any pages are populated. */ 177 /* Must set VM_NONLINEAR before any pages are populated. */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cc5be788a39f..c03273807182 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2324,11 +2324,8 @@ retry_avoidcopy:
2324 * and just make the page writable */ 2324 * and just make the page writable */
2325 avoidcopy = (page_mapcount(old_page) == 1); 2325 avoidcopy = (page_mapcount(old_page) == 1);
2326 if (avoidcopy) { 2326 if (avoidcopy) {
2327 if (!trylock_page(old_page)) { 2327 if (PageAnon(old_page))
2328 if (PageAnon(old_page)) 2328 page_move_anon_rmap(old_page, vma, address);
2329 page_move_anon_rmap(old_page, vma, address);
2330 } else
2331 unlock_page(old_page);
2332 set_huge_ptep_writable(vma, address, ptep); 2329 set_huge_ptep_writable(vma, address, ptep);
2333 return 0; 2330 return 0;
2334 } 2331 }
@@ -2404,7 +2401,7 @@ retry_avoidcopy:
2404 set_huge_pte_at(mm, address, ptep, 2401 set_huge_pte_at(mm, address, ptep,
2405 make_huge_pte(vma, new_page, 1)); 2402 make_huge_pte(vma, new_page, 1));
2406 page_remove_rmap(old_page); 2403 page_remove_rmap(old_page);
2407 hugepage_add_anon_rmap(new_page, vma, address); 2404 hugepage_add_new_anon_rmap(new_page, vma, address);
2408 /* Make the old page be freed below */ 2405 /* Make the old page be freed below */
2409 new_page = old_page; 2406 new_page = old_page;
2410 mmu_notifier_invalidate_range_end(mm, 2407 mmu_notifier_invalidate_range_end(mm,
@@ -2631,10 +2628,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2631 vma, address); 2628 vma, address);
2632 } 2629 }
2633 2630
2634 if (!pagecache_page) { 2631 /*
2635 page = pte_page(entry); 2632 * hugetlb_cow() requires page locks of pte_page(entry) and
2633 * pagecache_page, so here we need take the former one
2634 * when page != pagecache_page or !pagecache_page.
2635 * Note that locking order is always pagecache_page -> page,
2636 * so no worry about deadlock.
2637 */
2638 page = pte_page(entry);
2639 if (page != pagecache_page)
2636 lock_page(page); 2640 lock_page(page);
2637 }
2638 2641
2639 spin_lock(&mm->page_table_lock); 2642 spin_lock(&mm->page_table_lock);
2640 /* Check for a racing update before calling hugetlb_cow */ 2643 /* Check for a racing update before calling hugetlb_cow */
@@ -2661,9 +2664,8 @@ out_page_table_lock:
2661 if (pagecache_page) { 2664 if (pagecache_page) {
2662 unlock_page(pagecache_page); 2665 unlock_page(pagecache_page);
2663 put_page(pagecache_page); 2666 put_page(pagecache_page);
2664 } else {
2665 unlock_page(page);
2666 } 2667 }
2668 unlock_page(page);
2667 2669
2668out_mutex: 2670out_mutex:
2669 mutex_unlock(&hugetlb_instantiation_mutex); 2671 mutex_unlock(&hugetlb_instantiation_mutex);
diff --git a/mm/ksm.c b/mm/ksm.c
index e2ae00458320..65ab5c7067d9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -712,7 +712,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
712 if (!ptep) 712 if (!ptep)
713 goto out; 713 goto out;
714 714
715 if (pte_write(*ptep)) { 715 if (pte_write(*ptep) || pte_dirty(*ptep)) {
716 pte_t entry; 716 pte_t entry;
717 717
718 swapped = PageSwapCache(page); 718 swapped = PageSwapCache(page);
@@ -735,7 +735,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
735 set_pte_at(mm, addr, ptep, entry); 735 set_pte_at(mm, addr, ptep, entry);
736 goto out_unlock; 736 goto out_unlock;
737 } 737 }
738 entry = pte_wrprotect(entry); 738 if (pte_dirty(entry))
739 set_page_dirty(page);
740 entry = pte_mkclean(pte_wrprotect(entry));
739 set_pte_at_notify(mm, addr, ptep, entry); 741 set_pte_at_notify(mm, addr, ptep, entry);
740 } 742 }
741 *orig_pte = *ptep; 743 *orig_pte = *ptep;
@@ -1504,8 +1506,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
1504{ 1506{
1505 struct page *new_page; 1507 struct page *new_page;
1506 1508
1507 unlock_page(page); /* any racers will COW it, not modify it */
1508
1509 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1509 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1510 if (new_page) { 1510 if (new_page) {
1511 copy_user_highpage(new_page, page, address, vma); 1511 copy_user_highpage(new_page, page, address, vma);
@@ -1521,7 +1521,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
1521 add_page_to_unevictable_list(new_page); 1521 add_page_to_unevictable_list(new_page);
1522 } 1522 }
1523 1523
1524 page_cache_release(page);
1525 return new_page; 1524 return new_page;
1526} 1525}
1527 1526
diff --git a/mm/memory.c b/mm/memory.c
index 6b2ab1051851..0e18b4d649ec 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2623,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2623 unsigned int flags, pte_t orig_pte) 2623 unsigned int flags, pte_t orig_pte)
2624{ 2624{
2625 spinlock_t *ptl; 2625 spinlock_t *ptl;
2626 struct page *page; 2626 struct page *page, *swapcache = NULL;
2627 swp_entry_t entry; 2627 swp_entry_t entry;
2628 pte_t pte; 2628 pte_t pte;
2629 struct mem_cgroup *ptr = NULL; 2629 struct mem_cgroup *ptr = NULL;
@@ -2679,10 +2679,25 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2679 lock_page(page); 2679 lock_page(page);
2680 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2680 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2681 2681
2682 page = ksm_might_need_to_copy(page, vma, address); 2682 /*
2683 if (!page) { 2683 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
2684 ret = VM_FAULT_OOM; 2684 * release the swapcache from under us. The page pin, and pte_same
2685 goto out; 2685 * test below, are not enough to exclude that. Even if it is still
2686 * swapcache, we need to check that the page's swap has not changed.
2687 */
2688 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2689 goto out_page;
2690
2691 if (ksm_might_need_to_copy(page, vma, address)) {
2692 swapcache = page;
2693 page = ksm_does_need_to_copy(page, vma, address);
2694
2695 if (unlikely(!page)) {
2696 ret = VM_FAULT_OOM;
2697 page = swapcache;
2698 swapcache = NULL;
2699 goto out_page;
2700 }
2686 } 2701 }
2687 2702
2688 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2703 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -2735,6 +2750,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2735 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2750 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2736 try_to_free_swap(page); 2751 try_to_free_swap(page);
2737 unlock_page(page); 2752 unlock_page(page);
2753 if (swapcache) {
2754 /*
2755 * Hold the lock to avoid the swap entry to be reused
2756 * until we take the PT lock for the pte_same() check
2757 * (to avoid false positives from pte_same). For
2758 * further safety release the lock after the swap_free
2759 * so that the swap count won't change under a
2760 * parallel locked swapcache.
2761 */
2762 unlock_page(swapcache);
2763 page_cache_release(swapcache);
2764 }
2738 2765
2739 if (flags & FAULT_FLAG_WRITE) { 2766 if (flags & FAULT_FLAG_WRITE) {
2740 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 2767 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@@ -2756,6 +2783,10 @@ out_page:
2756 unlock_page(page); 2783 unlock_page(page);
2757out_release: 2784out_release:
2758 page_cache_release(page); 2785 page_cache_release(page);
2786 if (swapcache) {
2787 unlock_page(swapcache);
2788 page_cache_release(swapcache);
2789 }
2759 return ret; 2790 return ret;
2760} 2791}
2761 2792
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a4cfcdc00455..dd186c1a5d53 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -584,19 +584,19 @@ static inline int pageblock_free(struct page *page)
584/* Return the start of the next active pageblock after a given page */ 584/* Return the start of the next active pageblock after a given page */
585static struct page *next_active_pageblock(struct page *page) 585static struct page *next_active_pageblock(struct page *page)
586{ 586{
587 int pageblocks_stride;
588
589 /* Ensure the starting page is pageblock-aligned */ 587 /* Ensure the starting page is pageblock-aligned */
590 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 588 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
591 589
592 /* Move forward by at least 1 * pageblock_nr_pages */
593 pageblocks_stride = 1;
594
595 /* If the entire pageblock is free, move to the end of free page */ 590 /* If the entire pageblock is free, move to the end of free page */
596 if (pageblock_free(page)) 591 if (pageblock_free(page)) {
597 pageblocks_stride += page_order(page) - pageblock_order; 592 int order;
593 /* be careful. we don't have locks, page_order can be changed.*/
594 order = page_order(page);
595 if ((order < MAX_ORDER) && (order >= pageblock_order))
596 return page + (1 << order);
597 }
598 598
599 return page + (pageblocks_stride * pageblock_nr_pages); 599 return page + pageblock_nr_pages;
600} 600}
601 601
602/* Checks if this range of memory is likely to be hot-removable. */ 602/* Checks if this range of memory is likely to be hot-removable. */
diff --git a/mm/mlock.c b/mm/mlock.c
index cbae7c5b9568..b70919ce4f72 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,12 +135,6 @@ void munlock_vma_page(struct page *page)
135 } 135 }
136} 136}
137 137
138/* Is the vma a continuation of the stack vma above it? */
139static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr)
140{
141 return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
142}
143
144static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) 138static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
145{ 139{
146 return (vma->vm_flags & VM_GROWSDOWN) && 140 return (vma->vm_flags & VM_GROWSDOWN) &&
diff --git a/mm/mmap.c b/mm/mmap.c
index 6128dc8e5ede..00161a48a451 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2009,6 +2009,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2009 removed_exe_file_vma(mm); 2009 removed_exe_file_vma(mm);
2010 fput(new->vm_file); 2010 fput(new->vm_file);
2011 } 2011 }
2012 unlink_anon_vmas(new);
2012 out_free_mpol: 2013 out_free_mpol:
2013 mpol_put(pol); 2014 mpol_put(pol);
2014 out_free_vma: 2015 out_free_vma:
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d1760213..e35bfb82c855 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fc81cb22869e..4029583a1024 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -121,8 +121,8 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
121} 121}
122 122
123/* return true if the task is not adequate as candidate victim task. */ 123/* return true if the task is not adequate as candidate victim task. */
124static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem, 124static bool oom_unkillable_task(struct task_struct *p,
125 const nodemask_t *nodemask) 125 const struct mem_cgroup *mem, const nodemask_t *nodemask)
126{ 126{
127 if (is_global_init(p)) 127 if (is_global_init(p))
128 return true; 128 return true;
@@ -208,8 +208,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
208 */ 208 */
209 points += p->signal->oom_score_adj; 209 points += p->signal->oom_score_adj;
210 210
211 if (points < 0) 211 /*
212 return 0; 212 * Never return 0 for an eligible task that may be killed since it's
213 * possible that no single user task uses more than 0.1% of memory and
214 * no single admin tasks uses more than 3.0%.
215 */
216 if (points <= 0)
217 return 1;
213 return (points < 1000) ? points : 1000; 218 return (points < 1000) ? points : 1000;
214} 219}
215 220
@@ -339,26 +344,24 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
339/** 344/**
340 * dump_tasks - dump current memory state of all system tasks 345 * dump_tasks - dump current memory state of all system tasks
341 * @mem: current's memory controller, if constrained 346 * @mem: current's memory controller, if constrained
347 * @nodemask: nodemask passed to page allocator for mempolicy ooms
342 * 348 *
343 * Dumps the current memory state of all system tasks, excluding kernel threads. 349 * Dumps the current memory state of all eligible tasks. Tasks not in the same
350 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
351 * are not shown.
344 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 352 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
345 * value, oom_score_adj value, and name. 353 * value, oom_score_adj value, and name.
346 * 354 *
347 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
348 * shown.
349 *
350 * Call with tasklist_lock read-locked. 355 * Call with tasklist_lock read-locked.
351 */ 356 */
352static void dump_tasks(const struct mem_cgroup *mem) 357static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
353{ 358{
354 struct task_struct *p; 359 struct task_struct *p;
355 struct task_struct *task; 360 struct task_struct *task;
356 361
357 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); 362 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
358 for_each_process(p) { 363 for_each_process(p) {
359 if (p->flags & PF_KTHREAD) 364 if (oom_unkillable_task(p, mem, nodemask))
360 continue;
361 if (mem && !task_in_mem_cgroup(p, mem))
362 continue; 365 continue;
363 366
364 task = find_lock_task_mm(p); 367 task = find_lock_task_mm(p);
@@ -381,7 +384,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
381} 384}
382 385
383static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 386static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
384 struct mem_cgroup *mem) 387 struct mem_cgroup *mem, const nodemask_t *nodemask)
385{ 388{
386 task_lock(current); 389 task_lock(current);
387 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 390 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
@@ -394,7 +397,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
394 mem_cgroup_print_oom_info(mem, p); 397 mem_cgroup_print_oom_info(mem, p);
395 show_mem(); 398 show_mem();
396 if (sysctl_oom_dump_tasks) 399 if (sysctl_oom_dump_tasks)
397 dump_tasks(mem); 400 dump_tasks(mem, nodemask);
398} 401}
399 402
400#define K(x) ((x) << (PAGE_SHIFT-10)) 403#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -436,7 +439,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
436 unsigned int victim_points = 0; 439 unsigned int victim_points = 0;
437 440
438 if (printk_ratelimit()) 441 if (printk_ratelimit())
439 dump_header(p, gfp_mask, order, mem); 442 dump_header(p, gfp_mask, order, mem, nodemask);
440 443
441 /* 444 /*
442 * If the task is already exiting, don't alarm the sysadmin or kill 445 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -482,7 +485,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
482 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 485 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
483 */ 486 */
484static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 487static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
485 int order) 488 int order, const nodemask_t *nodemask)
486{ 489{
487 if (likely(!sysctl_panic_on_oom)) 490 if (likely(!sysctl_panic_on_oom))
488 return; 491 return;
@@ -496,7 +499,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
496 return; 499 return;
497 } 500 }
498 read_lock(&tasklist_lock); 501 read_lock(&tasklist_lock);
499 dump_header(NULL, gfp_mask, order, NULL); 502 dump_header(NULL, gfp_mask, order, NULL, nodemask);
500 read_unlock(&tasklist_lock); 503 read_unlock(&tasklist_lock);
501 panic("Out of memory: %s panic_on_oom is enabled\n", 504 panic("Out of memory: %s panic_on_oom is enabled\n",
502 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 505 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
@@ -509,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
509 unsigned int points = 0; 512 unsigned int points = 0;
510 struct task_struct *p; 513 struct task_struct *p;
511 514
512 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0); 515 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
513 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; 516 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
514 read_lock(&tasklist_lock); 517 read_lock(&tasklist_lock);
515retry: 518retry:
@@ -641,6 +644,7 @@ static void clear_system_oom(void)
641void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 644void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
642 int order, nodemask_t *nodemask) 645 int order, nodemask_t *nodemask)
643{ 646{
647 const nodemask_t *mpol_mask;
644 struct task_struct *p; 648 struct task_struct *p;
645 unsigned long totalpages; 649 unsigned long totalpages;
646 unsigned long freed = 0; 650 unsigned long freed = 0;
@@ -670,7 +674,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
670 */ 674 */
671 constraint = constrained_alloc(zonelist, gfp_mask, nodemask, 675 constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
672 &totalpages); 676 &totalpages);
673 check_panic_on_oom(constraint, gfp_mask, order); 677 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
678 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
674 679
675 read_lock(&tasklist_lock); 680 read_lock(&tasklist_lock);
676 if (sysctl_oom_kill_allocating_task && 681 if (sysctl_oom_kill_allocating_task &&
@@ -688,15 +693,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
688 } 693 }
689 694
690retry: 695retry:
691 p = select_bad_process(&points, totalpages, NULL, 696 p = select_bad_process(&points, totalpages, NULL, mpol_mask);
692 constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
693 NULL);
694 if (PTR_ERR(p) == -1UL) 697 if (PTR_ERR(p) == -1UL)
695 goto out; 698 goto out;
696 699
697 /* Found nothing?!?! Either we hang forever, or we panic. */ 700 /* Found nothing?!?! Either we hang forever, or we panic. */
698 if (!p) { 701 if (!p) {
699 dump_header(NULL, gfp_mask, order, NULL); 702 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
700 read_unlock(&tasklist_lock); 703 read_unlock(&tasklist_lock);
701 panic("Out of memory and no killable processes...\n"); 704 panic("Out of memory and no killable processes...\n");
702 } 705 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 768ea486df58..9536017108ec 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -589,13 +589,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
589{ 589{
590 int migratetype = 0; 590 int migratetype = 0;
591 int batch_free = 0; 591 int batch_free = 0;
592 int to_free = count;
592 593
593 spin_lock(&zone->lock); 594 spin_lock(&zone->lock);
594 zone->all_unreclaimable = 0; 595 zone->all_unreclaimable = 0;
595 zone->pages_scanned = 0; 596 zone->pages_scanned = 0;
596 597
597 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 598 while (to_free) {
598 while (count) {
599 struct page *page; 599 struct page *page;
600 struct list_head *list; 600 struct list_head *list;
601 601
@@ -620,8 +620,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
620 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 620 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
621 __free_one_page(page, zone, 0, page_private(page)); 621 __free_one_page(page, zone, 0, page_private(page));
622 trace_mm_page_pcpu_drain(page, 0, page_private(page)); 622 trace_mm_page_pcpu_drain(page, 0, page_private(page));
623 } while (--count && --batch_free && !list_empty(list)); 623 } while (--to_free && --batch_free && !list_empty(list));
624 } 624 }
625 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
625 spin_unlock(&zone->lock); 626 spin_unlock(&zone->lock);
626} 627}
627 628
@@ -632,8 +633,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
632 zone->all_unreclaimable = 0; 633 zone->all_unreclaimable = 0;
633 zone->pages_scanned = 0; 634 zone->pages_scanned = 0;
634 635
635 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
636 __free_one_page(page, zone, order, migratetype); 636 __free_one_page(page, zone, order, migratetype);
637 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
637 spin_unlock(&zone->lock); 638 spin_unlock(&zone->lock);
638} 639}
639 640
@@ -1462,7 +1463,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1462{ 1463{
1463 /* free_pages my go negative - that's OK */ 1464 /* free_pages my go negative - that's OK */
1464 long min = mark; 1465 long min = mark;
1465 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1466 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1466 int o; 1467 int o;
1467 1468
1468 if (alloc_flags & ALLOC_HIGH) 1469 if (alloc_flags & ALLOC_HIGH)
@@ -1847,6 +1848,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1847 struct page *page = NULL; 1848 struct page *page = NULL;
1848 struct reclaim_state reclaim_state; 1849 struct reclaim_state reclaim_state;
1849 struct task_struct *p = current; 1850 struct task_struct *p = current;
1851 bool drained = false;
1850 1852
1851 cond_resched(); 1853 cond_resched();
1852 1854
@@ -1865,14 +1867,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1865 1867
1866 cond_resched(); 1868 cond_resched();
1867 1869
1868 if (order != 0) 1870 if (unlikely(!(*did_some_progress)))
1869 drain_all_pages(); 1871 return NULL;
1870 1872
1871 if (likely(*did_some_progress)) 1873retry:
1872 page = get_page_from_freelist(gfp_mask, nodemask, order, 1874 page = get_page_from_freelist(gfp_mask, nodemask, order,
1873 zonelist, high_zoneidx, 1875 zonelist, high_zoneidx,
1874 alloc_flags, preferred_zone, 1876 alloc_flags, preferred_zone,
1875 migratetype); 1877 migratetype);
1878
1879 /*
1880 * If an allocation failed after direct reclaim, it could be because
1881 * pages are pinned on the per-cpu lists. Drain them and try again
1882 */
1883 if (!page && !drained) {
1884 drain_all_pages();
1885 drained = true;
1886 goto retry;
1887 }
1888
1876 return page; 1889 return page;
1877} 1890}
1878 1891
@@ -2424,7 +2437,7 @@ void show_free_areas(void)
2424 " all_unreclaimable? %s" 2437 " all_unreclaimable? %s"
2425 "\n", 2438 "\n",
2426 zone->name, 2439 zone->name,
2427 K(zone_page_state(zone, NR_FREE_PAGES)), 2440 K(zone_nr_free_pages(zone)),
2428 K(min_wmark_pages(zone)), 2441 K(min_wmark_pages(zone)),
2429 K(low_wmark_pages(zone)), 2442 K(low_wmark_pages(zone)),
2430 K(high_wmark_pages(zone)), 2443 K(high_wmark_pages(zone)),
diff --git a/mm/percpu.c b/mm/percpu.c
index e61dc2cc5873..c76ef3891e0d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -393,7 +393,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
393 goto out_unlock; 393 goto out_unlock;
394 394
395 old_size = chunk->map_alloc * sizeof(chunk->map[0]); 395 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
396 memcpy(new, chunk->map, old_size); 396 old = chunk->map;
397
398 memcpy(new, old, old_size);
397 399
398 chunk->map_alloc = new_alloc; 400 chunk->map_alloc = new_alloc;
399 chunk->map = new; 401 chunk->map = new;
@@ -1162,7 +1164,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1162 } 1164 }
1163 1165
1164 /* 1166 /*
1165 * Don't accept if wastage is over 25%. The 1167 * Don't accept if wastage is over 1/3. The
1166 * greater-than comparison ensures upa==1 always 1168 * greater-than comparison ensures upa==1 always
1167 * passes the following check. 1169 * passes the following check.
1168 */ 1170 */
@@ -1399,9 +1401,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1399 1401
1400 if (pcpu_first_unit_cpu == NR_CPUS) 1402 if (pcpu_first_unit_cpu == NR_CPUS)
1401 pcpu_first_unit_cpu = cpu; 1403 pcpu_first_unit_cpu = cpu;
1404 pcpu_last_unit_cpu = cpu;
1402 } 1405 }
1403 } 1406 }
1404 pcpu_last_unit_cpu = cpu;
1405 pcpu_nr_units = unit; 1407 pcpu_nr_units = unit;
1406 1408
1407 for_each_possible_cpu(cpu) 1409 for_each_possible_cpu(cpu)
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
index c4351c7f57d2..db884fae5721 100644
--- a/mm/percpu_up.c
+++ b/mm/percpu_up.c
@@ -14,13 +14,13 @@ void __percpu *__alloc_percpu(size_t size, size_t align)
14 * percpu sections on SMP for which this path isn't used. 14 * percpu sections on SMP for which this path isn't used.
15 */ 15 */
16 WARN_ON_ONCE(align > SMP_CACHE_BYTES); 16 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
17 return kzalloc(size, GFP_KERNEL); 17 return (void __percpu __force *)kzalloc(size, GFP_KERNEL);
18} 18}
19EXPORT_SYMBOL_GPL(__alloc_percpu); 19EXPORT_SYMBOL_GPL(__alloc_percpu);
20 20
21void free_percpu(void __percpu *p) 21void free_percpu(void __percpu *p)
22{ 22{
23 kfree(p); 23 kfree(this_cpu_ptr(p));
24} 24}
25EXPORT_SYMBOL_GPL(free_percpu); 25EXPORT_SYMBOL_GPL(free_percpu);
26 26
diff --git a/mm/rmap.c b/mm/rmap.c
index f6f0d2dda2ea..92e6757f196e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -381,7 +381,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
381unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 381unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
382{ 382{
383 if (PageAnon(page)) { 383 if (PageAnon(page)) {
384 if (vma->anon_vma->root != page_anon_vma(page)->root) 384 struct anon_vma *page__anon_vma = page_anon_vma(page);
385 /*
386 * Note: swapoff's unuse_vma() is more efficient with this
387 * check, and needs it to match anon_vma when KSM is active.
388 */
389 if (!vma->anon_vma || !page__anon_vma ||
390 vma->anon_vma->root != page__anon_vma->root)
385 return -EFAULT; 391 return -EFAULT;
386 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 392 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
387 if (!vma->vm_file || 393 if (!vma->vm_file ||
@@ -1564,13 +1570,14 @@ static void __hugepage_set_anon_rmap(struct page *page,
1564 struct vm_area_struct *vma, unsigned long address, int exclusive) 1570 struct vm_area_struct *vma, unsigned long address, int exclusive)
1565{ 1571{
1566 struct anon_vma *anon_vma = vma->anon_vma; 1572 struct anon_vma *anon_vma = vma->anon_vma;
1573
1567 BUG_ON(!anon_vma); 1574 BUG_ON(!anon_vma);
1568 if (!exclusive) { 1575
1569 struct anon_vma_chain *avc; 1576 if (PageAnon(page))
1570 avc = list_entry(vma->anon_vma_chain.prev, 1577 return;
1571 struct anon_vma_chain, same_vma); 1578 if (!exclusive)
1572 anon_vma = avc->anon_vma; 1579 anon_vma = anon_vma->root;
1573 } 1580
1574 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1581 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1575 page->mapping = (struct address_space *) anon_vma; 1582 page->mapping = (struct address_space *) anon_vma;
1576 page->index = linear_page_index(vma, address); 1583 page->index = linear_page_index(vma, address);
@@ -1581,6 +1588,8 @@ void hugepage_add_anon_rmap(struct page *page,
1581{ 1588{
1582 struct anon_vma *anon_vma = vma->anon_vma; 1589 struct anon_vma *anon_vma = vma->anon_vma;
1583 int first; 1590 int first;
1591
1592 BUG_ON(!PageLocked(page));
1584 BUG_ON(!anon_vma); 1593 BUG_ON(!anon_vma);
1585 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1594 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1586 first = atomic_inc_and_test(&page->_mapcount); 1595 first = atomic_inc_and_test(&page->_mapcount);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f3f9c59a73a..7c703ff2f36f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,8 +47,6 @@ long nr_swap_pages;
47long total_swap_pages; 47long total_swap_pages;
48static int least_priority; 48static int least_priority;
49 49
50static bool swap_for_hibernation;
51
52static const char Bad_file[] = "Bad swap file entry "; 50static const char Bad_file[] = "Bad swap file entry ";
53static const char Unused_file[] = "Unused swap file entry "; 51static const char Unused_file[] = "Unused swap file entry ";
54static const char Bad_offset[] = "Bad swap offset entry "; 52static const char Bad_offset[] = "Bad swap offset entry ";
@@ -141,8 +139,7 @@ static int discard_swap(struct swap_info_struct *si)
141 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
142 if (nr_blocks) { 140 if (nr_blocks) {
143 err = blkdev_issue_discard(si->bdev, start_block, 141 err = blkdev_issue_discard(si->bdev, start_block,
144 nr_blocks, GFP_KERNEL, 142 nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
145 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
146 if (err) 143 if (err)
147 return err; 144 return err;
148 cond_resched(); 145 cond_resched();
@@ -153,8 +150,7 @@ static int discard_swap(struct swap_info_struct *si)
153 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
154 151
155 err = blkdev_issue_discard(si->bdev, start_block, 152 err = blkdev_issue_discard(si->bdev, start_block,
156 nr_blocks, GFP_KERNEL, 153 nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
157 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
158 if (err) 154 if (err)
159 break; 155 break;
160 156
@@ -193,8 +189,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
193 start_block <<= PAGE_SHIFT - 9; 189 start_block <<= PAGE_SHIFT - 9;
194 nr_blocks <<= PAGE_SHIFT - 9; 190 nr_blocks <<= PAGE_SHIFT - 9;
195 if (blkdev_issue_discard(si->bdev, start_block, 191 if (blkdev_issue_discard(si->bdev, start_block,
196 nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT | 192 nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT))
197 BLKDEV_IFL_BARRIER))
198 break; 193 break;
199 } 194 }
200 195
@@ -320,10 +315,8 @@ checks:
320 if (offset > si->highest_bit) 315 if (offset > si->highest_bit)
321 scan_base = offset = si->lowest_bit; 316 scan_base = offset = si->lowest_bit;
322 317
323 /* reuse swap entry of cache-only swap if not hibernation. */ 318 /* reuse swap entry of cache-only swap if not busy. */
324 if (vm_swap_full() 319 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
325 && usage == SWAP_HAS_CACHE
326 && si->swap_map[offset] == SWAP_HAS_CACHE) {
327 int swap_was_freed; 320 int swap_was_freed;
328 spin_unlock(&swap_lock); 321 spin_unlock(&swap_lock);
329 swap_was_freed = __try_to_reclaim_swap(si, offset); 322 swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -453,8 +446,6 @@ swp_entry_t get_swap_page(void)
453 spin_lock(&swap_lock); 446 spin_lock(&swap_lock);
454 if (nr_swap_pages <= 0) 447 if (nr_swap_pages <= 0)
455 goto noswap; 448 goto noswap;
456 if (swap_for_hibernation)
457 goto noswap;
458 nr_swap_pages--; 449 nr_swap_pages--;
459 450
460 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 451 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -487,6 +478,28 @@ noswap:
487 return (swp_entry_t) {0}; 478 return (swp_entry_t) {0};
488} 479}
489 480
481/* The only caller of this function is now susupend routine */
482swp_entry_t get_swap_page_of_type(int type)
483{
484 struct swap_info_struct *si;
485 pgoff_t offset;
486
487 spin_lock(&swap_lock);
488 si = swap_info[type];
489 if (si && (si->flags & SWP_WRITEOK)) {
490 nr_swap_pages--;
491 /* This is called for allocating swap entry, not cache */
492 offset = scan_swap_map(si, 1);
493 if (offset) {
494 spin_unlock(&swap_lock);
495 return swp_entry(type, offset);
496 }
497 nr_swap_pages++;
498 }
499 spin_unlock(&swap_lock);
500 return (swp_entry_t) {0};
501}
502
490static struct swap_info_struct *swap_info_get(swp_entry_t entry) 503static struct swap_info_struct *swap_info_get(swp_entry_t entry)
491{ 504{
492 struct swap_info_struct *p; 505 struct swap_info_struct *p;
@@ -670,6 +683,24 @@ int try_to_free_swap(struct page *page)
670 if (page_swapcount(page)) 683 if (page_swapcount(page))
671 return 0; 684 return 0;
672 685
686 /*
687 * Once hibernation has begun to create its image of memory,
688 * there's a danger that one of the calls to try_to_free_swap()
689 * - most probably a call from __try_to_reclaim_swap() while
690 * hibernation is allocating its own swap pages for the image,
691 * but conceivably even a call from memory reclaim - will free
692 * the swap from a page which has already been recorded in the
693 * image as a clean swapcache page, and then reuse its swap for
694 * another page of the image. On waking from hibernation, the
695 * original page might be freed under memory pressure, then
696 * later read back in from swap, now with the wrong data.
697 *
698 * Hibernation clears bits from gfp_allowed_mask to prevent
699 * memory reclaim from writing to disk, so check that here.
700 */
701 if (!(gfp_allowed_mask & __GFP_IO))
702 return 0;
703
673 delete_from_swap_cache(page); 704 delete_from_swap_cache(page);
674 SetPageDirty(page); 705 SetPageDirty(page);
675 return 1; 706 return 1;
@@ -746,74 +777,6 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
746#endif 777#endif
747 778
748#ifdef CONFIG_HIBERNATION 779#ifdef CONFIG_HIBERNATION
749
750static pgoff_t hibernation_offset[MAX_SWAPFILES];
751/*
752 * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
753 * saved swap_map[] image to the disk will be an incomplete because it's
754 * changing without synchronization with hibernation snap shot.
755 * At resume, we just make swap_for_hibernation=false. We can forget
756 * used maps easily.
757 */
758void hibernation_freeze_swap(void)
759{
760 int i;
761
762 spin_lock(&swap_lock);
763
764 printk(KERN_INFO "PM: Freeze Swap\n");
765 swap_for_hibernation = true;
766 for (i = 0; i < MAX_SWAPFILES; i++)
767 hibernation_offset[i] = 1;
768 spin_unlock(&swap_lock);
769}
770
771void hibernation_thaw_swap(void)
772{
773 spin_lock(&swap_lock);
774 if (swap_for_hibernation) {
775 printk(KERN_INFO "PM: Thaw Swap\n");
776 swap_for_hibernation = false;
777 }
778 spin_unlock(&swap_lock);
779}
780
781/*
782 * Because updateing swap_map[] can make not-saved-status-change,
783 * we use our own easy allocator.
784 * Please see kernel/power/swap.c, Used swaps are recorded into
785 * RB-tree.
786 */
787swp_entry_t get_swap_for_hibernation(int type)
788{
789 pgoff_t off;
790 swp_entry_t val = {0};
791 struct swap_info_struct *si;
792
793 spin_lock(&swap_lock);
794
795 si = swap_info[type];
796 if (!si || !(si->flags & SWP_WRITEOK))
797 goto done;
798
799 for (off = hibernation_offset[type]; off < si->max; ++off) {
800 if (!si->swap_map[off])
801 break;
802 }
803 if (off < si->max) {
804 val = swp_entry(type, off);
805 hibernation_offset[type] = off + 1;
806 }
807done:
808 spin_unlock(&swap_lock);
809 return val;
810}
811
812void swap_free_for_hibernation(swp_entry_t ent)
813{
814 /* Nothing to do */
815}
816
817/* 780/*
818 * Find the swap type that corresponds to given device (if any). 781 * Find the swap type that corresponds to given device (if any).
819 * 782 *
@@ -2084,7 +2047,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2084 p->flags |= SWP_SOLIDSTATE; 2047 p->flags |= SWP_SOLIDSTATE;
2085 p->cluster_next = 1 + (random32() % p->highest_bit); 2048 p->cluster_next = 1 + (random32() % p->highest_bit);
2086 } 2049 }
2087 if (discard_swap(p) == 0) 2050 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2088 p->flags |= SWP_DISCARDABLE; 2051 p->flags |= SWP_DISCARDABLE;
2089 } 2052 }
2090 2053
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c391c320dbaf..c5dfabf25f11 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1804,12 +1804,11 @@ static void shrink_zone(int priority, struct zone *zone,
1804 * If a zone is deemed to be full of pinned pages then just give it a light 1804 * If a zone is deemed to be full of pinned pages then just give it a light
1805 * scan then give up on it. 1805 * scan then give up on it.
1806 */ 1806 */
1807static bool shrink_zones(int priority, struct zonelist *zonelist, 1807static void shrink_zones(int priority, struct zonelist *zonelist,
1808 struct scan_control *sc) 1808 struct scan_control *sc)
1809{ 1809{
1810 struct zoneref *z; 1810 struct zoneref *z;
1811 struct zone *zone; 1811 struct zone *zone;
1812 bool all_unreclaimable = true;
1813 1812
1814 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1813 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1815 gfp_zone(sc->gfp_mask), sc->nodemask) { 1814 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1827,8 +1826,38 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
1827 } 1826 }
1828 1827
1829 shrink_zone(priority, zone, sc); 1828 shrink_zone(priority, zone, sc);
1830 all_unreclaimable = false;
1831 } 1829 }
1830}
1831
1832static bool zone_reclaimable(struct zone *zone)
1833{
1834 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
1835}
1836
1837/*
1838 * As hibernation is going on, kswapd is freezed so that it can't mark
1839 * the zone into all_unreclaimable. It can't handle OOM during hibernation.
1840 * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
1841 */
1842static bool all_unreclaimable(struct zonelist *zonelist,
1843 struct scan_control *sc)
1844{
1845 struct zoneref *z;
1846 struct zone *zone;
1847 bool all_unreclaimable = true;
1848
1849 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1850 gfp_zone(sc->gfp_mask), sc->nodemask) {
1851 if (!populated_zone(zone))
1852 continue;
1853 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1854 continue;
1855 if (zone_reclaimable(zone)) {
1856 all_unreclaimable = false;
1857 break;
1858 }
1859 }
1860
1832 return all_unreclaimable; 1861 return all_unreclaimable;
1833} 1862}
1834 1863
@@ -1852,7 +1881,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1852 struct scan_control *sc) 1881 struct scan_control *sc)
1853{ 1882{
1854 int priority; 1883 int priority;
1855 bool all_unreclaimable;
1856 unsigned long total_scanned = 0; 1884 unsigned long total_scanned = 0;
1857 struct reclaim_state *reclaim_state = current->reclaim_state; 1885 struct reclaim_state *reclaim_state = current->reclaim_state;
1858 struct zoneref *z; 1886 struct zoneref *z;
@@ -1869,7 +1897,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1869 sc->nr_scanned = 0; 1897 sc->nr_scanned = 0;
1870 if (!priority) 1898 if (!priority)
1871 disable_swap_token(); 1899 disable_swap_token();
1872 all_unreclaimable = shrink_zones(priority, zonelist, sc); 1900 shrink_zones(priority, zonelist, sc);
1873 /* 1901 /*
1874 * Don't shrink slabs when reclaiming memory from 1902 * Don't shrink slabs when reclaiming memory from
1875 * over limit cgroups 1903 * over limit cgroups
@@ -1931,7 +1959,7 @@ out:
1931 return sc->nr_reclaimed; 1959 return sc->nr_reclaimed;
1932 1960
1933 /* top priority shrink_zones still had more to do? don't OOM, then */ 1961 /* top priority shrink_zones still had more to do? don't OOM, then */
1934 if (scanning_global_lru(sc) && !all_unreclaimable) 1962 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
1935 return 1; 1963 return 1;
1936 1964
1937 return 0; 1965 return 0;
@@ -2197,8 +2225,7 @@ loop_again:
2197 total_scanned += sc.nr_scanned; 2225 total_scanned += sc.nr_scanned;
2198 if (zone->all_unreclaimable) 2226 if (zone->all_unreclaimable)
2199 continue; 2227 continue;
2200 if (nr_slab == 0 && 2228 if (nr_slab == 0 && !zone_reclaimable(zone))
2201 zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
2202 zone->all_unreclaimable = 1; 2229 zone->all_unreclaimable = 1;
2203 /* 2230 /*
2204 * If we've done a decent amount of scanning and 2231 * If we've done a decent amount of scanning and
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f389168f9a83..355a9e669aaa 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -138,11 +138,24 @@ static void refresh_zone_stat_thresholds(void)
138 int threshold; 138 int threshold;
139 139
140 for_each_populated_zone(zone) { 140 for_each_populated_zone(zone) {
141 unsigned long max_drift, tolerate_drift;
142
141 threshold = calculate_threshold(zone); 143 threshold = calculate_threshold(zone);
142 144
143 for_each_online_cpu(cpu) 145 for_each_online_cpu(cpu)
144 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 146 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
145 = threshold; 147 = threshold;
148
149 /*
150 * Only set percpu_drift_mark if there is a danger that
151 * NR_FREE_PAGES reports the low watermark is ok when in fact
152 * the min watermark could be breached by an allocation
153 */
154 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
155 max_drift = num_online_cpus() * threshold;
156 if (max_drift > tolerate_drift)
157 zone->percpu_drift_mark = high_wmark_pages(zone) +
158 max_drift;
146 } 159 }
147} 160}
148 161
@@ -813,7 +826,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
813 "\n scanned %lu" 826 "\n scanned %lu"
814 "\n spanned %lu" 827 "\n spanned %lu"
815 "\n present %lu", 828 "\n present %lu",
816 zone_page_state(zone, NR_FREE_PAGES), 829 zone_nr_free_pages(zone),
817 min_wmark_pages(zone), 830 min_wmark_pages(zone),
818 low_wmark_pages(zone), 831 low_wmark_pages(zone),
819 high_wmark_pages(zone), 832 high_wmark_pages(zone),
@@ -998,6 +1011,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
998 switch (action) { 1011 switch (action) {
999 case CPU_ONLINE: 1012 case CPU_ONLINE:
1000 case CPU_ONLINE_FROZEN: 1013 case CPU_ONLINE_FROZEN:
1014 refresh_zone_stat_thresholds();
1001 start_cpu_timer(cpu); 1015 start_cpu_timer(cpu);
1002 node_set_state(cpu_to_node(cpu), N_CPU); 1016 node_set_state(cpu_to_node(cpu), N_CPU);
1003 break; 1017 break;