aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c1
-rw-r--r--mm/filemap.c37
-rw-r--r--mm/hugetlb.c3
-rw-r--r--mm/ksm.c7
-rw-r--r--mm/memcontrol.c101
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory_hotplug.c31
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/nommu.c29
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/pagewalk.c5
-rw-r--r--mm/percpu.c10
-rw-r--r--mm/shmem.c9
-rw-r--r--mm/slab.c76
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c77
-rw-r--r--mm/truncate.c4
-rw-r--r--mm/util.c21
-rw-r--r--mm/vmalloc.c28
-rw-r--r--mm/vmscan.c9
-rw-r--r--mm/vmstat.c155
24 files changed, 394 insertions, 278 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 4d709ee59013..1a8894eadf72 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -279,7 +279,6 @@ static unsigned long isolate_migratepages(struct zone *zone,
279 /* Successfully isolated */ 279 /* Successfully isolated */
280 del_page_from_lru_list(zone, page, page_lru(page)); 280 del_page_from_lru_list(zone, page, page_lru(page));
281 list_add(&page->lru, migratelist); 281 list_add(&page->lru, migratelist);
282 mem_cgroup_del_lru(page);
283 cc->nr_migratepages++; 282 cc->nr_migratepages++;
284 283
285 /* Avoid isolating too much */ 284 /* Avoid isolating too much */
diff --git a/mm/filemap.c b/mm/filemap.c
index 61ba5e405791..ca389394fa2a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,9 +102,6 @@
102 * ->inode_lock (zap_pte_range->set_page_dirty) 102 * ->inode_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 104 *
105 * ->task->proc_lock
106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around) 105 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao) 106 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock 107 * ->i_mmap_lock
@@ -143,13 +140,18 @@ void __remove_from_page_cache(struct page *page)
143void remove_from_page_cache(struct page *page) 140void remove_from_page_cache(struct page *page)
144{ 141{
145 struct address_space *mapping = page->mapping; 142 struct address_space *mapping = page->mapping;
143 void (*freepage)(struct page *);
146 144
147 BUG_ON(!PageLocked(page)); 145 BUG_ON(!PageLocked(page));
148 146
147 freepage = mapping->a_ops->freepage;
149 spin_lock_irq(&mapping->tree_lock); 148 spin_lock_irq(&mapping->tree_lock);
150 __remove_from_page_cache(page); 149 __remove_from_page_cache(page);
151 spin_unlock_irq(&mapping->tree_lock); 150 spin_unlock_irq(&mapping->tree_lock);
152 mem_cgroup_uncharge_cache_page(page); 151 mem_cgroup_uncharge_cache_page(page);
152
153 if (freepage)
154 freepage(page);
153} 155}
154EXPORT_SYMBOL(remove_from_page_cache); 156EXPORT_SYMBOL(remove_from_page_cache);
155 157
@@ -644,7 +646,9 @@ repeat:
644 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 646 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
645 if (pagep) { 647 if (pagep) {
646 page = radix_tree_deref_slot(pagep); 648 page = radix_tree_deref_slot(pagep);
647 if (unlikely(!page || page == RADIX_TREE_RETRY)) 649 if (unlikely(!page))
650 goto out;
651 if (radix_tree_deref_retry(page))
648 goto repeat; 652 goto repeat;
649 653
650 if (!page_cache_get_speculative(page)) 654 if (!page_cache_get_speculative(page))
@@ -660,6 +664,7 @@ repeat:
660 goto repeat; 664 goto repeat;
661 } 665 }
662 } 666 }
667out:
663 rcu_read_unlock(); 668 rcu_read_unlock();
664 669
665 return page; 670 return page;
@@ -777,12 +782,11 @@ repeat:
777 page = radix_tree_deref_slot((void **)pages[i]); 782 page = radix_tree_deref_slot((void **)pages[i]);
778 if (unlikely(!page)) 783 if (unlikely(!page))
779 continue; 784 continue;
780 /* 785 if (radix_tree_deref_retry(page)) {
781 * this can only trigger if nr_found == 1, making livelock 786 if (ret)
782 * a non issue. 787 start = pages[ret-1]->index;
783 */
784 if (unlikely(page == RADIX_TREE_RETRY))
785 goto restart; 788 goto restart;
789 }
786 790
787 if (!page_cache_get_speculative(page)) 791 if (!page_cache_get_speculative(page))
788 goto repeat; 792 goto repeat;
@@ -830,11 +834,7 @@ repeat:
830 page = radix_tree_deref_slot((void **)pages[i]); 834 page = radix_tree_deref_slot((void **)pages[i]);
831 if (unlikely(!page)) 835 if (unlikely(!page))
832 continue; 836 continue;
833 /* 837 if (radix_tree_deref_retry(page))
834 * this can only trigger if nr_found == 1, making livelock
835 * a non issue.
836 */
837 if (unlikely(page == RADIX_TREE_RETRY))
838 goto restart; 838 goto restart;
839 839
840 if (page->mapping == NULL || page->index != index) 840 if (page->mapping == NULL || page->index != index)
@@ -887,11 +887,7 @@ repeat:
887 page = radix_tree_deref_slot((void **)pages[i]); 887 page = radix_tree_deref_slot((void **)pages[i]);
888 if (unlikely(!page)) 888 if (unlikely(!page))
889 continue; 889 continue;
890 /* 890 if (radix_tree_deref_retry(page))
891 * this can only trigger if nr_found == 1, making livelock
892 * a non issue.
893 */
894 if (unlikely(page == RADIX_TREE_RETRY))
895 goto restart; 891 goto restart;
896 892
897 if (!page_cache_get_speculative(page)) 893 if (!page_cache_get_speculative(page))
@@ -1029,6 +1025,9 @@ find_page:
1029 goto page_not_up_to_date; 1025 goto page_not_up_to_date;
1030 if (!trylock_page(page)) 1026 if (!trylock_page(page))
1031 goto page_not_up_to_date; 1027 goto page_not_up_to_date;
1028 /* Did it get truncated before we got the lock? */
1029 if (!page->mapping)
1030 goto page_not_up_to_date_locked;
1032 if (!mapping->a_ops->is_partially_uptodate(page, 1031 if (!mapping->a_ops->is_partially_uptodate(page,
1033 desc, offset)) 1032 desc, offset))
1034 goto page_not_up_to_date_locked; 1033 goto page_not_up_to_date_locked;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c4a3558589ab..85855240933d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2738,7 +2738,8 @@ out_page_table_lock:
2738 unlock_page(pagecache_page); 2738 unlock_page(pagecache_page);
2739 put_page(pagecache_page); 2739 put_page(pagecache_page);
2740 } 2740 }
2741 unlock_page(page); 2741 if (page != pagecache_page)
2742 unlock_page(page);
2742 2743
2743out_mutex: 2744out_mutex:
2744 mutex_unlock(&hugetlb_instantiation_mutex); 2745 mutex_unlock(&hugetlb_instantiation_mutex);
diff --git a/mm/ksm.c b/mm/ksm.c
index 65ab5c7067d9..43bc893470b4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1724,8 +1724,13 @@ static int ksm_memory_callback(struct notifier_block *self,
1724 /* 1724 /*
1725 * Keep it very simple for now: just lock out ksmd and 1725 * Keep it very simple for now: just lock out ksmd and
1726 * MADV_UNMERGEABLE while any memory is going offline. 1726 * MADV_UNMERGEABLE while any memory is going offline.
1727 * mutex_lock_nested() is necessary because lockdep was alarmed
1728 * that here we take ksm_thread_mutex inside notifier chain
1729 * mutex, and later take notifier chain mutex inside
1730 * ksm_thread_mutex to unlock it. But that's safe because both
1731 * are inside mem_hotplug_mutex.
1727 */ 1732 */
1728 mutex_lock(&ksm_thread_mutex); 1733 mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING);
1729 break; 1734 break;
1730 1735
1731 case MEM_OFFLINE: 1736 case MEM_OFFLINE:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9a99cfaf0a19..00bb8a64d028 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
63int do_swap_account __read_mostly; 63int do_swap_account __read_mostly;
64static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 64
65/* for remember boot option*/
66#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
67static int really_do_swap_account __initdata = 1;
68#else
69static int really_do_swap_account __initdata = 0;
70#endif
71
65#else 72#else
66#define do_swap_account (0) 73#define do_swap_account (0)
67#endif 74#endif
@@ -278,13 +285,14 @@ enum move_type {
278 285
279/* "mc" and its members are protected by cgroup_mutex */ 286/* "mc" and its members are protected by cgroup_mutex */
280static struct move_charge_struct { 287static struct move_charge_struct {
281 spinlock_t lock; /* for from, to, moving_task */ 288 spinlock_t lock; /* for from, to */
282 struct mem_cgroup *from; 289 struct mem_cgroup *from;
283 struct mem_cgroup *to; 290 struct mem_cgroup *to;
284 unsigned long precharge; 291 unsigned long precharge;
285 unsigned long moved_charge; 292 unsigned long moved_charge;
286 unsigned long moved_swap; 293 unsigned long moved_swap;
287 struct task_struct *moving_task; /* a task moving charges */ 294 struct task_struct *moving_task; /* a task moving charges */
295 struct mm_struct *mm;
288 wait_queue_head_t waitq; /* a waitq for other context */ 296 wait_queue_head_t waitq; /* a waitq for other context */
289} mc = { 297} mc = {
290 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 298 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -1917,19 +1925,18 @@ again:
1917 1925
1918 rcu_read_lock(); 1926 rcu_read_lock();
1919 p = rcu_dereference(mm->owner); 1927 p = rcu_dereference(mm->owner);
1920 VM_BUG_ON(!p);
1921 /* 1928 /*
1922 * because we don't have task_lock(), "p" can exit while 1929 * Because we don't have task_lock(), "p" can exit.
1923 * we're here. In that case, "mem" can point to root 1930 * In that case, "mem" can point to root or p can be NULL with
1924 * cgroup but never be NULL. (and task_struct itself is freed 1931 * race with swapoff. Then, we have small risk of mis-accouning.
1925 * by RCU, cgroup itself is RCU safe.) Then, we have small 1932 * But such kind of mis-account by race always happens because
1926 * risk here to get wrong cgroup. But such kind of mis-account 1933 * we don't have cgroup_mutex(). It's overkill and we allo that
1927 * by race always happens because we don't have cgroup_mutex(). 1934 * small race, here.
1928 * It's overkill and we allow that small race, here. 1935 * (*) swapoff at el will charge against mm-struct not against
1936 * task-struct. So, mm->owner can be NULL.
1929 */ 1937 */
1930 mem = mem_cgroup_from_task(p); 1938 mem = mem_cgroup_from_task(p);
1931 VM_BUG_ON(!mem); 1939 if (!mem || mem_cgroup_is_root(mem)) {
1932 if (mem_cgroup_is_root(mem)) {
1933 rcu_read_unlock(); 1940 rcu_read_unlock();
1934 goto done; 1941 goto done;
1935 } 1942 }
@@ -2152,7 +2159,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2152{ 2159{
2153 VM_BUG_ON(from == to); 2160 VM_BUG_ON(from == to);
2154 VM_BUG_ON(PageLRU(pc->page)); 2161 VM_BUG_ON(PageLRU(pc->page));
2155 VM_BUG_ON(!PageCgroupLocked(pc)); 2162 VM_BUG_ON(!page_is_cgroup_locked(pc));
2156 VM_BUG_ON(!PageCgroupUsed(pc)); 2163 VM_BUG_ON(!PageCgroupUsed(pc));
2157 VM_BUG_ON(pc->mem_cgroup != from); 2164 VM_BUG_ON(pc->mem_cgroup != from);
2158 2165
@@ -4208,15 +4215,17 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4208 4215
4209 memset(mem, 0, size); 4216 memset(mem, 0, size);
4210 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4217 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4211 if (!mem->stat) { 4218 if (!mem->stat)
4212 if (size < PAGE_SIZE) 4219 goto out_free;
4213 kfree(mem);
4214 else
4215 vfree(mem);
4216 mem = NULL;
4217 }
4218 spin_lock_init(&mem->pcp_counter_lock); 4220 spin_lock_init(&mem->pcp_counter_lock);
4219 return mem; 4221 return mem;
4222
4223out_free:
4224 if (size < PAGE_SIZE)
4225 kfree(mem);
4226 else
4227 vfree(mem);
4228 return NULL;
4220} 4229}
4221 4230
4222/* 4231/*
@@ -4629,7 +4638,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4629 unsigned long precharge; 4638 unsigned long precharge;
4630 struct vm_area_struct *vma; 4639 struct vm_area_struct *vma;
4631 4640
4632 down_read(&mm->mmap_sem); 4641 /* We've already held the mmap_sem */
4633 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4642 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4634 struct mm_walk mem_cgroup_count_precharge_walk = { 4643 struct mm_walk mem_cgroup_count_precharge_walk = {
4635 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4644 .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4641,7 +4650,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4641 walk_page_range(vma->vm_start, vma->vm_end, 4650 walk_page_range(vma->vm_start, vma->vm_end,
4642 &mem_cgroup_count_precharge_walk); 4651 &mem_cgroup_count_precharge_walk);
4643 } 4652 }
4644 up_read(&mm->mmap_sem);
4645 4653
4646 precharge = mc.precharge; 4654 precharge = mc.precharge;
4647 mc.precharge = 0; 4655 mc.precharge = 0;
@@ -4692,11 +4700,16 @@ static void mem_cgroup_clear_mc(void)
4692 4700
4693 mc.moved_swap = 0; 4701 mc.moved_swap = 0;
4694 } 4702 }
4703 if (mc.mm) {
4704 up_read(&mc.mm->mmap_sem);
4705 mmput(mc.mm);
4706 }
4695 spin_lock(&mc.lock); 4707 spin_lock(&mc.lock);
4696 mc.from = NULL; 4708 mc.from = NULL;
4697 mc.to = NULL; 4709 mc.to = NULL;
4698 mc.moving_task = NULL;
4699 spin_unlock(&mc.lock); 4710 spin_unlock(&mc.lock);
4711 mc.moving_task = NULL;
4712 mc.mm = NULL;
4700 mem_cgroup_end_move(from); 4713 mem_cgroup_end_move(from);
4701 memcg_oom_recover(from); 4714 memcg_oom_recover(from);
4702 memcg_oom_recover(to); 4715 memcg_oom_recover(to);
@@ -4722,12 +4735,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4722 return 0; 4735 return 0;
4723 /* We move charges only when we move a owner of the mm */ 4736 /* We move charges only when we move a owner of the mm */
4724 if (mm->owner == p) { 4737 if (mm->owner == p) {
4738 /*
4739 * We do all the move charge works under one mmap_sem to
4740 * avoid deadlock with down_write(&mmap_sem)
4741 * -> try_charge() -> if (mc.moving_task) -> sleep.
4742 */
4743 down_read(&mm->mmap_sem);
4744
4725 VM_BUG_ON(mc.from); 4745 VM_BUG_ON(mc.from);
4726 VM_BUG_ON(mc.to); 4746 VM_BUG_ON(mc.to);
4727 VM_BUG_ON(mc.precharge); 4747 VM_BUG_ON(mc.precharge);
4728 VM_BUG_ON(mc.moved_charge); 4748 VM_BUG_ON(mc.moved_charge);
4729 VM_BUG_ON(mc.moved_swap); 4749 VM_BUG_ON(mc.moved_swap);
4730 VM_BUG_ON(mc.moving_task); 4750 VM_BUG_ON(mc.moving_task);
4751 VM_BUG_ON(mc.mm);
4752
4731 mem_cgroup_start_move(from); 4753 mem_cgroup_start_move(from);
4732 spin_lock(&mc.lock); 4754 spin_lock(&mc.lock);
4733 mc.from = from; 4755 mc.from = from;
@@ -4735,14 +4757,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4735 mc.precharge = 0; 4757 mc.precharge = 0;
4736 mc.moved_charge = 0; 4758 mc.moved_charge = 0;
4737 mc.moved_swap = 0; 4759 mc.moved_swap = 0;
4738 mc.moving_task = current;
4739 spin_unlock(&mc.lock); 4760 spin_unlock(&mc.lock);
4761 mc.moving_task = current;
4762 mc.mm = mm;
4740 4763
4741 ret = mem_cgroup_precharge_mc(mm); 4764 ret = mem_cgroup_precharge_mc(mm);
4742 if (ret) 4765 if (ret)
4743 mem_cgroup_clear_mc(); 4766 mem_cgroup_clear_mc();
4744 } 4767 /* We call up_read() and mmput() in clear_mc(). */
4745 mmput(mm); 4768 } else
4769 mmput(mm);
4746 } 4770 }
4747 return ret; 4771 return ret;
4748} 4772}
@@ -4830,7 +4854,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4830 struct vm_area_struct *vma; 4854 struct vm_area_struct *vma;
4831 4855
4832 lru_add_drain_all(); 4856 lru_add_drain_all();
4833 down_read(&mm->mmap_sem); 4857 /* We've already held the mmap_sem */
4834 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4858 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4835 int ret; 4859 int ret;
4836 struct mm_walk mem_cgroup_move_charge_walk = { 4860 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4849,7 +4873,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4849 */ 4873 */
4850 break; 4874 break;
4851 } 4875 }
4852 up_read(&mm->mmap_sem);
4853} 4876}
4854 4877
4855static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4878static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4858,17 +4881,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4858 struct task_struct *p, 4881 struct task_struct *p,
4859 bool threadgroup) 4882 bool threadgroup)
4860{ 4883{
4861 struct mm_struct *mm; 4884 if (!mc.mm)
4862
4863 if (!mc.to)
4864 /* no need to move charge */ 4885 /* no need to move charge */
4865 return; 4886 return;
4866 4887
4867 mm = get_task_mm(p); 4888 mem_cgroup_move_charge(mc.mm);
4868 if (mm) {
4869 mem_cgroup_move_charge(mm);
4870 mmput(mm);
4871 }
4872 mem_cgroup_clear_mc(); 4889 mem_cgroup_clear_mc();
4873} 4890}
4874#else /* !CONFIG_MMU */ 4891#else /* !CONFIG_MMU */
@@ -4909,10 +4926,20 @@ struct cgroup_subsys mem_cgroup_subsys = {
4909}; 4926};
4910 4927
4911#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4928#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4929static int __init enable_swap_account(char *s)
4930{
4931 /* consider enabled if no parameter or 1 is given */
4932 if (!s || !strcmp(s, "1"))
4933 really_do_swap_account = 1;
4934 else if (!strcmp(s, "0"))
4935 really_do_swap_account = 0;
4936 return 1;
4937}
4938__setup("swapaccount", enable_swap_account);
4912 4939
4913static int __init disable_swap_account(char *s) 4940static int __init disable_swap_account(char *s)
4914{ 4941{
4915 really_do_swap_account = 0; 4942 enable_swap_account("0");
4916 return 1; 4943 return 1;
4917} 4944}
4918__setup("noswapaccount", disable_swap_account); 4945__setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 124324134ff6..46ab2c044b0e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -51,6 +51,7 @@
51#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/swapops.h> 52#include <linux/swapops.h>
53#include <linux/hugetlb.h> 53#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h>
54#include "internal.h" 55#include "internal.h"
55 56
56int sysctl_memory_failure_early_kill __read_mostly = 0; 57int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1230,11 +1231,10 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1230 return 1; 1231 return 1;
1231 1232
1232 /* 1233 /*
1233 * The lock_system_sleep prevents a race with memory hotplug, 1234 * The lock_memory_hotplug prevents a race with memory hotplug.
1234 * because the isolation assumes there's only a single user.
1235 * This is a big hammer, a better would be nicer. 1235 * This is a big hammer, a better would be nicer.
1236 */ 1236 */
1237 lock_system_sleep(); 1237 lock_memory_hotplug();
1238 1238
1239 /* 1239 /*
1240 * Isolate the page, so that it doesn't get reallocated if it 1240 * Isolate the page, so that it doesn't get reallocated if it
@@ -1264,7 +1264,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1264 ret = 1; 1264 ret = 1;
1265 } 1265 }
1266 unset_migratetype_isolate(p); 1266 unset_migratetype_isolate(p);
1267 unlock_system_sleep(); 1267 unlock_memory_hotplug();
1268 return ret; 1268 return ret;
1269} 1269}
1270 1270
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9260314a221e..2c6523af5473 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,23 @@
34 34
35#include "internal.h" 35#include "internal.h"
36 36
37DEFINE_MUTEX(mem_hotplug_mutex);
38
39void lock_memory_hotplug(void)
40{
41 mutex_lock(&mem_hotplug_mutex);
42
43 /* for exclusive hibernation if CONFIG_HIBERNATION=y */
44 lock_system_sleep();
45}
46
47void unlock_memory_hotplug(void)
48{
49 unlock_system_sleep();
50 mutex_unlock(&mem_hotplug_mutex);
51}
52
53
37/* add this memory to iomem resource */ 54/* add this memory to iomem resource */
38static struct resource *register_memory_resource(u64 start, u64 size) 55static struct resource *register_memory_resource(u64 start, u64 size)
39{ 56{
@@ -493,7 +510,7 @@ int mem_online_node(int nid)
493 pg_data_t *pgdat; 510 pg_data_t *pgdat;
494 int ret; 511 int ret;
495 512
496 lock_system_sleep(); 513 lock_memory_hotplug();
497 pgdat = hotadd_new_pgdat(nid, 0); 514 pgdat = hotadd_new_pgdat(nid, 0);
498 if (pgdat) { 515 if (pgdat) {
499 ret = -ENOMEM; 516 ret = -ENOMEM;
@@ -504,7 +521,7 @@ int mem_online_node(int nid)
504 BUG_ON(ret); 521 BUG_ON(ret);
505 522
506out: 523out:
507 unlock_system_sleep(); 524 unlock_memory_hotplug();
508 return ret; 525 return ret;
509} 526}
510 527
@@ -516,7 +533,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
516 struct resource *res; 533 struct resource *res;
517 int ret; 534 int ret;
518 535
519 lock_system_sleep(); 536 lock_memory_hotplug();
520 537
521 res = register_memory_resource(start, size); 538 res = register_memory_resource(start, size);
522 ret = -EEXIST; 539 ret = -EEXIST;
@@ -563,7 +580,7 @@ error:
563 release_memory_resource(res); 580 release_memory_resource(res);
564 581
565out: 582out:
566 unlock_system_sleep(); 583 unlock_memory_hotplug();
567 return ret; 584 return ret;
568} 585}
569EXPORT_SYMBOL_GPL(add_memory); 586EXPORT_SYMBOL_GPL(add_memory);
@@ -791,7 +808,7 @@ static int offline_pages(unsigned long start_pfn,
791 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 808 if (!test_pages_in_a_zone(start_pfn, end_pfn))
792 return -EINVAL; 809 return -EINVAL;
793 810
794 lock_system_sleep(); 811 lock_memory_hotplug();
795 812
796 zone = page_zone(pfn_to_page(start_pfn)); 813 zone = page_zone(pfn_to_page(start_pfn));
797 node = zone_to_nid(zone); 814 node = zone_to_nid(zone);
@@ -880,7 +897,7 @@ repeat:
880 writeback_set_ratelimit(); 897 writeback_set_ratelimit();
881 898
882 memory_notify(MEM_OFFLINE, &arg); 899 memory_notify(MEM_OFFLINE, &arg);
883 unlock_system_sleep(); 900 unlock_memory_hotplug();
884 return 0; 901 return 0;
885 902
886failed_removal: 903failed_removal:
@@ -891,7 +908,7 @@ failed_removal:
891 undo_isolate_page_range(start_pfn, end_pfn); 908 undo_isolate_page_range(start_pfn, end_pfn);
892 909
893out: 910out:
894 unlock_system_sleep(); 911 unlock_memory_hotplug();
895 return ret; 912 return ret;
896} 913}
897 914
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4a57f135b76e..11ff260fb282 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1307,15 +1307,18 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1307 goto out; 1307 goto out;
1308 1308
1309 /* Find the mm_struct */ 1309 /* Find the mm_struct */
1310 rcu_read_lock();
1310 read_lock(&tasklist_lock); 1311 read_lock(&tasklist_lock);
1311 task = pid ? find_task_by_vpid(pid) : current; 1312 task = pid ? find_task_by_vpid(pid) : current;
1312 if (!task) { 1313 if (!task) {
1313 read_unlock(&tasklist_lock); 1314 read_unlock(&tasklist_lock);
1315 rcu_read_unlock();
1314 err = -ESRCH; 1316 err = -ESRCH;
1315 goto out; 1317 goto out;
1316 } 1318 }
1317 mm = get_task_mm(task); 1319 mm = get_task_mm(task);
1318 read_unlock(&tasklist_lock); 1320 read_unlock(&tasklist_lock);
1321 rcu_read_unlock();
1319 1322
1320 err = -EINVAL; 1323 err = -EINVAL;
1321 if (!mm) 1324 if (!mm)
diff --git a/mm/migrate.c b/mm/migrate.c
index fe5a3c6a5426..6ae8a66a7045 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,6 +35,8 @@
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/gfp.h> 36#include <linux/gfp.h>
37 37
38#include <asm/tlbflush.h>
39
38#include "internal.h" 40#include "internal.h"
39 41
40#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 42#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
diff --git a/mm/mmap.c b/mm/mmap.c
index b179abb1474a..50a4aa0255a0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2462,6 +2462,7 @@ int install_special_mapping(struct mm_struct *mm,
2462 unsigned long addr, unsigned long len, 2462 unsigned long addr, unsigned long len,
2463 unsigned long vm_flags, struct page **pages) 2463 unsigned long vm_flags, struct page **pages)
2464{ 2464{
2465 int ret;
2465 struct vm_area_struct *vma; 2466 struct vm_area_struct *vma;
2466 2467
2467 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2468 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
@@ -2479,16 +2480,23 @@ int install_special_mapping(struct mm_struct *mm,
2479 vma->vm_ops = &special_mapping_vmops; 2480 vma->vm_ops = &special_mapping_vmops;
2480 vma->vm_private_data = pages; 2481 vma->vm_private_data = pages;
2481 2482
2482 if (unlikely(insert_vm_struct(mm, vma))) { 2483 ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
2483 kmem_cache_free(vm_area_cachep, vma); 2484 if (ret)
2484 return -ENOMEM; 2485 goto out;
2485 } 2486
2487 ret = insert_vm_struct(mm, vma);
2488 if (ret)
2489 goto out;
2486 2490
2487 mm->total_vm += len >> PAGE_SHIFT; 2491 mm->total_vm += len >> PAGE_SHIFT;
2488 2492
2489 perf_event_mmap(vma); 2493 perf_event_mmap(vma);
2490 2494
2491 return 0; 2495 return 0;
2496
2497out:
2498 kmem_cache_free(vm_area_cachep, vma);
2499 return ret;
2492} 2500}
2493 2501
2494static DEFINE_MUTEX(mm_all_locks_mutex); 2502static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/nommu.c b/mm/nommu.c
index 3613517c7592..ef4045d010d5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> 13 * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
@@ -328,6 +328,7 @@ void *vmalloc_node(unsigned long size, int node)
328{ 328{
329 return vmalloc(size); 329 return vmalloc(size);
330} 330}
331EXPORT_SYMBOL(vmalloc_node);
331 332
332/** 333/**
333 * vzalloc_node - allocate memory on a specific node with zero fill 334 * vzalloc_node - allocate memory on a specific node with zero fill
@@ -440,6 +441,31 @@ void __attribute__((weak)) vmalloc_sync_all(void)
440{ 441{
441} 442}
442 443
444/**
445 * alloc_vm_area - allocate a range of kernel address space
446 * @size: size of the area
447 *
448 * Returns: NULL on failure, vm_struct on success
449 *
450 * This function reserves a range of kernel address space, and
451 * allocates pagetables to map that range. No actual mappings
452 * are created. If the kernel address space is not shared
453 * between processes, it syncs the pagetable across all
454 * processes.
455 */
456struct vm_struct *alloc_vm_area(size_t size)
457{
458 BUG();
459 return NULL;
460}
461EXPORT_SYMBOL_GPL(alloc_vm_area);
462
463void free_vm_area(struct vm_struct *area)
464{
465 BUG();
466}
467EXPORT_SYMBOL_GPL(free_vm_area);
468
443int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 469int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
444 struct page *page) 470 struct page *page)
445{ 471{
@@ -1717,6 +1743,7 @@ void exit_mmap(struct mm_struct *mm)
1717 mm->mmap = vma->vm_next; 1743 mm->mmap = vma->vm_next;
1718 delete_vma_from_mm(vma); 1744 delete_vma_from_mm(vma);
1719 delete_vma(mm, vma); 1745 delete_vma(mm, vma);
1746 cond_resched();
1720 } 1747 }
1721 1748
1722 kleave(""); 1749 kleave("");
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b840afa89761..b4edfe7ce06c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -563,7 +563,7 @@ static void balance_dirty_pages(struct address_space *mapping,
563 break; /* We've done our duty */ 563 break; /* We've done our duty */
564 } 564 }
565 trace_wbc_balance_dirty_wait(&wbc, bdi); 565 trace_wbc_balance_dirty_wait(&wbc, bdi);
566 __set_current_state(TASK_INTERRUPTIBLE); 566 __set_current_state(TASK_UNINTERRUPTIBLE);
567 io_schedule_timeout(pause); 567 io_schedule_timeout(pause);
568 568
569 /* 569 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07a654486f75..ff7e15872398 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
104 * only be modified with pm_mutex held, unless the suspend/hibernate code is 104 * only be modified with pm_mutex held, unless the suspend/hibernate code is
105 * guaranteed not to run in parallel with that modification). 105 * guaranteed not to run in parallel with that modification).
106 */ 106 */
107void set_gfp_allowed_mask(gfp_t mask) 107
108static gfp_t saved_gfp_mask;
109
110void pm_restore_gfp_mask(void)
108{ 111{
109 WARN_ON(!mutex_is_locked(&pm_mutex)); 112 WARN_ON(!mutex_is_locked(&pm_mutex));
110 gfp_allowed_mask = mask; 113 if (saved_gfp_mask) {
114 gfp_allowed_mask = saved_gfp_mask;
115 saved_gfp_mask = 0;
116 }
111} 117}
112 118
113gfp_t clear_gfp_allowed_mask(gfp_t mask) 119void pm_restrict_gfp_mask(void)
114{ 120{
115 gfp_t ret = gfp_allowed_mask;
116
117 WARN_ON(!mutex_is_locked(&pm_mutex)); 121 WARN_ON(!mutex_is_locked(&pm_mutex));
118 gfp_allowed_mask &= ~mask; 122 WARN_ON(saved_gfp_mask);
119 return ret; 123 saved_gfp_mask = gfp_allowed_mask;
124 gfp_allowed_mask &= ~GFP_IOFS;
120} 125}
121#endif /* CONFIG_PM_SLEEP */ 126#endif /* CONFIG_PM_SLEEP */
122 127
@@ -3008,14 +3013,6 @@ static __init_refok int __build_all_zonelists(void *data)
3008 build_zonelist_cache(pgdat); 3013 build_zonelist_cache(pgdat);
3009 } 3014 }
3010 3015
3011#ifdef CONFIG_MEMORY_HOTPLUG
3012 /* Setup real pagesets for the new zone */
3013 if (data) {
3014 struct zone *zone = data;
3015 setup_zone_pageset(zone);
3016 }
3017#endif
3018
3019 /* 3016 /*
3020 * Initialize the boot_pagesets that are going to be used 3017 * Initialize the boot_pagesets that are going to be used
3021 * for bootstrapping processors. The real pagesets for 3018 * for bootstrapping processors. The real pagesets for
@@ -3064,7 +3061,11 @@ void build_all_zonelists(void *data)
3064 } else { 3061 } else {
3065 /* we have to stop all cpus to guarantee there is no user 3062 /* we have to stop all cpus to guarantee there is no user
3066 of zonelist */ 3063 of zonelist */
3067 stop_machine(__build_all_zonelists, data, NULL); 3064#ifdef CONFIG_MEMORY_HOTPLUG
3065 if (data)
3066 setup_zone_pageset((struct zone *)data);
3067#endif
3068 stop_machine(__build_all_zonelists, NULL, NULL);
3068 /* cpuset refresh routine should be here */ 3069 /* cpuset refresh routine should be here */
3069 } 3070 }
3070 vm_total_pages = nr_free_pagecache_pages(); 3071 vm_total_pages = nr_free_pagecache_pages();
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8b1a2ce21ee5..38cc58b8b2b0 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -139,7 +139,6 @@ int walk_page_range(unsigned long addr, unsigned long end,
139 pgd_t *pgd; 139 pgd_t *pgd;
140 unsigned long next; 140 unsigned long next;
141 int err = 0; 141 int err = 0;
142 struct vm_area_struct *vma;
143 142
144 if (addr >= end) 143 if (addr >= end)
145 return err; 144 return err;
@@ -149,15 +148,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
149 148
150 pgd = pgd_offset(walk->mm, addr); 149 pgd = pgd_offset(walk->mm, addr);
151 do { 150 do {
151 struct vm_area_struct *uninitialized_var(vma);
152
152 next = pgd_addr_end(addr, end); 153 next = pgd_addr_end(addr, end);
153 154
155#ifdef CONFIG_HUGETLB_PAGE
154 /* 156 /*
155 * handle hugetlb vma individually because pagetable walk for 157 * handle hugetlb vma individually because pagetable walk for
156 * the hugetlb page is dependent on the architecture and 158 * the hugetlb page is dependent on the architecture and
157 * we can't handled it in the same manner as non-huge pages. 159 * we can't handled it in the same manner as non-huge pages.
158 */ 160 */
159 vma = find_vma(walk->mm, addr); 161 vma = find_vma(walk->mm, addr);
160#ifdef CONFIG_HUGETLB_PAGE
161 if (vma && is_vm_hugetlb_page(vma)) { 162 if (vma && is_vm_hugetlb_page(vma)) {
162 if (vma->vm_end < next) 163 if (vma->vm_end < next)
163 next = vma->vm_end; 164 next = vma->vm_end;
diff --git a/mm/percpu.c b/mm/percpu.c
index efe816856a9d..3dd4984bdef8 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size)
293 293
294 if (size <= PAGE_SIZE) 294 if (size <= PAGE_SIZE)
295 return kzalloc(size, GFP_KERNEL); 295 return kzalloc(size, GFP_KERNEL);
296 else { 296 else
297 void *ptr = vmalloc(size); 297 return vzalloc(size);
298 if (ptr)
299 memset(ptr, 0, size);
300 return ptr;
301 }
302} 298}
303 299
304/** 300/**
@@ -1268,7 +1264,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1268 1264
1269 /* we're done parsing the input, undefine BUG macro and dump config */ 1265 /* we're done parsing the input, undefine BUG macro and dump config */
1270#undef PCPU_SETUP_BUG_ON 1266#undef PCPU_SETUP_BUG_ON
1271 pcpu_dump_alloc_info(KERN_INFO, ai); 1267 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1272 1268
1273 pcpu_nr_groups = ai->nr_groups; 1269 pcpu_nr_groups = ai->nr_groups;
1274 pcpu_group_offsets = group_offsets; 1270 pcpu_group_offsets = group_offsets;
diff --git a/mm/shmem.c b/mm/shmem.c
index 47fdeeb9d636..5ee67c990602 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2415,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
2415 return &p->vfs_inode; 2415 return &p->vfs_inode;
2416} 2416}
2417 2417
2418static void shmem_i_callback(struct rcu_head *head)
2419{
2420 struct inode *inode = container_of(head, struct inode, i_rcu);
2421 INIT_LIST_HEAD(&inode->i_dentry);
2422 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2423}
2424
2418static void shmem_destroy_inode(struct inode *inode) 2425static void shmem_destroy_inode(struct inode *inode)
2419{ 2426{
2420 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2427 if ((inode->i_mode & S_IFMT) == S_IFREG) {
2421 /* only struct inode is valid if it's an inline symlink */ 2428 /* only struct inode is valid if it's an inline symlink */
2422 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2429 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2423 } 2430 }
2424 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2431 call_rcu(&inode->i_rcu, shmem_i_callback);
2425} 2432}
2426 2433
2427static void init_once(void *foo) 2434static void init_once(void *foo)
diff --git a/mm/slab.c b/mm/slab.c
index b1e40dafbab3..264037449f08 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -829,12 +829,12 @@ static void init_reap_node(int cpu)
829 829
830static void next_reap_node(void) 830static void next_reap_node(void)
831{ 831{
832 int node = __get_cpu_var(slab_reap_node); 832 int node = __this_cpu_read(slab_reap_node);
833 833
834 node = next_node(node, node_online_map); 834 node = next_node(node, node_online_map);
835 if (unlikely(node >= MAX_NUMNODES)) 835 if (unlikely(node >= MAX_NUMNODES))
836 node = first_node(node_online_map); 836 node = first_node(node_online_map);
837 __get_cpu_var(slab_reap_node) = node; 837 __this_cpu_write(slab_reap_node, node);
838} 838}
839 839
840#else 840#else
@@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
1012 */ 1012 */
1013static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1013static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1014{ 1014{
1015 int node = __get_cpu_var(slab_reap_node); 1015 int node = __this_cpu_read(slab_reap_node);
1016 1016
1017 if (l3->alien) { 1017 if (l3->alien) {
1018 struct array_cache *ac = l3->alien[node]; 1018 struct array_cache *ac = l3->alien[node];
@@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1293 * anything expensive but will only modify reap_work 1293 * anything expensive but will only modify reap_work
1294 * and reschedule the timer. 1294 * and reschedule the timer.
1295 */ 1295 */
1296 cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); 1296 cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1297 /* Now the cache_reaper is guaranteed to be not running. */ 1297 /* Now the cache_reaper is guaranteed to be not running. */
1298 per_cpu(slab_reap_work, cpu).work.func = NULL; 1298 per_cpu(slab_reap_work, cpu).work.func = NULL;
1299 break; 1299 break;
@@ -2781,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2781/* 2781/*
2782 * Map pages beginning at addr to the given cache and slab. This is required 2782 * Map pages beginning at addr to the given cache and slab. This is required
2783 * for the slab allocator to be able to lookup the cache and slab of a 2783 * for the slab allocator to be able to lookup the cache and slab of a
2784 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. 2784 * virtual address for kfree, ksize, and slab debugging.
2785 */ 2785 */
2786static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, 2786static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2787 void *addr) 2787 void *addr)
@@ -3653,42 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3653EXPORT_SYMBOL(kmem_cache_alloc); 3653EXPORT_SYMBOL(kmem_cache_alloc);
3654 3654
3655#ifdef CONFIG_TRACING 3655#ifdef CONFIG_TRACING
3656void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) 3656void *
3657kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
3657{ 3658{
3658 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3659 void *ret;
3659}
3660EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3661#endif
3662 3660
3663/** 3661 ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3664 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
3665 * @cachep: the cache we're checking against
3666 * @ptr: pointer to validate
3667 *
3668 * This verifies that the untrusted pointer looks sane;
3669 * it is _not_ a guarantee that the pointer is actually
3670 * part of the slab cache in question, but it at least
3671 * validates that the pointer can be dereferenced and
3672 * looks half-way sane.
3673 *
3674 * Currently only used for dentry validation.
3675 */
3676int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3677{
3678 unsigned long size = cachep->buffer_size;
3679 struct page *page;
3680 3662
3681 if (unlikely(!kern_ptr_validate(ptr, size))) 3663 trace_kmalloc(_RET_IP_, ret,
3682 goto out; 3664 size, slab_buffer_size(cachep), flags);
3683 page = virt_to_page(ptr); 3665 return ret;
3684 if (unlikely(!PageSlab(page)))
3685 goto out;
3686 if (unlikely(page_get_cache(page) != cachep))
3687 goto out;
3688 return 1;
3689out:
3690 return 0;
3691} 3666}
3667EXPORT_SYMBOL(kmem_cache_alloc_trace);
3668#endif
3692 3669
3693#ifdef CONFIG_NUMA 3670#ifdef CONFIG_NUMA
3694void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3671void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
@@ -3705,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3705EXPORT_SYMBOL(kmem_cache_alloc_node); 3682EXPORT_SYMBOL(kmem_cache_alloc_node);
3706 3683
3707#ifdef CONFIG_TRACING 3684#ifdef CONFIG_TRACING
3708void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, 3685void *kmem_cache_alloc_node_trace(size_t size,
3709 gfp_t flags, 3686 struct kmem_cache *cachep,
3710 int nodeid) 3687 gfp_t flags,
3688 int nodeid)
3711{ 3689{
3712 return __cache_alloc_node(cachep, flags, nodeid, 3690 void *ret;
3691
3692 ret = __cache_alloc_node(cachep, flags, nodeid,
3713 __builtin_return_address(0)); 3693 __builtin_return_address(0));
3694 trace_kmalloc_node(_RET_IP_, ret,
3695 size, slab_buffer_size(cachep),
3696 flags, nodeid);
3697 return ret;
3714} 3698}
3715EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 3699EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3716#endif 3700#endif
3717 3701
3718static __always_inline void * 3702static __always_inline void *
3719__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3703__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3720{ 3704{
3721 struct kmem_cache *cachep; 3705 struct kmem_cache *cachep;
3722 void *ret;
3723 3706
3724 cachep = kmem_find_general_cachep(size, flags); 3707 cachep = kmem_find_general_cachep(size, flags);
3725 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3708 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3726 return cachep; 3709 return cachep;
3727 ret = kmem_cache_alloc_node_notrace(cachep, flags, node); 3710 return kmem_cache_alloc_node_trace(size, cachep, flags, node);
3728
3729 trace_kmalloc_node((unsigned long) caller, ret,
3730 size, cachep->buffer_size, flags, node);
3731
3732 return ret;
3733} 3711}
3734 3712
3735#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3713#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
diff --git a/mm/slob.c b/mm/slob.c
index 617b6d6c42c7..3588eaaef726 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -678,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d)
678} 678}
679EXPORT_SYMBOL(kmem_cache_shrink); 679EXPORT_SYMBOL(kmem_cache_shrink);
680 680
681int kmem_ptr_validate(struct kmem_cache *a, const void *b)
682{
683 return 0;
684}
685
686static unsigned int slob_ready __read_mostly; 681static unsigned int slob_ready __read_mostly;
687 682
688int slab_is_available(void) 683int slab_is_available(void)
diff --git a/mm/slub.c b/mm/slub.c
index 8fd5401bb071..008cd743a36a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,8 @@
28#include <linux/math64.h> 28#include <linux/math64.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30 30
31#include <trace/events/kmem.h>
32
31/* 33/*
32 * Lock order: 34 * Lock order:
33 * 1. slab_lock(page) 35 * 1. slab_lock(page)
@@ -1774,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1774EXPORT_SYMBOL(kmem_cache_alloc); 1776EXPORT_SYMBOL(kmem_cache_alloc);
1775 1777
1776#ifdef CONFIG_TRACING 1778#ifdef CONFIG_TRACING
1777void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) 1779void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
1780{
1781 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
1782 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
1783 return ret;
1784}
1785EXPORT_SYMBOL(kmem_cache_alloc_trace);
1786
1787void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
1778{ 1788{
1779 return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 1789 void *ret = kmalloc_order(size, flags, order);
1790 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
1791 return ret;
1780} 1792}
1781EXPORT_SYMBOL(kmem_cache_alloc_notrace); 1793EXPORT_SYMBOL(kmalloc_order_trace);
1782#endif 1794#endif
1783 1795
1784#ifdef CONFIG_NUMA 1796#ifdef CONFIG_NUMA
@@ -1794,13 +1806,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1794EXPORT_SYMBOL(kmem_cache_alloc_node); 1806EXPORT_SYMBOL(kmem_cache_alloc_node);
1795 1807
1796#ifdef CONFIG_TRACING 1808#ifdef CONFIG_TRACING
1797void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, 1809void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
1798 gfp_t gfpflags, 1810 gfp_t gfpflags,
1799 int node) 1811 int node, size_t size)
1800{ 1812{
1801 return slab_alloc(s, gfpflags, node, _RET_IP_); 1813 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1814
1815 trace_kmalloc_node(_RET_IP_, ret,
1816 size, s->size, gfpflags, node);
1817 return ret;
1802} 1818}
1803EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 1819EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
1804#endif 1820#endif
1805#endif 1821#endif
1806 1822
@@ -1917,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
1917} 1933}
1918EXPORT_SYMBOL(kmem_cache_free); 1934EXPORT_SYMBOL(kmem_cache_free);
1919 1935
1920/* Figure out on which slab page the object resides */
1921static struct page *get_object_page(const void *x)
1922{
1923 struct page *page = virt_to_head_page(x);
1924
1925 if (!PageSlab(page))
1926 return NULL;
1927
1928 return page;
1929}
1930
1931/* 1936/*
1932 * Object placement in a slab is made very easy because we always start at 1937 * Object placement in a slab is made very easy because we always start at
1933 * offset 0. If we tune the size of the object to the alignment then we can 1938 * offset 0. If we tune the size of the object to the alignment then we can
@@ -2386,35 +2391,6 @@ error:
2386} 2391}
2387 2392
2388/* 2393/*
2389 * Check if a given pointer is valid
2390 */
2391int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2392{
2393 struct page *page;
2394
2395 if (!kern_ptr_validate(object, s->size))
2396 return 0;
2397
2398 page = get_object_page(object);
2399
2400 if (!page || s != page->slab)
2401 /* No slab or wrong slab */
2402 return 0;
2403
2404 if (!check_valid_pointer(s, page, object))
2405 return 0;
2406
2407 /*
2408 * We could also check if the object is on the slabs freelist.
2409 * But this would be too expensive and it seems that the main
2410 * purpose of kmem_ptr_valid() is to check if the object belongs
2411 * to a certain slab.
2412 */
2413 return 1;
2414}
2415EXPORT_SYMBOL(kmem_ptr_validate);
2416
2417/*
2418 * Determine the size of a slab object 2394 * Determine the size of a slab object
2419 */ 2395 */
2420unsigned int kmem_cache_size(struct kmem_cache *s) 2396unsigned int kmem_cache_size(struct kmem_cache *s)
@@ -3273,9 +3249,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3273 kfree(n); 3249 kfree(n);
3274 kfree(s); 3250 kfree(s);
3275 } 3251 }
3252err:
3276 up_write(&slub_lock); 3253 up_write(&slub_lock);
3277 3254
3278err:
3279 if (flags & SLAB_PANIC) 3255 if (flags & SLAB_PANIC)
3280 panic("Cannot create slabcache %s\n", name); 3256 panic("Cannot create slabcache %s\n", name);
3281 else 3257 else
@@ -3401,13 +3377,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
3401 3377
3402 for_each_free_object(p, s, page->freelist) { 3378 for_each_free_object(p, s, page->freelist) {
3403 set_bit(slab_index(p, s, addr), map); 3379 set_bit(slab_index(p, s, addr), map);
3404 if (!check_object(s, page, p, 0)) 3380 if (!check_object(s, page, p, SLUB_RED_INACTIVE))
3405 return 0; 3381 return 0;
3406 } 3382 }
3407 3383
3408 for_each_object(p, s, addr, page->objects) 3384 for_each_object(p, s, addr, page->objects)
3409 if (!test_bit(slab_index(p, s, addr), map)) 3385 if (!test_bit(slab_index(p, s, addr), map))
3410 if (!check_object(s, page, p, 1)) 3386 if (!check_object(s, page, p, SLUB_RED_ACTIVE))
3411 return 0; 3387 return 0;
3412 return 1; 3388 return 1;
3413} 3389}
@@ -3862,6 +3838,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3862 x += sprintf(buf + x, " N%d=%lu", 3838 x += sprintf(buf + x, " N%d=%lu",
3863 node, nodes[node]); 3839 node, nodes[node]);
3864#endif 3840#endif
3841 up_read(&slub_lock);
3865 kfree(nodes); 3842 kfree(nodes);
3866 return x + sprintf(buf + x, "\n"); 3843 return x + sprintf(buf + x, "\n");
3867} 3844}
diff --git a/mm/truncate.c b/mm/truncate.c
index ba887bff48c5..3c2d5ddfa0d4 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
390 __remove_from_page_cache(page); 390 __remove_from_page_cache(page);
391 spin_unlock_irq(&mapping->tree_lock); 391 spin_unlock_irq(&mapping->tree_lock);
392 mem_cgroup_uncharge_cache_page(page); 392 mem_cgroup_uncharge_cache_page(page);
393
394 if (mapping->a_ops->freepage)
395 mapping->a_ops->freepage(page);
396
393 page_cache_release(page); /* pagecache ref */ 397 page_cache_release(page); /* pagecache ref */
394 return 1; 398 return 1;
395failed: 399failed:
diff --git a/mm/util.c b/mm/util.c
index 73dac81e9f78..f126975ef23e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,27 +186,6 @@ void kzfree(const void *p)
186} 186}
187EXPORT_SYMBOL(kzfree); 187EXPORT_SYMBOL(kzfree);
188 188
189int kern_ptr_validate(const void *ptr, unsigned long size)
190{
191 unsigned long addr = (unsigned long)ptr;
192 unsigned long min_addr = PAGE_OFFSET;
193 unsigned long align_mask = sizeof(void *) - 1;
194
195 if (unlikely(addr < min_addr))
196 goto out;
197 if (unlikely(addr > (unsigned long)high_memory - size))
198 goto out;
199 if (unlikely(addr & align_mask))
200 goto out;
201 if (unlikely(!kern_addr_valid(addr)))
202 goto out;
203 if (unlikely(!kern_addr_valid(addr + size - 1)))
204 goto out;
205 return 1;
206out:
207 return 0;
208}
209
210/* 189/*
211 * strndup_user - duplicate an existing string from user space 190 * strndup_user - duplicate an existing string from user space
212 * @s: The string to duplicate 191 * @s: The string to duplicate
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a3d66b3dc5cb..eb5cc7d00c5a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,8 +31,6 @@
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h> 32#include <asm/shmparam.h>
33 33
34bool vmap_lazy_unmap __read_mostly = true;
35
36/*** Page table manipulation functions ***/ 34/*** Page table manipulation functions ***/
37 35
38static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 36static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
@@ -503,9 +501,6 @@ static unsigned long lazy_max_pages(void)
503{ 501{
504 unsigned int log; 502 unsigned int log;
505 503
506 if (!vmap_lazy_unmap)
507 return 0;
508
509 log = fls(num_online_cpus()); 504 log = fls(num_online_cpus());
510 505
511 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 506 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -566,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
566 if (va->va_end > *end) 561 if (va->va_end > *end)
567 *end = va->va_end; 562 *end = va->va_end;
568 nr += (va->va_end - va->va_start) >> PAGE_SHIFT; 563 nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
569 unmap_vmap_area(va);
570 list_add_tail(&va->purge_list, &valist); 564 list_add_tail(&va->purge_list, &valist);
571 va->flags |= VM_LAZY_FREEING; 565 va->flags |= VM_LAZY_FREEING;
572 va->flags &= ~VM_LAZY_FREE; 566 va->flags &= ~VM_LAZY_FREE;
@@ -611,10 +605,11 @@ static void purge_vmap_area_lazy(void)
611} 605}
612 606
613/* 607/*
614 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been 608 * Free a vmap area, caller ensuring that the area has been unmapped
615 * called for the correct range previously. 609 * and flush_cache_vunmap had been called for the correct range
610 * previously.
616 */ 611 */
617static void free_unmap_vmap_area_noflush(struct vmap_area *va) 612static void free_vmap_area_noflush(struct vmap_area *va)
618{ 613{
619 va->flags |= VM_LAZY_FREE; 614 va->flags |= VM_LAZY_FREE;
620 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); 615 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
@@ -623,6 +618,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va)
623} 618}
624 619
625/* 620/*
621 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
622 * called for the correct range previously.
623 */
624static void free_unmap_vmap_area_noflush(struct vmap_area *va)
625{
626 unmap_vmap_area(va);
627 free_vmap_area_noflush(va);
628}
629
630/*
626 * Free and unmap a vmap area 631 * Free and unmap a vmap area
627 */ 632 */
628static void free_unmap_vmap_area(struct vmap_area *va) 633static void free_unmap_vmap_area(struct vmap_area *va)
@@ -798,7 +803,7 @@ static void free_vmap_block(struct vmap_block *vb)
798 spin_unlock(&vmap_block_tree_lock); 803 spin_unlock(&vmap_block_tree_lock);
799 BUG_ON(tmp != vb); 804 BUG_ON(tmp != vb);
800 805
801 free_unmap_vmap_area_noflush(vb->va); 806 free_vmap_area_noflush(vb->va);
802 call_rcu(&vb->rcu_head, rcu_free_vb); 807 call_rcu(&vb->rcu_head, rcu_free_vb);
803} 808}
804 809
@@ -936,6 +941,8 @@ static void vb_free(const void *addr, unsigned long size)
936 rcu_read_unlock(); 941 rcu_read_unlock();
937 BUG_ON(!vb); 942 BUG_ON(!vb);
938 943
944 vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
945
939 spin_lock(&vb->lock); 946 spin_lock(&vb->lock);
940 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); 947 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
941 948
@@ -988,7 +995,6 @@ void vm_unmap_aliases(void)
988 995
989 s = vb->va->va_start + (i << PAGE_SHIFT); 996 s = vb->va->va_start + (i << PAGE_SHIFT);
990 e = vb->va->va_start + (j << PAGE_SHIFT); 997 e = vb->va->va_start + (j << PAGE_SHIFT);
991 vunmap_page_range(s, e);
992 flush = 1; 998 flush = 1;
993 999
994 if (s < start) 1000 if (s < start)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b8a6fdc21312..9ca587c69274 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -494,9 +494,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
494 spin_unlock_irq(&mapping->tree_lock); 494 spin_unlock_irq(&mapping->tree_lock);
495 swapcache_free(swap, page); 495 swapcache_free(swap, page);
496 } else { 496 } else {
497 void (*freepage)(struct page *);
498
499 freepage = mapping->a_ops->freepage;
500
497 __remove_from_page_cache(page); 501 __remove_from_page_cache(page);
498 spin_unlock_irq(&mapping->tree_lock); 502 spin_unlock_irq(&mapping->tree_lock);
499 mem_cgroup_uncharge_cache_page(page); 503 mem_cgroup_uncharge_cache_page(page);
504
505 if (freepage != NULL)
506 freepage(page);
500 } 507 }
501 508
502 return 1; 509 return 1;
@@ -913,7 +920,7 @@ keep_lumpy:
913 * back off and wait for congestion to clear because further reclaim 920 * back off and wait for congestion to clear because further reclaim
914 * will encounter the same problem 921 * will encounter the same problem
915 */ 922 */
916 if (nr_dirty == nr_congested) 923 if (nr_dirty == nr_congested && nr_dirty != 0)
917 zone_set_flag(zone, ZONE_CONGESTED); 924 zone_set_flag(zone, ZONE_CONGESTED);
918 925
919 free_page_list(&free_pages); 926 free_page_list(&free_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 42eac4d33216..312d728976f1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -167,36 +167,24 @@ static void refresh_zone_stat_thresholds(void)
167void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 167void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
168 int delta) 168 int delta)
169{ 169{
170 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 170 struct per_cpu_pageset __percpu *pcp = zone->pageset;
171 171 s8 __percpu *p = pcp->vm_stat_diff + item;
172 s8 *p = pcp->vm_stat_diff + item;
173 long x; 172 long x;
173 long t;
174
175 x = delta + __this_cpu_read(*p);
174 176
175 x = delta + *p; 177 t = __this_cpu_read(pcp->stat_threshold);
176 178
177 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { 179 if (unlikely(x > t || x < -t)) {
178 zone_page_state_add(x, zone, item); 180 zone_page_state_add(x, zone, item);
179 x = 0; 181 x = 0;
180 } 182 }
181 *p = x; 183 __this_cpu_write(*p, x);
182} 184}
183EXPORT_SYMBOL(__mod_zone_page_state); 185EXPORT_SYMBOL(__mod_zone_page_state);
184 186
185/* 187/*
186 * For an unknown interrupt state
187 */
188void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
189 int delta)
190{
191 unsigned long flags;
192
193 local_irq_save(flags);
194 __mod_zone_page_state(zone, item, delta);
195 local_irq_restore(flags);
196}
197EXPORT_SYMBOL(mod_zone_page_state);
198
199/*
200 * Optimized increment and decrement functions. 188 * Optimized increment and decrement functions.
201 * 189 *
202 * These are only for a single page and therefore can take a struct page * 190 * These are only for a single page and therefore can take a struct page *
@@ -221,16 +209,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
221 */ 209 */
222void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 210void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
223{ 211{
224 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 212 struct per_cpu_pageset __percpu *pcp = zone->pageset;
225 s8 *p = pcp->vm_stat_diff + item; 213 s8 __percpu *p = pcp->vm_stat_diff + item;
226 214 s8 v, t;
227 (*p)++;
228 215
229 if (unlikely(*p > pcp->stat_threshold)) { 216 v = __this_cpu_inc_return(*p);
230 int overstep = pcp->stat_threshold / 2; 217 t = __this_cpu_read(pcp->stat_threshold);
218 if (unlikely(v > t)) {
219 s8 overstep = t >> 1;
231 220
232 zone_page_state_add(*p + overstep, zone, item); 221 zone_page_state_add(v + overstep, zone, item);
233 *p = -overstep; 222 __this_cpu_write(*p, -overstep);
234 } 223 }
235} 224}
236 225
@@ -242,16 +231,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
242 231
243void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 232void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
244{ 233{
245 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 234 struct per_cpu_pageset __percpu *pcp = zone->pageset;
246 s8 *p = pcp->vm_stat_diff + item; 235 s8 __percpu *p = pcp->vm_stat_diff + item;
236 s8 v, t;
247 237
248 (*p)--; 238 v = __this_cpu_dec_return(*p);
239 t = __this_cpu_read(pcp->stat_threshold);
240 if (unlikely(v < - t)) {
241 s8 overstep = t >> 1;
249 242
250 if (unlikely(*p < - pcp->stat_threshold)) { 243 zone_page_state_add(v - overstep, zone, item);
251 int overstep = pcp->stat_threshold / 2; 244 __this_cpu_write(*p, overstep);
252
253 zone_page_state_add(*p - overstep, zone, item);
254 *p = overstep;
255 } 245 }
256} 246}
257 247
@@ -261,6 +251,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
261} 251}
262EXPORT_SYMBOL(__dec_zone_page_state); 252EXPORT_SYMBOL(__dec_zone_page_state);
263 253
254#ifdef CONFIG_CMPXCHG_LOCAL
255/*
256 * If we have cmpxchg_local support then we do not need to incur the overhead
257 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
258 *
259 * mod_state() modifies the zone counter state through atomic per cpu
260 * operations.
261 *
262 * Overstep mode specifies how overstep should handled:
263 * 0 No overstepping
264 * 1 Overstepping half of threshold
265 * -1 Overstepping minus half of threshold
266*/
267static inline void mod_state(struct zone *zone,
268 enum zone_stat_item item, int delta, int overstep_mode)
269{
270 struct per_cpu_pageset __percpu *pcp = zone->pageset;
271 s8 __percpu *p = pcp->vm_stat_diff + item;
272 long o, n, t, z;
273
274 do {
275 z = 0; /* overflow to zone counters */
276
277 /*
278 * The fetching of the stat_threshold is racy. We may apply
279 * a counter threshold to the wrong the cpu if we get
280 * rescheduled while executing here. However, the following
281 * will apply the threshold again and therefore bring the
282 * counter under the threshold.
283 */
284 t = this_cpu_read(pcp->stat_threshold);
285
286 o = this_cpu_read(*p);
287 n = delta + o;
288
289 if (n > t || n < -t) {
290 int os = overstep_mode * (t >> 1) ;
291
292 /* Overflow must be added to zone counters */
293 z = n + os;
294 n = -os;
295 }
296 } while (this_cpu_cmpxchg(*p, o, n) != o);
297
298 if (z)
299 zone_page_state_add(z, zone, item);
300}
301
302void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
303 int delta)
304{
305 mod_state(zone, item, delta, 0);
306}
307EXPORT_SYMBOL(mod_zone_page_state);
308
309void inc_zone_state(struct zone *zone, enum zone_stat_item item)
310{
311 mod_state(zone, item, 1, 1);
312}
313
314void inc_zone_page_state(struct page *page, enum zone_stat_item item)
315{
316 mod_state(page_zone(page), item, 1, 1);
317}
318EXPORT_SYMBOL(inc_zone_page_state);
319
320void dec_zone_page_state(struct page *page, enum zone_stat_item item)
321{
322 mod_state(page_zone(page), item, -1, -1);
323}
324EXPORT_SYMBOL(dec_zone_page_state);
325#else
326/*
327 * Use interrupt disable to serialize counter updates
328 */
329void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
330 int delta)
331{
332 unsigned long flags;
333
334 local_irq_save(flags);
335 __mod_zone_page_state(zone, item, delta);
336 local_irq_restore(flags);
337}
338EXPORT_SYMBOL(mod_zone_page_state);
339
264void inc_zone_state(struct zone *zone, enum zone_stat_item item) 340void inc_zone_state(struct zone *zone, enum zone_stat_item item)
265{ 341{
266 unsigned long flags; 342 unsigned long flags;
@@ -291,6 +367,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
291 local_irq_restore(flags); 367 local_irq_restore(flags);
292} 368}
293EXPORT_SYMBOL(dec_zone_page_state); 369EXPORT_SYMBOL(dec_zone_page_state);
370#endif
294 371
295/* 372/*
296 * Update the zone counters for one cpu. 373 * Update the zone counters for one cpu.
@@ -750,8 +827,6 @@ static const char * const vmstat_text[] = {
750 "nr_shmem", 827 "nr_shmem",
751 "nr_dirtied", 828 "nr_dirtied",
752 "nr_written", 829 "nr_written",
753 "nr_dirty_threshold",
754 "nr_dirty_background_threshold",
755 830
756#ifdef CONFIG_NUMA 831#ifdef CONFIG_NUMA
757 "numa_hit", 832 "numa_hit",
@@ -761,6 +836,8 @@ static const char * const vmstat_text[] = {
761 "numa_local", 836 "numa_local",
762 "numa_other", 837 "numa_other",
763#endif 838#endif
839 "nr_dirty_threshold",
840 "nr_dirty_background_threshold",
764 841
765#ifdef CONFIG_VM_EVENT_COUNTERS 842#ifdef CONFIG_VM_EVENT_COUNTERS
766 "pgpgin", 843 "pgpgin",
@@ -1033,7 +1110,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
1033 break; 1110 break;
1034 case CPU_DOWN_PREPARE: 1111 case CPU_DOWN_PREPARE:
1035 case CPU_DOWN_PREPARE_FROZEN: 1112 case CPU_DOWN_PREPARE_FROZEN:
1036 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); 1113 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1037 per_cpu(vmstat_work, cpu).work.func = NULL; 1114 per_cpu(vmstat_work, cpu).work.func = NULL;
1038 break; 1115 break;
1039 case CPU_DOWN_FAILED: 1116 case CPU_DOWN_FAILED: