aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/memcontrol.c23
-rw-r--r--mm/mempolicy.c84
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/oom_kill.c64
-rw-r--r--mm/page_alloc.c25
-rw-r--r--mm/percpu.c35
-rw-r--r--mm/rmap.c1
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/vmscan.c9
14 files changed, 176 insertions, 95 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c948d4ca8bde..fe5f674d7a7d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR
225 For most ia64, ppc64 and x86 users with lots of address space 225 For most ia64, ppc64 and x86 users with lots of address space
226 a value of 65536 is reasonable and should cause no problems. 226 a value of 65536 is reasonable and should cause no problems.
227 On arm and other archs it should not be higher than 32768. 227 On arm and other archs it should not be higher than 32768.
228 Programs which use vm86 functionality would either need additional 228 Programs which use vm86 functionality or have some need to map
229 permissions from either the LSM or the capabilities module or have 229 this low address space will need CAP_SYS_RAWIO or disable this
230 this protection disabled. 230 protection by setting the value to 0.
231 231
232 This value can be changed after boot using the 232 This value can be changed after boot using the
233 /proc/sys/vm/mmap_min_addr tunable. 233 /proc/sys/vm/mmap_min_addr tunable.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0351e31f474..cafdcee154e8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2370,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2370 long chg = region_truncate(&inode->i_mapping->private_list, offset); 2370 long chg = region_truncate(&inode->i_mapping->private_list, offset);
2371 2371
2372 spin_lock(&inode->i_lock); 2372 spin_lock(&inode->i_lock);
2373 inode->i_blocks -= blocks_per_huge_page(h); 2373 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2374 spin_unlock(&inode->i_lock); 2374 spin_unlock(&inode->i_lock);
2375 2375
2376 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2376 hugetlb_put_quota(inode->i_mapping, (chg - freed));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5aabd41ffb8f..487267310a84 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1217,7 +1217,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
1217 } 1217 }
1218 object = NULL; 1218 object = NULL;
1219out: 1219out:
1220 rcu_read_unlock();
1221 return object; 1220 return object;
1222} 1221}
1223 1222
@@ -1233,13 +1232,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1233 1232
1234 ++(*pos); 1233 ++(*pos);
1235 1234
1236 rcu_read_lock();
1237 list_for_each_continue_rcu(n, &object_list) { 1235 list_for_each_continue_rcu(n, &object_list) {
1238 next_obj = list_entry(n, struct kmemleak_object, object_list); 1236 next_obj = list_entry(n, struct kmemleak_object, object_list);
1239 if (get_object(next_obj)) 1237 if (get_object(next_obj))
1240 break; 1238 break;
1241 } 1239 }
1242 rcu_read_unlock();
1243 1240
1244 put_object(prev_obj); 1241 put_object(prev_obj);
1245 return next_obj; 1242 return next_obj;
@@ -1255,6 +1252,7 @@ static void kmemleak_seq_stop(struct seq_file *seq, void *v)
1255 * kmemleak_seq_start may return ERR_PTR if the scan_mutex 1252 * kmemleak_seq_start may return ERR_PTR if the scan_mutex
1256 * waiting was interrupted, so only release it if !IS_ERR. 1253 * waiting was interrupted, so only release it if !IS_ERR.
1257 */ 1254 */
1255 rcu_read_unlock();
1258 mutex_unlock(&scan_mutex); 1256 mutex_unlock(&scan_mutex);
1259 if (v) 1257 if (v)
1260 put_object(v); 1258 put_object(v);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e717964cb5a0..fd4529d86de5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1207 ret = 0; 1207 ret = 0;
1208out: 1208out:
1209 unlock_page_cgroup(pc); 1209 unlock_page_cgroup(pc);
1210 /*
1211 * We charges against "to" which may not have any tasks. Then, "to"
1212 * can be under rmdir(). But in current implementation, caller of
1213 * this function is just force_empty() and it's garanteed that
1214 * "to" is never removed. So, we don't check rmdir status here.
1215 */
1210 return ret; 1216 return ret;
1211} 1217}
1212 1218
@@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1428 return; 1434 return;
1429 if (!ptr) 1435 if (!ptr)
1430 return; 1436 return;
1437 cgroup_exclude_rmdir(&ptr->css);
1431 pc = lookup_page_cgroup(page); 1438 pc = lookup_page_cgroup(page);
1432 mem_cgroup_lru_del_before_commit_swapcache(page); 1439 mem_cgroup_lru_del_before_commit_swapcache(page);
1433 __mem_cgroup_commit_charge(ptr, pc, ctype); 1440 __mem_cgroup_commit_charge(ptr, pc, ctype);
@@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1457 } 1464 }
1458 rcu_read_unlock(); 1465 rcu_read_unlock();
1459 } 1466 }
1460 /* add this page(page_cgroup) to the LRU we want. */ 1467 /*
1461 1468 * At swapin, we may charge account against cgroup which has no tasks.
1469 * So, rmdir()->pre_destroy() can be called while we do this charge.
1470 * In that case, we need to call pre_destroy() again. check it here.
1471 */
1472 cgroup_release_and_wakeup_rmdir(&ptr->css);
1462} 1473}
1463 1474
1464void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1475void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
@@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1664 1675
1665 if (!mem) 1676 if (!mem)
1666 return; 1677 return;
1667 1678 cgroup_exclude_rmdir(&mem->css);
1668 /* at migration success, oldpage->mapping is NULL. */ 1679 /* at migration success, oldpage->mapping is NULL. */
1669 if (oldpage->mapping) { 1680 if (oldpage->mapping) {
1670 target = oldpage; 1681 target = oldpage;
@@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1704 */ 1715 */
1705 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 1716 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1706 mem_cgroup_uncharge_page(target); 1717 mem_cgroup_uncharge_page(target);
1718 /*
1719 * At migration, we may charge account against cgroup which has no tasks
1720 * So, rmdir()->pre_destroy() can be called while we do this charge.
1721 * In that case, we need to call pre_destroy() again. check it here.
1722 */
1723 cgroup_release_and_wakeup_rmdir(&mem->css);
1707} 1724}
1708 1725
1709/* 1726/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e08e2c4da63a..7dd9d9f80694 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
191 * Must be called holding task's alloc_lock to protect task's mems_allowed 191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy. May also be called holding the mmap_semaphore for write. 192 * and mempolicy. May also be called holding the mmap_semaphore for write.
193 */ 193 */
194static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 194static int mpol_set_nodemask(struct mempolicy *pol,
195 const nodemask_t *nodes, struct nodemask_scratch *nsc)
195{ 196{
196 nodemask_t cpuset_context_nmask;
197 int ret; 197 int ret;
198 198
199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 if (pol == NULL) 200 if (pol == NULL)
201 return 0; 201 return 0;
202 /* Check N_HIGH_MEMORY */
203 nodes_and(nsc->mask1,
204 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
202 205
203 VM_BUG_ON(!nodes); 206 VM_BUG_ON(!nodes);
204 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 207 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205 nodes = NULL; /* explicit local allocation */ 208 nodes = NULL; /* explicit local allocation */
206 else { 209 else {
207 if (pol->flags & MPOL_F_RELATIVE_NODES) 210 if (pol->flags & MPOL_F_RELATIVE_NODES)
208 mpol_relative_nodemask(&cpuset_context_nmask, nodes, 211 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
209 &cpuset_current_mems_allowed);
210 else 212 else
211 nodes_and(cpuset_context_nmask, *nodes, 213 nodes_and(nsc->mask2, *nodes, nsc->mask1);
212 cpuset_current_mems_allowed); 214
213 if (mpol_store_user_nodemask(pol)) 215 if (mpol_store_user_nodemask(pol))
214 pol->w.user_nodemask = *nodes; 216 pol->w.user_nodemask = *nodes;
215 else 217 else
@@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
217 cpuset_current_mems_allowed; 219 cpuset_current_mems_allowed;
218 } 220 }
219 221
220 ret = mpol_ops[pol->mode].create(pol, 222 if (nodes)
221 nodes ? &cpuset_context_nmask : NULL); 223 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
224 else
225 ret = mpol_ops[pol->mode].create(pol, NULL);
222 return ret; 226 return ret;
223} 227}
224 228
@@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
620{ 624{
621 struct mempolicy *new, *old; 625 struct mempolicy *new, *old;
622 struct mm_struct *mm = current->mm; 626 struct mm_struct *mm = current->mm;
627 NODEMASK_SCRATCH(scratch);
623 int ret; 628 int ret;
624 629
625 new = mpol_new(mode, flags, nodes); 630 if (!scratch)
626 if (IS_ERR(new)) 631 return -ENOMEM;
627 return PTR_ERR(new);
628 632
633 new = mpol_new(mode, flags, nodes);
634 if (IS_ERR(new)) {
635 ret = PTR_ERR(new);
636 goto out;
637 }
629 /* 638 /*
630 * prevent changing our mempolicy while show_numa_maps() 639 * prevent changing our mempolicy while show_numa_maps()
631 * is using it. 640 * is using it.
@@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
635 if (mm) 644 if (mm)
636 down_write(&mm->mmap_sem); 645 down_write(&mm->mmap_sem);
637 task_lock(current); 646 task_lock(current);
638 ret = mpol_set_nodemask(new, nodes); 647 ret = mpol_set_nodemask(new, nodes, scratch);
639 if (ret) { 648 if (ret) {
640 task_unlock(current); 649 task_unlock(current);
641 if (mm) 650 if (mm)
642 up_write(&mm->mmap_sem); 651 up_write(&mm->mmap_sem);
643 mpol_put(new); 652 mpol_put(new);
644 return ret; 653 goto out;
645 } 654 }
646 old = current->mempolicy; 655 old = current->mempolicy;
647 current->mempolicy = new; 656 current->mempolicy = new;
@@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
654 up_write(&mm->mmap_sem); 663 up_write(&mm->mmap_sem);
655 664
656 mpol_put(old); 665 mpol_put(old);
657 return 0; 666 ret = 0;
667out:
668 NODEMASK_SCRATCH_FREE(scratch);
669 return ret;
658} 670}
659 671
660/* 672/*
@@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len,
1014 if (err) 1026 if (err)
1015 return err; 1027 return err;
1016 } 1028 }
1017 down_write(&mm->mmap_sem); 1029 {
1018 task_lock(current); 1030 NODEMASK_SCRATCH(scratch);
1019 err = mpol_set_nodemask(new, nmask); 1031 if (scratch) {
1020 task_unlock(current); 1032 down_write(&mm->mmap_sem);
1033 task_lock(current);
1034 err = mpol_set_nodemask(new, nmask, scratch);
1035 task_unlock(current);
1036 if (err)
1037 up_write(&mm->mmap_sem);
1038 } else
1039 err = -ENOMEM;
1040 NODEMASK_SCRATCH_FREE(scratch);
1041 }
1021 if (err) { 1042 if (err) {
1022 up_write(&mm->mmap_sem);
1023 mpol_put(new); 1043 mpol_put(new);
1024 return err; 1044 return err;
1025 } 1045 }
@@ -1891,6 +1911,7 @@ restart:
1891 * Install non-NULL @mpol in inode's shared policy rb-tree. 1911 * Install non-NULL @mpol in inode's shared policy rb-tree.
1892 * On entry, the current task has a reference on a non-NULL @mpol. 1912 * On entry, the current task has a reference on a non-NULL @mpol.
1893 * This must be released on exit. 1913 * This must be released on exit.
1914 * This is called at get_inode() calls and we can use GFP_KERNEL.
1894 */ 1915 */
1895void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1916void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1896{ 1917{
@@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1902 if (mpol) { 1923 if (mpol) {
1903 struct vm_area_struct pvma; 1924 struct vm_area_struct pvma;
1904 struct mempolicy *new; 1925 struct mempolicy *new;
1926 NODEMASK_SCRATCH(scratch);
1905 1927
1928 if (!scratch)
1929 return;
1906 /* contextualize the tmpfs mount point mempolicy */ 1930 /* contextualize the tmpfs mount point mempolicy */
1907 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1931 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1908 if (IS_ERR(new)) { 1932 if (IS_ERR(new)) {
1909 mpol_put(mpol); /* drop our ref on sb mpol */ 1933 mpol_put(mpol); /* drop our ref on sb mpol */
1934 NODEMASK_SCRATCH_FREE(scratch);
1910 return; /* no valid nodemask intersection */ 1935 return; /* no valid nodemask intersection */
1911 } 1936 }
1912 1937
1913 task_lock(current); 1938 task_lock(current);
1914 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); 1939 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1915 task_unlock(current); 1940 task_unlock(current);
1916 mpol_put(mpol); /* drop our ref on sb mpol */ 1941 mpol_put(mpol); /* drop our ref on sb mpol */
1917 if (ret) { 1942 if (ret) {
1943 NODEMASK_SCRATCH_FREE(scratch);
1918 mpol_put(new); 1944 mpol_put(new);
1919 return; 1945 return;
1920 } 1946 }
@@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1924 pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 1950 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
1925 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 1951 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1926 mpol_put(new); /* drop initial ref */ 1952 mpol_put(new); /* drop initial ref */
1953 NODEMASK_SCRATCH_FREE(scratch);
1927 } 1954 }
1928} 1955}
1929 1956
@@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2140 err = 1; 2167 err = 1;
2141 else { 2168 else {
2142 int ret; 2169 int ret;
2143 2170 NODEMASK_SCRATCH(scratch);
2144 task_lock(current); 2171 if (scratch) {
2145 ret = mpol_set_nodemask(new, &nodes); 2172 task_lock(current);
2146 task_unlock(current); 2173 ret = mpol_set_nodemask(new, &nodes, scratch);
2147 if (ret) 2174 task_unlock(current);
2175 } else
2176 ret = -ENOMEM;
2177 NODEMASK_SCRATCH_FREE(scratch);
2178 if (ret) {
2148 err = 1; 2179 err = 1;
2149 else if (no_context) { 2180 mpol_put(new);
2181 } else if (no_context) {
2150 /* save for contextualization */ 2182 /* save for contextualization */
2151 new->w.user_nodemask = nodes; 2183 new->w.user_nodemask = nodes;
2152 } 2184 }
diff --git a/mm/mempool.c b/mm/mempool.c
index a46eb1b4bb66..32e75d400503 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab);
303 */ 303 */
304void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) 304void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
305{ 305{
306 size_t size = (size_t)(long)pool_data; 306 size_t size = (size_t)pool_data;
307 return kmalloc(size, gfp_mask); 307 return kmalloc(size, gfp_mask);
308} 308}
309EXPORT_SYMBOL(mempool_kmalloc); 309EXPORT_SYMBOL(mempool_kmalloc);
310 310
311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) 311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
312{ 312{
313 size_t size = (size_t) pool_data; 313 size_t size = (size_t)pool_data;
314 return kzalloc(size, gfp_mask); 314 return kzalloc(size, gfp_mask);
315} 315}
316EXPORT_SYMBOL(mempool_kzalloc); 316EXPORT_SYMBOL(mempool_kzalloc);
diff --git a/mm/mmap.c b/mm/mmap.c
index 34579b23ebd5..8101de490c73 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
88int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 88int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
89struct percpu_counter vm_committed_as; 89struct percpu_counter vm_committed_as;
90 90
91/* amount of vm to protect from userspace access */
92unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
93
94/* 91/*
95 * Check that a process has enough memory to allocate a new virtual 92 * Check that a process has enough memory to allocate a new virtual
96 * mapping. 0 means there is enough memory for the allocation to 93 * mapping. 0 means there is enough memory for the allocation to
diff --git a/mm/nommu.c b/mm/nommu.c
index 53cab10fece4..4bde489ec431 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72/* amount of vm to protect from userspace access */
73unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
74
75atomic_long_t mmap_pages_allocated; 72atomic_long_t mmap_pages_allocated;
76 73
77EXPORT_SYMBOL(mem_map); 74EXPORT_SYMBOL(mem_map);
@@ -922,6 +919,10 @@ static int validate_mmap_request(struct file *file,
922 if (!file->f_op->read) 919 if (!file->f_op->read)
923 capabilities &= ~BDI_CAP_MAP_COPY; 920 capabilities &= ~BDI_CAP_MAP_COPY;
924 921
922 /* The file shall have been opened with read permission. */
923 if (!(file->f_mode & FMODE_READ))
924 return -EACCES;
925
925 if (flags & MAP_SHARED) { 926 if (flags & MAP_SHARED) {
926 /* do checks for writing, appending and locking */ 927 /* do checks for writing, appending and locking */
927 if ((prot & PROT_WRITE) && 928 if ((prot & PROT_WRITE) &&
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 175a67a78a99..a7b2460e922b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 int oom_adj;
62 61
63 task_lock(p); 62 task_lock(p);
64 mm = p->mm; 63 mm = p->mm;
@@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
66 task_unlock(p); 65 task_unlock(p);
67 return 0; 66 return 0;
68 } 67 }
69 oom_adj = mm->oom_adj;
70 if (oom_adj == OOM_DISABLE) {
71 task_unlock(p);
72 return 0;
73 }
74 68
75 /* 69 /*
76 * The memory size of the process is the basis for the badness. 70 * The memory size of the process is the basis for the badness.
@@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
154 points /= 8; 148 points /= 8;
155 149
156 /* 150 /*
157 * Adjust the score by oom_adj. 151 * Adjust the score by oomkilladj.
158 */ 152 */
159 if (oom_adj) { 153 if (p->oomkilladj) {
160 if (oom_adj > 0) { 154 if (p->oomkilladj > 0) {
161 if (!points) 155 if (!points)
162 points = 1; 156 points = 1;
163 points <<= oom_adj; 157 points <<= p->oomkilladj;
164 } else 158 } else
165 points >>= -(oom_adj); 159 points >>= -(p->oomkilladj);
166 } 160 }
167 161
168#ifdef DEBUG 162#ifdef DEBUG
@@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
257 *ppoints = ULONG_MAX; 251 *ppoints = ULONG_MAX;
258 } 252 }
259 253
254 if (p->oomkilladj == OOM_DISABLE)
255 continue;
256
260 points = badness(p, uptime.tv_sec); 257 points = badness(p, uptime.tv_sec);
261 if (points > *ppoints) { 258 if (points > *ppoints || !chosen) {
262 chosen = p; 259 chosen = p;
263 *ppoints = points; 260 *ppoints = points;
264 } 261 }
@@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
307 } 304 }
308 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
309 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
310 get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); 307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
308 p->comm);
311 task_unlock(p); 309 task_unlock(p);
312 } while_each_thread(g, p); 310 } while_each_thread(g, p);
313} 311}
@@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
325 return; 323 return;
326 } 324 }
327 325
328 if (!p->mm) 326 if (!p->mm) {
327 WARN_ON(1);
328 printk(KERN_WARNING "tried to kill an mm-less task!\n");
329 return; 329 return;
330 }
330 331
331 if (verbose) 332 if (verbose)
332 printk(KERN_ERR "Killed process %d (%s)\n", 333 printk(KERN_ERR "Killed process %d (%s)\n",
@@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p)
348 struct mm_struct *mm; 349 struct mm_struct *mm;
349 struct task_struct *g, *q; 350 struct task_struct *g, *q;
350 351
351 task_lock(p);
352 mm = p->mm; 352 mm = p->mm;
353 if (!mm || mm->oom_adj == OOM_DISABLE) { 353
354 task_unlock(p); 354 /* WARNING: mm may not be dereferenced since we did not obtain its
355 * value from get_task_mm(p). This is OK since all we need to do is
356 * compare mm to q->mm below.
357 *
358 * Furthermore, even if mm contains a non-NULL value, p->mm may
359 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us.
361 */
362
363 if (mm == NULL)
355 return 1; 364 return 1;
356 } 365
357 task_unlock(p); 366 /*
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
358 __oom_kill_task(p, 1); 374 __oom_kill_task(p, 1);
359 375
360 /* 376 /*
@@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
377 struct task_struct *c; 393 struct task_struct *c;
378 394
379 if (printk_ratelimit()) { 395 if (printk_ratelimit()) {
380 task_lock(current);
381 printk(KERN_WARNING "%s invoked oom-killer: " 396 printk(KERN_WARNING "%s invoked oom-killer: "
382 "gfp_mask=0x%x, order=%d, oom_adj=%d\n", 397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
383 current->comm, gfp_mask, order, 398 current->comm, gfp_mask, order, current->oomkilladj);
384 current->mm ? current->mm->oom_adj : OOM_DISABLE); 399 task_lock(current);
385 cpuset_print_task_mems_allowed(current); 400 cpuset_print_task_mems_allowed(current);
386 task_unlock(current); 401 task_unlock(current);
387 dump_stack(); 402 dump_stack();
@@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
394 /* 409 /*
395 * If the task is already exiting, don't alarm the sysadmin or kill 410 * If the task is already exiting, don't alarm the sysadmin or kill
396 * its children or threads, just set TIF_MEMDIE so it can die quickly 411 * its children or threads, just set TIF_MEMDIE so it can die quickly
397 * if its mm is still attached.
398 */ 412 */
399 if (p->mm && (p->flags & PF_EXITING)) { 413 if (p->flags & PF_EXITING) {
400 __oom_kill_task(p, 0); 414 __oom_kill_task(p, 0);
401 return 0; 415 return 0;
402 } 416 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index caa92689aac9..5cc986eb9f6f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -882,7 +882,7 @@ retry_reserve:
882 */ 882 */
883static int rmqueue_bulk(struct zone *zone, unsigned int order, 883static int rmqueue_bulk(struct zone *zone, unsigned int order,
884 unsigned long count, struct list_head *list, 884 unsigned long count, struct list_head *list,
885 int migratetype) 885 int migratetype, int cold)
886{ 886{
887 int i; 887 int i;
888 888
@@ -901,7 +901,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
901 * merge IO requests if the physical pages are ordered 901 * merge IO requests if the physical pages are ordered
902 * properly. 902 * properly.
903 */ 903 */
904 list_add(&page->lru, list); 904 if (likely(cold == 0))
905 list_add(&page->lru, list);
906 else
907 list_add_tail(&page->lru, list);
905 set_page_private(page, migratetype); 908 set_page_private(page, migratetype);
906 list = &page->lru; 909 list = &page->lru;
907 } 910 }
@@ -1119,7 +1122,8 @@ again:
1119 local_irq_save(flags); 1122 local_irq_save(flags);
1120 if (!pcp->count) { 1123 if (!pcp->count) {
1121 pcp->count = rmqueue_bulk(zone, 0, 1124 pcp->count = rmqueue_bulk(zone, 0,
1122 pcp->batch, &pcp->list, migratetype); 1125 pcp->batch, &pcp->list,
1126 migratetype, cold);
1123 if (unlikely(!pcp->count)) 1127 if (unlikely(!pcp->count))
1124 goto failed; 1128 goto failed;
1125 } 1129 }
@@ -1138,7 +1142,8 @@ again:
1138 /* Allocate more to the pcp list if necessary */ 1142 /* Allocate more to the pcp list if necessary */
1139 if (unlikely(&page->lru == &pcp->list)) { 1143 if (unlikely(&page->lru == &pcp->list)) {
1140 pcp->count += rmqueue_bulk(zone, 0, 1144 pcp->count += rmqueue_bulk(zone, 0,
1141 pcp->batch, &pcp->list, migratetype); 1145 pcp->batch, &pcp->list,
1146 migratetype, cold);
1142 page = list_entry(pcp->list.next, struct page, lru); 1147 page = list_entry(pcp->list.next, struct page, lru);
1143 } 1148 }
1144 1149
@@ -1740,8 +1745,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1740 * be using allocators in order of preference for an area that is 1745 * be using allocators in order of preference for an area that is
1741 * too large. 1746 * too large.
1742 */ 1747 */
1743 if (WARN_ON_ONCE(order >= MAX_ORDER)) 1748 if (order >= MAX_ORDER) {
1749 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
1744 return NULL; 1750 return NULL;
1751 }
1745 1752
1746 /* 1753 /*
1747 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1754 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1789,6 +1796,10 @@ rebalance:
1789 if (p->flags & PF_MEMALLOC) 1796 if (p->flags & PF_MEMALLOC)
1790 goto nopage; 1797 goto nopage;
1791 1798
1799 /* Avoid allocations with no watermarks from looping endlessly */
1800 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1801 goto nopage;
1802
1792 /* Try direct reclaim and then allocating */ 1803 /* Try direct reclaim and then allocating */
1793 page = __alloc_pages_direct_reclaim(gfp_mask, order, 1804 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1794 zonelist, high_zoneidx, 1805 zonelist, high_zoneidx,
@@ -2533,7 +2544,6 @@ static void build_zonelists(pg_data_t *pgdat)
2533 prev_node = local_node; 2544 prev_node = local_node;
2534 nodes_clear(used_mask); 2545 nodes_clear(used_mask);
2535 2546
2536 memset(node_load, 0, sizeof(node_load));
2537 memset(node_order, 0, sizeof(node_order)); 2547 memset(node_order, 0, sizeof(node_order));
2538 j = 0; 2548 j = 0;
2539 2549
@@ -2642,6 +2652,9 @@ static int __build_all_zonelists(void *dummy)
2642{ 2652{
2643 int nid; 2653 int nid;
2644 2654
2655#ifdef CONFIG_NUMA
2656 memset(node_load, 0, sizeof(node_load));
2657#endif
2645 for_each_online_node(nid) { 2658 for_each_online_node(nid) {
2646 pg_data_t *pgdat = NODE_DATA(nid); 2659 pg_data_t *pgdat = NODE_DATA(nid);
2647 2660
diff --git a/mm/percpu.c b/mm/percpu.c
index b70f2acd8853..5fe37842e0ea 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,12 @@
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of num_possible_cpus() units and the first chunk 11 * chunk is consisted of nr_cpu_ids units and the first chunk is used
12 * is used for static percpu variables in the kernel image (special 12 * for static percpu variables in the kernel image (special boot time
13 * boot time alloc/init handling necessary as these areas need to be 13 * alloc/init handling necessary as these areas need to be brought up
14 * brought up before allocation services are running). Unit grows as 14 * before allocation services are running). Unit grows as necessary
15 * necessary and all units grow or shrink in unison. When a chunk is 15 * and all units grow or shrink in unison. When a chunk is filled up,
16 * filled up, another chunk is allocated. ie. in vmalloc area 16 * another chunk is allocated. ie. in vmalloc area
17 * 17 *
18 * c0 c1 c2 18 * c0 c1 c2
19 * ------------------- ------------------- ------------ 19 * ------------------- ------------------- ------------
@@ -558,7 +558,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
558static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 558static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
559 bool flush_tlb) 559 bool flush_tlb)
560{ 560{
561 unsigned int last = num_possible_cpus() - 1; 561 unsigned int last = nr_cpu_ids - 1;
562 unsigned int cpu; 562 unsigned int cpu;
563 563
564 /* unmap must not be done on immutable chunk */ 564 /* unmap must not be done on immutable chunk */
@@ -643,7 +643,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
643 */ 643 */
644static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 644static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
645{ 645{
646 unsigned int last = num_possible_cpus() - 1; 646 unsigned int last = nr_cpu_ids - 1;
647 unsigned int cpu; 647 unsigned int cpu;
648 int err; 648 int err;
649 649
@@ -749,7 +749,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
749 chunk->map[chunk->map_used++] = pcpu_unit_size; 749 chunk->map[chunk->map_used++] = pcpu_unit_size;
750 chunk->page = chunk->page_ar; 750 chunk->page = chunk->page_ar;
751 751
752 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); 752 chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
753 if (!chunk->vm) { 753 if (!chunk->vm) {
754 free_pcpu_chunk(chunk); 754 free_pcpu_chunk(chunk);
755 return NULL; 755 return NULL;
@@ -1067,9 +1067,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1067 PFN_UP(size_sum)); 1067 PFN_UP(size_sum));
1068 1068
1069 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1069 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1070 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; 1070 pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
1071 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) 1071 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
1072 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); 1072 + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
1073 1073
1074 if (dyn_size < 0) 1074 if (dyn_size < 0)
1075 dyn_size = pcpu_unit_size - static_size - reserved_size; 1075 dyn_size = pcpu_unit_size - static_size - reserved_size;
@@ -1248,7 +1248,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1248 } else 1248 } else
1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1250 1250
1251 chunk_size = pcpue_unit_size * num_possible_cpus(); 1251 chunk_size = pcpue_unit_size * nr_cpu_ids;
1252 1252
1253 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, 1253 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
1254 __pa(MAX_DMA_ADDRESS)); 1254 __pa(MAX_DMA_ADDRESS));
@@ -1259,12 +1259,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1259 } 1259 }
1260 1260
1261 /* return the leftover and copy */ 1261 /* return the leftover and copy */
1262 for_each_possible_cpu(cpu) { 1262 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
1263 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1263 void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
1264 1264
1265 free_bootmem(__pa(ptr + pcpue_size), 1265 if (cpu_possible(cpu)) {
1266 pcpue_unit_size - pcpue_size); 1266 free_bootmem(__pa(ptr + pcpue_size),
1267 memcpy(ptr, __per_cpu_load, static_size); 1267 pcpue_unit_size - pcpue_size);
1268 memcpy(ptr, __per_cpu_load, static_size);
1269 } else
1270 free_bootmem(__pa(ptr), pcpue_unit_size);
1268 } 1271 }
1269 1272
1270 /* we're ready, commit */ 1273 /* we're ready, commit */
diff --git a/mm/rmap.c b/mm/rmap.c
index 836c6c63e1f2..0895b5c7cbff 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -358,6 +358,7 @@ static int page_referenced_one(struct page *page,
358 */ 358 */
359 if (vma->vm_flags & VM_LOCKED) { 359 if (vma->vm_flags & VM_LOCKED) {
360 *mapcount = 1; /* break early from loop */ 360 *mapcount = 1; /* break early from loop */
361 *vm_flags |= VM_LOCKED;
361 goto out_unmap; 362 goto out_unmap;
362 } 363 }
363 364
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d1ade1a48ee7..8ffdc0d23c53 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
753 753
754 if (!bdev) { 754 if (!bdev) {
755 if (bdev_p) 755 if (bdev_p)
756 *bdev_p = bdget(sis->bdev->bd_dev); 756 *bdev_p = bdgrab(sis->bdev);
757 757
758 spin_unlock(&swap_lock); 758 spin_unlock(&swap_lock);
759 return i; 759 return i;
@@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
765 struct swap_extent, list); 765 struct swap_extent, list);
766 if (se->start_block == offset) { 766 if (se->start_block == offset) {
767 if (bdev_p) 767 if (bdev_p)
768 *bdev_p = bdget(sis->bdev->bd_dev); 768 *bdev_p = bdgrab(sis->bdev);
769 769
770 spin_unlock(&swap_lock); 770 spin_unlock(&swap_lock);
771 bdput(bdev); 771 bdput(bdev);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dea7abd31098..94e86dd6954c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -630,9 +630,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
630 630
631 referenced = page_referenced(page, 1, 631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags); 632 sc->mem_cgroup, &vm_flags);
633 /* In active use or really unfreeable? Activate it. */ 633 /*
634 * In active use or really unfreeable? Activate it.
635 * If page which have PG_mlocked lost isoltation race,
636 * try_to_unmap moves it to unevictable list
637 */
634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 638 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
635 referenced && page_mapping_inuse(page)) 639 referenced && page_mapping_inuse(page)
640 && !(vm_flags & VM_LOCKED))
636 goto activate_locked; 641 goto activate_locked;
637 642
638 /* 643 /*