aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJames Morris <jmorris@macbook.(none)>2009-12-03 01:33:40 -0500
committerJames Morris <jmorris@macbook.(none)>2009-12-03 01:33:40 -0500
commitc84d6efd363a3948eb32ec40d46bab6338580454 (patch)
tree3ba7ac46e6626fe8ac843834588609eb6ccee5c6 /mm
parent7539cf4b92be4aecc573ea962135f246a7a33401 (diff)
parent22763c5cf3690a681551162c15d34d935308c8d7 (diff)
Merge branch 'master' into next
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig13
-rw-r--r--mm/backing-dev.c27
-rw-r--r--mm/highmem.c17
-rw-r--r--mm/kmemleak.c5
-rw-r--r--mm/ksm.c11
-rw-r--r--mm/memcontrol.c127
-rw-r--r--mm/memory-failure.c59
-rw-r--r--mm/memory.c14
-rw-r--r--mm/memory_hotplug.c24
-rw-r--r--mm/mempolicy.c13
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/page-writeback.c3
-rw-r--r--mm/page_alloc.c7
-rw-r--r--mm/percpu.c217
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/swapfile.c15
-rw-r--r--mm/vmalloc.c50
-rw-r--r--mm/vmscan.c14
19 files changed, 390 insertions, 238 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index edd300aca173..44cf6f0a3a6d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -67,7 +67,7 @@ config DISCONTIGMEM
67 67
68config SPARSEMEM 68config SPARSEMEM
69 def_bool y 69 def_bool y
70 depends on SPARSEMEM_MANUAL 70 depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
71 71
72config FLATMEM 72config FLATMEM
73 def_bool y 73 def_bool y
@@ -128,11 +128,8 @@ config SPARSEMEM_VMEMMAP
128config MEMORY_HOTPLUG 128config MEMORY_HOTPLUG
129 bool "Allow for memory hot-add" 129 bool "Allow for memory hot-add"
130 depends on SPARSEMEM || X86_64_ACPI_NUMA 130 depends on SPARSEMEM || X86_64_ACPI_NUMA
131 depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG 131 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
132 depends on (IA64 || X86 || PPC64 || SUPERH || S390) 132 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
133
134comment "Memory hotplug is currently incompatible with Software Suspend"
135 depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
136 133
137config MEMORY_HOTPLUG_SPARSE 134config MEMORY_HOTPLUG_SPARSE
138 def_bool y 135 def_bool y
@@ -224,7 +221,9 @@ config KSM
224 the many instances by a single resident page with that content, so 221 the many instances by a single resident page with that content, so
225 saving memory until one or another app needs to modify the content. 222 saving memory until one or another app needs to modify the content.
226 Recommended for use with KVM, or with other duplicative applications. 223 Recommended for use with KVM, or with other duplicative applications.
227 See Documentation/vm/ksm.txt for more information. 224 See Documentation/vm/ksm.txt for more information: KSM is inactive
225 until a program has madvised that an area is MADV_MERGEABLE, and
226 root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
228 227
229config DEFAULT_MMAP_MIN_ADDR 228config DEFAULT_MMAP_MIN_ADDR
230 int "Low address space to protect from user allocation" 229 int "Low address space to protect from user allocation"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3d3accb1f800..67a33a5a1a93 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -92,7 +92,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
92 "BdiDirtyThresh: %8lu kB\n" 92 "BdiDirtyThresh: %8lu kB\n"
93 "DirtyThresh: %8lu kB\n" 93 "DirtyThresh: %8lu kB\n"
94 "BackgroundThresh: %8lu kB\n" 94 "BackgroundThresh: %8lu kB\n"
95 "WriteBack threads:%8lu\n" 95 "WritebackThreads: %8lu\n"
96 "b_dirty: %8lu\n" 96 "b_dirty: %8lu\n"
97 "b_io: %8lu\n" 97 "b_io: %8lu\n"
98 "b_more_io: %8lu\n" 98 "b_more_io: %8lu\n"
@@ -604,15 +604,36 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
604 604
605 /* 605 /*
606 * Finally, kill the kernel threads. We don't need to be RCU 606 * Finally, kill the kernel threads. We don't need to be RCU
607 * safe anymore, since the bdi is gone from visibility. 607 * safe anymore, since the bdi is gone from visibility. Force
608 * unfreeze of the thread before calling kthread_stop(), otherwise
609 * it would never exet if it is currently stuck in the refrigerator.
608 */ 610 */
609 list_for_each_entry(wb, &bdi->wb_list, list) 611 list_for_each_entry(wb, &bdi->wb_list, list) {
612 wb->task->flags &= ~PF_FROZEN;
610 kthread_stop(wb->task); 613 kthread_stop(wb->task);
614 }
615}
616
617/*
618 * This bdi is going away now, make sure that no super_blocks point to it
619 */
620static void bdi_prune_sb(struct backing_dev_info *bdi)
621{
622 struct super_block *sb;
623
624 spin_lock(&sb_lock);
625 list_for_each_entry(sb, &super_blocks, s_list) {
626 if (sb->s_bdi == bdi)
627 sb->s_bdi = NULL;
628 }
629 spin_unlock(&sb_lock);
611} 630}
612 631
613void bdi_unregister(struct backing_dev_info *bdi) 632void bdi_unregister(struct backing_dev_info *bdi)
614{ 633{
615 if (bdi->dev) { 634 if (bdi->dev) {
635 bdi_prune_sb(bdi);
636
616 if (!bdi_cap_flush_forker(bdi)) 637 if (!bdi_cap_flush_forker(bdi))
617 bdi_wb_shutdown(bdi); 638 bdi_wb_shutdown(bdi);
618 bdi_debug_unregister(bdi); 639 bdi_debug_unregister(bdi);
diff --git a/mm/highmem.c b/mm/highmem.c
index 25878cc49daa..9c1e627f282e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -426,16 +426,21 @@ void __init page_address_init(void)
426 426
427void debug_kmap_atomic(enum km_type type) 427void debug_kmap_atomic(enum km_type type)
428{ 428{
429 static unsigned warn_count = 10; 429 static int warn_count = 10;
430 430
431 if (unlikely(warn_count == 0)) 431 if (unlikely(warn_count < 0))
432 return; 432 return;
433 433
434 if (unlikely(in_interrupt())) { 434 if (unlikely(in_interrupt())) {
435 if (in_irq()) { 435 if (in_nmi()) {
436 if (type != KM_NMI && type != KM_NMI_PTE) {
437 WARN_ON(1);
438 warn_count--;
439 }
440 } else if (in_irq()) {
436 if (type != KM_IRQ0 && type != KM_IRQ1 && 441 if (type != KM_IRQ0 && type != KM_IRQ1 &&
437 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && 442 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
438 type != KM_BOUNCE_READ) { 443 type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
439 WARN_ON(1); 444 WARN_ON(1);
440 warn_count--; 445 warn_count--;
441 } 446 }
@@ -452,7 +457,9 @@ void debug_kmap_atomic(enum km_type type)
452 } 457 }
453 458
454 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || 459 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
455 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { 460 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
461 type == KM_IRQ_PTE || type == KM_NMI ||
462 type == KM_NMI_PTE ) {
456 if (!irqs_disabled()) { 463 if (!irqs_disabled()) {
457 WARN_ON(1); 464 WARN_ON(1);
458 warn_count--; 465 warn_count--;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 4ea4510e2996..8bf765c4f58d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -833,12 +833,15 @@ static void early_alloc(struct early_log *log)
833 */ 833 */
834 rcu_read_lock(); 834 rcu_read_lock();
835 object = create_object((unsigned long)log->ptr, log->size, 835 object = create_object((unsigned long)log->ptr, log->size,
836 log->min_count, GFP_KERNEL); 836 log->min_count, GFP_ATOMIC);
837 if (!object)
838 goto out;
837 spin_lock_irqsave(&object->lock, flags); 839 spin_lock_irqsave(&object->lock, flags);
838 for (i = 0; i < log->trace_len; i++) 840 for (i = 0; i < log->trace_len; i++)
839 object->trace[i] = log->trace[i]; 841 object->trace[i] = log->trace[i];
840 object->trace_len = log->trace_len; 842 object->trace_len = log->trace_len;
841 spin_unlock_irqrestore(&object->lock, flags); 843 spin_unlock_irqrestore(&object->lock, flags);
844out:
842 rcu_read_unlock(); 845 rcu_read_unlock();
843} 846}
844 847
diff --git a/mm/ksm.c b/mm/ksm.c
index f7edac356f46..5575f8628fef 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -184,11 +184,6 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock);
184 sizeof(struct __struct), __alignof__(struct __struct),\ 184 sizeof(struct __struct), __alignof__(struct __struct),\
185 (__flags), NULL) 185 (__flags), NULL)
186 186
187static void __init ksm_init_max_kernel_pages(void)
188{
189 ksm_max_kernel_pages = nr_free_buffer_pages() / 4;
190}
191
192static int __init ksm_slab_init(void) 187static int __init ksm_slab_init(void)
193{ 188{
194 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); 189 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
@@ -1017,6 +1012,7 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page,
1017 struct rmap_item *tree_rmap_item; 1012 struct rmap_item *tree_rmap_item;
1018 int ret; 1013 int ret;
1019 1014
1015 cond_resched();
1020 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1016 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1021 page2[0] = get_mergeable_page(tree_rmap_item); 1017 page2[0] = get_mergeable_page(tree_rmap_item);
1022 if (!page2[0]) 1018 if (!page2[0])
@@ -1673,7 +1669,7 @@ static int __init ksm_init(void)
1673 struct task_struct *ksm_thread; 1669 struct task_struct *ksm_thread;
1674 int err; 1670 int err;
1675 1671
1676 ksm_init_max_kernel_pages(); 1672 ksm_max_kernel_pages = totalram_pages / 4;
1677 1673
1678 err = ksm_slab_init(); 1674 err = ksm_slab_init();
1679 if (err) 1675 if (err)
@@ -1697,6 +1693,9 @@ static int __init ksm_init(void)
1697 kthread_stop(ksm_thread); 1693 kthread_stop(ksm_thread);
1698 goto out_free2; 1694 goto out_free2;
1699 } 1695 }
1696#else
1697 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
1698
1700#endif /* CONFIG_SYSFS */ 1699#endif /* CONFIG_SYSFS */
1701 1700
1702 return 0; 1701 return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2b98a6875c0..f99f5991d6bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -313,7 +313,8 @@ soft_limit_tree_from_page(struct page *page)
313static void 313static void
314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
315 struct mem_cgroup_per_zone *mz, 315 struct mem_cgroup_per_zone *mz,
316 struct mem_cgroup_tree_per_zone *mctz) 316 struct mem_cgroup_tree_per_zone *mctz,
317 unsigned long long new_usage_in_excess)
317{ 318{
318 struct rb_node **p = &mctz->rb_root.rb_node; 319 struct rb_node **p = &mctz->rb_root.rb_node;
319 struct rb_node *parent = NULL; 320 struct rb_node *parent = NULL;
@@ -322,7 +323,9 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
322 if (mz->on_tree) 323 if (mz->on_tree)
323 return; 324 return;
324 325
325 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); 326 mz->usage_in_excess = new_usage_in_excess;
327 if (!mz->usage_in_excess)
328 return;
326 while (*p) { 329 while (*p) {
327 parent = *p; 330 parent = *p;
328 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 331 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
@@ -353,16 +356,6 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
353} 356}
354 357
355static void 358static void
356mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
357 struct mem_cgroup_per_zone *mz,
358 struct mem_cgroup_tree_per_zone *mctz)
359{
360 spin_lock(&mctz->lock);
361 __mem_cgroup_insert_exceeded(mem, mz, mctz);
362 spin_unlock(&mctz->lock);
363}
364
365static void
366mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 359mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
367 struct mem_cgroup_per_zone *mz, 360 struct mem_cgroup_per_zone *mz,
368 struct mem_cgroup_tree_per_zone *mctz) 361 struct mem_cgroup_tree_per_zone *mctz)
@@ -392,34 +385,36 @@ static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
392 385
393static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 386static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
394{ 387{
395 unsigned long long prev_usage_in_excess, new_usage_in_excess; 388 unsigned long long excess;
396 bool updated_tree = false;
397 struct mem_cgroup_per_zone *mz; 389 struct mem_cgroup_per_zone *mz;
398 struct mem_cgroup_tree_per_zone *mctz; 390 struct mem_cgroup_tree_per_zone *mctz;
399 391 int nid = page_to_nid(page);
400 mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); 392 int zid = page_zonenum(page);
401 mctz = soft_limit_tree_from_page(page); 393 mctz = soft_limit_tree_from_page(page);
402 394
403 /* 395 /*
404 * We do updates in lazy mode, mem's are removed 396 * Necessary to update all ancestors when hierarchy is used.
405 * lazily from the per-zone, per-node rb tree 397 * because their event counter is not touched.
406 */ 398 */
407 prev_usage_in_excess = mz->usage_in_excess; 399 for (; mem; mem = parent_mem_cgroup(mem)) {
408 400 mz = mem_cgroup_zoneinfo(mem, nid, zid);
409 new_usage_in_excess = res_counter_soft_limit_excess(&mem->res); 401 excess = res_counter_soft_limit_excess(&mem->res);
410 if (prev_usage_in_excess) { 402 /*
411 mem_cgroup_remove_exceeded(mem, mz, mctz); 403 * We have to update the tree if mz is on RB-tree or
412 updated_tree = true; 404 * mem is over its softlimit.
413 } 405 */
414 if (!new_usage_in_excess) 406 if (excess || mz->on_tree) {
415 goto done; 407 spin_lock(&mctz->lock);
416 mem_cgroup_insert_exceeded(mem, mz, mctz); 408 /* if on-tree, remove it */
417 409 if (mz->on_tree)
418done: 410 __mem_cgroup_remove_exceeded(mem, mz, mctz);
419 if (updated_tree) { 411 /*
420 spin_lock(&mctz->lock); 412 * Insert again. mz->usage_in_excess will be updated.
421 mz->usage_in_excess = new_usage_in_excess; 413 * If excess is 0, no tree ops.
422 spin_unlock(&mctz->lock); 414 */
415 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
416 spin_unlock(&mctz->lock);
417 }
423 } 418 }
424} 419}
425 420
@@ -447,9 +442,10 @@ static struct mem_cgroup_per_zone *
447__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 442__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
448{ 443{
449 struct rb_node *rightmost = NULL; 444 struct rb_node *rightmost = NULL;
450 struct mem_cgroup_per_zone *mz = NULL; 445 struct mem_cgroup_per_zone *mz;
451 446
452retry: 447retry:
448 mz = NULL;
453 rightmost = rb_last(&mctz->rb_root); 449 rightmost = rb_last(&mctz->rb_root);
454 if (!rightmost) 450 if (!rightmost)
455 goto done; /* Nothing to reclaim from */ 451 goto done; /* Nothing to reclaim from */
@@ -1270,9 +1266,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1270 gfp_t gfp_mask, struct mem_cgroup **memcg, 1266 gfp_t gfp_mask, struct mem_cgroup **memcg,
1271 bool oom, struct page *page) 1267 bool oom, struct page *page)
1272{ 1268{
1273 struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit; 1269 struct mem_cgroup *mem, *mem_over_limit;
1274 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1275 struct res_counter *fail_res, *soft_fail_res = NULL; 1271 struct res_counter *fail_res;
1276 1272
1277 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1273 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1278 /* Don't account this! */ 1274 /* Don't account this! */
@@ -1304,17 +1300,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1304 1300
1305 if (mem_cgroup_is_root(mem)) 1301 if (mem_cgroup_is_root(mem))
1306 goto done; 1302 goto done;
1307 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, 1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
1308 &soft_fail_res);
1309 if (likely(!ret)) { 1304 if (likely(!ret)) {
1310 if (!do_swap_account) 1305 if (!do_swap_account)
1311 break; 1306 break;
1312 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1307 ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
1313 &fail_res, NULL); 1308 &fail_res);
1314 if (likely(!ret)) 1309 if (likely(!ret))
1315 break; 1310 break;
1316 /* mem+swap counter fails */ 1311 /* mem+swap counter fails */
1317 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1312 res_counter_uncharge(&mem->res, PAGE_SIZE);
1318 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1319 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1320 memsw); 1315 memsw);
@@ -1353,16 +1348,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1353 } 1348 }
1354 } 1349 }
1355 /* 1350 /*
1356 * Insert just the ancestor, we should trickle down to the correct 1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1357 * cgroup for reclaim, since the other nodes will be below their 1352 * if they exceeds softlimit.
1358 * soft limit
1359 */ 1353 */
1360 if (soft_fail_res) { 1354 if (mem_cgroup_soft_limit_check(mem))
1361 mem_over_soft_limit = 1355 mem_cgroup_update_tree(mem, page);
1362 mem_cgroup_from_res_counter(soft_fail_res, res);
1363 if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
1364 mem_cgroup_update_tree(mem_over_soft_limit, page);
1365 }
1366done: 1356done:
1367 return 0; 1357 return 0;
1368nomem: 1358nomem:
@@ -1437,10 +1427,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1437 if (unlikely(PageCgroupUsed(pc))) { 1427 if (unlikely(PageCgroupUsed(pc))) {
1438 unlock_page_cgroup(pc); 1428 unlock_page_cgroup(pc);
1439 if (!mem_cgroup_is_root(mem)) { 1429 if (!mem_cgroup_is_root(mem)) {
1440 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1430 res_counter_uncharge(&mem->res, PAGE_SIZE);
1441 if (do_swap_account) 1431 if (do_swap_account)
1442 res_counter_uncharge(&mem->memsw, PAGE_SIZE, 1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1443 NULL);
1444 } 1433 }
1445 css_put(&mem->css); 1434 css_put(&mem->css);
1446 return; 1435 return;
@@ -1519,7 +1508,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1519 goto out; 1508 goto out;
1520 1509
1521 if (!mem_cgroup_is_root(from)) 1510 if (!mem_cgroup_is_root(from))
1522 res_counter_uncharge(&from->res, PAGE_SIZE, NULL); 1511 res_counter_uncharge(&from->res, PAGE_SIZE);
1523 mem_cgroup_charge_statistics(from, pc, false); 1512 mem_cgroup_charge_statistics(from, pc, false);
1524 1513
1525 page = pc->page; 1514 page = pc->page;
@@ -1539,7 +1528,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1539 } 1528 }
1540 1529
1541 if (do_swap_account && !mem_cgroup_is_root(from)) 1530 if (do_swap_account && !mem_cgroup_is_root(from))
1542 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); 1531 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1543 css_put(&from->css); 1532 css_put(&from->css);
1544 1533
1545 css_get(&to->css); 1534 css_get(&to->css);
@@ -1610,9 +1599,9 @@ uncharge:
1610 css_put(&parent->css); 1599 css_put(&parent->css);
1611 /* uncharge if move fails */ 1600 /* uncharge if move fails */
1612 if (!mem_cgroup_is_root(parent)) { 1601 if (!mem_cgroup_is_root(parent)) {
1613 res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); 1602 res_counter_uncharge(&parent->res, PAGE_SIZE);
1614 if (do_swap_account) 1603 if (do_swap_account)
1615 res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); 1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1616 } 1605 }
1617 return ret; 1606 return ret;
1618} 1607}
@@ -1803,8 +1792,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1803 * calling css_tryget 1792 * calling css_tryget
1804 */ 1793 */
1805 if (!mem_cgroup_is_root(memcg)) 1794 if (!mem_cgroup_is_root(memcg))
1806 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, 1795 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1807 NULL);
1808 mem_cgroup_swap_statistics(memcg, false); 1796 mem_cgroup_swap_statistics(memcg, false);
1809 mem_cgroup_put(memcg); 1797 mem_cgroup_put(memcg);
1810 } 1798 }
@@ -1831,9 +1819,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1831 if (!mem) 1819 if (!mem)
1832 return; 1820 return;
1833 if (!mem_cgroup_is_root(mem)) { 1821 if (!mem_cgroup_is_root(mem)) {
1834 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1822 res_counter_uncharge(&mem->res, PAGE_SIZE);
1835 if (do_swap_account) 1823 if (do_swap_account)
1836 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); 1824 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1837 } 1825 }
1838 css_put(&mem->css); 1826 css_put(&mem->css);
1839} 1827}
@@ -1848,7 +1836,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1848 struct page_cgroup *pc; 1836 struct page_cgroup *pc;
1849 struct mem_cgroup *mem = NULL; 1837 struct mem_cgroup *mem = NULL;
1850 struct mem_cgroup_per_zone *mz; 1838 struct mem_cgroup_per_zone *mz;
1851 bool soft_limit_excess = false;
1852 1839
1853 if (mem_cgroup_disabled()) 1840 if (mem_cgroup_disabled())
1854 return NULL; 1841 return NULL;
@@ -1888,10 +1875,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1888 } 1875 }
1889 1876
1890 if (!mem_cgroup_is_root(mem)) { 1877 if (!mem_cgroup_is_root(mem)) {
1891 res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); 1878 res_counter_uncharge(&mem->res, PAGE_SIZE);
1892 if (do_swap_account && 1879 if (do_swap_account &&
1893 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1880 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1894 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); 1881 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1895 } 1882 }
1896 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1883 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1897 mem_cgroup_swap_statistics(mem, true); 1884 mem_cgroup_swap_statistics(mem, true);
@@ -1908,7 +1895,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1908 mz = page_cgroup_zoneinfo(pc); 1895 mz = page_cgroup_zoneinfo(pc);
1909 unlock_page_cgroup(pc); 1896 unlock_page_cgroup(pc);
1910 1897
1911 if (soft_limit_excess && mem_cgroup_soft_limit_check(mem)) 1898 if (mem_cgroup_soft_limit_check(mem))
1912 mem_cgroup_update_tree(mem, page); 1899 mem_cgroup_update_tree(mem, page);
1913 /* at swapout, this memcg will be accessed to record to swap */ 1900 /* at swapout, this memcg will be accessed to record to swap */
1914 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1901 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
@@ -1986,7 +1973,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1986 * This memcg can be obsolete one. We avoid calling css_tryget 1973 * This memcg can be obsolete one. We avoid calling css_tryget
1987 */ 1974 */
1988 if (!mem_cgroup_is_root(memcg)) 1975 if (!mem_cgroup_is_root(memcg))
1989 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); 1976 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1990 mem_cgroup_swap_statistics(memcg, false); 1977 mem_cgroup_swap_statistics(memcg, false);
1991 mem_cgroup_put(memcg); 1978 mem_cgroup_put(memcg);
1992 } 1979 }
@@ -2233,6 +2220,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2233 unsigned long reclaimed; 2220 unsigned long reclaimed;
2234 int loop = 0; 2221 int loop = 0;
2235 struct mem_cgroup_tree_per_zone *mctz; 2222 struct mem_cgroup_tree_per_zone *mctz;
2223 unsigned long long excess;
2236 2224
2237 if (order > 0) 2225 if (order > 0)
2238 return 0; 2226 return 0;
@@ -2284,9 +2272,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2284 break; 2272 break;
2285 } while (1); 2273 } while (1);
2286 } 2274 }
2287 mz->usage_in_excess =
2288 res_counter_soft_limit_excess(&mz->mem->res);
2289 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 2275 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2276 excess = res_counter_soft_limit_excess(&mz->mem->res);
2290 /* 2277 /*
2291 * One school of thought says that we should not add 2278 * One school of thought says that we should not add
2292 * back the node to the tree if reclaim returns 0. 2279 * back the node to the tree if reclaim returns 0.
@@ -2295,8 +2282,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2295 * memory to reclaim from. Consider this as a longer 2282 * memory to reclaim from. Consider this as a longer
2296 * term TODO. 2283 * term TODO.
2297 */ 2284 */
2298 if (mz->usage_in_excess) 2285 /* If excess == 0, no tree ops */
2299 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz); 2286 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2300 spin_unlock(&mctz->lock); 2287 spin_unlock(&mctz->lock);
2301 css_put(&mz->mem->css); 2288 css_put(&mz->mem->css);
2302 loop++; 2289 loop++;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 729d4b15b645..dacc64183874 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -35,6 +35,7 @@
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/page-flags.h> 36#include <linux/page-flags.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/ksm.h>
38#include <linux/rmap.h> 39#include <linux/rmap.h>
39#include <linux/pagemap.h> 40#include <linux/pagemap.h>
40#include <linux/swap.h> 41#include <linux/swap.h>
@@ -370,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
370 int ret = FAILED; 371 int ret = FAILED;
371 struct address_space *mapping; 372 struct address_space *mapping;
372 373
373 if (!isolate_lru_page(p))
374 page_cache_release(p);
375
376 /* 374 /*
377 * For anonymous pages we're done the only reference left 375 * For anonymous pages we're done the only reference left
378 * should be the one m_f() holds. 376 * should be the one m_f() holds.
@@ -498,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
498 */ 496 */
499static int me_swapcache_dirty(struct page *p, unsigned long pfn) 497static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500{ 498{
501 int ret = FAILED;
502
503 ClearPageDirty(p); 499 ClearPageDirty(p);
504 /* Trigger EIO in shmem: */ 500 /* Trigger EIO in shmem: */
505 ClearPageUptodate(p); 501 ClearPageUptodate(p);
506 502
507 if (!isolate_lru_page(p)) { 503 return DELAYED;
508 page_cache_release(p);
509 ret = DELAYED;
510 }
511
512 return ret;
513} 504}
514 505
515static int me_swapcache_clean(struct page *p, unsigned long pfn) 506static int me_swapcache_clean(struct page *p, unsigned long pfn)
516{ 507{
517 int ret = FAILED;
518
519 if (!isolate_lru_page(p)) {
520 page_cache_release(p);
521 ret = RECOVERED;
522 }
523 delete_from_swap_cache(p); 508 delete_from_swap_cache(p);
524 return ret; 509
510 return RECOVERED;
525} 511}
526 512
527/* 513/*
@@ -611,8 +597,6 @@ static struct page_state {
611 { 0, 0, "unknown page state", me_unknown }, 597 { 0, 0, "unknown page state", me_unknown },
612}; 598};
613 599
614#undef lru
615
616static void action_result(unsigned long pfn, char *msg, int result) 600static void action_result(unsigned long pfn, char *msg, int result)
617{ 601{
618 struct page *page = NULL; 602 struct page *page = NULL;
@@ -629,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p,
629 unsigned long pfn, int ref) 613 unsigned long pfn, int ref)
630{ 614{
631 int result; 615 int result;
616 int count;
632 617
633 result = ps->action(p, pfn); 618 result = ps->action(p, pfn);
634 action_result(pfn, ps->msg, result); 619 action_result(pfn, ps->msg, result);
635 if (page_count(p) != 1 + ref) 620
621 count = page_count(p) - 1 - ref;
622 if (count != 0)
636 printk(KERN_ERR 623 printk(KERN_ERR
637 "MCE %#lx: %s page still referenced by %d users\n", 624 "MCE %#lx: %s page still referenced by %d users\n",
638 pfn, ps->msg, page_count(p) - 1); 625 pfn, ps->msg, count);
639 626
640 /* Could do more checks here if page looks ok */ 627 /* Could do more checks here if page looks ok */
641 /* 628 /*
@@ -661,12 +648,9 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
661 int i; 648 int i;
662 int kill = 1; 649 int kill = 1;
663 650
664 if (PageReserved(p) || PageCompound(p) || PageSlab(p)) 651 if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
665 return; 652 return;
666 653
667 if (!PageLRU(p))
668 lru_add_drain_all();
669
670 /* 654 /*
671 * This check implies we don't kill processes if their pages 655 * This check implies we don't kill processes if their pages
672 * are in the swap cache early. Those are always late kills. 656 * are in the swap cache early. Those are always late kills.
@@ -738,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
738 722
739int __memory_failure(unsigned long pfn, int trapno, int ref) 723int __memory_failure(unsigned long pfn, int trapno, int ref)
740{ 724{
725 unsigned long lru_flag;
741 struct page_state *ps; 726 struct page_state *ps;
742 struct page *p; 727 struct page *p;
743 int res; 728 int res;
@@ -775,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
775 } 760 }
776 761
777 /* 762 /*
763 * We ignore non-LRU pages for good reasons.
764 * - PG_locked is only well defined for LRU pages and a few others
765 * - to avoid races with __set_page_locked()
766 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
767 * The check (unnecessarily) ignores LRU pages being isolated and
768 * walked by the page reclaim code, however that's not a big loss.
769 */
770 if (!PageLRU(p))
771 lru_add_drain_all();
772 lru_flag = p->flags & lru;
773 if (isolate_lru_page(p)) {
774 action_result(pfn, "non LRU", IGNORED);
775 put_page(p);
776 return -EBUSY;
777 }
778 page_cache_release(p);
779
780 /*
778 * Lock the page and wait for writeback to finish. 781 * Lock the page and wait for writeback to finish.
779 * It's very difficult to mess with pages currently under IO 782 * It's very difficult to mess with pages currently under IO
780 * and in many cases impossible, so we just avoid it here. 783 * and in many cases impossible, so we just avoid it here.
@@ -790,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
790 /* 793 /*
791 * Torn down by someone else? 794 * Torn down by someone else?
792 */ 795 */
793 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 796 if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
794 action_result(pfn, "already truncated LRU", IGNORED); 797 action_result(pfn, "already truncated LRU", IGNORED);
795 res = 0; 798 res = 0;
796 goto out; 799 goto out;
@@ -798,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
798 801
799 res = -EBUSY; 802 res = -EBUSY;
800 for (ps = error_states;; ps++) { 803 for (ps = error_states;; ps++) {
801 if ((p->flags & ps->mask) == ps->res) { 804 if (((p->flags | lru_flag)& ps->mask) == ps->res) {
802 res = page_action(ps, p, pfn, ref); 805 res = page_action(ps, p, pfn, ref);
803 break; 806 break;
804 } 807 }
diff --git a/mm/memory.c b/mm/memory.c
index 7e91b5f9f690..6ab19dd4a199 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -641,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
641 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 641 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
642 unsigned long addr, unsigned long end) 642 unsigned long addr, unsigned long end)
643{ 643{
644 pte_t *orig_src_pte, *orig_dst_pte;
644 pte_t *src_pte, *dst_pte; 645 pte_t *src_pte, *dst_pte;
645 spinlock_t *src_ptl, *dst_ptl; 646 spinlock_t *src_ptl, *dst_ptl;
646 int progress = 0; 647 int progress = 0;
@@ -654,6 +655,8 @@ again:
654 src_pte = pte_offset_map_nested(src_pmd, addr); 655 src_pte = pte_offset_map_nested(src_pmd, addr);
655 src_ptl = pte_lockptr(src_mm, src_pmd); 656 src_ptl = pte_lockptr(src_mm, src_pmd);
656 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 657 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
658 orig_src_pte = src_pte;
659 orig_dst_pte = dst_pte;
657 arch_enter_lazy_mmu_mode(); 660 arch_enter_lazy_mmu_mode();
658 661
659 do { 662 do {
@@ -677,9 +680,9 @@ again:
677 680
678 arch_leave_lazy_mmu_mode(); 681 arch_leave_lazy_mmu_mode();
679 spin_unlock(src_ptl); 682 spin_unlock(src_ptl);
680 pte_unmap_nested(src_pte - 1); 683 pte_unmap_nested(orig_src_pte);
681 add_mm_rss(dst_mm, rss[0], rss[1]); 684 add_mm_rss(dst_mm, rss[0], rss[1]);
682 pte_unmap_unlock(dst_pte - 1, dst_ptl); 685 pte_unmap_unlock(orig_dst_pte, dst_ptl);
683 cond_resched(); 686 cond_resched();
684 if (addr != end) 687 if (addr != end)
685 goto again; 688 goto again;
@@ -1820,10 +1823,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1820 token = pmd_pgtable(*pmd); 1823 token = pmd_pgtable(*pmd);
1821 1824
1822 do { 1825 do {
1823 err = fn(pte, token, addr, data); 1826 err = fn(pte++, token, addr, data);
1824 if (err) 1827 if (err)
1825 break; 1828 break;
1826 } while (pte++, addr += PAGE_SIZE, addr != end); 1829 } while (addr += PAGE_SIZE, addr != end);
1827 1830
1828 arch_leave_lazy_mmu_mode(); 1831 arch_leave_lazy_mmu_mode();
1829 1832
@@ -2539,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2539 } else if (PageHWPoison(page)) { 2542 } else if (PageHWPoison(page)) {
2540 ret = VM_FAULT_HWPOISON; 2543 ret = VM_FAULT_HWPOISON;
2541 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2544 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2542 goto out; 2545 goto out_release;
2543 } 2546 }
2544 2547
2545 lock_page(page); 2548 lock_page(page);
@@ -2611,6 +2614,7 @@ out_nomap:
2611 pte_unmap_unlock(page_table, ptl); 2614 pte_unmap_unlock(page_table, ptl);
2612out_page: 2615out_page:
2613 unlock_page(page); 2616 unlock_page(page);
2617out_release:
2614 page_cache_release(page); 2618 page_cache_release(page);
2615 return ret; 2619 return ret;
2616} 2620}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 821dee596377..2047465cd27c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
26#include <linux/migrate.h> 26#include <linux/migrate.h>
27#include <linux/page-isolation.h> 27#include <linux/page-isolation.h>
28#include <linux/pfn.h> 28#include <linux/pfn.h>
29#include <linux/suspend.h>
29 30
30#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
31 32
@@ -447,7 +448,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
447} 448}
448#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 449#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
449 450
450static pg_data_t *hotadd_new_pgdat(int nid, u64 start) 451/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
452static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
451{ 453{
452 struct pglist_data *pgdat; 454 struct pglist_data *pgdat;
453 unsigned long zones_size[MAX_NR_ZONES] = {0}; 455 unsigned long zones_size[MAX_NR_ZONES] = {0};
@@ -484,14 +486,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
484 struct resource *res; 486 struct resource *res;
485 int ret; 487 int ret;
486 488
489 lock_system_sleep();
490
487 res = register_memory_resource(start, size); 491 res = register_memory_resource(start, size);
492 ret = -EEXIST;
488 if (!res) 493 if (!res)
489 return -EEXIST; 494 goto out;
490 495
491 if (!node_online(nid)) { 496 if (!node_online(nid)) {
492 pgdat = hotadd_new_pgdat(nid, start); 497 pgdat = hotadd_new_pgdat(nid, start);
498 ret = -ENOMEM;
493 if (!pgdat) 499 if (!pgdat)
494 return -ENOMEM; 500 goto out;
495 new_pgdat = 1; 501 new_pgdat = 1;
496 } 502 }
497 503
@@ -514,7 +520,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
514 BUG_ON(ret); 520 BUG_ON(ret);
515 } 521 }
516 522
517 return ret; 523 goto out;
524
518error: 525error:
519 /* rollback pgdat allocation and others */ 526 /* rollback pgdat allocation and others */
520 if (new_pgdat) 527 if (new_pgdat)
@@ -522,6 +529,8 @@ error:
522 if (res) 529 if (res)
523 release_memory_resource(res); 530 release_memory_resource(res);
524 531
532out:
533 unlock_system_sleep();
525 return ret; 534 return ret;
526} 535}
527EXPORT_SYMBOL_GPL(add_memory); 536EXPORT_SYMBOL_GPL(add_memory);
@@ -758,6 +767,8 @@ int offline_pages(unsigned long start_pfn,
758 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 767 if (!test_pages_in_a_zone(start_pfn, end_pfn))
759 return -EINVAL; 768 return -EINVAL;
760 769
770 lock_system_sleep();
771
761 zone = page_zone(pfn_to_page(start_pfn)); 772 zone = page_zone(pfn_to_page(start_pfn));
762 node = zone_to_nid(zone); 773 node = zone_to_nid(zone);
763 nr_pages = end_pfn - start_pfn; 774 nr_pages = end_pfn - start_pfn;
@@ -765,7 +776,7 @@ int offline_pages(unsigned long start_pfn,
765 /* set above range as isolated */ 776 /* set above range as isolated */
766 ret = start_isolate_page_range(start_pfn, end_pfn); 777 ret = start_isolate_page_range(start_pfn, end_pfn);
767 if (ret) 778 if (ret)
768 return ret; 779 goto out;
769 780
770 arg.start_pfn = start_pfn; 781 arg.start_pfn = start_pfn;
771 arg.nr_pages = nr_pages; 782 arg.nr_pages = nr_pages;
@@ -843,6 +854,7 @@ repeat:
843 writeback_set_ratelimit(); 854 writeback_set_ratelimit();
844 855
845 memory_notify(MEM_OFFLINE, &arg); 856 memory_notify(MEM_OFFLINE, &arg);
857 unlock_system_sleep();
846 return 0; 858 return 0;
847 859
848failed_removal: 860failed_removal:
@@ -852,6 +864,8 @@ failed_removal:
852 /* pushback to free area */ 864 /* pushback to free area */
853 undo_isolate_page_range(start_pfn, end_pfn); 865 undo_isolate_page_range(start_pfn, end_pfn);
854 866
867out:
868 unlock_system_sleep();
855 return ret; 869 return ret;
856} 870}
857 871
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7dd9d9f80694..4545d5944243 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1024,7 +1024,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1024 1024
1025 err = migrate_prep(); 1025 err = migrate_prep();
1026 if (err) 1026 if (err)
1027 return err; 1027 goto mpol_out;
1028 } 1028 }
1029 { 1029 {
1030 NODEMASK_SCRATCH(scratch); 1030 NODEMASK_SCRATCH(scratch);
@@ -1039,10 +1039,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1039 err = -ENOMEM; 1039 err = -ENOMEM;
1040 NODEMASK_SCRATCH_FREE(scratch); 1040 NODEMASK_SCRATCH_FREE(scratch);
1041 } 1041 }
1042 if (err) { 1042 if (err)
1043 mpol_put(new); 1043 goto mpol_out;
1044 return err; 1044
1045 }
1046 vma = check_range(mm, start, end, nmask, 1045 vma = check_range(mm, start, end, nmask,
1047 flags | MPOL_MF_INVERT, &pagelist); 1046 flags | MPOL_MF_INVERT, &pagelist);
1048 1047
@@ -1058,9 +1057,11 @@ static long do_mbind(unsigned long start, unsigned long len,
1058 1057
1059 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1058 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1060 err = -EIO; 1059 err = -EIO;
1061 } 1060 } else
1061 putback_lru_pages(&pagelist);
1062 1062
1063 up_write(&mm->mmap_sem); 1063 up_write(&mm->mmap_sem);
1064 mpol_out:
1064 mpol_put(new); 1065 mpol_put(new);
1065 return err; 1066 return err;
1066} 1067}
diff --git a/mm/migrate.c b/mm/migrate.c
index 1a4bf4813780..7dbcb22316d2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -602,7 +602,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
602 struct page *newpage = get_new_page(page, private, &result); 602 struct page *newpage = get_new_page(page, private, &result);
603 int rcu_locked = 0; 603 int rcu_locked = 0;
604 int charge = 0; 604 int charge = 0;
605 struct mem_cgroup *mem; 605 struct mem_cgroup *mem = NULL;
606 606
607 if (!newpage) 607 if (!newpage)
608 return -ENOMEM; 608 return -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 5189b5aed8c0..9876fa0c3ad3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1362,9 +1362,11 @@ share:
1362error_just_free: 1362error_just_free:
1363 up_write(&nommu_region_sem); 1363 up_write(&nommu_region_sem);
1364error: 1364error:
1365 fput(region->vm_file); 1365 if (region->vm_file)
1366 fput(region->vm_file);
1366 kmem_cache_free(vm_region_jar, region); 1367 kmem_cache_free(vm_region_jar, region);
1367 fput(vma->vm_file); 1368 if (vma->vm_file)
1369 fput(vma->vm_file);
1368 if (vma->vm_flags & VM_EXECUTABLE) 1370 if (vma->vm_flags & VM_EXECUTABLE)
1369 removed_exe_file_vma(vma->vm_mm); 1371 removed_exe_file_vma(vma->vm_mm);
1370 kmem_cache_free(vm_area_cachep, vma); 1372 kmem_cache_free(vm_area_cachep, vma);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a3b14090b1fb..2c5d79236ead 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -566,7 +566,8 @@ static void balance_dirty_pages(struct address_space *mapping,
566 if (pages_written >= write_chunk) 566 if (pages_written >= write_chunk)
567 break; /* We've done our duty */ 567 break; /* We've done our duty */
568 568
569 schedule_timeout_interruptible(pause); 569 __set_current_state(TASK_INTERRUPTIBLE);
570 io_schedule_timeout(pause);
570 571
571 /* 572 /*
572 * Increase the delay for each loop, up to our previous 573 * Increase the delay for each loop, up to our previous
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bf720550b44d..2bc2ac63f41e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1769,7 +1769,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1769 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1769 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1770 */ 1770 */
1771 alloc_flags &= ~ALLOC_CPUSET; 1771 alloc_flags &= ~ALLOC_CPUSET;
1772 } else if (unlikely(rt_task(p))) 1772 } else if (unlikely(rt_task(p)) && !in_interrupt())
1773 alloc_flags |= ALLOC_HARDER; 1773 alloc_flags |= ALLOC_HARDER;
1774 1774
1775 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 1775 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
@@ -1817,9 +1817,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1817 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1817 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1818 goto nopage; 1818 goto nopage;
1819 1819
1820restart:
1820 wake_all_kswapd(order, zonelist, high_zoneidx); 1821 wake_all_kswapd(order, zonelist, high_zoneidx);
1821 1822
1822restart:
1823 /* 1823 /*
1824 * OK, we're below the kswapd watermark and have kicked background 1824 * OK, we're below the kswapd watermark and have kicked background
1825 * reclaim. Now things get more complex, so set up alloc_flags according 1825 * reclaim. Now things get more complex, so set up alloc_flags according
@@ -2183,7 +2183,7 @@ void show_free_areas(void)
2183 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 2183 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2184 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 2184 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2185 " unevictable:%lu" 2185 " unevictable:%lu"
2186 " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" 2186 " dirty:%lu writeback:%lu unstable:%lu\n"
2187 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 2187 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2188 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", 2188 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2189 global_page_state(NR_ACTIVE_ANON), 2189 global_page_state(NR_ACTIVE_ANON),
@@ -2196,7 +2196,6 @@ void show_free_areas(void)
2196 global_page_state(NR_FILE_DIRTY), 2196 global_page_state(NR_FILE_DIRTY),
2197 global_page_state(NR_WRITEBACK), 2197 global_page_state(NR_WRITEBACK),
2198 global_page_state(NR_UNSTABLE_NFS), 2198 global_page_state(NR_UNSTABLE_NFS),
2199 nr_blockdev_pages(),
2200 global_page_state(NR_FREE_PAGES), 2199 global_page_state(NR_FREE_PAGES),
2201 global_page_state(NR_SLAB_RECLAIMABLE), 2200 global_page_state(NR_SLAB_RECLAIMABLE),
2202 global_page_state(NR_SLAB_UNRECLAIMABLE), 2201 global_page_state(NR_SLAB_UNRECLAIMABLE),
diff --git a/mm/percpu.c b/mm/percpu.c
index 43d8cacfdaa5..5adfc268b408 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -153,7 +153,10 @@ static int pcpu_reserved_chunk_limit;
153 * 153 *
154 * During allocation, pcpu_alloc_mutex is kept locked all the time and 154 * During allocation, pcpu_alloc_mutex is kept locked all the time and
155 * pcpu_lock is grabbed and released as necessary. All actual memory 155 * pcpu_lock is grabbed and released as necessary. All actual memory
156 * allocations are done using GFP_KERNEL with pcpu_lock released. 156 * allocations are done using GFP_KERNEL with pcpu_lock released. In
157 * general, percpu memory can't be allocated with irq off but
158 * irqsave/restore are still used in alloc path so that it can be used
159 * from early init path - sched_init() specifically.
157 * 160 *
158 * Free path accesses and alters only the index data structures, so it 161 * Free path accesses and alters only the index data structures, so it
159 * can be safely called from atomic context. When memory needs to be 162 * can be safely called from atomic context. When memory needs to be
@@ -352,62 +355,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
352} 355}
353 356
354/** 357/**
355 * pcpu_extend_area_map - extend area map for allocation 358 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
356 * @chunk: target chunk 359 * @chunk: chunk of interest
357 * 360 *
358 * Extend area map of @chunk so that it can accomodate an allocation. 361 * Determine whether area map of @chunk needs to be extended to
359 * A single allocation can split an area into three areas, so this 362 * accomodate a new allocation.
360 * function makes sure that @chunk->map has at least two extra slots.
361 * 363 *
362 * CONTEXT: 364 * CONTEXT:
363 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired 365 * pcpu_lock.
364 * if area map is extended.
365 * 366 *
366 * RETURNS: 367 * RETURNS:
367 * 0 if noop, 1 if successfully extended, -errno on failure. 368 * New target map allocation length if extension is necessary, 0
369 * otherwise.
368 */ 370 */
369static int pcpu_extend_area_map(struct pcpu_chunk *chunk) 371static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
370{ 372{
371 int new_alloc; 373 int new_alloc;
372 int *new;
373 size_t size;
374 374
375 /* has enough? */
376 if (chunk->map_alloc >= chunk->map_used + 2) 375 if (chunk->map_alloc >= chunk->map_used + 2)
377 return 0; 376 return 0;
378 377
379 spin_unlock_irq(&pcpu_lock);
380
381 new_alloc = PCPU_DFL_MAP_ALLOC; 378 new_alloc = PCPU_DFL_MAP_ALLOC;
382 while (new_alloc < chunk->map_used + 2) 379 while (new_alloc < chunk->map_used + 2)
383 new_alloc *= 2; 380 new_alloc *= 2;
384 381
385 new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); 382 return new_alloc;
386 if (!new) { 383}
387 spin_lock_irq(&pcpu_lock); 384
385/**
386 * pcpu_extend_area_map - extend area map of a chunk
387 * @chunk: chunk of interest
388 * @new_alloc: new target allocation length of the area map
389 *
390 * Extend area map of @chunk to have @new_alloc entries.
391 *
392 * CONTEXT:
393 * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
394 *
395 * RETURNS:
396 * 0 on success, -errno on failure.
397 */
398static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
399{
400 int *old = NULL, *new = NULL;
401 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
402 unsigned long flags;
403
404 new = pcpu_mem_alloc(new_size);
405 if (!new)
388 return -ENOMEM; 406 return -ENOMEM;
389 }
390 407
391 /* 408 /* acquire pcpu_lock and switch to new area map */
392 * Acquire pcpu_lock and switch to new area map. Only free 409 spin_lock_irqsave(&pcpu_lock, flags);
393 * could have happened inbetween, so map_used couldn't have 410
394 * grown. 411 if (new_alloc <= chunk->map_alloc)
395 */ 412 goto out_unlock;
396 spin_lock_irq(&pcpu_lock);
397 BUG_ON(new_alloc < chunk->map_used + 2);
398 413
399 size = chunk->map_alloc * sizeof(chunk->map[0]); 414 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
400 memcpy(new, chunk->map, size); 415 memcpy(new, chunk->map, old_size);
401 416
402 /* 417 /*
403 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is 418 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
404 * one of the first chunks and still using static map. 419 * one of the first chunks and still using static map.
405 */ 420 */
406 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) 421 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
407 pcpu_mem_free(chunk->map, size); 422 old = chunk->map;
408 423
409 chunk->map_alloc = new_alloc; 424 chunk->map_alloc = new_alloc;
410 chunk->map = new; 425 chunk->map = new;
426 new = NULL;
427
428out_unlock:
429 spin_unlock_irqrestore(&pcpu_lock, flags);
430
431 /*
432 * pcpu_mem_free() might end up calling vfree() which uses
433 * IRQ-unsafe lock and thus can't be called under pcpu_lock.
434 */
435 pcpu_mem_free(old, old_size);
436 pcpu_mem_free(new, new_size);
437
411 return 0; 438 return 0;
412} 439}
413 440
@@ -1043,8 +1070,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
1043 */ 1070 */
1044static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1071static void *pcpu_alloc(size_t size, size_t align, bool reserved)
1045{ 1072{
1073 static int warn_limit = 10;
1046 struct pcpu_chunk *chunk; 1074 struct pcpu_chunk *chunk;
1047 int slot, off; 1075 const char *err;
1076 int slot, off, new_alloc;
1077 unsigned long flags;
1048 1078
1049 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { 1079 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
1050 WARN(true, "illegal size (%zu) or align (%zu) for " 1080 WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -1053,17 +1083,31 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
1053 } 1083 }
1054 1084
1055 mutex_lock(&pcpu_alloc_mutex); 1085 mutex_lock(&pcpu_alloc_mutex);
1056 spin_lock_irq(&pcpu_lock); 1086 spin_lock_irqsave(&pcpu_lock, flags);
1057 1087
1058 /* serve reserved allocations from the reserved chunk if available */ 1088 /* serve reserved allocations from the reserved chunk if available */
1059 if (reserved && pcpu_reserved_chunk) { 1089 if (reserved && pcpu_reserved_chunk) {
1060 chunk = pcpu_reserved_chunk; 1090 chunk = pcpu_reserved_chunk;
1061 if (size > chunk->contig_hint || 1091
1062 pcpu_extend_area_map(chunk) < 0) 1092 if (size > chunk->contig_hint) {
1093 err = "alloc from reserved chunk failed";
1063 goto fail_unlock; 1094 goto fail_unlock;
1095 }
1096
1097 while ((new_alloc = pcpu_need_to_extend(chunk))) {
1098 spin_unlock_irqrestore(&pcpu_lock, flags);
1099 if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
1100 err = "failed to extend area map of reserved chunk";
1101 goto fail_unlock_mutex;
1102 }
1103 spin_lock_irqsave(&pcpu_lock, flags);
1104 }
1105
1064 off = pcpu_alloc_area(chunk, size, align); 1106 off = pcpu_alloc_area(chunk, size, align);
1065 if (off >= 0) 1107 if (off >= 0)
1066 goto area_found; 1108 goto area_found;
1109
1110 err = "alloc from reserved chunk failed";
1067 goto fail_unlock; 1111 goto fail_unlock;
1068 } 1112 }
1069 1113
@@ -1074,13 +1118,20 @@ restart:
1074 if (size > chunk->contig_hint) 1118 if (size > chunk->contig_hint)
1075 continue; 1119 continue;
1076 1120
1077 switch (pcpu_extend_area_map(chunk)) { 1121 new_alloc = pcpu_need_to_extend(chunk);
1078 case 0: 1122 if (new_alloc) {
1079 break; 1123 spin_unlock_irqrestore(&pcpu_lock, flags);
1080 case 1: 1124 if (pcpu_extend_area_map(chunk,
1081 goto restart; /* pcpu_lock dropped, restart */ 1125 new_alloc) < 0) {
1082 default: 1126 err = "failed to extend area map";
1083 goto fail_unlock; 1127 goto fail_unlock_mutex;
1128 }
1129 spin_lock_irqsave(&pcpu_lock, flags);
1130 /*
1131 * pcpu_lock has been dropped, need to
1132 * restart cpu_slot list walking.
1133 */
1134 goto restart;
1084 } 1135 }
1085 1136
1086 off = pcpu_alloc_area(chunk, size, align); 1137 off = pcpu_alloc_area(chunk, size, align);
@@ -1090,23 +1141,26 @@ restart:
1090 } 1141 }
1091 1142
1092 /* hmmm... no space left, create a new chunk */ 1143 /* hmmm... no space left, create a new chunk */
1093 spin_unlock_irq(&pcpu_lock); 1144 spin_unlock_irqrestore(&pcpu_lock, flags);
1094 1145
1095 chunk = alloc_pcpu_chunk(); 1146 chunk = alloc_pcpu_chunk();
1096 if (!chunk) 1147 if (!chunk) {
1148 err = "failed to allocate new chunk";
1097 goto fail_unlock_mutex; 1149 goto fail_unlock_mutex;
1150 }
1098 1151
1099 spin_lock_irq(&pcpu_lock); 1152 spin_lock_irqsave(&pcpu_lock, flags);
1100 pcpu_chunk_relocate(chunk, -1); 1153 pcpu_chunk_relocate(chunk, -1);
1101 goto restart; 1154 goto restart;
1102 1155
1103area_found: 1156area_found:
1104 spin_unlock_irq(&pcpu_lock); 1157 spin_unlock_irqrestore(&pcpu_lock, flags);
1105 1158
1106 /* populate, map and clear the area */ 1159 /* populate, map and clear the area */
1107 if (pcpu_populate_chunk(chunk, off, size)) { 1160 if (pcpu_populate_chunk(chunk, off, size)) {
1108 spin_lock_irq(&pcpu_lock); 1161 spin_lock_irqsave(&pcpu_lock, flags);
1109 pcpu_free_area(chunk, off); 1162 pcpu_free_area(chunk, off);
1163 err = "failed to populate";
1110 goto fail_unlock; 1164 goto fail_unlock;
1111 } 1165 }
1112 1166
@@ -1116,9 +1170,16 @@ area_found:
1116 return __addr_to_pcpu_ptr(chunk->base_addr + off); 1170 return __addr_to_pcpu_ptr(chunk->base_addr + off);
1117 1171
1118fail_unlock: 1172fail_unlock:
1119 spin_unlock_irq(&pcpu_lock); 1173 spin_unlock_irqrestore(&pcpu_lock, flags);
1120fail_unlock_mutex: 1174fail_unlock_mutex:
1121 mutex_unlock(&pcpu_alloc_mutex); 1175 mutex_unlock(&pcpu_alloc_mutex);
1176 if (warn_limit) {
1177 pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
1178 "%s\n", size, align, err);
1179 dump_stack();
1180 if (!--warn_limit)
1181 pr_info("PERCPU: limit reached, disable warning\n");
1182 }
1122 return NULL; 1183 return NULL;
1123} 1184}
1124 1185
@@ -1347,6 +1408,10 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1347 struct pcpu_alloc_info *ai; 1408 struct pcpu_alloc_info *ai;
1348 unsigned int *cpu_map; 1409 unsigned int *cpu_map;
1349 1410
1411 /* this function may be called multiple times */
1412 memset(group_map, 0, sizeof(group_map));
1413 memset(group_cnt, 0, sizeof(group_map));
1414
1350 /* 1415 /*
1351 * Determine min_unit_size, alloc_size and max_upa such that 1416 * Determine min_unit_size, alloc_size and max_upa such that
1352 * alloc_size is multiple of atom_size and is the smallest 1417 * alloc_size is multiple of atom_size and is the smallest
@@ -1574,6 +1639,7 @@ static void pcpu_dump_alloc_info(const char *lvl,
1574int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, 1639int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1575 void *base_addr) 1640 void *base_addr)
1576{ 1641{
1642 static char cpus_buf[4096] __initdata;
1577 static int smap[2], dmap[2]; 1643 static int smap[2], dmap[2];
1578 size_t dyn_size = ai->dyn_size; 1644 size_t dyn_size = ai->dyn_size;
1579 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; 1645 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
@@ -1585,17 +1651,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1585 int *unit_map; 1651 int *unit_map;
1586 int group, unit, i; 1652 int group, unit, i;
1587 1653
1654 cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
1655
1656#define PCPU_SETUP_BUG_ON(cond) do { \
1657 if (unlikely(cond)) { \
1658 pr_emerg("PERCPU: failed to initialize, %s", #cond); \
1659 pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
1660 pcpu_dump_alloc_info(KERN_EMERG, ai); \
1661 BUG(); \
1662 } \
1663} while (0)
1664
1588 /* sanity checks */ 1665 /* sanity checks */
1589 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1666 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1590 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1667 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1591 BUG_ON(ai->nr_groups <= 0); 1668 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1592 BUG_ON(!ai->static_size); 1669 PCPU_SETUP_BUG_ON(!ai->static_size);
1593 BUG_ON(!base_addr); 1670 PCPU_SETUP_BUG_ON(!base_addr);
1594 BUG_ON(ai->unit_size < size_sum); 1671 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1595 BUG_ON(ai->unit_size & ~PAGE_MASK); 1672 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1596 BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 1673 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1597
1598 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1599 1674
1600 /* process group information and build config tables accordingly */ 1675 /* process group information and build config tables accordingly */
1601 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); 1676 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
@@ -1604,7 +1679,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1604 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); 1679 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
1605 1680
1606 for (cpu = 0; cpu < nr_cpu_ids; cpu++) 1681 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1607 unit_map[cpu] = NR_CPUS; 1682 unit_map[cpu] = UINT_MAX;
1608 pcpu_first_unit_cpu = NR_CPUS; 1683 pcpu_first_unit_cpu = NR_CPUS;
1609 1684
1610 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { 1685 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
@@ -1618,8 +1693,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1618 if (cpu == NR_CPUS) 1693 if (cpu == NR_CPUS)
1619 continue; 1694 continue;
1620 1695
1621 BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); 1696 PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
1622 BUG_ON(unit_map[cpu] != NR_CPUS); 1697 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
1698 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
1623 1699
1624 unit_map[cpu] = unit + i; 1700 unit_map[cpu] = unit + i;
1625 unit_off[cpu] = gi->base_offset + i * ai->unit_size; 1701 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
@@ -1632,7 +1708,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1632 pcpu_nr_units = unit; 1708 pcpu_nr_units = unit;
1633 1709
1634 for_each_possible_cpu(cpu) 1710 for_each_possible_cpu(cpu)
1635 BUG_ON(unit_map[cpu] == NR_CPUS); 1711 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
1712
1713 /* we're done parsing the input, undefine BUG macro and dump config */
1714#undef PCPU_SETUP_BUG_ON
1715 pcpu_dump_alloc_info(KERN_INFO, ai);
1636 1716
1637 pcpu_nr_groups = ai->nr_groups; 1717 pcpu_nr_groups = ai->nr_groups;
1638 pcpu_group_offsets = group_offsets; 1718 pcpu_group_offsets = group_offsets;
@@ -1782,7 +1862,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
1782 void *base = (void *)ULONG_MAX; 1862 void *base = (void *)ULONG_MAX;
1783 void **areas = NULL; 1863 void **areas = NULL;
1784 struct pcpu_alloc_info *ai; 1864 struct pcpu_alloc_info *ai;
1785 size_t size_sum, areas_size; 1865 size_t size_sum, areas_size, max_distance;
1786 int group, i, rc; 1866 int group, i, rc;
1787 1867
1788 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, 1868 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
@@ -1832,8 +1912,25 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
1832 } 1912 }
1833 1913
1834 /* base address is now known, determine group base offsets */ 1914 /* base address is now known, determine group base offsets */
1835 for (group = 0; group < ai->nr_groups; group++) 1915 max_distance = 0;
1916 for (group = 0; group < ai->nr_groups; group++) {
1836 ai->groups[group].base_offset = areas[group] - base; 1917 ai->groups[group].base_offset = areas[group] - base;
1918 max_distance = max_t(size_t, max_distance,
1919 ai->groups[group].base_offset);
1920 }
1921 max_distance += ai->unit_size;
1922
1923 /* warn if maximum distance is further than 75% of vmalloc space */
1924 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
1925 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
1926 "space 0x%lx\n",
1927 max_distance, VMALLOC_END - VMALLOC_START);
1928#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1929 /* and fail if we have fallback */
1930 rc = -EINVAL;
1931 goto out_free;
1932#endif
1933 }
1837 1934
1838 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", 1935 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
1839 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, 1936 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
diff --git a/mm/rmap.c b/mm/rmap.c
index 28aafe2b5306..dd43373a483f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -242,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
242} 242}
243 243
244/* 244/*
245 * At what user virtual address is page expected in vma? checking that the 245 * At what user virtual address is page expected in vma?
246 * page matches the vma: currently only used on anon pages, by unuse_vma; 246 * checking that the page matches the vma.
247 */ 247 */
248unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 248unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
249{ 249{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4de7f02f820b..9c590eef7912 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1151,8 +1151,7 @@ static int try_to_unuse(unsigned int type)
1151 } else 1151 } else
1152 retval = unuse_mm(mm, entry, page); 1152 retval = unuse_mm(mm, entry, page);
1153 1153
1154 if (set_start_mm && 1154 if (set_start_mm && *swap_map < swcount) {
1155 swap_count(*swap_map) < swcount) {
1156 mmput(new_start_mm); 1155 mmput(new_start_mm);
1157 atomic_inc(&mm->mm_users); 1156 atomic_inc(&mm->mm_users);
1158 new_start_mm = mm; 1157 new_start_mm = mm;
@@ -1974,12 +1973,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1974 goto bad_swap; 1973 goto bad_swap;
1975 } 1974 }
1976 1975
1977 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 1976 if (p->bdev) {
1978 p->flags |= SWP_SOLIDSTATE; 1977 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1979 p->cluster_next = 1 + (random32() % p->highest_bit); 1978 p->flags |= SWP_SOLIDSTATE;
1979 p->cluster_next = 1 + (random32() % p->highest_bit);
1980 }
1981 if (discard_swap(p) == 0)
1982 p->flags |= SWP_DISCARDABLE;
1980 } 1983 }
1981 if (discard_swap(p) == 0)
1982 p->flags |= SWP_DISCARDABLE;
1983 1984
1984 mutex_lock(&swapon_mutex); 1985 mutex_lock(&swapon_mutex);
1985 spin_lock(&swap_lock); 1986 spin_lock(&swap_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 69511e663234..0f551a4a44cd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/highmem.h> 14#include <linux/highmem.h>
15#include <linux/sched.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/spinlock.h> 17#include <linux/spinlock.h>
17#include <linux/interrupt.h> 18#include <linux/interrupt.h>
@@ -25,10 +26,10 @@
25#include <linux/rcupdate.h> 26#include <linux/rcupdate.h>
26#include <linux/pfn.h> 27#include <linux/pfn.h>
27#include <linux/kmemleak.h> 28#include <linux/kmemleak.h>
28#include <linux/highmem.h>
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h>
32 33
33 34
34/*** Page table manipulation functions ***/ 35/*** Page table manipulation functions ***/
@@ -1156,12 +1157,11 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1156} 1157}
1157 1158
1158static struct vm_struct *__get_vm_area_node(unsigned long size, 1159static struct vm_struct *__get_vm_area_node(unsigned long size,
1159 unsigned long flags, unsigned long start, unsigned long end, 1160 unsigned long align, unsigned long flags, unsigned long start,
1160 int node, gfp_t gfp_mask, void *caller) 1161 unsigned long end, int node, gfp_t gfp_mask, void *caller)
1161{ 1162{
1162 static struct vmap_area *va; 1163 static struct vmap_area *va;
1163 struct vm_struct *area; 1164 struct vm_struct *area;
1164 unsigned long align = 1;
1165 1165
1166 BUG_ON(in_interrupt()); 1166 BUG_ON(in_interrupt());
1167 if (flags & VM_IOREMAP) { 1167 if (flags & VM_IOREMAP) {
@@ -1201,7 +1201,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1201struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1201struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1202 unsigned long start, unsigned long end) 1202 unsigned long start, unsigned long end)
1203{ 1203{
1204 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, 1204 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1205 __builtin_return_address(0)); 1205 __builtin_return_address(0));
1206} 1206}
1207EXPORT_SYMBOL_GPL(__get_vm_area); 1207EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -1210,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1210 unsigned long start, unsigned long end, 1210 unsigned long start, unsigned long end,
1211 void *caller) 1211 void *caller)
1212{ 1212{
1213 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, 1213 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1214 caller); 1214 caller);
1215} 1215}
1216 1216
@@ -1225,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1225 */ 1225 */
1226struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1226struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1227{ 1227{
1228 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, 1228 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1229 -1, GFP_KERNEL, __builtin_return_address(0)); 1229 -1, GFP_KERNEL, __builtin_return_address(0));
1230} 1230}
1231 1231
1232struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1232struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1233 void *caller) 1233 void *caller)
1234{ 1234{
1235 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, 1235 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1236 -1, GFP_KERNEL, caller); 1236 -1, GFP_KERNEL, caller);
1237} 1237}
1238 1238
1239struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, 1239struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1240 int node, gfp_t gfp_mask) 1240 int node, gfp_t gfp_mask)
1241{ 1241{
1242 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, 1242 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1243 gfp_mask, __builtin_return_address(0)); 1243 node, gfp_mask, __builtin_return_address(0));
1244} 1244}
1245 1245
1246static struct vm_struct *find_vm_area(const void *addr) 1246static struct vm_struct *find_vm_area(const void *addr)
@@ -1403,7 +1403,8 @@ void *vmap(struct page **pages, unsigned int count,
1403} 1403}
1404EXPORT_SYMBOL(vmap); 1404EXPORT_SYMBOL(vmap);
1405 1405
1406static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 1406static void *__vmalloc_node(unsigned long size, unsigned long align,
1407 gfp_t gfp_mask, pgprot_t prot,
1407 int node, void *caller); 1408 int node, void *caller);
1408static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1409static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1409 pgprot_t prot, int node, void *caller) 1410 pgprot_t prot, int node, void *caller)
@@ -1417,7 +1418,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1417 area->nr_pages = nr_pages; 1418 area->nr_pages = nr_pages;
1418 /* Please note that the recursion is strictly bounded. */ 1419 /* Please note that the recursion is strictly bounded. */
1419 if (array_size > PAGE_SIZE) { 1420 if (array_size > PAGE_SIZE) {
1420 pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, 1421 pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO,
1421 PAGE_KERNEL, node, caller); 1422 PAGE_KERNEL, node, caller);
1422 area->flags |= VM_VPAGES; 1423 area->flags |= VM_VPAGES;
1423 } else { 1424 } else {
@@ -1476,6 +1477,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1476/** 1477/**
1477 * __vmalloc_node - allocate virtually contiguous memory 1478 * __vmalloc_node - allocate virtually contiguous memory
1478 * @size: allocation size 1479 * @size: allocation size
1480 * @align: desired alignment
1479 * @gfp_mask: flags for the page level allocator 1481 * @gfp_mask: flags for the page level allocator
1480 * @prot: protection mask for the allocated pages 1482 * @prot: protection mask for the allocated pages
1481 * @node: node to use for allocation or -1 1483 * @node: node to use for allocation or -1
@@ -1485,8 +1487,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1485 * allocator with @gfp_mask flags. Map them into contiguous 1487 * allocator with @gfp_mask flags. Map them into contiguous
1486 * kernel virtual space, using a pagetable protection of @prot. 1488 * kernel virtual space, using a pagetable protection of @prot.
1487 */ 1489 */
1488static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 1490static void *__vmalloc_node(unsigned long size, unsigned long align,
1489 int node, void *caller) 1491 gfp_t gfp_mask, pgprot_t prot,
1492 int node, void *caller)
1490{ 1493{
1491 struct vm_struct *area; 1494 struct vm_struct *area;
1492 void *addr; 1495 void *addr;
@@ -1496,8 +1499,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1496 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1499 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1497 return NULL; 1500 return NULL;
1498 1501
1499 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, 1502 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
1500 node, gfp_mask, caller); 1503 VMALLOC_END, node, gfp_mask, caller);
1501 1504
1502 if (!area) 1505 if (!area)
1503 return NULL; 1506 return NULL;
@@ -1516,7 +1519,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1516 1519
1517void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1520void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1518{ 1521{
1519 return __vmalloc_node(size, gfp_mask, prot, -1, 1522 return __vmalloc_node(size, 1, gfp_mask, prot, -1,
1520 __builtin_return_address(0)); 1523 __builtin_return_address(0));
1521} 1524}
1522EXPORT_SYMBOL(__vmalloc); 1525EXPORT_SYMBOL(__vmalloc);
@@ -1532,7 +1535,7 @@ EXPORT_SYMBOL(__vmalloc);
1532 */ 1535 */
1533void *vmalloc(unsigned long size) 1536void *vmalloc(unsigned long size)
1534{ 1537{
1535 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1538 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1536 -1, __builtin_return_address(0)); 1539 -1, __builtin_return_address(0));
1537} 1540}
1538EXPORT_SYMBOL(vmalloc); 1541EXPORT_SYMBOL(vmalloc);
@@ -1549,7 +1552,8 @@ void *vmalloc_user(unsigned long size)
1549 struct vm_struct *area; 1552 struct vm_struct *area;
1550 void *ret; 1553 void *ret;
1551 1554
1552 ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1555 ret = __vmalloc_node(size, SHMLBA,
1556 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1553 PAGE_KERNEL, -1, __builtin_return_address(0)); 1557 PAGE_KERNEL, -1, __builtin_return_address(0));
1554 if (ret) { 1558 if (ret) {
1555 area = find_vm_area(ret); 1559 area = find_vm_area(ret);
@@ -1572,7 +1576,7 @@ EXPORT_SYMBOL(vmalloc_user);
1572 */ 1576 */
1573void *vmalloc_node(unsigned long size, int node) 1577void *vmalloc_node(unsigned long size, int node)
1574{ 1578{
1575 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1579 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1576 node, __builtin_return_address(0)); 1580 node, __builtin_return_address(0));
1577} 1581}
1578EXPORT_SYMBOL(vmalloc_node); 1582EXPORT_SYMBOL(vmalloc_node);
@@ -1595,7 +1599,7 @@ EXPORT_SYMBOL(vmalloc_node);
1595 1599
1596void *vmalloc_exec(unsigned long size) 1600void *vmalloc_exec(unsigned long size)
1597{ 1601{
1598 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1602 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1599 -1, __builtin_return_address(0)); 1603 -1, __builtin_return_address(0));
1600} 1604}
1601 1605
@@ -1616,7 +1620,7 @@ void *vmalloc_exec(unsigned long size)
1616 */ 1620 */
1617void *vmalloc_32(unsigned long size) 1621void *vmalloc_32(unsigned long size)
1618{ 1622{
1619 return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, 1623 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1620 -1, __builtin_return_address(0)); 1624 -1, __builtin_return_address(0));
1621} 1625}
1622EXPORT_SYMBOL(vmalloc_32); 1626EXPORT_SYMBOL(vmalloc_32);
@@ -1633,7 +1637,7 @@ void *vmalloc_32_user(unsigned long size)
1633 struct vm_struct *area; 1637 struct vm_struct *area;
1634 void *ret; 1638 void *ret;
1635 1639
1636 ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1640 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1637 -1, __builtin_return_address(0)); 1641 -1, __builtin_return_address(0));
1638 if (ret) { 1642 if (ret) {
1639 area = find_vm_area(ret); 1643 area = find_vm_area(ret);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 64e438898832..777af57fd8c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -544,6 +544,16 @@ redo:
544 */ 544 */
545 lru = LRU_UNEVICTABLE; 545 lru = LRU_UNEVICTABLE;
546 add_page_to_unevictable_list(page); 546 add_page_to_unevictable_list(page);
547 /*
548 * When racing with an mlock clearing (page is
549 * unlocked), make sure that if the other thread does
550 * not observe our setting of PG_lru and fails
551 * isolation, we see PG_mlocked cleared below and move
552 * the page back to the evictable list.
553 *
554 * The other side is TestClearPageMlocked().
555 */
556 smp_mb();
547 } 557 }
548 558
549 /* 559 /*
@@ -1088,7 +1098,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1088 int lumpy_reclaim = 0; 1098 int lumpy_reclaim = 0;
1089 1099
1090 while (unlikely(too_many_isolated(zone, file, sc))) { 1100 while (unlikely(too_many_isolated(zone, file, sc))) {
1091 congestion_wait(WRITE, HZ/10); 1101 congestion_wait(BLK_RW_ASYNC, HZ/10);
1092 1102
1093 /* We are about to die and free our memory. Return now. */ 1103 /* We are about to die and free our memory. Return now. */
1094 if (fatal_signal_pending(current)) 1104 if (fatal_signal_pending(current))
@@ -1356,7 +1366,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1356 * IO, plus JVM can create lots of anon VM_EXEC pages, 1366 * IO, plus JVM can create lots of anon VM_EXEC pages,
1357 * so we ignore them here. 1367 * so we ignore them here.
1358 */ 1368 */
1359 if ((vm_flags & VM_EXEC) && !PageAnon(page)) { 1369 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1360 list_add(&page->lru, &l_active); 1370 list_add(&page->lru, &l_active);
1361 continue; 1371 continue;
1362 } 1372 }