aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJ. Bruce Fields <bfields@citi.umich.edu>2009-11-23 12:34:58 -0500
committerJ. Bruce Fields <bfields@citi.umich.edu>2009-11-23 12:34:58 -0500
commit9b8b317d58084b9a44f6f33b355c4278d9f841fb (patch)
treee0df89800bf4301c4017db3cdf04a2056ec1a852 /mm
parent78c210efdefe07131f91ed512a3308b15bb14e2f (diff)
parent648f4e3e50c4793d9dbf9a09afa193631f76fa26 (diff)
Merge commit 'v2.6.32-rc8' into HEAD
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/backing-dev.c17
-rw-r--r--mm/highmem.c17
-rw-r--r--mm/ksm.c1
-rw-r--r--mm/memory-failure.c59
-rw-r--r--mm/memory.c14
-rw-r--r--mm/memory_hotplug.c24
-rw-r--r--mm/mempolicy.c13
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/page_alloc.c7
-rw-r--r--mm/percpu.c139
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/vmscan.c14
14 files changed, 212 insertions, 113 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 57963c6063d1..44cf6f0a3a6d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -67,7 +67,7 @@ config DISCONTIGMEM
67 67
68config SPARSEMEM 68config SPARSEMEM
69 def_bool y 69 def_bool y
70 depends on SPARSEMEM_MANUAL 70 depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
71 71
72config FLATMEM 72config FLATMEM
73 def_bool y 73 def_bool y
@@ -128,11 +128,8 @@ config SPARSEMEM_VMEMMAP
128config MEMORY_HOTPLUG 128config MEMORY_HOTPLUG
129 bool "Allow for memory hot-add" 129 bool "Allow for memory hot-add"
130 depends on SPARSEMEM || X86_64_ACPI_NUMA 130 depends on SPARSEMEM || X86_64_ACPI_NUMA
131 depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG 131 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
132 depends on (IA64 || X86 || PPC64 || SUPERH || S390) 132 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
133
134comment "Memory hotplug is currently incompatible with Software Suspend"
135 depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
136 133
137config MEMORY_HOTPLUG_SPARSE 134config MEMORY_HOTPLUG_SPARSE
138 def_bool y 135 def_bool y
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5a37e2055717..11aee09dd2a6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -610,9 +610,26 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
610 kthread_stop(wb->task); 610 kthread_stop(wb->task);
611} 611}
612 612
613/*
614 * This bdi is going away now, make sure that no super_blocks point to it
615 */
616static void bdi_prune_sb(struct backing_dev_info *bdi)
617{
618 struct super_block *sb;
619
620 spin_lock(&sb_lock);
621 list_for_each_entry(sb, &super_blocks, s_list) {
622 if (sb->s_bdi == bdi)
623 sb->s_bdi = NULL;
624 }
625 spin_unlock(&sb_lock);
626}
627
613void bdi_unregister(struct backing_dev_info *bdi) 628void bdi_unregister(struct backing_dev_info *bdi)
614{ 629{
615 if (bdi->dev) { 630 if (bdi->dev) {
631 bdi_prune_sb(bdi);
632
616 if (!bdi_cap_flush_forker(bdi)) 633 if (!bdi_cap_flush_forker(bdi))
617 bdi_wb_shutdown(bdi); 634 bdi_wb_shutdown(bdi);
618 bdi_debug_unregister(bdi); 635 bdi_debug_unregister(bdi);
diff --git a/mm/highmem.c b/mm/highmem.c
index 25878cc49daa..9c1e627f282e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -426,16 +426,21 @@ void __init page_address_init(void)
426 426
427void debug_kmap_atomic(enum km_type type) 427void debug_kmap_atomic(enum km_type type)
428{ 428{
429 static unsigned warn_count = 10; 429 static int warn_count = 10;
430 430
431 if (unlikely(warn_count == 0)) 431 if (unlikely(warn_count < 0))
432 return; 432 return;
433 433
434 if (unlikely(in_interrupt())) { 434 if (unlikely(in_interrupt())) {
435 if (in_irq()) { 435 if (in_nmi()) {
436 if (type != KM_NMI && type != KM_NMI_PTE) {
437 WARN_ON(1);
438 warn_count--;
439 }
440 } else if (in_irq()) {
436 if (type != KM_IRQ0 && type != KM_IRQ1 && 441 if (type != KM_IRQ0 && type != KM_IRQ1 &&
437 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && 442 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
438 type != KM_BOUNCE_READ) { 443 type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
439 WARN_ON(1); 444 WARN_ON(1);
440 warn_count--; 445 warn_count--;
441 } 446 }
@@ -452,7 +457,9 @@ void debug_kmap_atomic(enum km_type type)
452 } 457 }
453 458
454 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || 459 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
455 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { 460 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
461 type == KM_IRQ_PTE || type == KM_NMI ||
462 type == KM_NMI_PTE ) {
456 if (!irqs_disabled()) { 463 if (!irqs_disabled()) {
457 WARN_ON(1); 464 WARN_ON(1);
458 warn_count--; 465 warn_count--;
diff --git a/mm/ksm.c b/mm/ksm.c
index bef1af4f77e3..5575f8628fef 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1012,6 +1012,7 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page,
1012 struct rmap_item *tree_rmap_item; 1012 struct rmap_item *tree_rmap_item;
1013 int ret; 1013 int ret;
1014 1014
1015 cond_resched();
1015 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1016 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1016 page2[0] = get_mergeable_page(tree_rmap_item); 1017 page2[0] = get_mergeable_page(tree_rmap_item);
1017 if (!page2[0]) 1018 if (!page2[0])
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 729d4b15b645..dacc64183874 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -35,6 +35,7 @@
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/page-flags.h> 36#include <linux/page-flags.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/ksm.h>
38#include <linux/rmap.h> 39#include <linux/rmap.h>
39#include <linux/pagemap.h> 40#include <linux/pagemap.h>
40#include <linux/swap.h> 41#include <linux/swap.h>
@@ -370,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
370 int ret = FAILED; 371 int ret = FAILED;
371 struct address_space *mapping; 372 struct address_space *mapping;
372 373
373 if (!isolate_lru_page(p))
374 page_cache_release(p);
375
376 /* 374 /*
377 * For anonymous pages we're done the only reference left 375 * For anonymous pages we're done the only reference left
378 * should be the one m_f() holds. 376 * should be the one m_f() holds.
@@ -498,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
498 */ 496 */
499static int me_swapcache_dirty(struct page *p, unsigned long pfn) 497static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500{ 498{
501 int ret = FAILED;
502
503 ClearPageDirty(p); 499 ClearPageDirty(p);
504 /* Trigger EIO in shmem: */ 500 /* Trigger EIO in shmem: */
505 ClearPageUptodate(p); 501 ClearPageUptodate(p);
506 502
507 if (!isolate_lru_page(p)) { 503 return DELAYED;
508 page_cache_release(p);
509 ret = DELAYED;
510 }
511
512 return ret;
513} 504}
514 505
515static int me_swapcache_clean(struct page *p, unsigned long pfn) 506static int me_swapcache_clean(struct page *p, unsigned long pfn)
516{ 507{
517 int ret = FAILED;
518
519 if (!isolate_lru_page(p)) {
520 page_cache_release(p);
521 ret = RECOVERED;
522 }
523 delete_from_swap_cache(p); 508 delete_from_swap_cache(p);
524 return ret; 509
510 return RECOVERED;
525} 511}
526 512
527/* 513/*
@@ -611,8 +597,6 @@ static struct page_state {
611 { 0, 0, "unknown page state", me_unknown }, 597 { 0, 0, "unknown page state", me_unknown },
612}; 598};
613 599
614#undef lru
615
616static void action_result(unsigned long pfn, char *msg, int result) 600static void action_result(unsigned long pfn, char *msg, int result)
617{ 601{
618 struct page *page = NULL; 602 struct page *page = NULL;
@@ -629,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p,
629 unsigned long pfn, int ref) 613 unsigned long pfn, int ref)
630{ 614{
631 int result; 615 int result;
616 int count;
632 617
633 result = ps->action(p, pfn); 618 result = ps->action(p, pfn);
634 action_result(pfn, ps->msg, result); 619 action_result(pfn, ps->msg, result);
635 if (page_count(p) != 1 + ref) 620
621 count = page_count(p) - 1 - ref;
622 if (count != 0)
636 printk(KERN_ERR 623 printk(KERN_ERR
637 "MCE %#lx: %s page still referenced by %d users\n", 624 "MCE %#lx: %s page still referenced by %d users\n",
638 pfn, ps->msg, page_count(p) - 1); 625 pfn, ps->msg, count);
639 626
640 /* Could do more checks here if page looks ok */ 627 /* Could do more checks here if page looks ok */
641 /* 628 /*
@@ -661,12 +648,9 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
661 int i; 648 int i;
662 int kill = 1; 649 int kill = 1;
663 650
664 if (PageReserved(p) || PageCompound(p) || PageSlab(p)) 651 if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
665 return; 652 return;
666 653
667 if (!PageLRU(p))
668 lru_add_drain_all();
669
670 /* 654 /*
671 * This check implies we don't kill processes if their pages 655 * This check implies we don't kill processes if their pages
672 * are in the swap cache early. Those are always late kills. 656 * are in the swap cache early. Those are always late kills.
@@ -738,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
738 722
739int __memory_failure(unsigned long pfn, int trapno, int ref) 723int __memory_failure(unsigned long pfn, int trapno, int ref)
740{ 724{
725 unsigned long lru_flag;
741 struct page_state *ps; 726 struct page_state *ps;
742 struct page *p; 727 struct page *p;
743 int res; 728 int res;
@@ -775,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
775 } 760 }
776 761
777 /* 762 /*
763 * We ignore non-LRU pages for good reasons.
764 * - PG_locked is only well defined for LRU pages and a few others
765 * - to avoid races with __set_page_locked()
766 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
767 * The check (unnecessarily) ignores LRU pages being isolated and
768 * walked by the page reclaim code, however that's not a big loss.
769 */
770 if (!PageLRU(p))
771 lru_add_drain_all();
772 lru_flag = p->flags & lru;
773 if (isolate_lru_page(p)) {
774 action_result(pfn, "non LRU", IGNORED);
775 put_page(p);
776 return -EBUSY;
777 }
778 page_cache_release(p);
779
780 /*
778 * Lock the page and wait for writeback to finish. 781 * Lock the page and wait for writeback to finish.
779 * It's very difficult to mess with pages currently under IO 782 * It's very difficult to mess with pages currently under IO
780 * and in many cases impossible, so we just avoid it here. 783 * and in many cases impossible, so we just avoid it here.
@@ -790,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
790 /* 793 /*
791 * Torn down by someone else? 794 * Torn down by someone else?
792 */ 795 */
793 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 796 if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
794 action_result(pfn, "already truncated LRU", IGNORED); 797 action_result(pfn, "already truncated LRU", IGNORED);
795 res = 0; 798 res = 0;
796 goto out; 799 goto out;
@@ -798,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
798 801
799 res = -EBUSY; 802 res = -EBUSY;
800 for (ps = error_states;; ps++) { 803 for (ps = error_states;; ps++) {
801 if ((p->flags & ps->mask) == ps->res) { 804 if (((p->flags | lru_flag)& ps->mask) == ps->res) {
802 res = page_action(ps, p, pfn, ref); 805 res = page_action(ps, p, pfn, ref);
803 break; 806 break;
804 } 807 }
diff --git a/mm/memory.c b/mm/memory.c
index 7e91b5f9f690..6ab19dd4a199 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -641,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
641 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 641 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
642 unsigned long addr, unsigned long end) 642 unsigned long addr, unsigned long end)
643{ 643{
644 pte_t *orig_src_pte, *orig_dst_pte;
644 pte_t *src_pte, *dst_pte; 645 pte_t *src_pte, *dst_pte;
645 spinlock_t *src_ptl, *dst_ptl; 646 spinlock_t *src_ptl, *dst_ptl;
646 int progress = 0; 647 int progress = 0;
@@ -654,6 +655,8 @@ again:
654 src_pte = pte_offset_map_nested(src_pmd, addr); 655 src_pte = pte_offset_map_nested(src_pmd, addr);
655 src_ptl = pte_lockptr(src_mm, src_pmd); 656 src_ptl = pte_lockptr(src_mm, src_pmd);
656 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 657 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
658 orig_src_pte = src_pte;
659 orig_dst_pte = dst_pte;
657 arch_enter_lazy_mmu_mode(); 660 arch_enter_lazy_mmu_mode();
658 661
659 do { 662 do {
@@ -677,9 +680,9 @@ again:
677 680
678 arch_leave_lazy_mmu_mode(); 681 arch_leave_lazy_mmu_mode();
679 spin_unlock(src_ptl); 682 spin_unlock(src_ptl);
680 pte_unmap_nested(src_pte - 1); 683 pte_unmap_nested(orig_src_pte);
681 add_mm_rss(dst_mm, rss[0], rss[1]); 684 add_mm_rss(dst_mm, rss[0], rss[1]);
682 pte_unmap_unlock(dst_pte - 1, dst_ptl); 685 pte_unmap_unlock(orig_dst_pte, dst_ptl);
683 cond_resched(); 686 cond_resched();
684 if (addr != end) 687 if (addr != end)
685 goto again; 688 goto again;
@@ -1820,10 +1823,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1820 token = pmd_pgtable(*pmd); 1823 token = pmd_pgtable(*pmd);
1821 1824
1822 do { 1825 do {
1823 err = fn(pte, token, addr, data); 1826 err = fn(pte++, token, addr, data);
1824 if (err) 1827 if (err)
1825 break; 1828 break;
1826 } while (pte++, addr += PAGE_SIZE, addr != end); 1829 } while (addr += PAGE_SIZE, addr != end);
1827 1830
1828 arch_leave_lazy_mmu_mode(); 1831 arch_leave_lazy_mmu_mode();
1829 1832
@@ -2539,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2539 } else if (PageHWPoison(page)) { 2542 } else if (PageHWPoison(page)) {
2540 ret = VM_FAULT_HWPOISON; 2543 ret = VM_FAULT_HWPOISON;
2541 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2544 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2542 goto out; 2545 goto out_release;
2543 } 2546 }
2544 2547
2545 lock_page(page); 2548 lock_page(page);
@@ -2611,6 +2614,7 @@ out_nomap:
2611 pte_unmap_unlock(page_table, ptl); 2614 pte_unmap_unlock(page_table, ptl);
2612out_page: 2615out_page:
2613 unlock_page(page); 2616 unlock_page(page);
2617out_release:
2614 page_cache_release(page); 2618 page_cache_release(page);
2615 return ret; 2619 return ret;
2616} 2620}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 821dee596377..2047465cd27c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
26#include <linux/migrate.h> 26#include <linux/migrate.h>
27#include <linux/page-isolation.h> 27#include <linux/page-isolation.h>
28#include <linux/pfn.h> 28#include <linux/pfn.h>
29#include <linux/suspend.h>
29 30
30#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
31 32
@@ -447,7 +448,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
447} 448}
448#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 449#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
449 450
450static pg_data_t *hotadd_new_pgdat(int nid, u64 start) 451/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
452static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
451{ 453{
452 struct pglist_data *pgdat; 454 struct pglist_data *pgdat;
453 unsigned long zones_size[MAX_NR_ZONES] = {0}; 455 unsigned long zones_size[MAX_NR_ZONES] = {0};
@@ -484,14 +486,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
484 struct resource *res; 486 struct resource *res;
485 int ret; 487 int ret;
486 488
489 lock_system_sleep();
490
487 res = register_memory_resource(start, size); 491 res = register_memory_resource(start, size);
492 ret = -EEXIST;
488 if (!res) 493 if (!res)
489 return -EEXIST; 494 goto out;
490 495
491 if (!node_online(nid)) { 496 if (!node_online(nid)) {
492 pgdat = hotadd_new_pgdat(nid, start); 497 pgdat = hotadd_new_pgdat(nid, start);
498 ret = -ENOMEM;
493 if (!pgdat) 499 if (!pgdat)
494 return -ENOMEM; 500 goto out;
495 new_pgdat = 1; 501 new_pgdat = 1;
496 } 502 }
497 503
@@ -514,7 +520,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
514 BUG_ON(ret); 520 BUG_ON(ret);
515 } 521 }
516 522
517 return ret; 523 goto out;
524
518error: 525error:
519 /* rollback pgdat allocation and others */ 526 /* rollback pgdat allocation and others */
520 if (new_pgdat) 527 if (new_pgdat)
@@ -522,6 +529,8 @@ error:
522 if (res) 529 if (res)
523 release_memory_resource(res); 530 release_memory_resource(res);
524 531
532out:
533 unlock_system_sleep();
525 return ret; 534 return ret;
526} 535}
527EXPORT_SYMBOL_GPL(add_memory); 536EXPORT_SYMBOL_GPL(add_memory);
@@ -758,6 +767,8 @@ int offline_pages(unsigned long start_pfn,
758 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 767 if (!test_pages_in_a_zone(start_pfn, end_pfn))
759 return -EINVAL; 768 return -EINVAL;
760 769
770 lock_system_sleep();
771
761 zone = page_zone(pfn_to_page(start_pfn)); 772 zone = page_zone(pfn_to_page(start_pfn));
762 node = zone_to_nid(zone); 773 node = zone_to_nid(zone);
763 nr_pages = end_pfn - start_pfn; 774 nr_pages = end_pfn - start_pfn;
@@ -765,7 +776,7 @@ int offline_pages(unsigned long start_pfn,
765 /* set above range as isolated */ 776 /* set above range as isolated */
766 ret = start_isolate_page_range(start_pfn, end_pfn); 777 ret = start_isolate_page_range(start_pfn, end_pfn);
767 if (ret) 778 if (ret)
768 return ret; 779 goto out;
769 780
770 arg.start_pfn = start_pfn; 781 arg.start_pfn = start_pfn;
771 arg.nr_pages = nr_pages; 782 arg.nr_pages = nr_pages;
@@ -843,6 +854,7 @@ repeat:
843 writeback_set_ratelimit(); 854 writeback_set_ratelimit();
844 855
845 memory_notify(MEM_OFFLINE, &arg); 856 memory_notify(MEM_OFFLINE, &arg);
857 unlock_system_sleep();
846 return 0; 858 return 0;
847 859
848failed_removal: 860failed_removal:
@@ -852,6 +864,8 @@ failed_removal:
852 /* pushback to free area */ 864 /* pushback to free area */
853 undo_isolate_page_range(start_pfn, end_pfn); 865 undo_isolate_page_range(start_pfn, end_pfn);
854 866
867out:
868 unlock_system_sleep();
855 return ret; 869 return ret;
856} 870}
857 871
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7dd9d9f80694..4545d5944243 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1024,7 +1024,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1024 1024
1025 err = migrate_prep(); 1025 err = migrate_prep();
1026 if (err) 1026 if (err)
1027 return err; 1027 goto mpol_out;
1028 } 1028 }
1029 { 1029 {
1030 NODEMASK_SCRATCH(scratch); 1030 NODEMASK_SCRATCH(scratch);
@@ -1039,10 +1039,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1039 err = -ENOMEM; 1039 err = -ENOMEM;
1040 NODEMASK_SCRATCH_FREE(scratch); 1040 NODEMASK_SCRATCH_FREE(scratch);
1041 } 1041 }
1042 if (err) { 1042 if (err)
1043 mpol_put(new); 1043 goto mpol_out;
1044 return err; 1044
1045 }
1046 vma = check_range(mm, start, end, nmask, 1045 vma = check_range(mm, start, end, nmask,
1047 flags | MPOL_MF_INVERT, &pagelist); 1046 flags | MPOL_MF_INVERT, &pagelist);
1048 1047
@@ -1058,9 +1057,11 @@ static long do_mbind(unsigned long start, unsigned long len,
1058 1057
1059 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1058 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1060 err = -EIO; 1059 err = -EIO;
1061 } 1060 } else
1061 putback_lru_pages(&pagelist);
1062 1062
1063 up_write(&mm->mmap_sem); 1063 up_write(&mm->mmap_sem);
1064 mpol_out:
1064 mpol_put(new); 1065 mpol_put(new);
1065 return err; 1066 return err;
1066} 1067}
diff --git a/mm/migrate.c b/mm/migrate.c
index 1a4bf4813780..7dbcb22316d2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -602,7 +602,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
602 struct page *newpage = get_new_page(page, private, &result); 602 struct page *newpage = get_new_page(page, private, &result);
603 int rcu_locked = 0; 603 int rcu_locked = 0;
604 int charge = 0; 604 int charge = 0;
605 struct mem_cgroup *mem; 605 struct mem_cgroup *mem = NULL;
606 606
607 if (!newpage) 607 if (!newpage)
608 return -ENOMEM; 608 return -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 5189b5aed8c0..9876fa0c3ad3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1362,9 +1362,11 @@ share:
1362error_just_free: 1362error_just_free:
1363 up_write(&nommu_region_sem); 1363 up_write(&nommu_region_sem);
1364error: 1364error:
1365 fput(region->vm_file); 1365 if (region->vm_file)
1366 fput(region->vm_file);
1366 kmem_cache_free(vm_region_jar, region); 1367 kmem_cache_free(vm_region_jar, region);
1367 fput(vma->vm_file); 1368 if (vma->vm_file)
1369 fput(vma->vm_file);
1368 if (vma->vm_flags & VM_EXECUTABLE) 1370 if (vma->vm_flags & VM_EXECUTABLE)
1369 removed_exe_file_vma(vma->vm_mm); 1371 removed_exe_file_vma(vma->vm_mm);
1370 kmem_cache_free(vm_area_cachep, vma); 1372 kmem_cache_free(vm_area_cachep, vma);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bf720550b44d..2bc2ac63f41e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1769,7 +1769,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1769 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1769 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1770 */ 1770 */
1771 alloc_flags &= ~ALLOC_CPUSET; 1771 alloc_flags &= ~ALLOC_CPUSET;
1772 } else if (unlikely(rt_task(p))) 1772 } else if (unlikely(rt_task(p)) && !in_interrupt())
1773 alloc_flags |= ALLOC_HARDER; 1773 alloc_flags |= ALLOC_HARDER;
1774 1774
1775 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 1775 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
@@ -1817,9 +1817,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1817 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1817 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1818 goto nopage; 1818 goto nopage;
1819 1819
1820restart:
1820 wake_all_kswapd(order, zonelist, high_zoneidx); 1821 wake_all_kswapd(order, zonelist, high_zoneidx);
1821 1822
1822restart:
1823 /* 1823 /*
1824 * OK, we're below the kswapd watermark and have kicked background 1824 * OK, we're below the kswapd watermark and have kicked background
1825 * reclaim. Now things get more complex, so set up alloc_flags according 1825 * reclaim. Now things get more complex, so set up alloc_flags according
@@ -2183,7 +2183,7 @@ void show_free_areas(void)
2183 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 2183 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2184 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 2184 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2185 " unevictable:%lu" 2185 " unevictable:%lu"
2186 " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" 2186 " dirty:%lu writeback:%lu unstable:%lu\n"
2187 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 2187 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2188 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", 2188 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2189 global_page_state(NR_ACTIVE_ANON), 2189 global_page_state(NR_ACTIVE_ANON),
@@ -2196,7 +2196,6 @@ void show_free_areas(void)
2196 global_page_state(NR_FILE_DIRTY), 2196 global_page_state(NR_FILE_DIRTY),
2197 global_page_state(NR_WRITEBACK), 2197 global_page_state(NR_WRITEBACK),
2198 global_page_state(NR_UNSTABLE_NFS), 2198 global_page_state(NR_UNSTABLE_NFS),
2199 nr_blockdev_pages(),
2200 global_page_state(NR_FREE_PAGES), 2199 global_page_state(NR_FREE_PAGES),
2201 global_page_state(NR_SLAB_RECLAIMABLE), 2200 global_page_state(NR_SLAB_RECLAIMABLE),
2202 global_page_state(NR_SLAB_UNRECLAIMABLE), 2201 global_page_state(NR_SLAB_UNRECLAIMABLE),
diff --git a/mm/percpu.c b/mm/percpu.c
index 6af78c1ee704..5adfc268b408 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -153,7 +153,10 @@ static int pcpu_reserved_chunk_limit;
153 * 153 *
154 * During allocation, pcpu_alloc_mutex is kept locked all the time and 154 * During allocation, pcpu_alloc_mutex is kept locked all the time and
155 * pcpu_lock is grabbed and released as necessary. All actual memory 155 * pcpu_lock is grabbed and released as necessary. All actual memory
156 * allocations are done using GFP_KERNEL with pcpu_lock released. 156 * allocations are done using GFP_KERNEL with pcpu_lock released. In
157 * general, percpu memory can't be allocated with irq off but
158 * irqsave/restore are still used in alloc path so that it can be used
159 * from early init path - sched_init() specifically.
157 * 160 *
158 * Free path accesses and alters only the index data structures, so it 161 * Free path accesses and alters only the index data structures, so it
159 * can be safely called from atomic context. When memory needs to be 162 * can be safely called from atomic context. When memory needs to be
@@ -352,62 +355,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
352} 355}
353 356
354/** 357/**
355 * pcpu_extend_area_map - extend area map for allocation 358 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
356 * @chunk: target chunk 359 * @chunk: chunk of interest
357 * 360 *
358 * Extend area map of @chunk so that it can accomodate an allocation. 361 * Determine whether area map of @chunk needs to be extended to
359 * A single allocation can split an area into three areas, so this 362 * accomodate a new allocation.
360 * function makes sure that @chunk->map has at least two extra slots.
361 * 363 *
362 * CONTEXT: 364 * CONTEXT:
363 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired 365 * pcpu_lock.
364 * if area map is extended.
365 * 366 *
366 * RETURNS: 367 * RETURNS:
367 * 0 if noop, 1 if successfully extended, -errno on failure. 368 * New target map allocation length if extension is necessary, 0
369 * otherwise.
368 */ 370 */
369static int pcpu_extend_area_map(struct pcpu_chunk *chunk) 371static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
370{ 372{
371 int new_alloc; 373 int new_alloc;
372 int *new;
373 size_t size;
374 374
375 /* has enough? */
376 if (chunk->map_alloc >= chunk->map_used + 2) 375 if (chunk->map_alloc >= chunk->map_used + 2)
377 return 0; 376 return 0;
378 377
379 spin_unlock_irq(&pcpu_lock);
380
381 new_alloc = PCPU_DFL_MAP_ALLOC; 378 new_alloc = PCPU_DFL_MAP_ALLOC;
382 while (new_alloc < chunk->map_used + 2) 379 while (new_alloc < chunk->map_used + 2)
383 new_alloc *= 2; 380 new_alloc *= 2;
384 381
385 new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); 382 return new_alloc;
386 if (!new) { 383}
387 spin_lock_irq(&pcpu_lock); 384
385/**
386 * pcpu_extend_area_map - extend area map of a chunk
387 * @chunk: chunk of interest
388 * @new_alloc: new target allocation length of the area map
389 *
390 * Extend area map of @chunk to have @new_alloc entries.
391 *
392 * CONTEXT:
393 * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
394 *
395 * RETURNS:
396 * 0 on success, -errno on failure.
397 */
398static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
399{
400 int *old = NULL, *new = NULL;
401 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
402 unsigned long flags;
403
404 new = pcpu_mem_alloc(new_size);
405 if (!new)
388 return -ENOMEM; 406 return -ENOMEM;
389 }
390 407
391 /* 408 /* acquire pcpu_lock and switch to new area map */
392 * Acquire pcpu_lock and switch to new area map. Only free 409 spin_lock_irqsave(&pcpu_lock, flags);
393 * could have happened inbetween, so map_used couldn't have 410
394 * grown. 411 if (new_alloc <= chunk->map_alloc)
395 */ 412 goto out_unlock;
396 spin_lock_irq(&pcpu_lock);
397 BUG_ON(new_alloc < chunk->map_used + 2);
398 413
399 size = chunk->map_alloc * sizeof(chunk->map[0]); 414 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
400 memcpy(new, chunk->map, size); 415 memcpy(new, chunk->map, old_size);
401 416
402 /* 417 /*
403 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is 418 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
404 * one of the first chunks and still using static map. 419 * one of the first chunks and still using static map.
405 */ 420 */
406 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) 421 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
407 pcpu_mem_free(chunk->map, size); 422 old = chunk->map;
408 423
409 chunk->map_alloc = new_alloc; 424 chunk->map_alloc = new_alloc;
410 chunk->map = new; 425 chunk->map = new;
426 new = NULL;
427
428out_unlock:
429 spin_unlock_irqrestore(&pcpu_lock, flags);
430
431 /*
432 * pcpu_mem_free() might end up calling vfree() which uses
433 * IRQ-unsafe lock and thus can't be called under pcpu_lock.
434 */
435 pcpu_mem_free(old, old_size);
436 pcpu_mem_free(new, new_size);
437
411 return 0; 438 return 0;
412} 439}
413 440
@@ -1046,7 +1073,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
1046 static int warn_limit = 10; 1073 static int warn_limit = 10;
1047 struct pcpu_chunk *chunk; 1074 struct pcpu_chunk *chunk;
1048 const char *err; 1075 const char *err;
1049 int slot, off; 1076 int slot, off, new_alloc;
1077 unsigned long flags;
1050 1078
1051 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { 1079 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
1052 WARN(true, "illegal size (%zu) or align (%zu) for " 1080 WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -1055,19 +1083,30 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
1055 } 1083 }
1056 1084
1057 mutex_lock(&pcpu_alloc_mutex); 1085 mutex_lock(&pcpu_alloc_mutex);
1058 spin_lock_irq(&pcpu_lock); 1086 spin_lock_irqsave(&pcpu_lock, flags);
1059 1087
1060 /* serve reserved allocations from the reserved chunk if available */ 1088 /* serve reserved allocations from the reserved chunk if available */
1061 if (reserved && pcpu_reserved_chunk) { 1089 if (reserved && pcpu_reserved_chunk) {
1062 chunk = pcpu_reserved_chunk; 1090 chunk = pcpu_reserved_chunk;
1063 if (size > chunk->contig_hint || 1091
1064 pcpu_extend_area_map(chunk) < 0) { 1092 if (size > chunk->contig_hint) {
1065 err = "failed to extend area map of reserved chunk"; 1093 err = "alloc from reserved chunk failed";
1066 goto fail_unlock; 1094 goto fail_unlock;
1067 } 1095 }
1096
1097 while ((new_alloc = pcpu_need_to_extend(chunk))) {
1098 spin_unlock_irqrestore(&pcpu_lock, flags);
1099 if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
1100 err = "failed to extend area map of reserved chunk";
1101 goto fail_unlock_mutex;
1102 }
1103 spin_lock_irqsave(&pcpu_lock, flags);
1104 }
1105
1068 off = pcpu_alloc_area(chunk, size, align); 1106 off = pcpu_alloc_area(chunk, size, align);
1069 if (off >= 0) 1107 if (off >= 0)
1070 goto area_found; 1108 goto area_found;
1109
1071 err = "alloc from reserved chunk failed"; 1110 err = "alloc from reserved chunk failed";
1072 goto fail_unlock; 1111 goto fail_unlock;
1073 } 1112 }
@@ -1079,14 +1118,20 @@ restart:
1079 if (size > chunk->contig_hint) 1118 if (size > chunk->contig_hint)
1080 continue; 1119 continue;
1081 1120
1082 switch (pcpu_extend_area_map(chunk)) { 1121 new_alloc = pcpu_need_to_extend(chunk);
1083 case 0: 1122 if (new_alloc) {
1084 break; 1123 spin_unlock_irqrestore(&pcpu_lock, flags);
1085 case 1: 1124 if (pcpu_extend_area_map(chunk,
1086 goto restart; /* pcpu_lock dropped, restart */ 1125 new_alloc) < 0) {
1087 default: 1126 err = "failed to extend area map";
1088 err = "failed to extend area map"; 1127 goto fail_unlock_mutex;
1089 goto fail_unlock; 1128 }
1129 spin_lock_irqsave(&pcpu_lock, flags);
1130 /*
1131 * pcpu_lock has been dropped, need to
1132 * restart cpu_slot list walking.
1133 */
1134 goto restart;
1090 } 1135 }
1091 1136
1092 off = pcpu_alloc_area(chunk, size, align); 1137 off = pcpu_alloc_area(chunk, size, align);
@@ -1096,7 +1141,7 @@ restart:
1096 } 1141 }
1097 1142
1098 /* hmmm... no space left, create a new chunk */ 1143 /* hmmm... no space left, create a new chunk */
1099 spin_unlock_irq(&pcpu_lock); 1144 spin_unlock_irqrestore(&pcpu_lock, flags);
1100 1145
1101 chunk = alloc_pcpu_chunk(); 1146 chunk = alloc_pcpu_chunk();
1102 if (!chunk) { 1147 if (!chunk) {
@@ -1104,16 +1149,16 @@ restart:
1104 goto fail_unlock_mutex; 1149 goto fail_unlock_mutex;
1105 } 1150 }
1106 1151
1107 spin_lock_irq(&pcpu_lock); 1152 spin_lock_irqsave(&pcpu_lock, flags);
1108 pcpu_chunk_relocate(chunk, -1); 1153 pcpu_chunk_relocate(chunk, -1);
1109 goto restart; 1154 goto restart;
1110 1155
1111area_found: 1156area_found:
1112 spin_unlock_irq(&pcpu_lock); 1157 spin_unlock_irqrestore(&pcpu_lock, flags);
1113 1158
1114 /* populate, map and clear the area */ 1159 /* populate, map and clear the area */
1115 if (pcpu_populate_chunk(chunk, off, size)) { 1160 if (pcpu_populate_chunk(chunk, off, size)) {
1116 spin_lock_irq(&pcpu_lock); 1161 spin_lock_irqsave(&pcpu_lock, flags);
1117 pcpu_free_area(chunk, off); 1162 pcpu_free_area(chunk, off);
1118 err = "failed to populate"; 1163 err = "failed to populate";
1119 goto fail_unlock; 1164 goto fail_unlock;
@@ -1125,7 +1170,7 @@ area_found:
1125 return __addr_to_pcpu_ptr(chunk->base_addr + off); 1170 return __addr_to_pcpu_ptr(chunk->base_addr + off);
1126 1171
1127fail_unlock: 1172fail_unlock:
1128 spin_unlock_irq(&pcpu_lock); 1173 spin_unlock_irqrestore(&pcpu_lock, flags);
1129fail_unlock_mutex: 1174fail_unlock_mutex:
1130 mutex_unlock(&pcpu_alloc_mutex); 1175 mutex_unlock(&pcpu_alloc_mutex);
1131 if (warn_limit) { 1176 if (warn_limit) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a1bc6b9af9a2..9c590eef7912 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1151,8 +1151,7 @@ static int try_to_unuse(unsigned int type)
1151 } else 1151 } else
1152 retval = unuse_mm(mm, entry, page); 1152 retval = unuse_mm(mm, entry, page);
1153 1153
1154 if (set_start_mm && 1154 if (set_start_mm && *swap_map < swcount) {
1155 swap_count(*swap_map) < swcount) {
1156 mmput(new_start_mm); 1155 mmput(new_start_mm);
1157 atomic_inc(&mm->mm_users); 1156 atomic_inc(&mm->mm_users);
1158 new_start_mm = mm; 1157 new_start_mm = mm;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 64e438898832..777af57fd8c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -544,6 +544,16 @@ redo:
544 */ 544 */
545 lru = LRU_UNEVICTABLE; 545 lru = LRU_UNEVICTABLE;
546 add_page_to_unevictable_list(page); 546 add_page_to_unevictable_list(page);
547 /*
548 * When racing with an mlock clearing (page is
549 * unlocked), make sure that if the other thread does
550 * not observe our setting of PG_lru and fails
551 * isolation, we see PG_mlocked cleared below and move
552 * the page back to the evictable list.
553 *
554 * The other side is TestClearPageMlocked().
555 */
556 smp_mb();
547 } 557 }
548 558
549 /* 559 /*
@@ -1088,7 +1098,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1088 int lumpy_reclaim = 0; 1098 int lumpy_reclaim = 0;
1089 1099
1090 while (unlikely(too_many_isolated(zone, file, sc))) { 1100 while (unlikely(too_many_isolated(zone, file, sc))) {
1091 congestion_wait(WRITE, HZ/10); 1101 congestion_wait(BLK_RW_ASYNC, HZ/10);
1092 1102
1093 /* We are about to die and free our memory. Return now. */ 1103 /* We are about to die and free our memory. Return now. */
1094 if (fatal_signal_pending(current)) 1104 if (fatal_signal_pending(current))
@@ -1356,7 +1366,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1356 * IO, plus JVM can create lots of anon VM_EXEC pages, 1366 * IO, plus JVM can create lots of anon VM_EXEC pages,
1357 * so we ignore them here. 1367 * so we ignore them here.
1358 */ 1368 */
1359 if ((vm_flags & VM_EXEC) && !PageAnon(page)) { 1369 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1360 list_add(&page->lru, &l_active); 1370 list_add(&page->lru, &l_active);
1361 continue; 1371 continue;
1362 } 1372 }