aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-07-30 13:33:48 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-30 13:33:48 -0400
commit15dd859cacf312f606f54502d1f66537a1e5c78c (patch)
treee50e125eaa6da83fa715704e53c1bde013d1ef8e /mm
parentb2d9d33412b9d13a40cd314d93ab517950fc5950 (diff)
parent6e86841d05f371b5b9b86ce76c02aaee83352298 (diff)
Merge commit 'v2.6.27-rc1' into x86/core
Conflicts: include/asm-x86/dma-mapping.h include/asm-x86/namei.h include/asm-x86/uaccess.h Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile1
-rw-r--r--mm/allocpercpu.c20
-rw-r--r--mm/filemap.c242
-rw-r--r--mm/filemap_xip.c5
-rw-r--r--mm/fremap.c3
-rw-r--r--mm/hugetlb.c28
-rw-r--r--mm/memory.c38
-rw-r--r--mm/migrate.c29
-rw-r--r--mm/mmap.c160
-rw-r--r--mm/mmu_notifier.c277
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page-writeback.c12
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c15
-rw-r--r--mm/shmem.c11
-rw-r--r--mm/shmem_acl.c2
-rw-r--r--mm/slab.c11
-rw-r--r--mm/slob.c7
-rw-r--r--mm/slub.c13
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap_state.c30
-rw-r--r--mm/swapfile.c10
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/util.c55
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c80
30 files changed, 886 insertions, 206 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index aa799007a11b..446c6588c753 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -77,6 +77,9 @@ config FLAT_NODE_MEM_MAP
77 def_bool y 77 def_bool y
78 depends on !SPARSEMEM 78 depends on !SPARSEMEM
79 79
80config HAVE_GET_USER_PAGES_FAST
81 bool
82
80# 83#
81# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's 84# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
82# to represent different areas of memory. This variable allows 85# to represent different areas of memory. This variable allows
@@ -205,3 +208,6 @@ config NR_QUICK
205config VIRT_TO_BUS 208config VIRT_TO_BUS
206 def_bool y 209 def_bool y
207 depends on !ARCH_NO_VIRT_TO_BUS 210 depends on !ARCH_NO_VIRT_TO_BUS
211
212config MMU_NOTIFIER
213 bool
diff --git a/mm/Makefile b/mm/Makefile
index 06ca2381fef1..da4ccf015aea 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
27obj-$(CONFIG_SLOB) += slob.o 27obj-$(CONFIG_SLOB) += slob.o
28obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_SLAB) += slab.o 29obj-$(CONFIG_SLAB) += slab.o
29obj-$(CONFIG_SLUB) += slub.o 30obj-$(CONFIG_SLUB) += slub.o
30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 843364594e23..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -18,27 +18,28 @@
18 * Depopulating per-cpu data for a cpu going offline would be a typical 18 * Depopulating per-cpu data for a cpu going offline would be a typical
19 * use case. You need to register a cpu hotplug handler for that purpose. 19 * use case. You need to register a cpu hotplug handler for that purpose.
20 */ 20 */
21void percpu_depopulate(void *__pdata, int cpu) 21static void percpu_depopulate(void *__pdata, int cpu)
22{ 22{
23 struct percpu_data *pdata = __percpu_disguise(__pdata); 23 struct percpu_data *pdata = __percpu_disguise(__pdata);
24 24
25 kfree(pdata->ptrs[cpu]); 25 kfree(pdata->ptrs[cpu]);
26 pdata->ptrs[cpu] = NULL; 26 pdata->ptrs[cpu] = NULL;
27} 27}
28EXPORT_SYMBOL_GPL(percpu_depopulate);
29 28
30/** 29/**
31 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's 30 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
32 * @__pdata: per-cpu data to depopulate 31 * @__pdata: per-cpu data to depopulate
33 * @mask: depopulate per-cpu data for cpu's selected through mask bits 32 * @mask: depopulate per-cpu data for cpu's selected through mask bits
34 */ 33 */
35void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) 34static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
36{ 35{
37 int cpu; 36 int cpu;
38 for_each_cpu_mask_nr(cpu, *mask) 37 for_each_cpu_mask_nr(cpu, *mask)
39 percpu_depopulate(__pdata, cpu); 38 percpu_depopulate(__pdata, cpu);
40} 39}
41EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); 40
41#define percpu_depopulate_mask(__pdata, mask) \
42 __percpu_depopulate_mask((__pdata), &(mask))
42 43
43/** 44/**
44 * percpu_populate - populate per-cpu data for given cpu 45 * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
51 * use case. You need to register a cpu hotplug handler for that purpose. 52 * use case. You need to register a cpu hotplug handler for that purpose.
52 * Per-cpu object is populated with zeroed buffer. 53 * Per-cpu object is populated with zeroed buffer.
53 */ 54 */
54void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) 55static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
55{ 56{
56 struct percpu_data *pdata = __percpu_disguise(__pdata); 57 struct percpu_data *pdata = __percpu_disguise(__pdata);
57 int node = cpu_to_node(cpu); 58 int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
68 pdata->ptrs[cpu] = kzalloc(size, gfp); 69 pdata->ptrs[cpu] = kzalloc(size, gfp);
69 return pdata->ptrs[cpu]; 70 return pdata->ptrs[cpu];
70} 71}
71EXPORT_SYMBOL_GPL(percpu_populate);
72 72
73/** 73/**
74 * percpu_populate_mask - populate per-cpu data for more cpu's 74 * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,8 +79,8 @@ EXPORT_SYMBOL_GPL(percpu_populate);
79 * 79 *
80 * Per-cpu objects are populated with zeroed buffers. 80 * Per-cpu objects are populated with zeroed buffers.
81 */ 81 */
82int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, 82static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
83 cpumask_t *mask) 83 cpumask_t *mask)
84{ 84{
85 cpumask_t populated; 85 cpumask_t populated;
86 int cpu; 86 int cpu;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
94 cpu_set(cpu, populated); 94 cpu_set(cpu, populated);
95 return 0; 95 return 0;
96} 96}
97EXPORT_SYMBOL_GPL(__percpu_populate_mask); 97
98#define percpu_populate_mask(__pdata, size, gfp, mask) \
99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
98 100
99/** 101/**
100 * percpu_alloc_mask - initial setup of per-cpu data 102 * percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/filemap.c b/mm/filemap.c
index 2d3ec1ffc66e..42bbc6909ba4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -109,7 +109,7 @@
109/* 109/*
110 * Remove a page from the page cache and free it. Caller has to make 110 * Remove a page from the page cache and free it. Caller has to make
111 * sure the page is locked and that nobody else uses it - or that usage 111 * sure the page is locked and that nobody else uses it - or that usage
112 * is safe. The caller must hold a write_lock on the mapping's tree_lock. 112 * is safe. The caller must hold the mapping's tree_lock.
113 */ 113 */
114void __remove_from_page_cache(struct page *page) 114void __remove_from_page_cache(struct page *page)
115{ 115{
@@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page)
141 141
142 BUG_ON(!PageLocked(page)); 142 BUG_ON(!PageLocked(page));
143 143
144 write_lock_irq(&mapping->tree_lock); 144 spin_lock_irq(&mapping->tree_lock);
145 __remove_from_page_cache(page); 145 __remove_from_page_cache(page);
146 write_unlock_irq(&mapping->tree_lock); 146 spin_unlock_irq(&mapping->tree_lock);
147} 147}
148 148
149static int sync_page(void *word) 149static int sync_page(void *word)
@@ -442,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping,
442} 442}
443 443
444/** 444/**
445 * add_to_page_cache - add newly allocated pagecache pages 445 * add_to_page_cache_locked - add a locked page to the pagecache
446 * @page: page to add 446 * @page: page to add
447 * @mapping: the page's address_space 447 * @mapping: the page's address_space
448 * @offset: page index 448 * @offset: page index
449 * @gfp_mask: page allocation mode 449 * @gfp_mask: page allocation mode
450 * 450 *
451 * This function is used to add newly allocated pagecache pages; 451 * This function is used to add a page to the pagecache. It must be locked.
452 * the page is new, so we can just run SetPageLocked() against it.
453 * The other page state flags were set by rmqueue().
454 *
455 * This function does not add the page to the LRU. The caller must do that. 452 * This function does not add the page to the LRU. The caller must do that.
456 */ 453 */
457int add_to_page_cache(struct page *page, struct address_space *mapping, 454int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
458 pgoff_t offset, gfp_t gfp_mask) 455 pgoff_t offset, gfp_t gfp_mask)
459{ 456{
460 int error = mem_cgroup_cache_charge(page, current->mm, 457 int error;
458
459 VM_BUG_ON(!PageLocked(page));
460
461 error = mem_cgroup_cache_charge(page, current->mm,
461 gfp_mask & ~__GFP_HIGHMEM); 462 gfp_mask & ~__GFP_HIGHMEM);
462 if (error) 463 if (error)
463 goto out; 464 goto out;
464 465
465 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 466 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
466 if (error == 0) { 467 if (error == 0) {
467 write_lock_irq(&mapping->tree_lock); 468 page_cache_get(page);
469 page->mapping = mapping;
470 page->index = offset;
471
472 spin_lock_irq(&mapping->tree_lock);
468 error = radix_tree_insert(&mapping->page_tree, offset, page); 473 error = radix_tree_insert(&mapping->page_tree, offset, page);
469 if (!error) { 474 if (likely(!error)) {
470 page_cache_get(page);
471 SetPageLocked(page);
472 page->mapping = mapping;
473 page->index = offset;
474 mapping->nrpages++; 475 mapping->nrpages++;
475 __inc_zone_page_state(page, NR_FILE_PAGES); 476 __inc_zone_page_state(page, NR_FILE_PAGES);
476 } else 477 } else {
478 page->mapping = NULL;
477 mem_cgroup_uncharge_cache_page(page); 479 mem_cgroup_uncharge_cache_page(page);
480 page_cache_release(page);
481 }
478 482
479 write_unlock_irq(&mapping->tree_lock); 483 spin_unlock_irq(&mapping->tree_lock);
480 radix_tree_preload_end(); 484 radix_tree_preload_end();
481 } else 485 } else
482 mem_cgroup_uncharge_cache_page(page); 486 mem_cgroup_uncharge_cache_page(page);
483out: 487out:
484 return error; 488 return error;
485} 489}
486EXPORT_SYMBOL(add_to_page_cache); 490EXPORT_SYMBOL(add_to_page_cache_locked);
487 491
488int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 492int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
489 pgoff_t offset, gfp_t gfp_mask) 493 pgoff_t offset, gfp_t gfp_mask)
@@ -633,15 +637,35 @@ void __lock_page_nosync(struct page *page)
633 * Is there a pagecache struct page at the given (mapping, offset) tuple? 637 * Is there a pagecache struct page at the given (mapping, offset) tuple?
634 * If yes, increment its refcount and return it; if no, return NULL. 638 * If yes, increment its refcount and return it; if no, return NULL.
635 */ 639 */
636struct page * find_get_page(struct address_space *mapping, pgoff_t offset) 640struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
637{ 641{
642 void **pagep;
638 struct page *page; 643 struct page *page;
639 644
640 read_lock_irq(&mapping->tree_lock); 645 rcu_read_lock();
641 page = radix_tree_lookup(&mapping->page_tree, offset); 646repeat:
642 if (page) 647 page = NULL;
643 page_cache_get(page); 648 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
644 read_unlock_irq(&mapping->tree_lock); 649 if (pagep) {
650 page = radix_tree_deref_slot(pagep);
651 if (unlikely(!page || page == RADIX_TREE_RETRY))
652 goto repeat;
653
654 if (!page_cache_get_speculative(page))
655 goto repeat;
656
657 /*
658 * Has the page moved?
659 * This is part of the lockless pagecache protocol. See
660 * include/linux/pagemap.h for details.
661 */
662 if (unlikely(page != *pagep)) {
663 page_cache_release(page);
664 goto repeat;
665 }
666 }
667 rcu_read_unlock();
668
645 return page; 669 return page;
646} 670}
647EXPORT_SYMBOL(find_get_page); 671EXPORT_SYMBOL(find_get_page);
@@ -656,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);
656 * 680 *
657 * Returns zero if the page was not present. find_lock_page() may sleep. 681 * Returns zero if the page was not present. find_lock_page() may sleep.
658 */ 682 */
659struct page *find_lock_page(struct address_space *mapping, 683struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
660 pgoff_t offset)
661{ 684{
662 struct page *page; 685 struct page *page;
663 686
664repeat: 687repeat:
665 read_lock_irq(&mapping->tree_lock); 688 page = find_get_page(mapping, offset);
666 page = radix_tree_lookup(&mapping->page_tree, offset);
667 if (page) { 689 if (page) {
668 page_cache_get(page); 690 lock_page(page);
669 if (TestSetPageLocked(page)) { 691 /* Has the page been truncated? */
670 read_unlock_irq(&mapping->tree_lock); 692 if (unlikely(page->mapping != mapping)) {
671 __lock_page(page); 693 unlock_page(page);
672 694 page_cache_release(page);
673 /* Has the page been truncated while we slept? */ 695 goto repeat;
674 if (unlikely(page->mapping != mapping)) {
675 unlock_page(page);
676 page_cache_release(page);
677 goto repeat;
678 }
679 VM_BUG_ON(page->index != offset);
680 goto out;
681 } 696 }
697 VM_BUG_ON(page->index != offset);
682 } 698 }
683 read_unlock_irq(&mapping->tree_lock);
684out:
685 return page; 699 return page;
686} 700}
687EXPORT_SYMBOL(find_lock_page); 701EXPORT_SYMBOL(find_lock_page);
@@ -747,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
747{ 761{
748 unsigned int i; 762 unsigned int i;
749 unsigned int ret; 763 unsigned int ret;
764 unsigned int nr_found;
765
766 rcu_read_lock();
767restart:
768 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
769 (void ***)pages, start, nr_pages);
770 ret = 0;
771 for (i = 0; i < nr_found; i++) {
772 struct page *page;
773repeat:
774 page = radix_tree_deref_slot((void **)pages[i]);
775 if (unlikely(!page))
776 continue;
777 /*
778 * this can only trigger if nr_found == 1, making livelock
779 * a non issue.
780 */
781 if (unlikely(page == RADIX_TREE_RETRY))
782 goto restart;
750 783
751 read_lock_irq(&mapping->tree_lock); 784 if (!page_cache_get_speculative(page))
752 ret = radix_tree_gang_lookup(&mapping->page_tree, 785 goto repeat;
753 (void **)pages, start, nr_pages); 786
754 for (i = 0; i < ret; i++) 787 /* Has the page moved? */
755 page_cache_get(pages[i]); 788 if (unlikely(page != *((void **)pages[i]))) {
756 read_unlock_irq(&mapping->tree_lock); 789 page_cache_release(page);
790 goto repeat;
791 }
792
793 pages[ret] = page;
794 ret++;
795 }
796 rcu_read_unlock();
757 return ret; 797 return ret;
758} 798}
759 799
@@ -774,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
774{ 814{
775 unsigned int i; 815 unsigned int i;
776 unsigned int ret; 816 unsigned int ret;
817 unsigned int nr_found;
818
819 rcu_read_lock();
820restart:
821 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
822 (void ***)pages, index, nr_pages);
823 ret = 0;
824 for (i = 0; i < nr_found; i++) {
825 struct page *page;
826repeat:
827 page = radix_tree_deref_slot((void **)pages[i]);
828 if (unlikely(!page))
829 continue;
830 /*
831 * this can only trigger if nr_found == 1, making livelock
832 * a non issue.
833 */
834 if (unlikely(page == RADIX_TREE_RETRY))
835 goto restart;
777 836
778 read_lock_irq(&mapping->tree_lock); 837 if (page->mapping == NULL || page->index != index)
779 ret = radix_tree_gang_lookup(&mapping->page_tree,
780 (void **)pages, index, nr_pages);
781 for (i = 0; i < ret; i++) {
782 if (pages[i]->mapping == NULL || pages[i]->index != index)
783 break; 838 break;
784 839
785 page_cache_get(pages[i]); 840 if (!page_cache_get_speculative(page))
841 goto repeat;
842
843 /* Has the page moved? */
844 if (unlikely(page != *((void **)pages[i]))) {
845 page_cache_release(page);
846 goto repeat;
847 }
848
849 pages[ret] = page;
850 ret++;
786 index++; 851 index++;
787 } 852 }
788 read_unlock_irq(&mapping->tree_lock); 853 rcu_read_unlock();
789 return i; 854 return ret;
790} 855}
791EXPORT_SYMBOL(find_get_pages_contig); 856EXPORT_SYMBOL(find_get_pages_contig);
792 857
@@ -806,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
806{ 871{
807 unsigned int i; 872 unsigned int i;
808 unsigned int ret; 873 unsigned int ret;
874 unsigned int nr_found;
875
876 rcu_read_lock();
877restart:
878 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
879 (void ***)pages, *index, nr_pages, tag);
880 ret = 0;
881 for (i = 0; i < nr_found; i++) {
882 struct page *page;
883repeat:
884 page = radix_tree_deref_slot((void **)pages[i]);
885 if (unlikely(!page))
886 continue;
887 /*
888 * this can only trigger if nr_found == 1, making livelock
889 * a non issue.
890 */
891 if (unlikely(page == RADIX_TREE_RETRY))
892 goto restart;
893
894 if (!page_cache_get_speculative(page))
895 goto repeat;
896
897 /* Has the page moved? */
898 if (unlikely(page != *((void **)pages[i]))) {
899 page_cache_release(page);
900 goto repeat;
901 }
902
903 pages[ret] = page;
904 ret++;
905 }
906 rcu_read_unlock();
809 907
810 read_lock_irq(&mapping->tree_lock);
811 ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
812 (void **)pages, *index, nr_pages, tag);
813 for (i = 0; i < ret; i++)
814 page_cache_get(pages[i]);
815 if (ret) 908 if (ret)
816 *index = pages[ret - 1]->index + 1; 909 *index = pages[ret - 1]->index + 1;
817 read_unlock_irq(&mapping->tree_lock); 910
818 return ret; 911 return ret;
819} 912}
820EXPORT_SYMBOL(find_get_pages_tag); 913EXPORT_SYMBOL(find_get_pages_tag);
@@ -930,8 +1023,17 @@ find_page:
930 ra, filp, page, 1023 ra, filp, page,
931 index, last_index - index); 1024 index, last_index - index);
932 } 1025 }
933 if (!PageUptodate(page)) 1026 if (!PageUptodate(page)) {
934 goto page_not_up_to_date; 1027 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1028 !mapping->a_ops->is_partially_uptodate)
1029 goto page_not_up_to_date;
1030 if (TestSetPageLocked(page))
1031 goto page_not_up_to_date;
1032 if (!mapping->a_ops->is_partially_uptodate(page,
1033 desc, offset))
1034 goto page_not_up_to_date_locked;
1035 unlock_page(page);
1036 }
935page_ok: 1037page_ok:
936 /* 1038 /*
937 * i_size must be checked after we know the page is Uptodate. 1039 * i_size must be checked after we know the page is Uptodate.
@@ -1001,6 +1103,7 @@ page_not_up_to_date:
1001 if (lock_page_killable(page)) 1103 if (lock_page_killable(page))
1002 goto readpage_eio; 1104 goto readpage_eio;
1003 1105
1106page_not_up_to_date_locked:
1004 /* Did it get truncated before we got the lock? */ 1107 /* Did it get truncated before we got the lock? */
1005 if (!page->mapping) { 1108 if (!page->mapping) {
1006 unlock_page(page); 1109 unlock_page(page);
@@ -1665,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
1665 return notify_change(dentry, &newattrs); 1768 return notify_change(dentry, &newattrs);
1666} 1769}
1667 1770
1668int remove_suid(struct dentry *dentry) 1771int file_remove_suid(struct file *file)
1669{ 1772{
1773 struct dentry *dentry = file->f_path.dentry;
1670 int killsuid = should_remove_suid(dentry); 1774 int killsuid = should_remove_suid(dentry);
1671 int killpriv = security_inode_need_killpriv(dentry); 1775 int killpriv = security_inode_need_killpriv(dentry);
1672 int error = 0; 1776 int error = 0;
@@ -1680,7 +1784,7 @@ int remove_suid(struct dentry *dentry)
1680 1784
1681 return error; 1785 return error;
1682} 1786}
1683EXPORT_SYMBOL(remove_suid); 1787EXPORT_SYMBOL(file_remove_suid);
1684 1788
1685static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1789static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1686 const struct iovec *iov, size_t base, size_t bytes) 1790 const struct iovec *iov, size_t base, size_t bytes)
@@ -2436,7 +2540,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2436 if (count == 0) 2540 if (count == 0)
2437 goto out; 2541 goto out;
2438 2542
2439 err = remove_suid(file->f_path.dentry); 2543 err = file_remove_suid(file);
2440 if (err) 2544 if (err)
2441 goto out; 2545 goto out;
2442 2546
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9d..380ab402d711 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/mmu_notifier.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
18#include <asm/io.h> 19#include <asm/io.h>
@@ -188,7 +189,7 @@ __xip_unmap (struct address_space * mapping,
188 if (pte) { 189 if (pte) {
189 /* Nuke the page table entry. */ 190 /* Nuke the page table entry. */
190 flush_cache_page(vma, address, pte_pfn(*pte)); 191 flush_cache_page(vma, address, pte_pfn(*pte));
191 pteval = ptep_clear_flush(vma, address, pte); 192 pteval = ptep_clear_flush_notify(vma, address, pte);
192 page_remove_rmap(page, vma); 193 page_remove_rmap(page, vma);
193 dec_mm_counter(mm, file_rss); 194 dec_mm_counter(mm, file_rss);
194 BUG_ON(pte_dirty(pteval)); 195 BUG_ON(pte_dirty(pteval));
@@ -380,7 +381,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
380 if (count == 0) 381 if (count == 0)
381 goto out_backing; 382 goto out_backing;
382 383
383 ret = remove_suid(filp->f_path.dentry); 384 ret = file_remove_suid(filp);
384 if (ret) 385 if (ret)
385 goto out_backing; 386 goto out_backing;
386 387
diff --git a/mm/fremap.c b/mm/fremap.c
index 07a9c82ce1a3..7881638e4a12 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,6 +15,7 @@
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/mmu_notifier.h>
18 19
19#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
20#include <asm/cacheflush.h> 21#include <asm/cacheflush.h>
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
214 spin_unlock(&mapping->i_mmap_lock); 215 spin_unlock(&mapping->i_mmap_lock);
215 } 216 }
216 217
218 mmu_notifier_invalidate_range_start(mm, start, start + size);
217 err = populate_range(mm, vma, start, size, pgoff); 219 err = populate_range(mm, vma, start, size, pgoff);
220 mmu_notifier_invalidate_range_end(mm, start, start + size);
218 if (!err && !(flags & MAP_NONBLOCK)) { 221 if (!err && !(flags & MAP_NONBLOCK)) {
219 if (unlikely(has_write_lock)) { 222 if (unlikely(has_write_lock)) {
220 downgrade_write(&mm->mmap_sem); 223 downgrade_write(&mm->mmap_sem);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8bf4ab01f86..254ce2b90158 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/sysctl.h> 10#include <linux/sysctl.h>
11#include <linux/highmem.h> 11#include <linux/highmem.h>
12#include <linux/mmu_notifier.h>
12#include <linux/nodemask.h> 13#include <linux/nodemask.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 15#include <linux/mempolicy.h>
@@ -19,6 +20,7 @@
19 20
20#include <asm/page.h> 21#include <asm/page.h>
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
23#include <asm/io.h>
22 24
23#include <linux/hugetlb.h> 25#include <linux/hugetlb.h>
24#include "internal.h" 26#include "internal.h"
@@ -1026,18 +1028,6 @@ static void __init report_hugepages(void)
1026 } 1028 }
1027} 1029}
1028 1030
1029static unsigned int cpuset_mems_nr(unsigned int *array)
1030{
1031 int node;
1032 unsigned int nr = 0;
1033
1034 for_each_node_mask(node, cpuset_current_mems_allowed)
1035 nr += array[node];
1036
1037 return nr;
1038}
1039
1040#ifdef CONFIG_SYSCTL
1041#ifdef CONFIG_HIGHMEM 1031#ifdef CONFIG_HIGHMEM
1042static void try_to_free_low(struct hstate *h, unsigned long count) 1032static void try_to_free_low(struct hstate *h, unsigned long count)
1043{ 1033{
@@ -1386,6 +1376,18 @@ static int __init hugetlb_default_setup(char *s)
1386} 1376}
1387__setup("default_hugepagesz=", hugetlb_default_setup); 1377__setup("default_hugepagesz=", hugetlb_default_setup);
1388 1378
1379static unsigned int cpuset_mems_nr(unsigned int *array)
1380{
1381 int node;
1382 unsigned int nr = 0;
1383
1384 for_each_node_mask(node, cpuset_current_mems_allowed)
1385 nr += array[node];
1386
1387 return nr;
1388}
1389
1390#ifdef CONFIG_SYSCTL
1389int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1391int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1390 struct file *file, void __user *buffer, 1392 struct file *file, void __user *buffer,
1391 size_t *length, loff_t *ppos) 1393 size_t *length, loff_t *ppos)
@@ -1672,6 +1674,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1672 BUG_ON(start & ~huge_page_mask(h)); 1674 BUG_ON(start & ~huge_page_mask(h));
1673 BUG_ON(end & ~huge_page_mask(h)); 1675 BUG_ON(end & ~huge_page_mask(h));
1674 1676
1677 mmu_notifier_invalidate_range_start(mm, start, end);
1675 spin_lock(&mm->page_table_lock); 1678 spin_lock(&mm->page_table_lock);
1676 for (address = start; address < end; address += sz) { 1679 for (address = start; address < end; address += sz) {
1677 ptep = huge_pte_offset(mm, address); 1680 ptep = huge_pte_offset(mm, address);
@@ -1713,6 +1716,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1713 } 1716 }
1714 spin_unlock(&mm->page_table_lock); 1717 spin_unlock(&mm->page_table_lock);
1715 flush_tlb_range(vma, start, end); 1718 flush_tlb_range(vma, start, end);
1719 mmu_notifier_invalidate_range_end(mm, start, end);
1716 list_for_each_entry_safe(page, tmp, &page_list, lru) { 1720 list_for_each_entry_safe(page, tmp, &page_list, lru) {
1717 list_del(&page->lru); 1721 list_del(&page->lru);
1718 put_page(page); 1722 put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 262e3eb6601a..67f0ab9077d9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h> 53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h>
54 55
55#include <asm/pgalloc.h> 56#include <asm/pgalloc.h>
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
@@ -374,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
374 * 375 *
375 * The calling function must still handle the error. 376 * The calling function must still handle the error.
376 */ 377 */
377void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) 378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
379 unsigned long vaddr)
378{ 380{
379 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 381 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
380 "vm_flags = %lx, vaddr = %lx\n", 382 "vm_flags = %lx, vaddr = %lx\n",
@@ -651,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
651 unsigned long next; 653 unsigned long next;
652 unsigned long addr = vma->vm_start; 654 unsigned long addr = vma->vm_start;
653 unsigned long end = vma->vm_end; 655 unsigned long end = vma->vm_end;
656 int ret;
654 657
655 /* 658 /*
656 * Don't copy ptes where a page fault will fill them correctly. 659 * Don't copy ptes where a page fault will fill them correctly.
@@ -666,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
666 if (is_vm_hugetlb_page(vma)) 669 if (is_vm_hugetlb_page(vma))
667 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 670 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
668 671
672 /*
673 * We need to invalidate the secondary MMU mappings only when
674 * there could be a permission downgrade on the ptes of the
675 * parent mm. And a permission downgrade will only happen if
676 * is_cow_mapping() returns true.
677 */
678 if (is_cow_mapping(vma->vm_flags))
679 mmu_notifier_invalidate_range_start(src_mm, addr, end);
680
681 ret = 0;
669 dst_pgd = pgd_offset(dst_mm, addr); 682 dst_pgd = pgd_offset(dst_mm, addr);
670 src_pgd = pgd_offset(src_mm, addr); 683 src_pgd = pgd_offset(src_mm, addr);
671 do { 684 do {
672 next = pgd_addr_end(addr, end); 685 next = pgd_addr_end(addr, end);
673 if (pgd_none_or_clear_bad(src_pgd)) 686 if (pgd_none_or_clear_bad(src_pgd))
674 continue; 687 continue;
675 if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, 688 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
676 vma, addr, next)) 689 vma, addr, next))) {
677 return -ENOMEM; 690 ret = -ENOMEM;
691 break;
692 }
678 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 693 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
679 return 0; 694
695 if (is_cow_mapping(vma->vm_flags))
696 mmu_notifier_invalidate_range_end(src_mm,
697 vma->vm_start, end);
698 return ret;
680} 699}
681 700
682static unsigned long zap_pte_range(struct mmu_gather *tlb, 701static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -880,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
880 unsigned long start = start_addr; 899 unsigned long start = start_addr;
881 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; 900 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
882 int fullmm = (*tlbp)->fullmm; 901 int fullmm = (*tlbp)->fullmm;
902 struct mm_struct *mm = vma->vm_mm;
883 903
904 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
884 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 905 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
885 unsigned long end; 906 unsigned long end;
886 907
@@ -945,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
945 } 966 }
946 } 967 }
947out: 968out:
969 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
948 return start; /* which is now the end (or restart) address */ 970 return start; /* which is now the end (or restart) address */
949} 971}
950 972
@@ -1615,10 +1637,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1615{ 1637{
1616 pgd_t *pgd; 1638 pgd_t *pgd;
1617 unsigned long next; 1639 unsigned long next;
1618 unsigned long end = addr + size; 1640 unsigned long start = addr, end = addr + size;
1619 int err; 1641 int err;
1620 1642
1621 BUG_ON(addr >= end); 1643 BUG_ON(addr >= end);
1644 mmu_notifier_invalidate_range_start(mm, start, end);
1622 pgd = pgd_offset(mm, addr); 1645 pgd = pgd_offset(mm, addr);
1623 do { 1646 do {
1624 next = pgd_addr_end(addr, end); 1647 next = pgd_addr_end(addr, end);
@@ -1626,6 +1649,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1626 if (err) 1649 if (err)
1627 break; 1650 break;
1628 } while (pgd++, addr = next, addr != end); 1651 } while (pgd++, addr = next, addr != end);
1652 mmu_notifier_invalidate_range_end(mm, start, end);
1629 return err; 1653 return err;
1630} 1654}
1631EXPORT_SYMBOL_GPL(apply_to_page_range); 1655EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1838,7 +1862,7 @@ gotten:
1838 * seen in the presence of one thread doing SMC and another 1862 * seen in the presence of one thread doing SMC and another
1839 * thread doing COW. 1863 * thread doing COW.
1840 */ 1864 */
1841 ptep_clear_flush(vma, address, page_table); 1865 ptep_clear_flush_notify(vma, address, page_table);
1842 set_pte_at(mm, address, page_table, entry); 1866 set_pte_at(mm, address, page_table, entry);
1843 update_mmu_cache(vma, address, entry); 1867 update_mmu_cache(vma, address, entry);
1844 lru_cache_add_active(new_page); 1868 lru_cache_add_active(new_page);
diff --git a/mm/migrate.c b/mm/migrate.c
index d8c65a65c61d..153572fb60b8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
285 285
286 page = migration_entry_to_page(entry); 286 page = migration_entry_to_page(entry);
287 287
288 get_page(page); 288 /*
289 * Once radix-tree replacement of page migration started, page_count
290 * *must* be zero. And, we don't want to call wait_on_page_locked()
291 * against a page without get_page().
292 * So, we use get_page_unless_zero(), here. Even failed, page fault
293 * will occur again.
294 */
295 if (!get_page_unless_zero(page))
296 goto out;
289 pte_unmap_unlock(ptep, ptl); 297 pte_unmap_unlock(ptep, ptl);
290 wait_on_page_locked(page); 298 wait_on_page_locked(page);
291 put_page(page); 299 put_page(page);
@@ -305,6 +313,7 @@ out:
305static int migrate_page_move_mapping(struct address_space *mapping, 313static int migrate_page_move_mapping(struct address_space *mapping,
306 struct page *newpage, struct page *page) 314 struct page *newpage, struct page *page)
307{ 315{
316 int expected_count;
308 void **pslot; 317 void **pslot;
309 318
310 if (!mapping) { 319 if (!mapping) {
@@ -314,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
314 return 0; 323 return 0;
315 } 324 }
316 325
317 write_lock_irq(&mapping->tree_lock); 326 spin_lock_irq(&mapping->tree_lock);
318 327
319 pslot = radix_tree_lookup_slot(&mapping->page_tree, 328 pslot = radix_tree_lookup_slot(&mapping->page_tree,
320 page_index(page)); 329 page_index(page));
321 330
322 if (page_count(page) != 2 + !!PagePrivate(page) || 331 expected_count = 2 + !!PagePrivate(page);
332 if (page_count(page) != expected_count ||
323 (struct page *)radix_tree_deref_slot(pslot) != page) { 333 (struct page *)radix_tree_deref_slot(pslot) != page) {
324 write_unlock_irq(&mapping->tree_lock); 334 spin_unlock_irq(&mapping->tree_lock);
335 return -EAGAIN;
336 }
337
338 if (!page_freeze_refs(page, expected_count)) {
339 spin_unlock_irq(&mapping->tree_lock);
325 return -EAGAIN; 340 return -EAGAIN;
326 } 341 }
327 342
@@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
338 353
339 radix_tree_replace_slot(pslot, newpage); 354 radix_tree_replace_slot(pslot, newpage);
340 355
356 page_unfreeze_refs(page, expected_count);
341 /* 357 /*
342 * Drop cache reference from old page. 358 * Drop cache reference from old page.
343 * We know this isn't the last reference. 359 * We know this isn't the last reference.
@@ -357,10 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
357 __dec_zone_page_state(page, NR_FILE_PAGES); 373 __dec_zone_page_state(page, NR_FILE_PAGES);
358 __inc_zone_page_state(newpage, NR_FILE_PAGES); 374 __inc_zone_page_state(newpage, NR_FILE_PAGES);
359 375
360 write_unlock_irq(&mapping->tree_lock); 376 spin_unlock_irq(&mapping->tree_lock);
361 if (!PageSwapCache(newpage)) { 377 if (!PageSwapCache(newpage))
362 mem_cgroup_uncharge_cache_page(page); 378 mem_cgroup_uncharge_cache_page(page);
363 }
364 379
365 return 0; 380 return 0;
366} 381}
diff --git a/mm/mmap.c b/mm/mmap.c
index 5e0cc99e9cd5..245c3d69067b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,6 +26,7 @@
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
28#include <linux/rmap.h> 28#include <linux/rmap.h>
29#include <linux/mmu_notifier.h>
29 30
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/cacheflush.h> 32#include <asm/cacheflush.h>
@@ -2061,6 +2062,7 @@ void exit_mmap(struct mm_struct *mm)
2061 2062
2062 /* mm's last user has gone, and its about to be pulled down */ 2063 /* mm's last user has gone, and its about to be pulled down */
2063 arch_exit_mmap(mm); 2064 arch_exit_mmap(mm);
2065 mmu_notifier_release(mm);
2064 2066
2065 lru_add_drain(); 2067 lru_add_drain();
2066 flush_cache_mm(mm); 2068 flush_cache_mm(mm);
@@ -2268,3 +2270,161 @@ int install_special_mapping(struct mm_struct *mm,
2268 2270
2269 return 0; 2271 return 0;
2270} 2272}
2273
2274static DEFINE_MUTEX(mm_all_locks_mutex);
2275
2276static void vm_lock_anon_vma(struct anon_vma *anon_vma)
2277{
2278 if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2279 /*
2280 * The LSB of head.next can't change from under us
2281 * because we hold the mm_all_locks_mutex.
2282 */
2283 spin_lock(&anon_vma->lock);
2284 /*
2285 * We can safely modify head.next after taking the
2286 * anon_vma->lock. If some other vma in this mm shares
2287 * the same anon_vma we won't take it again.
2288 *
2289 * No need of atomic instructions here, head.next
2290 * can't change from under us thanks to the
2291 * anon_vma->lock.
2292 */
2293 if (__test_and_set_bit(0, (unsigned long *)
2294 &anon_vma->head.next))
2295 BUG();
2296 }
2297}
2298
2299static void vm_lock_mapping(struct address_space *mapping)
2300{
2301 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2302 /*
2303 * AS_MM_ALL_LOCKS can't change from under us because
2304 * we hold the mm_all_locks_mutex.
2305 *
2306 * Operations on ->flags have to be atomic because
2307 * even if AS_MM_ALL_LOCKS is stable thanks to the
2308 * mm_all_locks_mutex, there may be other cpus
2309 * changing other bitflags in parallel to us.
2310 */
2311 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
2312 BUG();
2313 spin_lock(&mapping->i_mmap_lock);
2314 }
2315}
2316
2317/*
2318 * This operation locks against the VM for all pte/vma/mm related
2319 * operations that could ever happen on a certain mm. This includes
2320 * vmtruncate, try_to_unmap, and all page faults.
2321 *
2322 * The caller must take the mmap_sem in write mode before calling
2323 * mm_take_all_locks(). The caller isn't allowed to release the
2324 * mmap_sem until mm_drop_all_locks() returns.
2325 *
2326 * mmap_sem in write mode is required in order to block all operations
2327 * that could modify pagetables and free pages without need of
2328 * altering the vma layout (for example populate_range() with
2329 * nonlinear vmas). It's also needed in write mode to avoid new
2330 * anon_vmas to be associated with existing vmas.
2331 *
2332 * A single task can't take more than one mm_take_all_locks() in a row
2333 * or it would deadlock.
2334 *
2335 * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
2336 * mapping->flags avoid to take the same lock twice, if more than one
2337 * vma in this mm is backed by the same anon_vma or address_space.
2338 *
2339 * We can take all the locks in random order because the VM code
2340 * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
2341 * takes more than one of them in a row. Secondly we're protected
2342 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2343 *
2344 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
2345 * that may have to take thousand of locks.
2346 *
2347 * mm_take_all_locks() can fail if it's interrupted by signals.
2348 */
2349int mm_take_all_locks(struct mm_struct *mm)
2350{
2351 struct vm_area_struct *vma;
2352 int ret = -EINTR;
2353
2354 BUG_ON(down_read_trylock(&mm->mmap_sem));
2355
2356 mutex_lock(&mm_all_locks_mutex);
2357
2358 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2359 if (signal_pending(current))
2360 goto out_unlock;
2361 if (vma->anon_vma)
2362 vm_lock_anon_vma(vma->anon_vma);
2363 if (vma->vm_file && vma->vm_file->f_mapping)
2364 vm_lock_mapping(vma->vm_file->f_mapping);
2365 }
2366 ret = 0;
2367
2368out_unlock:
2369 if (ret)
2370 mm_drop_all_locks(mm);
2371
2372 return ret;
2373}
2374
2375static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2376{
2377 if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2378 /*
2379 * The LSB of head.next can't change to 0 from under
2380 * us because we hold the mm_all_locks_mutex.
2381 *
2382 * We must however clear the bitflag before unlocking
2383 * the vma so the users using the anon_vma->head will
2384 * never see our bitflag.
2385 *
2386 * No need of atomic instructions here, head.next
2387 * can't change from under us until we release the
2388 * anon_vma->lock.
2389 */
2390 if (!__test_and_clear_bit(0, (unsigned long *)
2391 &anon_vma->head.next))
2392 BUG();
2393 spin_unlock(&anon_vma->lock);
2394 }
2395}
2396
2397static void vm_unlock_mapping(struct address_space *mapping)
2398{
2399 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2400 /*
2401 * AS_MM_ALL_LOCKS can't change to 0 from under us
2402 * because we hold the mm_all_locks_mutex.
2403 */
2404 spin_unlock(&mapping->i_mmap_lock);
2405 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
2406 &mapping->flags))
2407 BUG();
2408 }
2409}
2410
2411/*
2412 * The mmap_sem cannot be released by the caller until
2413 * mm_drop_all_locks() returns.
2414 */
2415void mm_drop_all_locks(struct mm_struct *mm)
2416{
2417 struct vm_area_struct *vma;
2418
2419 BUG_ON(down_read_trylock(&mm->mmap_sem));
2420 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2421
2422 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2423 if (vma->anon_vma)
2424 vm_unlock_anon_vma(vma->anon_vma);
2425 if (vma->vm_file && vma->vm_file->f_mapping)
2426 vm_unlock_mapping(vma->vm_file->f_mapping);
2427 }
2428
2429 mutex_unlock(&mm_all_locks_mutex);
2430}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000000..5f4ef0250bee
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
1/*
2 * linux/mm/mmu_notifier.c
3 *
4 * Copyright (C) 2008 Qumranet, Inc.
5 * Copyright (C) 2008 SGI
6 * Christoph Lameter <clameter@sgi.com>
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2. See
9 * the COPYING file in the top-level directory.
10 */
11
12#include <linux/rculist.h>
13#include <linux/mmu_notifier.h>
14#include <linux/module.h>
15#include <linux/mm.h>
16#include <linux/err.h>
17#include <linux/rcupdate.h>
18#include <linux/sched.h>
19
20/*
21 * This function can't run concurrently against mmu_notifier_register
22 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
23 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
24 * in parallel despite there being no task using this mm any more,
25 * through the vmas outside of the exit_mmap context, such as with
26 * vmtruncate. This serializes against mmu_notifier_unregister with
27 * the mmu_notifier_mm->lock in addition to RCU and it serializes
28 * against the other mmu notifiers with RCU. struct mmu_notifier_mm
29 * can't go away from under us as exit_mmap holds an mm_count pin
30 * itself.
31 */
32void __mmu_notifier_release(struct mm_struct *mm)
33{
34 struct mmu_notifier *mn;
35
36 spin_lock(&mm->mmu_notifier_mm->lock);
37 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
38 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
39 struct mmu_notifier,
40 hlist);
41 /*
42 * We arrived before mmu_notifier_unregister so
43 * mmu_notifier_unregister will do nothing other than
44 * to wait ->release to finish and
45 * mmu_notifier_unregister to return.
46 */
47 hlist_del_init_rcu(&mn->hlist);
48 /*
49 * RCU here will block mmu_notifier_unregister until
50 * ->release returns.
51 */
52 rcu_read_lock();
53 spin_unlock(&mm->mmu_notifier_mm->lock);
54 /*
55 * if ->release runs before mmu_notifier_unregister it
56 * must be handled as it's the only way for the driver
57 * to flush all existing sptes and stop the driver
58 * from establishing any more sptes before all the
59 * pages in the mm are freed.
60 */
61 if (mn->ops->release)
62 mn->ops->release(mn, mm);
63 rcu_read_unlock();
64 spin_lock(&mm->mmu_notifier_mm->lock);
65 }
66 spin_unlock(&mm->mmu_notifier_mm->lock);
67
68 /*
69 * synchronize_rcu here prevents mmu_notifier_release to
70 * return to exit_mmap (which would proceed freeing all pages
71 * in the mm) until the ->release method returns, if it was
72 * invoked by mmu_notifier_unregister.
73 *
74 * The mmu_notifier_mm can't go away from under us because one
75 * mm_count is hold by exit_mmap.
76 */
77 synchronize_rcu();
78}
79
80/*
81 * If no young bitflag is supported by the hardware, ->clear_flush_young can
82 * unmap the address and return 1 or 0 depending if the mapping previously
83 * existed or not.
84 */
85int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
86 unsigned long address)
87{
88 struct mmu_notifier *mn;
89 struct hlist_node *n;
90 int young = 0;
91
92 rcu_read_lock();
93 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
94 if (mn->ops->clear_flush_young)
95 young |= mn->ops->clear_flush_young(mn, mm, address);
96 }
97 rcu_read_unlock();
98
99 return young;
100}
101
102void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->invalidate_page)
111 mn->ops->invalidate_page(mn, mm, address);
112 }
113 rcu_read_unlock();
114}
115
116void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
117 unsigned long start, unsigned long end)
118{
119 struct mmu_notifier *mn;
120 struct hlist_node *n;
121
122 rcu_read_lock();
123 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
124 if (mn->ops->invalidate_range_start)
125 mn->ops->invalidate_range_start(mn, mm, start, end);
126 }
127 rcu_read_unlock();
128}
129
130void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
131 unsigned long start, unsigned long end)
132{
133 struct mmu_notifier *mn;
134 struct hlist_node *n;
135
136 rcu_read_lock();
137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
138 if (mn->ops->invalidate_range_end)
139 mn->ops->invalidate_range_end(mn, mm, start, end);
140 }
141 rcu_read_unlock();
142}
143
144static int do_mmu_notifier_register(struct mmu_notifier *mn,
145 struct mm_struct *mm,
146 int take_mmap_sem)
147{
148 struct mmu_notifier_mm *mmu_notifier_mm;
149 int ret;
150
151 BUG_ON(atomic_read(&mm->mm_users) <= 0);
152
153 ret = -ENOMEM;
154 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
155 if (unlikely(!mmu_notifier_mm))
156 goto out;
157
158 if (take_mmap_sem)
159 down_write(&mm->mmap_sem);
160 ret = mm_take_all_locks(mm);
161 if (unlikely(ret))
162 goto out_cleanup;
163
164 if (!mm_has_notifiers(mm)) {
165 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
166 spin_lock_init(&mmu_notifier_mm->lock);
167 mm->mmu_notifier_mm = mmu_notifier_mm;
168 mmu_notifier_mm = NULL;
169 }
170 atomic_inc(&mm->mm_count);
171
172 /*
173 * Serialize the update against mmu_notifier_unregister. A
174 * side note: mmu_notifier_release can't run concurrently with
175 * us because we hold the mm_users pin (either implicitly as
176 * current->mm or explicitly with get_task_mm() or similar).
177 * We can't race against any other mmu notifier method either
178 * thanks to mm_take_all_locks().
179 */
180 spin_lock(&mm->mmu_notifier_mm->lock);
181 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
182 spin_unlock(&mm->mmu_notifier_mm->lock);
183
184 mm_drop_all_locks(mm);
185out_cleanup:
186 if (take_mmap_sem)
187 up_write(&mm->mmap_sem);
188 /* kfree() does nothing if mmu_notifier_mm is NULL */
189 kfree(mmu_notifier_mm);
190out:
191 BUG_ON(atomic_read(&mm->mm_users) <= 0);
192 return ret;
193}
194
195/*
196 * Must not hold mmap_sem nor any other VM related lock when calling
197 * this registration function. Must also ensure mm_users can't go down
198 * to zero while this runs to avoid races with mmu_notifier_release,
199 * so mm has to be current->mm or the mm should be pinned safely such
200 * as with get_task_mm(). If the mm is not current->mm, the mm_users
201 * pin should be released by calling mmput after mmu_notifier_register
202 * returns. mmu_notifier_unregister must be always called to
203 * unregister the notifier. mm_count is automatically pinned to allow
204 * mmu_notifier_unregister to safely run at any time later, before or
205 * after exit_mmap. ->release will always be called before exit_mmap
206 * frees the pages.
207 */
208int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
209{
210 return do_mmu_notifier_register(mn, mm, 1);
211}
212EXPORT_SYMBOL_GPL(mmu_notifier_register);
213
214/*
215 * Same as mmu_notifier_register but here the caller must hold the
216 * mmap_sem in write mode.
217 */
218int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
219{
220 return do_mmu_notifier_register(mn, mm, 0);
221}
222EXPORT_SYMBOL_GPL(__mmu_notifier_register);
223
224/* this is called after the last mmu_notifier_unregister() returned */
225void __mmu_notifier_mm_destroy(struct mm_struct *mm)
226{
227 BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
228 kfree(mm->mmu_notifier_mm);
229 mm->mmu_notifier_mm = LIST_POISON1; /* debug */
230}
231
232/*
233 * This releases the mm_count pin automatically and frees the mm
234 * structure if it was the last user of it. It serializes against
235 * running mmu notifiers with RCU and against mmu_notifier_unregister
236 * with the unregister lock + RCU. All sptes must be dropped before
237 * calling mmu_notifier_unregister. ->release or any other notifier
238 * method may be invoked concurrently with mmu_notifier_unregister,
239 * and only after mmu_notifier_unregister returned we're guaranteed
240 * that ->release or any other method can't run anymore.
241 */
242void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
243{
244 BUG_ON(atomic_read(&mm->mm_count) <= 0);
245
246 spin_lock(&mm->mmu_notifier_mm->lock);
247 if (!hlist_unhashed(&mn->hlist)) {
248 hlist_del_rcu(&mn->hlist);
249
250 /*
251 * RCU here will force exit_mmap to wait ->release to finish
252 * before freeing the pages.
253 */
254 rcu_read_lock();
255 spin_unlock(&mm->mmu_notifier_mm->lock);
256 /*
257 * exit_mmap will block in mmu_notifier_release to
258 * guarantee ->release is called before freeing the
259 * pages.
260 */
261 if (mn->ops->release)
262 mn->ops->release(mn, mm);
263 rcu_read_unlock();
264 } else
265 spin_unlock(&mm->mmu_notifier_mm->lock);
266
267 /*
268 * Wait any running method to finish, of course including
269 * ->release if it was run by mmu_notifier_relase instead of us.
270 */
271 synchronize_rcu();
272
273 BUG_ON(atomic_read(&mm->mm_count) <= 0);
274
275 mmdrop(mm);
276}
277EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abd645a3b0a0..fded06f923f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include <asm/pgtable.h> 26#include <asm/pgtable.h>
26#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
@@ -203,10 +204,12 @@ success:
203 dirty_accountable = 1; 204 dirty_accountable = 1;
204 } 205 }
205 206
207 mmu_notifier_invalidate_range_start(mm, start, end);
206 if (is_vm_hugetlb_page(vma)) 208 if (is_vm_hugetlb_page(vma))
207 hugetlb_change_protection(vma, start, end, vma->vm_page_prot); 209 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
208 else 210 else
209 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); 211 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
212 mmu_notifier_invalidate_range_end(mm, start, end);
210 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 213 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
211 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 214 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
212 return 0; 215 return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index 08e3c7f2bd15..1a7743923c8c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,6 +18,7 @@
18#include <linux/highmem.h> 18#include <linux/highmem.h>
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/mmu_notifier.h>
21 22
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/cacheflush.h> 24#include <asm/cacheflush.h>
@@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
74 struct mm_struct *mm = vma->vm_mm; 75 struct mm_struct *mm = vma->vm_mm;
75 pte_t *old_pte, *new_pte, pte; 76 pte_t *old_pte, *new_pte, pte;
76 spinlock_t *old_ptl, *new_ptl; 77 spinlock_t *old_ptl, *new_ptl;
78 unsigned long old_start;
77 79
80 old_start = old_addr;
81 mmu_notifier_invalidate_range_start(vma->vm_mm,
82 old_start, old_end);
78 if (vma->vm_file) { 83 if (vma->vm_file) {
79 /* 84 /*
80 * Subtle point from Rajesh Venkatasubramanian: before 85 * Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
116 pte_unmap_unlock(old_pte - 1, old_ptl); 121 pte_unmap_unlock(old_pte - 1, old_ptl);
117 if (mapping) 122 if (mapping)
118 spin_unlock(&mapping->i_mmap_lock); 123 spin_unlock(&mapping->i_mmap_lock);
124 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
119} 125}
120 126
121#define LATENCY_LIMIT (64 * PAGE_SIZE) 127#define LATENCY_LIMIT (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..5edccd9c9218 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/ptrace.h> 25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
@@ -745,7 +745,7 @@ static unsigned long determine_vm_flags(struct file *file,
745 * it's being traced - otherwise breakpoints set in it may interfere 745 * it's being traced - otherwise breakpoints set in it may interfere
746 * with another untraced process 746 * with another untraced process
747 */ 747 */
748 if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) 748 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
749 vm_flags &= ~VM_MAYSHARE; 749 vm_flags &= ~VM_MAYSHARE;
750 750
751 return vm_flags; 751 return vm_flags;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab3..24de8b65fdbd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1088 if (!mapping) 1088 if (!mapping)
1089 return 1; 1089 return 1;
1090 1090
1091 write_lock_irq(&mapping->tree_lock); 1091 spin_lock_irq(&mapping->tree_lock);
1092 mapping2 = page_mapping(page); 1092 mapping2 = page_mapping(page);
1093 if (mapping2) { /* Race with truncate? */ 1093 if (mapping2) { /* Race with truncate? */
1094 BUG_ON(mapping2 != mapping); 1094 BUG_ON(mapping2 != mapping);
@@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1102 radix_tree_tag_set(&mapping->page_tree, 1102 radix_tree_tag_set(&mapping->page_tree,
1103 page_index(page), PAGECACHE_TAG_DIRTY); 1103 page_index(page), PAGECACHE_TAG_DIRTY);
1104 } 1104 }
1105 write_unlock_irq(&mapping->tree_lock); 1105 spin_unlock_irq(&mapping->tree_lock);
1106 if (mapping->host) { 1106 if (mapping->host) {
1107 /* !PageAnon && !swapper_space */ 1107 /* !PageAnon && !swapper_space */
1108 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1108 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
1258 struct backing_dev_info *bdi = mapping->backing_dev_info; 1258 struct backing_dev_info *bdi = mapping->backing_dev_info;
1259 unsigned long flags; 1259 unsigned long flags;
1260 1260
1261 write_lock_irqsave(&mapping->tree_lock, flags); 1261 spin_lock_irqsave(&mapping->tree_lock, flags);
1262 ret = TestClearPageWriteback(page); 1262 ret = TestClearPageWriteback(page);
1263 if (ret) { 1263 if (ret) {
1264 radix_tree_tag_clear(&mapping->page_tree, 1264 radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
1269 __bdi_writeout_inc(bdi); 1269 __bdi_writeout_inc(bdi);
1270 } 1270 }
1271 } 1271 }
1272 write_unlock_irqrestore(&mapping->tree_lock, flags); 1272 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1273 } else { 1273 } else {
1274 ret = TestClearPageWriteback(page); 1274 ret = TestClearPageWriteback(page);
1275 } 1275 }
@@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
1287 struct backing_dev_info *bdi = mapping->backing_dev_info; 1287 struct backing_dev_info *bdi = mapping->backing_dev_info;
1288 unsigned long flags; 1288 unsigned long flags;
1289 1289
1290 write_lock_irqsave(&mapping->tree_lock, flags); 1290 spin_lock_irqsave(&mapping->tree_lock, flags);
1291 ret = TestSetPageWriteback(page); 1291 ret = TestSetPageWriteback(page);
1292 if (!ret) { 1292 if (!ret) {
1293 radix_tree_tag_set(&mapping->page_tree, 1293 radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
1300 radix_tree_tag_clear(&mapping->page_tree, 1300 radix_tree_tag_clear(&mapping->page_tree,
1301 page_index(page), 1301 page_index(page),
1302 PAGECACHE_TAG_DIRTY); 1302 PAGECACHE_TAG_DIRTY);
1303 write_unlock_irqrestore(&mapping->tree_lock, flags); 1303 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1304 } else { 1304 } else {
1305 ret = TestSetPageWriteback(page); 1305 ret = TestSetPageWriteback(page);
1306 } 1306 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6da667274df5..3cf3d05b6bd4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2372,7 +2372,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2372 2372
2373#endif /* CONFIG_NUMA */ 2373#endif /* CONFIG_NUMA */
2374 2374
2375/* return values int ....just for stop_machine_run() */ 2375/* return values int ....just for stop_machine() */
2376static int __build_all_zonelists(void *dummy) 2376static int __build_all_zonelists(void *dummy)
2377{ 2377{
2378 int nid; 2378 int nid;
@@ -2397,7 +2397,7 @@ void build_all_zonelists(void)
2397 } else { 2397 } else {
2398 /* we have to stop all cpus to guarantee there is no user 2398 /* we have to stop all cpus to guarantee there is no user
2399 of zonelist */ 2399 of zonelist */
2400 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 2400 stop_machine(__build_all_zonelists, NULL, NULL);
2401 /* cpuset refresh routine should be here */ 2401 /* cpuset refresh routine should be here */
2402 } 2402 }
2403 vm_total_pages = nr_free_pagecache_pages(); 2403 vm_total_pages = nr_free_pagecache_pages();
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f6496..77e8ddf945e9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
382 if (hit_readahead_marker) { 382 if (hit_readahead_marker) {
383 pgoff_t start; 383 pgoff_t start;
384 384
385 read_lock_irq(&mapping->tree_lock); 385 rcu_read_lock();
386 start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); 386 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
387 read_unlock_irq(&mapping->tree_lock); 387 rcu_read_unlock();
388 388
389 if (!start || start - offset > max) 389 if (!start || start - offset > max)
390 return 0; 390 return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index abbd29f7c43f..99bc3f9cd796 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h> 50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h> 51#include <linux/memcontrol.h>
52#include <linux/mmu_notifier.h>
52 53
53#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
54 55
@@ -138,7 +139,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
138 anon_vma_free(anon_vma); 139 anon_vma_free(anon_vma);
139} 140}
140 141
141static void anon_vma_ctor(struct kmem_cache *cachep, void *data) 142static void anon_vma_ctor(void *data)
142{ 143{
143 struct anon_vma *anon_vma = data; 144 struct anon_vma *anon_vma = data;
144 145
@@ -287,7 +288,7 @@ static int page_referenced_one(struct page *page,
287 if (vma->vm_flags & VM_LOCKED) { 288 if (vma->vm_flags & VM_LOCKED) {
288 referenced++; 289 referenced++;
289 *mapcount = 1; /* break early from loop */ 290 *mapcount = 1; /* break early from loop */
290 } else if (ptep_clear_flush_young(vma, address, pte)) 291 } else if (ptep_clear_flush_young_notify(vma, address, pte))
291 referenced++; 292 referenced++;
292 293
293 /* Pretend the page is referenced if the task has the 294 /* Pretend the page is referenced if the task has the
@@ -457,7 +458,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
457 pte_t entry; 458 pte_t entry;
458 459
459 flush_cache_page(vma, address, pte_pfn(*pte)); 460 flush_cache_page(vma, address, pte_pfn(*pte));
460 entry = ptep_clear_flush(vma, address, pte); 461 entry = ptep_clear_flush_notify(vma, address, pte);
461 entry = pte_wrprotect(entry); 462 entry = pte_wrprotect(entry);
462 entry = pte_mkclean(entry); 463 entry = pte_mkclean(entry);
463 set_pte_at(mm, address, pte, entry); 464 set_pte_at(mm, address, pte, entry);
@@ -705,14 +706,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
705 * skipped over this mm) then we should reactivate it. 706 * skipped over this mm) then we should reactivate it.
706 */ 707 */
707 if (!migration && ((vma->vm_flags & VM_LOCKED) || 708 if (!migration && ((vma->vm_flags & VM_LOCKED) ||
708 (ptep_clear_flush_young(vma, address, pte)))) { 709 (ptep_clear_flush_young_notify(vma, address, pte)))) {
709 ret = SWAP_FAIL; 710 ret = SWAP_FAIL;
710 goto out_unmap; 711 goto out_unmap;
711 } 712 }
712 713
713 /* Nuke the page table entry. */ 714 /* Nuke the page table entry. */
714 flush_cache_page(vma, address, page_to_pfn(page)); 715 flush_cache_page(vma, address, page_to_pfn(page));
715 pteval = ptep_clear_flush(vma, address, pte); 716 pteval = ptep_clear_flush_notify(vma, address, pte);
716 717
717 /* Move the dirty bit to the physical page now the pte is gone. */ 718 /* Move the dirty bit to the physical page now the pte is gone. */
718 if (pte_dirty(pteval)) 719 if (pte_dirty(pteval))
@@ -837,12 +838,12 @@ static void try_to_unmap_cluster(unsigned long cursor,
837 page = vm_normal_page(vma, address, *pte); 838 page = vm_normal_page(vma, address, *pte);
838 BUG_ON(!page || PageAnon(page)); 839 BUG_ON(!page || PageAnon(page));
839 840
840 if (ptep_clear_flush_young(vma, address, pte)) 841 if (ptep_clear_flush_young_notify(vma, address, pte))
841 continue; 842 continue;
842 843
843 /* Nuke the page table entry. */ 844 /* Nuke the page table entry. */
844 flush_cache_page(vma, address, pte_pfn(*pte)); 845 flush_cache_page(vma, address, pte_pfn(*pte));
845 pteval = ptep_clear_flush(vma, address, pte); 846 pteval = ptep_clear_flush_notify(vma, address, pte);
846 847
847 /* If nonlinear, store the file page offset in the pte. */ 848 /* If nonlinear, store the file page offset in the pte. */
848 if (page->index != linear_page_index(vma, address)) 849 if (page->index != linear_page_index(vma, address))
diff --git a/mm/shmem.c b/mm/shmem.c
index f92fea94d037..c1e5a3b4f758 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -936,7 +936,7 @@ found:
936 spin_lock(&info->lock); 936 spin_lock(&info->lock);
937 ptr = shmem_swp_entry(info, idx, NULL); 937 ptr = shmem_swp_entry(info, idx, NULL);
938 if (ptr && ptr->val == entry.val) { 938 if (ptr && ptr->val == entry.val) {
939 error = add_to_page_cache(page, inode->i_mapping, 939 error = add_to_page_cache_locked(page, inode->i_mapping,
940 idx, GFP_NOWAIT); 940 idx, GFP_NOWAIT);
941 /* does mem_cgroup_uncharge_cache_page on error */ 941 /* does mem_cgroup_uncharge_cache_page on error */
942 } else /* we must compensate for our precharge above */ 942 } else /* we must compensate for our precharge above */
@@ -1301,8 +1301,8 @@ repeat:
1301 SetPageUptodate(filepage); 1301 SetPageUptodate(filepage);
1302 set_page_dirty(filepage); 1302 set_page_dirty(filepage);
1303 swap_free(swap); 1303 swap_free(swap);
1304 } else if (!(error = add_to_page_cache( 1304 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1305 swappage, mapping, idx, GFP_NOWAIT))) { 1305 idx, GFP_NOWAIT))) {
1306 info->flags |= SHMEM_PAGEIN; 1306 info->flags |= SHMEM_PAGEIN;
1307 shmem_swp_set(info, entry, 0); 1307 shmem_swp_set(info, entry, 0);
1308 shmem_swp_unmap(entry); 1308 shmem_swp_unmap(entry);
@@ -1513,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1513 inode->i_uid = current->fsuid; 1513 inode->i_uid = current->fsuid;
1514 inode->i_gid = current->fsgid; 1514 inode->i_gid = current->fsgid;
1515 inode->i_blocks = 0; 1515 inode->i_blocks = 0;
1516 inode->i_mapping->a_ops = &shmem_aops;
1517 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1516 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1518 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1517 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1519 inode->i_generation = get_seconds(); 1518 inode->i_generation = get_seconds();
@@ -1528,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1528 init_special_inode(inode, mode, dev); 1527 init_special_inode(inode, mode, dev);
1529 break; 1528 break;
1530 case S_IFREG: 1529 case S_IFREG:
1530 inode->i_mapping->a_ops = &shmem_aops;
1531 inode->i_op = &shmem_inode_operations; 1531 inode->i_op = &shmem_inode_operations;
1532 inode->i_fop = &shmem_file_operations; 1532 inode->i_fop = &shmem_file_operations;
1533 mpol_shared_policy_init(&info->policy, 1533 mpol_shared_policy_init(&info->policy,
@@ -1929,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1929 return error; 1929 return error;
1930 } 1930 }
1931 unlock_page(page); 1931 unlock_page(page);
1932 inode->i_mapping->a_ops = &shmem_aops;
1932 inode->i_op = &shmem_symlink_inode_operations; 1933 inode->i_op = &shmem_symlink_inode_operations;
1933 kaddr = kmap_atomic(page, KM_USER0); 1934 kaddr = kmap_atomic(page, KM_USER0);
1934 memcpy(kaddr, symname, len); 1935 memcpy(kaddr, symname, len);
@@ -2352,7 +2353,7 @@ static void shmem_destroy_inode(struct inode *inode)
2352 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2353 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2353} 2354}
2354 2355
2355static void init_once(struct kmem_cache *cachep, void *foo) 2356static void init_once(void *foo)
2356{ 2357{
2357 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2358 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2358 2359
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb1..8e5aadd7dcd6 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
191 * shmem_permission - permission() inode operation 191 * shmem_permission - permission() inode operation
192 */ 192 */
193int 193int
194shmem_permission(struct inode *inode, int mask, struct nameidata *nd) 194shmem_permission(struct inode *inode, int mask)
195{ 195{
196 return generic_permission(inode, mask, shmem_check_acl); 196 return generic_permission(inode, mask, shmem_check_acl);
197} 197}
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..918f04f7fef1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
406 unsigned int dflags; /* dynamic flags */ 406 unsigned int dflags; /* dynamic flags */
407 407
408 /* constructor func */ 408 /* constructor func */
409 void (*ctor)(struct kmem_cache *, void *); 409 void (*ctor)(void *obj);
410 410
411/* 5) cache creation/removal */ 411/* 5) cache creation/removal */
412 const char *name; 412 const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2137 */ 2137 */
2138struct kmem_cache * 2138struct kmem_cache *
2139kmem_cache_create (const char *name, size_t size, size_t align, 2139kmem_cache_create (const char *name, size_t size, size_t align,
2140 unsigned long flags, 2140 unsigned long flags, void (*ctor)(void *))
2141 void (*ctor)(struct kmem_cache *, void *))
2142{ 2141{
2143 size_t left_over, slab_size, ralign; 2142 size_t left_over, slab_size, ralign;
2144 struct kmem_cache *cachep = NULL, *pc; 2143 struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2653 * They must also be threaded. 2652 * They must also be threaded.
2654 */ 2653 */
2655 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2654 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2656 cachep->ctor(cachep, objp + obj_offset(cachep)); 2655 cachep->ctor(objp + obj_offset(cachep));
2657 2656
2658 if (cachep->flags & SLAB_RED_ZONE) { 2657 if (cachep->flags & SLAB_RED_ZONE) {
2659 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2658 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2669 cachep->buffer_size / PAGE_SIZE, 0); 2668 cachep->buffer_size / PAGE_SIZE, 0);
2670#else 2669#else
2671 if (cachep->ctor) 2670 if (cachep->ctor)
2672 cachep->ctor(cachep, objp); 2671 cachep->ctor(objp);
2673#endif 2672#endif
2674 slab_bufctl(slabp)[i] = i + 1; 2673 slab_bufctl(slabp)[i] = i + 1;
2675 } 2674 }
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3093#endif 3092#endif
3094 objp += obj_offset(cachep); 3093 objp += obj_offset(cachep);
3095 if (cachep->ctor && cachep->flags & SLAB_POISON) 3094 if (cachep->ctor && cachep->flags & SLAB_POISON)
3096 cachep->ctor(cachep, objp); 3095 cachep->ctor(objp);
3097#if ARCH_SLAB_MINALIGN 3096#if ARCH_SLAB_MINALIGN
3098 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3097 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3099 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3098 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
diff --git a/mm/slob.c b/mm/slob.c
index de268eb7ac70..d8fbd4d1bfa7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -525,12 +525,11 @@ struct kmem_cache {
525 unsigned int size, align; 525 unsigned int size, align;
526 unsigned long flags; 526 unsigned long flags;
527 const char *name; 527 const char *name;
528 void (*ctor)(struct kmem_cache *, void *); 528 void (*ctor)(void *);
529}; 529};
530 530
531struct kmem_cache *kmem_cache_create(const char *name, size_t size, 531struct kmem_cache *kmem_cache_create(const char *name, size_t size,
532 size_t align, unsigned long flags, 532 size_t align, unsigned long flags, void (*ctor)(void *))
533 void (*ctor)(struct kmem_cache *, void *))
534{ 533{
535 struct kmem_cache *c; 534 struct kmem_cache *c;
536 535
@@ -575,7 +574,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
575 b = slob_new_page(flags, get_order(c->size), node); 574 b = slob_new_page(flags, get_order(c->size), node);
576 575
577 if (c->ctor) 576 if (c->ctor)
578 c->ctor(c, b); 577 c->ctor(b);
579 578
580 return b; 579 return b;
581} 580}
diff --git a/mm/slub.c b/mm/slub.c
index 77c21cf53ff9..b7e2cd5d82db 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
1012 1012
1013static unsigned long kmem_cache_flags(unsigned long objsize, 1013static unsigned long kmem_cache_flags(unsigned long objsize,
1014 unsigned long flags, const char *name, 1014 unsigned long flags, const char *name,
1015 void (*ctor)(struct kmem_cache *, void *)) 1015 void (*ctor)(void *))
1016{ 1016{
1017 /* 1017 /*
1018 * Enable debugging if selected on the kernel commandline. 1018 * Enable debugging if selected on the kernel commandline.
@@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1040static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1040static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1041static inline unsigned long kmem_cache_flags(unsigned long objsize, 1041static inline unsigned long kmem_cache_flags(unsigned long objsize,
1042 unsigned long flags, const char *name, 1042 unsigned long flags, const char *name,
1043 void (*ctor)(struct kmem_cache *, void *)) 1043 void (*ctor)(void *))
1044{ 1044{
1045 return flags; 1045 return flags;
1046} 1046}
@@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
1103{ 1103{
1104 setup_object_debug(s, page, object); 1104 setup_object_debug(s, page, object);
1105 if (unlikely(s->ctor)) 1105 if (unlikely(s->ctor))
1106 s->ctor(s, object); 1106 s->ctor(object);
1107} 1107}
1108 1108
1109static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1109static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2286,7 +2286,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2286static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2286static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2287 const char *name, size_t size, 2287 const char *name, size_t size,
2288 size_t align, unsigned long flags, 2288 size_t align, unsigned long flags,
2289 void (*ctor)(struct kmem_cache *, void *)) 2289 void (*ctor)(void *))
2290{ 2290{
2291 memset(s, 0, kmem_size); 2291 memset(s, 0, kmem_size);
2292 s->name = name; 2292 s->name = name;
@@ -3042,7 +3042,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3042 3042
3043static struct kmem_cache *find_mergeable(size_t size, 3043static struct kmem_cache *find_mergeable(size_t size,
3044 size_t align, unsigned long flags, const char *name, 3044 size_t align, unsigned long flags, const char *name,
3045 void (*ctor)(struct kmem_cache *, void *)) 3045 void (*ctor)(void *))
3046{ 3046{
3047 struct kmem_cache *s; 3047 struct kmem_cache *s;
3048 3048
@@ -3082,8 +3082,7 @@ static struct kmem_cache *find_mergeable(size_t size,
3082} 3082}
3083 3083
3084struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3084struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3085 size_t align, unsigned long flags, 3085 size_t align, unsigned long flags, void (*ctor)(void *))
3086 void (*ctor)(struct kmem_cache *, void *))
3087{ 3086{
3088 struct kmem_cache *s; 3087 struct kmem_cache *s;
3089 3088
diff --git a/mm/sparse.c b/mm/sparse.c
index 8ffc08990008..5d9dbbb9d39e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -377,7 +377,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
377} 377}
378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
379 379
380struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 380static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
381{ 381{
382 struct page *map; 382 struct page *map;
383 struct mem_section *ms = __nr_to_section(pnum); 383 struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..b8035b055129 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
39 39
40struct address_space swapper_space = { 40struct address_space swapper_space = {
41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), 42 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
43 .a_ops = &swap_aops, 43 .a_ops = &swap_aops,
44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
45 .backing_dev_info = &swap_backing_dev_info, 45 .backing_dev_info = &swap_backing_dev_info,
@@ -56,7 +56,8 @@ static struct {
56 56
57void show_swap_cache_info(void) 57void show_swap_cache_info(void)
58{ 58{
59 printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", 59 printk("%lu pages in swap cache\n", total_swapcache_pages);
60 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
60 swap_cache_info.add_total, swap_cache_info.del_total, 61 swap_cache_info.add_total, swap_cache_info.del_total,
61 swap_cache_info.find_success, swap_cache_info.find_total); 62 swap_cache_info.find_success, swap_cache_info.find_total);
62 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 63 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
@@ -64,7 +65,7 @@ void show_swap_cache_info(void)
64} 65}
65 66
66/* 67/*
67 * add_to_swap_cache resembles add_to_page_cache on swapper_space, 68 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
68 * but sets SwapCache flag and private instead of mapping and index. 69 * but sets SwapCache flag and private instead of mapping and index.
69 */ 70 */
70int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 71int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
76 BUG_ON(PagePrivate(page)); 77 BUG_ON(PagePrivate(page));
77 error = radix_tree_preload(gfp_mask); 78 error = radix_tree_preload(gfp_mask);
78 if (!error) { 79 if (!error) {
79 write_lock_irq(&swapper_space.tree_lock); 80 page_cache_get(page);
81 SetPageSwapCache(page);
82 set_page_private(page, entry.val);
83
84 spin_lock_irq(&swapper_space.tree_lock);
80 error = radix_tree_insert(&swapper_space.page_tree, 85 error = radix_tree_insert(&swapper_space.page_tree,
81 entry.val, page); 86 entry.val, page);
82 if (!error) { 87 if (likely(!error)) {
83 page_cache_get(page);
84 SetPageSwapCache(page);
85 set_page_private(page, entry.val);
86 total_swapcache_pages++; 88 total_swapcache_pages++;
87 __inc_zone_page_state(page, NR_FILE_PAGES); 89 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total); 90 INC_CACHE_INFO(add_total);
89 } 91 }
90 write_unlock_irq(&swapper_space.tree_lock); 92 spin_unlock_irq(&swapper_space.tree_lock);
91 radix_tree_preload_end(); 93 radix_tree_preload_end();
94
95 if (unlikely(error)) {
96 set_page_private(page, 0UL);
97 ClearPageSwapCache(page);
98 page_cache_release(page);
99 }
92 } 100 }
93 return error; 101 return error;
94} 102}
@@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page)
175 183
176 entry.val = page_private(page); 184 entry.val = page_private(page);
177 185
178 write_lock_irq(&swapper_space.tree_lock); 186 spin_lock_irq(&swapper_space.tree_lock);
179 __delete_from_swap_cache(page); 187 __delete_from_swap_cache(page);
180 write_unlock_irq(&swapper_space.tree_lock); 188 spin_unlock_irq(&swapper_space.tree_lock);
181 189
182 swap_free(entry); 190 swap_free(entry);
183 page_cache_release(page); 191 page_cache_release(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2f33edb8bee9..6beb6251e99d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,8 +33,8 @@
33#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
34#include <linux/swapops.h> 34#include <linux/swapops.h>
35 35
36DEFINE_SPINLOCK(swap_lock); 36static DEFINE_SPINLOCK(swap_lock);
37unsigned int nr_swapfiles; 37static unsigned int nr_swapfiles;
38long total_swap_pages; 38long total_swap_pages;
39static int swap_overflow; 39static int swap_overflow;
40static int least_priority; 40static int least_priority;
@@ -44,7 +44,7 @@ static const char Unused_file[] = "Unused swap file entry ";
44static const char Bad_offset[] = "Bad swap offset entry "; 44static const char Bad_offset[] = "Bad swap offset entry ";
45static const char Unused_offset[] = "Unused swap offset entry "; 45static const char Unused_offset[] = "Unused swap offset entry ";
46 46
47struct swap_list_t swap_list = {-1, -1}; 47static struct swap_list_t swap_list = {-1, -1};
48 48
49static struct swap_info_struct swap_info[MAX_SWAPFILES]; 49static struct swap_info_struct swap_info[MAX_SWAPFILES];
50 50
@@ -369,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
369 retval = 0; 369 retval = 0;
370 if (p->swap_map[swp_offset(entry)] == 1) { 370 if (p->swap_map[swp_offset(entry)] == 1) {
371 /* Recheck the page count with the swapcache lock held.. */ 371 /* Recheck the page count with the swapcache lock held.. */
372 write_lock_irq(&swapper_space.tree_lock); 372 spin_lock_irq(&swapper_space.tree_lock);
373 if ((page_count(page) == 2) && !PageWriteback(page)) { 373 if ((page_count(page) == 2) && !PageWriteback(page)) {
374 __delete_from_swap_cache(page); 374 __delete_from_swap_cache(page);
375 SetPageDirty(page); 375 SetPageDirty(page);
376 retval = 1; 376 retval = 1;
377 } 377 }
378 write_unlock_irq(&swapper_space.tree_lock); 378 spin_unlock_irq(&swapper_space.tree_lock);
379 } 379 }
380 spin_unlock(&swap_lock); 380 spin_unlock(&swap_lock);
381 381
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..e68443d74567 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -349,18 +349,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
349 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 349 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
350 return 0; 350 return 0;
351 351
352 write_lock_irq(&mapping->tree_lock); 352 spin_lock_irq(&mapping->tree_lock);
353 if (PageDirty(page)) 353 if (PageDirty(page))
354 goto failed; 354 goto failed;
355 355
356 BUG_ON(PagePrivate(page)); 356 BUG_ON(PagePrivate(page));
357 __remove_from_page_cache(page); 357 __remove_from_page_cache(page);
358 write_unlock_irq(&mapping->tree_lock); 358 spin_unlock_irq(&mapping->tree_lock);
359 ClearPageUptodate(page); 359 ClearPageUptodate(page);
360 page_cache_release(page); /* pagecache ref */ 360 page_cache_release(page); /* pagecache ref */
361 return 1; 361 return 1;
362failed: 362failed:
363 write_unlock_irq(&mapping->tree_lock); 363 spin_unlock_irq(&mapping->tree_lock);
364 return 0; 364 return 0;
365} 365}
366 366
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..9341ca77bd88 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
1#include <linux/mm.h>
1#include <linux/slab.h> 2#include <linux/slab.h>
2#include <linux/string.h> 3#include <linux/string.h>
3#include <linux/module.h> 4#include <linux/module.h>
4#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h>
5#include <asm/uaccess.h> 7#include <asm/uaccess.h>
6 8
7/** 9/**
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
68EXPORT_SYMBOL(kmemdup); 70EXPORT_SYMBOL(kmemdup);
69 71
70/** 72/**
71 * krealloc - reallocate memory. The contents will remain unchanged. 73 * __krealloc - like krealloc() but don't free @p.
72 * @p: object to reallocate memory for. 74 * @p: object to reallocate memory for.
73 * @new_size: how many bytes of memory are required. 75 * @new_size: how many bytes of memory are required.
74 * @flags: the type of memory to allocate. 76 * @flags: the type of memory to allocate.
75 * 77 *
76 * The contents of the object pointed to are preserved up to the 78 * This function is like krealloc() except it never frees the originally
77 * lesser of the new and old sizes. If @p is %NULL, krealloc() 79 * allocated buffer. Use this if you don't want to free the buffer immediately
78 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 80 * like, for example, with RCU.
79 * %NULL pointer, the object pointed to is freed.
80 */ 81 */
81void *krealloc(const void *p, size_t new_size, gfp_t flags) 82void *__krealloc(const void *p, size_t new_size, gfp_t flags)
82{ 83{
83 void *ret; 84 void *ret;
84 size_t ks = 0; 85 size_t ks = 0;
85 86
86 if (unlikely(!new_size)) { 87 if (unlikely(!new_size))
87 kfree(p);
88 return ZERO_SIZE_PTR; 88 return ZERO_SIZE_PTR;
89 }
90 89
91 if (p) 90 if (p)
92 ks = ksize(p); 91 ks = ksize(p);
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
95 return (void *)p; 94 return (void *)p;
96 95
97 ret = kmalloc_track_caller(new_size, flags); 96 ret = kmalloc_track_caller(new_size, flags);
98 if (ret && p) { 97 if (ret && p)
99 memcpy(ret, p, ks); 98 memcpy(ret, p, ks);
99
100 return ret;
101}
102EXPORT_SYMBOL(__krealloc);
103
104/**
105 * krealloc - reallocate memory. The contents will remain unchanged.
106 * @p: object to reallocate memory for.
107 * @new_size: how many bytes of memory are required.
108 * @flags: the type of memory to allocate.
109 *
110 * The contents of the object pointed to are preserved up to the
111 * lesser of the new and old sizes. If @p is %NULL, krealloc()
112 * behaves exactly like kmalloc(). If @size is 0 and @p is not a
113 * %NULL pointer, the object pointed to is freed.
114 */
115void *krealloc(const void *p, size_t new_size, gfp_t flags)
116{
117 void *ret;
118
119 if (unlikely(!new_size)) {
100 kfree(p); 120 kfree(p);
121 return ZERO_SIZE_PTR;
101 } 122 }
123
124 ret = __krealloc(p, new_size, flags);
125 if (ret && p != ret)
126 kfree(p);
127
102 return ret; 128 return ret;
103} 129}
104EXPORT_SYMBOL(krealloc); 130EXPORT_SYMBOL(krealloc);
@@ -136,3 +162,12 @@ char *strndup_user(const char __user *s, long n)
136 return p; 162 return p;
137} 163}
138EXPORT_SYMBOL(strndup_user); 164EXPORT_SYMBOL(strndup_user);
165
166#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
167void arch_pick_mmap_layout(struct mm_struct *mm)
168{
169 mm->mmap_base = TASK_UNMAPPED_BASE;
170 mm->get_unmapped_area = arch_get_unmapped_area;
171 mm->unmap_area = arch_unmap_area;
172}
173#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35f293816294..85b9a0d2c877 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -381,16 +381,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
381 return; 381 return;
382 382
383 if ((PAGE_SIZE-1) & (unsigned long)addr) { 383 if ((PAGE_SIZE-1) & (unsigned long)addr) {
384 printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 384 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
385 WARN_ON(1);
386 return; 385 return;
387 } 386 }
388 387
389 area = remove_vm_area(addr); 388 area = remove_vm_area(addr);
390 if (unlikely(!area)) { 389 if (unlikely(!area)) {
391 printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 390 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
392 addr); 391 addr);
393 WARN_ON(1);
394 return; 392 return;
395 } 393 }
396 394
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26672c6cd3ce..8f71761bc4b7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -391,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
391} 391}
392 392
393/* 393/*
394 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 394 * Same as remove_mapping, but if the page is removed from the mapping, it
395 * someone else has a ref on the page, abort and return 0. If it was 395 * gets returned with a refcount of 0.
396 * successfully detached, return 1. Assumes the caller has a single ref on
397 * this page.
398 */ 396 */
399int remove_mapping(struct address_space *mapping, struct page *page) 397static int __remove_mapping(struct address_space *mapping, struct page *page)
400{ 398{
401 BUG_ON(!PageLocked(page)); 399 BUG_ON(!PageLocked(page));
402 BUG_ON(mapping != page_mapping(page)); 400 BUG_ON(mapping != page_mapping(page));
403 401
404 write_lock_irq(&mapping->tree_lock); 402 spin_lock_irq(&mapping->tree_lock);
405 /* 403 /*
406 * The non racy check for a busy page. 404 * The non racy check for a busy page.
407 * 405 *
@@ -427,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page)
427 * Note that if SetPageDirty is always performed via set_page_dirty, 425 * Note that if SetPageDirty is always performed via set_page_dirty,
428 * and thus under tree_lock, then this ordering is not required. 426 * and thus under tree_lock, then this ordering is not required.
429 */ 427 */
430 if (unlikely(page_count(page) != 2)) 428 if (!page_freeze_refs(page, 2))
431 goto cannot_free; 429 goto cannot_free;
432 smp_rmb(); 430 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
433 if (unlikely(PageDirty(page))) 431 if (unlikely(PageDirty(page))) {
432 page_unfreeze_refs(page, 2);
434 goto cannot_free; 433 goto cannot_free;
434 }
435 435
436 if (PageSwapCache(page)) { 436 if (PageSwapCache(page)) {
437 swp_entry_t swap = { .val = page_private(page) }; 437 swp_entry_t swap = { .val = page_private(page) };
438 __delete_from_swap_cache(page); 438 __delete_from_swap_cache(page);
439 write_unlock_irq(&mapping->tree_lock); 439 spin_unlock_irq(&mapping->tree_lock);
440 swap_free(swap); 440 swap_free(swap);
441 __put_page(page); /* The pagecache ref */ 441 } else {
442 return 1; 442 __remove_from_page_cache(page);
443 spin_unlock_irq(&mapping->tree_lock);
443 } 444 }
444 445
445 __remove_from_page_cache(page);
446 write_unlock_irq(&mapping->tree_lock);
447 __put_page(page);
448 return 1; 446 return 1;
449 447
450cannot_free: 448cannot_free:
451 write_unlock_irq(&mapping->tree_lock); 449 spin_unlock_irq(&mapping->tree_lock);
450 return 0;
451}
452
453/*
454 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
455 * someone else has a ref on the page, abort and return 0. If it was
456 * successfully detached, return 1. Assumes the caller has a single ref on
457 * this page.
458 */
459int remove_mapping(struct address_space *mapping, struct page *page)
460{
461 if (__remove_mapping(mapping, page)) {
462 /*
463 * Unfreezing the refcount with 1 rather than 2 effectively
464 * drops the pagecache ref for us without requiring another
465 * atomic operation.
466 */
467 page_unfreeze_refs(page, 1);
468 return 1;
469 }
452 return 0; 470 return 0;
453} 471}
454 472
@@ -598,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
598 if (PagePrivate(page)) { 616 if (PagePrivate(page)) {
599 if (!try_to_release_page(page, sc->gfp_mask)) 617 if (!try_to_release_page(page, sc->gfp_mask))
600 goto activate_locked; 618 goto activate_locked;
601 if (!mapping && page_count(page) == 1) 619 if (!mapping && page_count(page) == 1) {
602 goto free_it; 620 unlock_page(page);
621 if (put_page_testzero(page))
622 goto free_it;
623 else {
624 /*
625 * rare race with speculative reference.
626 * the speculative reference will free
627 * this page shortly, so we may
628 * increment nr_reclaimed here (and
629 * leave it off the LRU).
630 */
631 nr_reclaimed++;
632 continue;
633 }
634 }
603 } 635 }
604 636
605 if (!mapping || !remove_mapping(mapping, page)) 637 if (!mapping || !__remove_mapping(mapping, page))
606 goto keep_locked; 638 goto keep_locked;
607 639
608free_it:
609 unlock_page(page); 640 unlock_page(page);
641free_it:
610 nr_reclaimed++; 642 nr_reclaimed++;
611 if (!pagevec_add(&freed_pvec, page)) 643 if (!pagevec_add(&freed_pvec, page)) {
612 __pagevec_release_nonlru(&freed_pvec); 644 __pagevec_free(&freed_pvec);
645 pagevec_reinit(&freed_pvec);
646 }
613 continue; 647 continue;
614 648
615activate_locked: 649activate_locked:
@@ -623,7 +657,7 @@ keep:
623 } 657 }
624 list_splice(&ret_pages, page_list); 658 list_splice(&ret_pages, page_list);
625 if (pagevec_count(&freed_pvec)) 659 if (pagevec_count(&freed_pvec))
626 __pagevec_release_nonlru(&freed_pvec); 660 __pagevec_free(&freed_pvec);
627 count_vm_events(PGACTIVATE, pgactivate); 661 count_vm_events(PGACTIVATE, pgactivate);
628 return nr_reclaimed; 662 return nr_reclaimed;
629} 663}