aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig21
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c1
-rw-r--r--mm/filemap.c24
-rw-r--r--mm/filemap_xip.c22
-rw-r--r--mm/fremap.c86
-rw-r--r--mm/highmem.c14
-rw-r--r--mm/hugetlb.c207
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory.c993
-rw-r--r--mm/memory_hotplug.c138
-rw-r--r--mm/mempolicy.c463
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/mmap.c128
-rw-r--r--mm/mprotect.c19
-rw-r--r--mm/mremap.c193
-rw-r--r--mm/msync.c78
-rw-r--r--mm/nommu.c18
-rw-r--r--mm/page_alloc.c240
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/pdflush.c13
-rw-r--r--mm/rmap.c146
-rw-r--r--mm/shmem.c32
-rw-r--r--mm/slab.c13
-rw-r--r--mm/sparse.c99
-rw-r--r--mm/swap.c7
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c41
-rw-r--r--mm/thrash.c2
-rw-r--r--mm/tiny-shmem.c5
-rw-r--r--mm/truncate.c11
-rw-r--r--mm/vmalloc.c77
-rw-r--r--mm/vmscan.c14
33 files changed, 1746 insertions, 1382 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 391ffc54d136..1a4473fcb2ca 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -111,3 +111,24 @@ config SPARSEMEM_STATIC
111config SPARSEMEM_EXTREME 111config SPARSEMEM_EXTREME
112 def_bool y 112 def_bool y
113 depends on SPARSEMEM && !SPARSEMEM_STATIC 113 depends on SPARSEMEM && !SPARSEMEM_STATIC
114
115# eventually, we can have this option just 'select SPARSEMEM'
116config MEMORY_HOTPLUG
117 bool "Allow for memory hot-add"
118 depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
119
120comment "Memory hotplug is currently incompatible with Software Suspend"
121 depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
122
123# Heavily threaded applications may benefit from splitting the mm-wide
124# page_table_lock, so that faults on different parts of the user address
125# space can be handled with less contention: split it at this NR_CPUS.
126# Default to 4 for wider testing, though 8 might be more appropriate.
127# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
128# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
129#
130config SPLIT_PTLOCK_CPUS
131 int
132 default "4096" if ARM && !CPU_CACHE_VIPT
133 default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
134 default "4"
diff --git a/mm/Makefile b/mm/Makefile
index 4cd69e3ce421..2fa6d2ca9f28 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o 18obj-$(CONFIG_SPARSEMEM) += sparse.o
19obj-$(CONFIG_SHMEM) += shmem.o 19obj-$(CONFIG_SHMEM) += shmem.o
20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
21 21obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
22obj-$(CONFIG_FS_XIP) += filemap_xip.o 22obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a58699b6579e..e8c567177dcf 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
305 if (j + 16 < BITS_PER_LONG) 305 if (j + 16 < BITS_PER_LONG)
306 prefetchw(page + j + 16); 306 prefetchw(page + j + 16);
307 __ClearPageReserved(page + j); 307 __ClearPageReserved(page + j);
308 set_page_count(page + j, 0);
308 } 309 }
309 __free_pages(page, order); 310 __free_pages(page, order);
310 i += BITS_PER_LONG; 311 i += BITS_PER_LONG;
diff --git a/mm/filemap.c b/mm/filemap.c
index b5346576e58d..5d6e4c2000dc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
66 * 66 *
67 * ->mmap_sem 67 * ->mmap_sem
68 * ->i_mmap_lock 68 * ->i_mmap_lock
69 * ->page_table_lock (various places, mainly in mmap.c) 69 * ->page_table_lock or pte_lock (various, mainly in memory.c)
70 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 70 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
71 * 71 *
72 * ->mmap_sem 72 * ->mmap_sem
@@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
86 * ->anon_vma.lock (vma_adjust) 86 * ->anon_vma.lock (vma_adjust)
87 * 87 *
88 * ->anon_vma.lock 88 * ->anon_vma.lock
89 * ->page_table_lock (anon_vma_prepare and various) 89 * ->page_table_lock or pte_lock (anon_vma_prepare and various)
90 * 90 *
91 * ->page_table_lock 91 * ->page_table_lock or pte_lock
92 * ->swap_lock (try_to_unmap_one) 92 * ->swap_lock (try_to_unmap_one)
93 * ->private_lock (try_to_unmap_one) 93 * ->private_lock (try_to_unmap_one)
94 * ->tree_lock (try_to_unmap_one) 94 * ->tree_lock (try_to_unmap_one)
@@ -152,7 +152,7 @@ static int sync_page(void *word)
152 * in the ->sync_page() methods make essential use of the 152 * in the ->sync_page() methods make essential use of the
153 * page_mapping(), merely passing the page down to the backing 153 * page_mapping(), merely passing the page down to the backing
154 * device's unplug functions when it's non-NULL, which in turn 154 * device's unplug functions when it's non-NULL, which in turn
155 * ignore it for all cases but swap, where only page->private is 155 * ignore it for all cases but swap, where only page_private(page) is
156 * of interest. When page_mapping() does go NULL, the entire 156 * of interest. When page_mapping() does go NULL, the entire
157 * call stack gracefully ignores the page and returns. 157 * call stack gracefully ignores the page and returns.
158 * -- wli 158 * -- wli
@@ -377,7 +377,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
377 * This function does not add the page to the LRU. The caller must do that. 377 * This function does not add the page to the LRU. The caller must do that.
378 */ 378 */
379int add_to_page_cache(struct page *page, struct address_space *mapping, 379int add_to_page_cache(struct page *page, struct address_space *mapping,
380 pgoff_t offset, int gfp_mask) 380 pgoff_t offset, gfp_t gfp_mask)
381{ 381{
382 int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 382 int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
383 383
@@ -401,7 +401,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
401EXPORT_SYMBOL(add_to_page_cache); 401EXPORT_SYMBOL(add_to_page_cache);
402 402
403int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 403int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
404 pgoff_t offset, int gfp_mask) 404 pgoff_t offset, gfp_t gfp_mask)
405{ 405{
406 int ret = add_to_page_cache(page, mapping, offset, gfp_mask); 406 int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
407 if (ret == 0) 407 if (ret == 0)
@@ -591,7 +591,7 @@ EXPORT_SYMBOL(find_lock_page);
591 * memory exhaustion. 591 * memory exhaustion.
592 */ 592 */
593struct page *find_or_create_page(struct address_space *mapping, 593struct page *find_or_create_page(struct address_space *mapping,
594 unsigned long index, unsigned int gfp_mask) 594 unsigned long index, gfp_t gfp_mask)
595{ 595{
596 struct page *page, *cached_page = NULL; 596 struct page *page, *cached_page = NULL;
597 int err; 597 int err;
@@ -683,7 +683,7 @@ struct page *
683grab_cache_page_nowait(struct address_space *mapping, unsigned long index) 683grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
684{ 684{
685 struct page *page = find_get_page(mapping, index); 685 struct page *page = find_get_page(mapping, index);
686 unsigned int gfp_mask; 686 gfp_t gfp_mask;
687 687
688 if (page) { 688 if (page) {
689 if (!TestSetPageLocked(page)) 689 if (!TestSetPageLocked(page))
@@ -1030,8 +1030,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1030 desc.error = 0; 1030 desc.error = 0;
1031 do_generic_file_read(filp,ppos,&desc,file_read_actor); 1031 do_generic_file_read(filp,ppos,&desc,file_read_actor);
1032 retval += desc.written; 1032 retval += desc.written;
1033 if (!retval) { 1033 if (desc.error) {
1034 retval = desc.error; 1034 retval = retval ?: desc.error;
1035 break; 1035 break;
1036 } 1036 }
1037 } 1037 }
@@ -1520,7 +1520,7 @@ repeat:
1520 page_cache_release(page); 1520 page_cache_release(page);
1521 return err; 1521 return err;
1522 } 1522 }
1523 } else { 1523 } else if (vma->vm_flags & VM_NONLINEAR) {
1524 /* No page was found just because we can't read it in now (being 1524 /* No page was found just because we can't read it in now (being
1525 * here implies nonblock != 0), but the page may exist, so set 1525 * here implies nonblock != 0), but the page may exist, so set
1526 * the PTE to fault it in later. */ 1526 * the PTE to fault it in later. */
@@ -1537,6 +1537,7 @@ repeat:
1537 1537
1538 return 0; 1538 return 0;
1539} 1539}
1540EXPORT_SYMBOL(filemap_populate);
1540 1541
1541struct vm_operations_struct generic_file_vm_ops = { 1542struct vm_operations_struct generic_file_vm_ops = {
1542 .nopage = filemap_nopage, 1543 .nopage = filemap_nopage,
@@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1555 vma->vm_ops = &generic_file_vm_ops; 1556 vma->vm_ops = &generic_file_vm_ops;
1556 return 0; 1557 return 0;
1557} 1558}
1558EXPORT_SYMBOL(filemap_populate);
1559 1559
1560/* 1560/*
1561 * This is for filesystems which do not implement ->writepage. 1561 * This is for filesystems which do not implement ->writepage.
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8c199f537732..9cf687e4a29a 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,8 @@ __xip_unmap (struct address_space * mapping,
174 unsigned long address; 174 unsigned long address;
175 pte_t *pte; 175 pte_t *pte;
176 pte_t pteval; 176 pte_t pteval;
177 spinlock_t *ptl;
178 struct page *page;
177 179
178 spin_lock(&mapping->i_mmap_lock); 180 spin_lock(&mapping->i_mmap_lock);
179 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 181 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -181,19 +183,17 @@ __xip_unmap (struct address_space * mapping,
181 address = vma->vm_start + 183 address = vma->vm_start +
182 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 184 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
183 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 185 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
184 /* 186 page = ZERO_PAGE(address);
185 * We need the page_table_lock to protect us from page faults, 187 pte = page_check_address(page, mm, address, &ptl);
186 * munmap, fork, etc... 188 if (pte) {
187 */
188 pte = page_check_address(ZERO_PAGE(address), mm,
189 address);
190 if (!IS_ERR(pte)) {
191 /* Nuke the page table entry. */ 189 /* Nuke the page table entry. */
192 flush_cache_page(vma, address, pte_pfn(*pte)); 190 flush_cache_page(vma, address, pte_pfn(*pte));
193 pteval = ptep_clear_flush(vma, address, pte); 191 pteval = ptep_clear_flush(vma, address, pte);
192 page_remove_rmap(page);
193 dec_mm_counter(mm, file_rss);
194 BUG_ON(pte_dirty(pteval)); 194 BUG_ON(pte_dirty(pteval));
195 pte_unmap(pte); 195 pte_unmap_unlock(pte, ptl);
196 spin_unlock(&mm->page_table_lock); 196 page_cache_release(page);
197 } 197 }
198 } 198 }
199 spin_unlock(&mapping->i_mmap_lock); 199 spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +228,7 @@ xip_file_nopage(struct vm_area_struct * area,
228 228
229 page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); 229 page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
230 if (!IS_ERR(page)) { 230 if (!IS_ERR(page)) {
231 return page; 231 goto out;
232 } 232 }
233 if (PTR_ERR(page) != -ENODATA) 233 if (PTR_ERR(page) != -ENODATA)
234 return NULL; 234 return NULL;
@@ -249,6 +249,8 @@ xip_file_nopage(struct vm_area_struct * area,
249 page = ZERO_PAGE(address); 249 page = ZERO_PAGE(address);
250 } 250 }
251 251
252out:
253 page_cache_get(page);
252 return page; 254 return page;
253} 255}
254 256
diff --git a/mm/fremap.c b/mm/fremap.c
index ab23a0673c35..d862be3bc3e3 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -20,33 +20,32 @@
20#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, 23static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
24 unsigned long addr, pte_t *ptep) 24 unsigned long addr, pte_t *ptep)
25{ 25{
26 pte_t pte = *ptep; 26 pte_t pte = *ptep;
27 struct page *page = NULL;
27 28
28 if (pte_none(pte))
29 return;
30 if (pte_present(pte)) { 29 if (pte_present(pte)) {
31 unsigned long pfn = pte_pfn(pte); 30 unsigned long pfn = pte_pfn(pte);
32
33 flush_cache_page(vma, addr, pfn); 31 flush_cache_page(vma, addr, pfn);
34 pte = ptep_clear_flush(vma, addr, ptep); 32 pte = ptep_clear_flush(vma, addr, ptep);
35 if (pfn_valid(pfn)) { 33 if (unlikely(!pfn_valid(pfn))) {
36 struct page *page = pfn_to_page(pfn); 34 print_bad_pte(vma, pte, addr);
37 if (!PageReserved(page)) { 35 goto out;
38 if (pte_dirty(pte))
39 set_page_dirty(page);
40 page_remove_rmap(page);
41 page_cache_release(page);
42 dec_mm_counter(mm, rss);
43 }
44 } 36 }
37 page = pfn_to_page(pfn);
38 if (pte_dirty(pte))
39 set_page_dirty(page);
40 page_remove_rmap(page);
41 page_cache_release(page);
45 } else { 42 } else {
46 if (!pte_file(pte)) 43 if (!pte_file(pte))
47 free_swap_and_cache(pte_to_swp_entry(pte)); 44 free_swap_and_cache(pte_to_swp_entry(pte));
48 pte_clear(mm, addr, ptep); 45 pte_clear(mm, addr, ptep);
49 } 46 }
47out:
48 return !!page;
50} 49}
51 50
52/* 51/*
@@ -64,21 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
64 pud_t *pud; 63 pud_t *pud;
65 pgd_t *pgd; 64 pgd_t *pgd;
66 pte_t pte_val; 65 pte_t pte_val;
66 spinlock_t *ptl;
67
68 BUG_ON(vma->vm_flags & VM_RESERVED);
67 69
68 pgd = pgd_offset(mm, addr); 70 pgd = pgd_offset(mm, addr);
69 spin_lock(&mm->page_table_lock);
70
71 pud = pud_alloc(mm, pgd, addr); 71 pud = pud_alloc(mm, pgd, addr);
72 if (!pud) 72 if (!pud)
73 goto err_unlock; 73 goto out;
74
75 pmd = pmd_alloc(mm, pud, addr); 74 pmd = pmd_alloc(mm, pud, addr);
76 if (!pmd) 75 if (!pmd)
77 goto err_unlock; 76 goto out;
78 77 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
79 pte = pte_alloc_map(mm, pmd, addr);
80 if (!pte) 78 if (!pte)
81 goto err_unlock; 79 goto out;
82 80
83 /* 81 /*
84 * This page may have been truncated. Tell the 82 * This page may have been truncated. Tell the
@@ -88,29 +86,27 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
88 inode = vma->vm_file->f_mapping->host; 86 inode = vma->vm_file->f_mapping->host;
89 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 87 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
90 if (!page->mapping || page->index >= size) 88 if (!page->mapping || page->index >= size)
91 goto err_unlock; 89 goto unlock;
92 err = -ENOMEM; 90 err = -ENOMEM;
93 if (page_mapcount(page) > INT_MAX/2) 91 if (page_mapcount(page) > INT_MAX/2)
94 goto err_unlock; 92 goto unlock;
95 93
96 zap_pte(mm, vma, addr, pte); 94 if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
95 inc_mm_counter(mm, file_rss);
97 96
98 inc_mm_counter(mm,rss);
99 flush_icache_page(vma, page); 97 flush_icache_page(vma, page);
100 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 98 set_pte_at(mm, addr, pte, mk_pte(page, prot));
101 page_add_file_rmap(page); 99 page_add_file_rmap(page);
102 pte_val = *pte; 100 pte_val = *pte;
103 pte_unmap(pte);
104 update_mmu_cache(vma, addr, pte_val); 101 update_mmu_cache(vma, addr, pte_val);
105
106 err = 0; 102 err = 0;
107err_unlock: 103unlock:
108 spin_unlock(&mm->page_table_lock); 104 pte_unmap_unlock(pte, ptl);
105out:
109 return err; 106 return err;
110} 107}
111EXPORT_SYMBOL(install_page); 108EXPORT_SYMBOL(install_page);
112 109
113
114/* 110/*
115 * Install a file pte to a given virtual memory address, release any 111 * Install a file pte to a given virtual memory address, release any
116 * previously existing mapping. 112 * previously existing mapping.
@@ -124,37 +120,35 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
124 pud_t *pud; 120 pud_t *pud;
125 pgd_t *pgd; 121 pgd_t *pgd;
126 pte_t pte_val; 122 pte_t pte_val;
123 spinlock_t *ptl;
124
125 BUG_ON(vma->vm_flags & VM_RESERVED);
127 126
128 pgd = pgd_offset(mm, addr); 127 pgd = pgd_offset(mm, addr);
129 spin_lock(&mm->page_table_lock);
130
131 pud = pud_alloc(mm, pgd, addr); 128 pud = pud_alloc(mm, pgd, addr);
132 if (!pud) 129 if (!pud)
133 goto err_unlock; 130 goto out;
134
135 pmd = pmd_alloc(mm, pud, addr); 131 pmd = pmd_alloc(mm, pud, addr);
136 if (!pmd) 132 if (!pmd)
137 goto err_unlock; 133 goto out;
138 134 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
139 pte = pte_alloc_map(mm, pmd, addr);
140 if (!pte) 135 if (!pte)
141 goto err_unlock; 136 goto out;
142 137
143 zap_pte(mm, vma, addr, pte); 138 if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
139 update_hiwater_rss(mm);
140 dec_mm_counter(mm, file_rss);
141 }
144 142
145 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 143 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
146 pte_val = *pte; 144 pte_val = *pte;
147 pte_unmap(pte);
148 update_mmu_cache(vma, addr, pte_val); 145 update_mmu_cache(vma, addr, pte_val);
149 spin_unlock(&mm->page_table_lock); 146 pte_unmap_unlock(pte, ptl);
150 return 0; 147 err = 0;
151 148out:
152err_unlock:
153 spin_unlock(&mm->page_table_lock);
154 return err; 149 return err;
155} 150}
156 151
157
158/*** 152/***
159 * sys_remap_file_pages - remap arbitrary pages of a shared backing store 153 * sys_remap_file_pages - remap arbitrary pages of a shared backing store
160 * file within an existing vma. 154 * file within an existing vma.
diff --git a/mm/highmem.c b/mm/highmem.c
index 90e1861e2da0..ce2e7e8bbfa7 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,11 +30,9 @@
30 30
31static mempool_t *page_pool, *isa_page_pool; 31static mempool_t *page_pool, *isa_page_pool;
32 32
33static void *page_pool_alloc(gfp_t gfp_mask, void *data) 33static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data)
34{ 34{
35 unsigned int gfp = gfp_mask | (unsigned int) (long) data; 35 return alloc_page(gfp_mask | GFP_DMA);
36
37 return alloc_page(gfp);
38} 36}
39 37
40static void page_pool_free(void *page, void *data) 38static void page_pool_free(void *page, void *data)
@@ -51,6 +49,12 @@ static void page_pool_free(void *page, void *data)
51 * n means that there are (n-1) current users of it. 49 * n means that there are (n-1) current users of it.
52 */ 50 */
53#ifdef CONFIG_HIGHMEM 51#ifdef CONFIG_HIGHMEM
52
53static void *page_pool_alloc(gfp_t gfp_mask, void *data)
54{
55 return alloc_page(gfp_mask);
56}
57
54static int pkmap_count[LAST_PKMAP]; 58static int pkmap_count[LAST_PKMAP];
55static unsigned int last_pkmap_nr; 59static unsigned int last_pkmap_nr;
56static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); 60static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -267,7 +271,7 @@ int init_emergency_isa_pool(void)
267 if (isa_page_pool) 271 if (isa_page_pool)
268 return 0; 272 return 0;
269 273
270 isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA); 274 isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL);
271 if (!isa_page_pool) 275 if (!isa_page_pool)
272 BUG(); 276 BUG();
273 277
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61d380678030..c9b43360fd33 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -277,19 +277,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
277 unsigned long addr; 277 unsigned long addr;
278 278
279 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 279 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
280 src_pte = huge_pte_offset(src, addr);
281 if (!src_pte)
282 continue;
280 dst_pte = huge_pte_alloc(dst, addr); 283 dst_pte = huge_pte_alloc(dst, addr);
281 if (!dst_pte) 284 if (!dst_pte)
282 goto nomem; 285 goto nomem;
286 spin_lock(&dst->page_table_lock);
283 spin_lock(&src->page_table_lock); 287 spin_lock(&src->page_table_lock);
284 src_pte = huge_pte_offset(src, addr); 288 if (!pte_none(*src_pte)) {
285 if (src_pte && !pte_none(*src_pte)) {
286 entry = *src_pte; 289 entry = *src_pte;
287 ptepage = pte_page(entry); 290 ptepage = pte_page(entry);
288 get_page(ptepage); 291 get_page(ptepage);
289 add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); 292 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
290 set_huge_pte_at(dst, addr, dst_pte, entry); 293 set_huge_pte_at(dst, addr, dst_pte, entry);
291 } 294 }
292 spin_unlock(&src->page_table_lock); 295 spin_unlock(&src->page_table_lock);
296 spin_unlock(&dst->page_table_lock);
293 } 297 }
294 return 0; 298 return 0;
295 299
@@ -310,12 +314,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
310 BUG_ON(start & ~HPAGE_MASK); 314 BUG_ON(start & ~HPAGE_MASK);
311 BUG_ON(end & ~HPAGE_MASK); 315 BUG_ON(end & ~HPAGE_MASK);
312 316
317 spin_lock(&mm->page_table_lock);
318
319 /* Update high watermark before we lower rss */
320 update_hiwater_rss(mm);
321
313 for (address = start; address < end; address += HPAGE_SIZE) { 322 for (address = start; address < end; address += HPAGE_SIZE) {
314 ptep = huge_pte_offset(mm, address); 323 ptep = huge_pte_offset(mm, address);
315 if (! ptep) 324 if (!ptep)
316 /* This can happen on truncate, or if an
317 * mmap() is aborted due to an error before
318 * the prefault */
319 continue; 325 continue;
320 326
321 pte = huge_ptep_get_and_clear(mm, address, ptep); 327 pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -324,96 +330,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
324 330
325 page = pte_page(pte); 331 page = pte_page(pte);
326 put_page(page); 332 put_page(page);
327 add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE)); 333 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
328 } 334 }
329 flush_tlb_range(vma, start, end);
330}
331
332void zap_hugepage_range(struct vm_area_struct *vma,
333 unsigned long start, unsigned long length)
334{
335 struct mm_struct *mm = vma->vm_mm;
336 335
337 spin_lock(&mm->page_table_lock);
338 unmap_hugepage_range(vma, start, start + length);
339 spin_unlock(&mm->page_table_lock); 336 spin_unlock(&mm->page_table_lock);
337 flush_tlb_range(vma, start, end);
340} 338}
341 339
342int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) 340static struct page *find_lock_huge_page(struct address_space *mapping,
341 unsigned long idx)
343{ 342{
344 struct mm_struct *mm = current->mm; 343 struct page *page;
345 unsigned long addr; 344 int err;
346 int ret = 0; 345 struct inode *inode = mapping->host;
347 346 unsigned long size;
348 WARN_ON(!is_vm_hugetlb_page(vma)); 347
349 BUG_ON(vma->vm_start & ~HPAGE_MASK); 348retry:
350 BUG_ON(vma->vm_end & ~HPAGE_MASK); 349 page = find_lock_page(mapping, idx);
351 350 if (page)
352 hugetlb_prefault_arch_hook(mm); 351 goto out;
353 352
354 spin_lock(&mm->page_table_lock); 353 /* Check to make sure the mapping hasn't been truncated */
355 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 354 size = i_size_read(inode) >> HPAGE_SHIFT;
356 unsigned long idx; 355 if (idx >= size)
357 pte_t *pte = huge_pte_alloc(mm, addr); 356 goto out;
358 struct page *page; 357
359 358 if (hugetlb_get_quota(mapping))
360 if (!pte) { 359 goto out;
361 ret = -ENOMEM; 360 page = alloc_huge_page();
362 goto out; 361 if (!page) {
363 } 362 hugetlb_put_quota(mapping);
363 goto out;
364 }
364 365
365 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) 366 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
366 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 367 if (err) {
367 page = find_get_page(mapping, idx); 368 put_page(page);
368 if (!page) { 369 hugetlb_put_quota(mapping);
369 /* charge the fs quota first */ 370 if (err == -EEXIST)
370 if (hugetlb_get_quota(mapping)) { 371 goto retry;
371 ret = -ENOMEM; 372 page = NULL;
372 goto out;
373 }
374 page = alloc_huge_page();
375 if (!page) {
376 hugetlb_put_quota(mapping);
377 ret = -ENOMEM;
378 goto out;
379 }
380 ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
381 if (! ret) {
382 unlock_page(page);
383 } else {
384 hugetlb_put_quota(mapping);
385 free_huge_page(page);
386 goto out;
387 }
388 }
389 add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
390 set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
391 } 373 }
392out: 374out:
393 spin_unlock(&mm->page_table_lock); 375 return page;
394 return ret;
395} 376}
396 377
397/*
398 * On ia64 at least, it is possible to receive a hugetlb fault from a
399 * stale zero entry left in the TLB from earlier hardware prefetching.
400 * Low-level arch code should already have flushed the stale entry as
401 * part of its fault handling, but we do need to accept this minor fault
402 * and return successfully. Whereas the "normal" case is that this is
403 * an access to a hugetlb page which has been truncated off since mmap.
404 */
405int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 378int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
406 unsigned long address, int write_access) 379 unsigned long address, int write_access)
407{ 380{
408 int ret = VM_FAULT_SIGBUS; 381 int ret = VM_FAULT_SIGBUS;
382 unsigned long idx;
383 unsigned long size;
409 pte_t *pte; 384 pte_t *pte;
385 struct page *page;
386 struct address_space *mapping;
387
388 pte = huge_pte_alloc(mm, address);
389 if (!pte)
390 goto out;
391
392 mapping = vma->vm_file->f_mapping;
393 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
394 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
395
396 /*
397 * Use page lock to guard against racing truncation
398 * before we get page_table_lock.
399 */
400 page = find_lock_huge_page(mapping, idx);
401 if (!page)
402 goto out;
410 403
411 spin_lock(&mm->page_table_lock); 404 spin_lock(&mm->page_table_lock);
412 pte = huge_pte_offset(mm, address); 405 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
413 if (pte && !pte_none(*pte)) 406 if (idx >= size)
414 ret = VM_FAULT_MINOR; 407 goto backout;
408
409 ret = VM_FAULT_MINOR;
410 if (!pte_none(*pte))
411 goto backout;
412
413 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
414 set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
415 spin_unlock(&mm->page_table_lock); 415 spin_unlock(&mm->page_table_lock);
416 unlock_page(page);
417out:
416 return ret; 418 return ret;
419
420backout:
421 spin_unlock(&mm->page_table_lock);
422 hugetlb_put_quota(mapping);
423 unlock_page(page);
424 put_page(page);
425 goto out;
417} 426}
418 427
419int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 428int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -423,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
423 unsigned long vpfn, vaddr = *position; 432 unsigned long vpfn, vaddr = *position;
424 int remainder = *length; 433 int remainder = *length;
425 434
426 BUG_ON(!is_vm_hugetlb_page(vma));
427
428 vpfn = vaddr/PAGE_SIZE; 435 vpfn = vaddr/PAGE_SIZE;
429 spin_lock(&mm->page_table_lock); 436 spin_lock(&mm->page_table_lock);
430 while (vaddr < vma->vm_end && remainder) { 437 while (vaddr < vma->vm_end && remainder) {
438 pte_t *pte;
439 struct page *page;
431 440
432 if (pages) { 441 /*
433 pte_t *pte; 442 * Some archs (sparc64, sh*) have multiple pte_ts to
434 struct page *page; 443 * each hugepage. We have to make * sure we get the
435 444 * first, for the page indexing below to work.
436 /* Some archs (sparc64, sh*) have multiple 445 */
437 * pte_ts to each hugepage. We have to make 446 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
438 * sure we get the first, for the page
439 * indexing below to work. */
440 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
441
442 /* the hugetlb file might have been truncated */
443 if (!pte || pte_none(*pte)) {
444 remainder = 0;
445 if (!i)
446 i = -EFAULT;
447 break;
448 }
449 447
450 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 448 if (!pte || pte_none(*pte)) {
449 int ret;
451 450
452 WARN_ON(!PageCompound(page)); 451 spin_unlock(&mm->page_table_lock);
452 ret = hugetlb_fault(mm, vma, vaddr, 0);
453 spin_lock(&mm->page_table_lock);
454 if (ret == VM_FAULT_MINOR)
455 continue;
456
457 remainder = 0;
458 if (!i)
459 i = -EFAULT;
460 break;
461 }
453 462
463 if (pages) {
464 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
454 get_page(page); 465 get_page(page);
455 pages[i] = page; 466 pages[i] = page;
456 } 467 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 20e075d1c64c..17aaf3e16449 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
126 unsigned long start, unsigned long end) 126 unsigned long start, unsigned long end)
127{ 127{
128 *prev = vma; 128 *prev = vma;
129 if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) 129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
130 return -EINVAL; 130 return -EINVAL;
131 131
132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index 1db40e935e55..0f60baf6f69b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
114{ 114{
115 struct page *page = pmd_page(*pmd); 115 struct page *page = pmd_page(*pmd);
116 pmd_clear(pmd); 116 pmd_clear(pmd);
117 pte_lock_deinit(page);
117 pte_free_tlb(tlb, page); 118 pte_free_tlb(tlb, page);
118 dec_page_state(nr_page_table_pages); 119 dec_page_state(nr_page_table_pages);
119 tlb->mm->nr_ptes--; 120 tlb->mm->nr_ptes--;
@@ -249,7 +250,7 @@ void free_pgd_range(struct mmu_gather **tlb,
249 free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 250 free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
250 } while (pgd++, addr = next, addr != end); 251 } while (pgd++, addr = next, addr != end);
251 252
252 if (!tlb_is_full_mm(*tlb)) 253 if (!(*tlb)->fullmm)
253 flush_tlb_pgtables((*tlb)->mm, start, end); 254 flush_tlb_pgtables((*tlb)->mm, start, end);
254} 255}
255 256
@@ -260,6 +261,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
260 struct vm_area_struct *next = vma->vm_next; 261 struct vm_area_struct *next = vma->vm_next;
261 unsigned long addr = vma->vm_start; 262 unsigned long addr = vma->vm_start;
262 263
264 /*
265 * Hide vma from rmap and vmtruncate before freeing pgtables
266 */
267 anon_vma_unlink(vma);
268 unlink_file_vma(vma);
269
263 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { 270 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
264 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 271 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
265 floor, next? next->vm_start: ceiling); 272 floor, next? next->vm_start: ceiling);
@@ -272,6 +279,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
272 HPAGE_SIZE)) { 279 HPAGE_SIZE)) {
273 vma = next; 280 vma = next;
274 next = vma->vm_next; 281 next = vma->vm_next;
282 anon_vma_unlink(vma);
283 unlink_file_vma(vma);
275 } 284 }
276 free_pgd_range(tlb, addr, vma->vm_end, 285 free_pgd_range(tlb, addr, vma->vm_end,
277 floor, next? next->vm_start: ceiling); 286 floor, next? next->vm_start: ceiling);
@@ -280,72 +289,78 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
280 } 289 }
281} 290}
282 291
283pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, 292int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
284 unsigned long address)
285{ 293{
286 if (!pmd_present(*pmd)) { 294 struct page *new = pte_alloc_one(mm, address);
287 struct page *new; 295 if (!new)
288 296 return -ENOMEM;
289 spin_unlock(&mm->page_table_lock); 297
290 new = pte_alloc_one(mm, address); 298 pte_lock_init(new);
291 spin_lock(&mm->page_table_lock); 299 spin_lock(&mm->page_table_lock);
292 if (!new) 300 if (pmd_present(*pmd)) { /* Another has populated it */
293 return NULL; 301 pte_lock_deinit(new);
294 /* 302 pte_free(new);
295 * Because we dropped the lock, we should re-check the 303 } else {
296 * entry, as somebody else could have populated it..
297 */
298 if (pmd_present(*pmd)) {
299 pte_free(new);
300 goto out;
301 }
302 mm->nr_ptes++; 304 mm->nr_ptes++;
303 inc_page_state(nr_page_table_pages); 305 inc_page_state(nr_page_table_pages);
304 pmd_populate(mm, pmd, new); 306 pmd_populate(mm, pmd, new);
305 } 307 }
306out: 308 spin_unlock(&mm->page_table_lock);
307 return pte_offset_map(pmd, address); 309 return 0;
308} 310}
309 311
310pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 312int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
311{ 313{
312 if (!pmd_present(*pmd)) { 314 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
313 pte_t *new; 315 if (!new)
316 return -ENOMEM;
314 317
315 spin_unlock(&mm->page_table_lock); 318 spin_lock(&init_mm.page_table_lock);
316 new = pte_alloc_one_kernel(mm, address); 319 if (pmd_present(*pmd)) /* Another has populated it */
317 spin_lock(&mm->page_table_lock); 320 pte_free_kernel(new);
318 if (!new) 321 else
319 return NULL; 322 pmd_populate_kernel(&init_mm, pmd, new);
323 spin_unlock(&init_mm.page_table_lock);
324 return 0;
325}
320 326
321 /* 327static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
322 * Because we dropped the lock, we should re-check the 328{
323 * entry, as somebody else could have populated it.. 329 if (file_rss)
324 */ 330 add_mm_counter(mm, file_rss, file_rss);
325 if (pmd_present(*pmd)) { 331 if (anon_rss)
326 pte_free_kernel(new); 332 add_mm_counter(mm, anon_rss, anon_rss);
327 goto out; 333}
328 } 334
329 pmd_populate_kernel(mm, pmd, new); 335/*
330 } 336 * This function is called to print an error when a pte in a
331out: 337 * !VM_RESERVED region is found pointing to an invalid pfn (which
332 return pte_offset_kernel(pmd, address); 338 * is an error.
339 *
340 * The calling function must still handle the error.
341 */
342void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
343{
344 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
345 "vm_flags = %lx, vaddr = %lx\n",
346 (long long)pte_val(pte),
347 (vma->vm_mm == current->mm ? current->comm : "???"),
348 vma->vm_flags, vaddr);
349 dump_stack();
333} 350}
334 351
335/* 352/*
336 * copy one vm_area from one task to the other. Assumes the page tables 353 * copy one vm_area from one task to the other. Assumes the page tables
337 * already present in the new task to be cleared in the whole range 354 * already present in the new task to be cleared in the whole range
338 * covered by this vma. 355 * covered by this vma.
339 *
340 * dst->page_table_lock is held on entry and exit,
341 * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
342 */ 356 */
343 357
344static inline void 358static inline void
345copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 359copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
346 pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, 360 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
347 unsigned long addr) 361 unsigned long addr, int *rss)
348{ 362{
363 unsigned long vm_flags = vma->vm_flags;
349 pte_t pte = *src_pte; 364 pte_t pte = *src_pte;
350 struct page *page; 365 struct page *page;
351 unsigned long pfn; 366 unsigned long pfn;
@@ -357,29 +372,32 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
357 /* make sure dst_mm is on swapoff's mmlist. */ 372 /* make sure dst_mm is on swapoff's mmlist. */
358 if (unlikely(list_empty(&dst_mm->mmlist))) { 373 if (unlikely(list_empty(&dst_mm->mmlist))) {
359 spin_lock(&mmlist_lock); 374 spin_lock(&mmlist_lock);
360 list_add(&dst_mm->mmlist, &src_mm->mmlist); 375 if (list_empty(&dst_mm->mmlist))
376 list_add(&dst_mm->mmlist,
377 &src_mm->mmlist);
361 spin_unlock(&mmlist_lock); 378 spin_unlock(&mmlist_lock);
362 } 379 }
363 } 380 }
364 set_pte_at(dst_mm, addr, dst_pte, pte); 381 goto out_set_pte;
365 return;
366 } 382 }
367 383
368 pfn = pte_pfn(pte); 384 /* If the region is VM_RESERVED, the mapping is not
369 /* the pte points outside of valid memory, the 385 * mapped via rmap - duplicate the pte as is.
370 * mapping is assumed to be good, meaningful
371 * and not mapped via rmap - duplicate the
372 * mapping as is.
373 */ 386 */
374 page = NULL; 387 if (vm_flags & VM_RESERVED)
375 if (pfn_valid(pfn)) 388 goto out_set_pte;
376 page = pfn_to_page(pfn);
377 389
378 if (!page || PageReserved(page)) { 390 pfn = pte_pfn(pte);
379 set_pte_at(dst_mm, addr, dst_pte, pte); 391 /* If the pte points outside of valid memory but
380 return; 392 * the region is not VM_RESERVED, we have a problem.
393 */
394 if (unlikely(!pfn_valid(pfn))) {
395 print_bad_pte(vma, pte, addr);
396 goto out_set_pte; /* try to do something sane */
381 } 397 }
382 398
399 page = pfn_to_page(pfn);
400
383 /* 401 /*
384 * If it's a COW mapping, write protect it both 402 * If it's a COW mapping, write protect it both
385 * in the parent and the child 403 * in the parent and the child
@@ -397,11 +415,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
397 pte = pte_mkclean(pte); 415 pte = pte_mkclean(pte);
398 pte = pte_mkold(pte); 416 pte = pte_mkold(pte);
399 get_page(page); 417 get_page(page);
400 inc_mm_counter(dst_mm, rss);
401 if (PageAnon(page))
402 inc_mm_counter(dst_mm, anon_rss);
403 set_pte_at(dst_mm, addr, dst_pte, pte);
404 page_dup_rmap(page); 418 page_dup_rmap(page);
419 rss[!!PageAnon(page)]++;
420
421out_set_pte:
422 set_pte_at(dst_mm, addr, dst_pte, pte);
405} 423}
406 424
407static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 425static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -409,38 +427,44 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
409 unsigned long addr, unsigned long end) 427 unsigned long addr, unsigned long end)
410{ 428{
411 pte_t *src_pte, *dst_pte; 429 pte_t *src_pte, *dst_pte;
412 unsigned long vm_flags = vma->vm_flags; 430 spinlock_t *src_ptl, *dst_ptl;
413 int progress; 431 int progress = 0;
432 int rss[2];
414 433
415again: 434again:
416 dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); 435 rss[1] = rss[0] = 0;
436 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
417 if (!dst_pte) 437 if (!dst_pte)
418 return -ENOMEM; 438 return -ENOMEM;
419 src_pte = pte_offset_map_nested(src_pmd, addr); 439 src_pte = pte_offset_map_nested(src_pmd, addr);
440 src_ptl = pte_lockptr(src_mm, src_pmd);
441 spin_lock(src_ptl);
420 442
421 progress = 0;
422 spin_lock(&src_mm->page_table_lock);
423 do { 443 do {
424 /* 444 /*
425 * We are holding two locks at this point - either of them 445 * We are holding two locks at this point - either of them
426 * could generate latencies in another task on another CPU. 446 * could generate latencies in another task on another CPU.
427 */ 447 */
428 if (progress >= 32 && (need_resched() || 448 if (progress >= 32) {
429 need_lockbreak(&src_mm->page_table_lock) || 449 progress = 0;
430 need_lockbreak(&dst_mm->page_table_lock))) 450 if (need_resched() ||
431 break; 451 need_lockbreak(src_ptl) ||
452 need_lockbreak(dst_ptl))
453 break;
454 }
432 if (pte_none(*src_pte)) { 455 if (pte_none(*src_pte)) {
433 progress++; 456 progress++;
434 continue; 457 continue;
435 } 458 }
436 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); 459 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
437 progress += 8; 460 progress += 8;
438 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 461 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
439 spin_unlock(&src_mm->page_table_lock);
440 462
463 spin_unlock(src_ptl);
441 pte_unmap_nested(src_pte - 1); 464 pte_unmap_nested(src_pte - 1);
442 pte_unmap(dst_pte - 1); 465 add_mm_rss(dst_mm, rss[0], rss[1]);
443 cond_resched_lock(&dst_mm->page_table_lock); 466 pte_unmap_unlock(dst_pte - 1, dst_ptl);
467 cond_resched();
444 if (addr != end) 468 if (addr != end)
445 goto again; 469 goto again;
446 return 0; 470 return 0;
@@ -525,24 +549,30 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
525 return 0; 549 return 0;
526} 550}
527 551
528static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 552static void zap_pte_range(struct mmu_gather *tlb,
553 struct vm_area_struct *vma, pmd_t *pmd,
529 unsigned long addr, unsigned long end, 554 unsigned long addr, unsigned long end,
530 struct zap_details *details) 555 struct zap_details *details)
531{ 556{
557 struct mm_struct *mm = tlb->mm;
532 pte_t *pte; 558 pte_t *pte;
559 spinlock_t *ptl;
560 int file_rss = 0;
561 int anon_rss = 0;
533 562
534 pte = pte_offset_map(pmd, addr); 563 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
535 do { 564 do {
536 pte_t ptent = *pte; 565 pte_t ptent = *pte;
537 if (pte_none(ptent)) 566 if (pte_none(ptent))
538 continue; 567 continue;
539 if (pte_present(ptent)) { 568 if (pte_present(ptent)) {
540 struct page *page = NULL; 569 struct page *page = NULL;
541 unsigned long pfn = pte_pfn(ptent); 570 if (!(vma->vm_flags & VM_RESERVED)) {
542 if (pfn_valid(pfn)) { 571 unsigned long pfn = pte_pfn(ptent);
543 page = pfn_to_page(pfn); 572 if (unlikely(!pfn_valid(pfn)))
544 if (PageReserved(page)) 573 print_bad_pte(vma, ptent, addr);
545 page = NULL; 574 else
575 page = pfn_to_page(pfn);
546 } 576 }
547 if (unlikely(details) && page) { 577 if (unlikely(details) && page) {
548 /* 578 /*
@@ -562,7 +592,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
562 page->index > details->last_index)) 592 page->index > details->last_index))
563 continue; 593 continue;
564 } 594 }
565 ptent = ptep_get_and_clear_full(tlb->mm, addr, pte, 595 ptent = ptep_get_and_clear_full(mm, addr, pte,
566 tlb->fullmm); 596 tlb->fullmm);
567 tlb_remove_tlb_entry(tlb, pte, addr); 597 tlb_remove_tlb_entry(tlb, pte, addr);
568 if (unlikely(!page)) 598 if (unlikely(!page))
@@ -570,15 +600,17 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
570 if (unlikely(details) && details->nonlinear_vma 600 if (unlikely(details) && details->nonlinear_vma
571 && linear_page_index(details->nonlinear_vma, 601 && linear_page_index(details->nonlinear_vma,
572 addr) != page->index) 602 addr) != page->index)
573 set_pte_at(tlb->mm, addr, pte, 603 set_pte_at(mm, addr, pte,
574 pgoff_to_pte(page->index)); 604 pgoff_to_pte(page->index));
575 if (pte_dirty(ptent))
576 set_page_dirty(page);
577 if (PageAnon(page)) 605 if (PageAnon(page))
578 dec_mm_counter(tlb->mm, anon_rss); 606 anon_rss--;
579 else if (pte_young(ptent)) 607 else {
580 mark_page_accessed(page); 608 if (pte_dirty(ptent))
581 tlb->freed++; 609 set_page_dirty(page);
610 if (pte_young(ptent))
611 mark_page_accessed(page);
612 file_rss--;
613 }
582 page_remove_rmap(page); 614 page_remove_rmap(page);
583 tlb_remove_page(tlb, page); 615 tlb_remove_page(tlb, page);
584 continue; 616 continue;
@@ -591,12 +623,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
591 continue; 623 continue;
592 if (!pte_file(ptent)) 624 if (!pte_file(ptent))
593 free_swap_and_cache(pte_to_swp_entry(ptent)); 625 free_swap_and_cache(pte_to_swp_entry(ptent));
594 pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); 626 pte_clear_full(mm, addr, pte, tlb->fullmm);
595 } while (pte++, addr += PAGE_SIZE, addr != end); 627 } while (pte++, addr += PAGE_SIZE, addr != end);
596 pte_unmap(pte - 1); 628
629 add_mm_rss(mm, file_rss, anon_rss);
630 pte_unmap_unlock(pte - 1, ptl);
597} 631}
598 632
599static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, 633static inline void zap_pmd_range(struct mmu_gather *tlb,
634 struct vm_area_struct *vma, pud_t *pud,
600 unsigned long addr, unsigned long end, 635 unsigned long addr, unsigned long end,
601 struct zap_details *details) 636 struct zap_details *details)
602{ 637{
@@ -608,11 +643,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
608 next = pmd_addr_end(addr, end); 643 next = pmd_addr_end(addr, end);
609 if (pmd_none_or_clear_bad(pmd)) 644 if (pmd_none_or_clear_bad(pmd))
610 continue; 645 continue;
611 zap_pte_range(tlb, pmd, addr, next, details); 646 zap_pte_range(tlb, vma, pmd, addr, next, details);
612 } while (pmd++, addr = next, addr != end); 647 } while (pmd++, addr = next, addr != end);
613} 648}
614 649
615static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 650static inline void zap_pud_range(struct mmu_gather *tlb,
651 struct vm_area_struct *vma, pgd_t *pgd,
616 unsigned long addr, unsigned long end, 652 unsigned long addr, unsigned long end,
617 struct zap_details *details) 653 struct zap_details *details)
618{ 654{
@@ -624,7 +660,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
624 next = pud_addr_end(addr, end); 660 next = pud_addr_end(addr, end);
625 if (pud_none_or_clear_bad(pud)) 661 if (pud_none_or_clear_bad(pud))
626 continue; 662 continue;
627 zap_pmd_range(tlb, pud, addr, next, details); 663 zap_pmd_range(tlb, vma, pud, addr, next, details);
628 } while (pud++, addr = next, addr != end); 664 } while (pud++, addr = next, addr != end);
629} 665}
630 666
@@ -645,7 +681,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
645 next = pgd_addr_end(addr, end); 681 next = pgd_addr_end(addr, end);
646 if (pgd_none_or_clear_bad(pgd)) 682 if (pgd_none_or_clear_bad(pgd))
647 continue; 683 continue;
648 zap_pud_range(tlb, pgd, addr, next, details); 684 zap_pud_range(tlb, vma, pgd, addr, next, details);
649 } while (pgd++, addr = next, addr != end); 685 } while (pgd++, addr = next, addr != end);
650 tlb_end_vma(tlb, vma); 686 tlb_end_vma(tlb, vma);
651} 687}
@@ -660,7 +696,6 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
660/** 696/**
661 * unmap_vmas - unmap a range of memory covered by a list of vma's 697 * unmap_vmas - unmap a range of memory covered by a list of vma's
662 * @tlbp: address of the caller's struct mmu_gather 698 * @tlbp: address of the caller's struct mmu_gather
663 * @mm: the controlling mm_struct
664 * @vma: the starting vma 699 * @vma: the starting vma
665 * @start_addr: virtual address at which to start unmapping 700 * @start_addr: virtual address at which to start unmapping
666 * @end_addr: virtual address at which to end unmapping 701 * @end_addr: virtual address at which to end unmapping
@@ -669,10 +704,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
669 * 704 *
670 * Returns the end address of the unmapping (restart addr if interrupted). 705 * Returns the end address of the unmapping (restart addr if interrupted).
671 * 706 *
672 * Unmap all pages in the vma list. Called under page_table_lock. 707 * Unmap all pages in the vma list.
673 * 708 *
674 * We aim to not hold page_table_lock for too long (for scheduling latency 709 * We aim to not hold locks for too long (for scheduling latency reasons).
675 * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to 710 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
676 * return the ending mmu_gather to the caller. 711 * return the ending mmu_gather to the caller.
677 * 712 *
678 * Only addresses between `start' and `end' will be unmapped. 713 * Only addresses between `start' and `end' will be unmapped.
@@ -684,7 +719,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
684 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 719 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
685 * drops the lock and schedules. 720 * drops the lock and schedules.
686 */ 721 */
687unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, 722unsigned long unmap_vmas(struct mmu_gather **tlbp,
688 struct vm_area_struct *vma, unsigned long start_addr, 723 struct vm_area_struct *vma, unsigned long start_addr,
689 unsigned long end_addr, unsigned long *nr_accounted, 724 unsigned long end_addr, unsigned long *nr_accounted,
690 struct zap_details *details) 725 struct zap_details *details)
@@ -694,7 +729,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
694 int tlb_start_valid = 0; 729 int tlb_start_valid = 0;
695 unsigned long start = start_addr; 730 unsigned long start = start_addr;
696 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; 731 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
697 int fullmm = tlb_is_full_mm(*tlbp); 732 int fullmm = (*tlbp)->fullmm;
698 733
699 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 734 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
700 unsigned long end; 735 unsigned long end;
@@ -734,19 +769,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
734 tlb_finish_mmu(*tlbp, tlb_start, start); 769 tlb_finish_mmu(*tlbp, tlb_start, start);
735 770
736 if (need_resched() || 771 if (need_resched() ||
737 need_lockbreak(&mm->page_table_lock) ||
738 (i_mmap_lock && need_lockbreak(i_mmap_lock))) { 772 (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
739 if (i_mmap_lock) { 773 if (i_mmap_lock) {
740 /* must reset count of rss freed */ 774 *tlbp = NULL;
741 *tlbp = tlb_gather_mmu(mm, fullmm);
742 goto out; 775 goto out;
743 } 776 }
744 spin_unlock(&mm->page_table_lock);
745 cond_resched(); 777 cond_resched();
746 spin_lock(&mm->page_table_lock);
747 } 778 }
748 779
749 *tlbp = tlb_gather_mmu(mm, fullmm); 780 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
750 tlb_start_valid = 0; 781 tlb_start_valid = 0;
751 zap_bytes = ZAP_BLOCK_SIZE; 782 zap_bytes = ZAP_BLOCK_SIZE;
752 } 783 }
@@ -770,123 +801,93 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
770 unsigned long end = address + size; 801 unsigned long end = address + size;
771 unsigned long nr_accounted = 0; 802 unsigned long nr_accounted = 0;
772 803
773 if (is_vm_hugetlb_page(vma)) {
774 zap_hugepage_range(vma, address, size);
775 return end;
776 }
777
778 lru_add_drain(); 804 lru_add_drain();
779 spin_lock(&mm->page_table_lock);
780 tlb = tlb_gather_mmu(mm, 0); 805 tlb = tlb_gather_mmu(mm, 0);
781 end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); 806 update_hiwater_rss(mm);
782 tlb_finish_mmu(tlb, address, end); 807 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
783 spin_unlock(&mm->page_table_lock); 808 if (tlb)
809 tlb_finish_mmu(tlb, address, end);
784 return end; 810 return end;
785} 811}
786 812
787/* 813/*
788 * Do a quick page-table lookup for a single page. 814 * Do a quick page-table lookup for a single page.
789 * mm->page_table_lock must be held.
790 */ 815 */
791static struct page *__follow_page(struct mm_struct *mm, unsigned long address, 816struct page *follow_page(struct mm_struct *mm, unsigned long address,
792 int read, int write, int accessed) 817 unsigned int flags)
793{ 818{
794 pgd_t *pgd; 819 pgd_t *pgd;
795 pud_t *pud; 820 pud_t *pud;
796 pmd_t *pmd; 821 pmd_t *pmd;
797 pte_t *ptep, pte; 822 pte_t *ptep, pte;
823 spinlock_t *ptl;
798 unsigned long pfn; 824 unsigned long pfn;
799 struct page *page; 825 struct page *page;
800 826
801 page = follow_huge_addr(mm, address, write); 827 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
802 if (! IS_ERR(page)) 828 if (!IS_ERR(page)) {
803 return page; 829 BUG_ON(flags & FOLL_GET);
830 goto out;
831 }
804 832
833 page = NULL;
805 pgd = pgd_offset(mm, address); 834 pgd = pgd_offset(mm, address);
806 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 835 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
807 goto out; 836 goto no_page_table;
808 837
809 pud = pud_offset(pgd, address); 838 pud = pud_offset(pgd, address);
810 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 839 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
811 goto out; 840 goto no_page_table;
812 841
813 pmd = pmd_offset(pud, address); 842 pmd = pmd_offset(pud, address);
814 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 843 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
844 goto no_page_table;
845
846 if (pmd_huge(*pmd)) {
847 BUG_ON(flags & FOLL_GET);
848 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
815 goto out; 849 goto out;
816 if (pmd_huge(*pmd)) 850 }
817 return follow_huge_pmd(mm, address, pmd, write);
818 851
819 ptep = pte_offset_map(pmd, address); 852 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
820 if (!ptep) 853 if (!ptep)
821 goto out; 854 goto out;
822 855
823 pte = *ptep; 856 pte = *ptep;
824 pte_unmap(ptep); 857 if (!pte_present(pte))
825 if (pte_present(pte)) { 858 goto unlock;
826 if (write && !pte_write(pte)) 859 if ((flags & FOLL_WRITE) && !pte_write(pte))
827 goto out; 860 goto unlock;
828 if (read && !pte_read(pte)) 861 pfn = pte_pfn(pte);
829 goto out; 862 if (!pfn_valid(pfn))
830 pfn = pte_pfn(pte); 863 goto unlock;
831 if (pfn_valid(pfn)) { 864
832 page = pfn_to_page(pfn); 865 page = pfn_to_page(pfn);
833 if (accessed) { 866 if (flags & FOLL_GET)
834 if (write && !pte_dirty(pte) &&!PageDirty(page)) 867 get_page(page);
835 set_page_dirty(page); 868 if (flags & FOLL_TOUCH) {
836 mark_page_accessed(page); 869 if ((flags & FOLL_WRITE) &&
837 } 870 !pte_dirty(pte) && !PageDirty(page))
838 return page; 871 set_page_dirty(page);
839 } 872 mark_page_accessed(page);
840 } 873 }
841 874unlock:
875 pte_unmap_unlock(ptep, ptl);
842out: 876out:
843 return NULL; 877 return page;
844}
845
846inline struct page *
847follow_page(struct mm_struct *mm, unsigned long address, int write)
848{
849 return __follow_page(mm, address, 0, write, 1);
850}
851
852/*
853 * check_user_page_readable() can be called frm niterrupt context by oprofile,
854 * so we need to avoid taking any non-irq-safe locks
855 */
856int check_user_page_readable(struct mm_struct *mm, unsigned long address)
857{
858 return __follow_page(mm, address, 1, 0, 0) != NULL;
859}
860EXPORT_SYMBOL(check_user_page_readable);
861
862static inline int
863untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
864 unsigned long address)
865{
866 pgd_t *pgd;
867 pud_t *pud;
868 pmd_t *pmd;
869
870 /* Check if the vma is for an anonymous mapping. */
871 if (vma->vm_ops && vma->vm_ops->nopage)
872 return 0;
873
874 /* Check if page directory entry exists. */
875 pgd = pgd_offset(mm, address);
876 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
877 return 1;
878
879 pud = pud_offset(pgd, address);
880 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
881 return 1;
882
883 /* Check if page middle directory entry exists. */
884 pmd = pmd_offset(pud, address);
885 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
886 return 1;
887 878
888 /* There is a pte slot for 'address' in 'mm'. */ 879no_page_table:
889 return 0; 880 /*
881 * When core dumping an enormous anonymous area that nobody
882 * has touched so far, we don't want to allocate page tables.
883 */
884 if (flags & FOLL_ANON) {
885 page = ZERO_PAGE(address);
886 if (flags & FOLL_GET)
887 get_page(page);
888 BUG_ON(flags & FOLL_WRITE);
889 }
890 return page;
890} 891}
891 892
892int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 893int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +895,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
894 struct page **pages, struct vm_area_struct **vmas) 895 struct page **pages, struct vm_area_struct **vmas)
895{ 896{
896 int i; 897 int i;
897 unsigned int flags; 898 unsigned int vm_flags;
898 899
899 /* 900 /*
900 * Require read or write permissions. 901 * Require read or write permissions.
901 * If 'force' is set, we only require the "MAY" flags. 902 * If 'force' is set, we only require the "MAY" flags.
902 */ 903 */
903 flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 904 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
904 flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 905 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
905 i = 0; 906 i = 0;
906 907
907 do { 908 do {
908 struct vm_area_struct * vma; 909 struct vm_area_struct *vma;
910 unsigned int foll_flags;
909 911
910 vma = find_extend_vma(mm, start); 912 vma = find_extend_vma(mm, start);
911 if (!vma && in_gate_area(tsk, start)) { 913 if (!vma && in_gate_area(tsk, start)) {
@@ -945,8 +947,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
945 continue; 947 continue;
946 } 948 }
947 949
948 if (!vma || (vma->vm_flags & VM_IO) 950 if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
949 || !(flags & vma->vm_flags)) 951 || !(vm_flags & vma->vm_flags))
950 return i ? : -EFAULT; 952 return i ? : -EFAULT;
951 953
952 if (is_vm_hugetlb_page(vma)) { 954 if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +956,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
954 &start, &len, i); 956 &start, &len, i);
955 continue; 957 continue;
956 } 958 }
957 spin_lock(&mm->page_table_lock); 959
960 foll_flags = FOLL_TOUCH;
961 if (pages)
962 foll_flags |= FOLL_GET;
963 if (!write && !(vma->vm_flags & VM_LOCKED) &&
964 (!vma->vm_ops || !vma->vm_ops->nopage))
965 foll_flags |= FOLL_ANON;
966
958 do { 967 do {
959 int write_access = write;
960 struct page *page; 968 struct page *page;
961 969
962 cond_resched_lock(&mm->page_table_lock); 970 if (write)
963 while (!(page = follow_page(mm, start, write_access))) { 971 foll_flags |= FOLL_WRITE;
964 int ret;
965
966 /*
967 * Shortcut for anonymous pages. We don't want
968 * to force the creation of pages tables for
969 * insanely big anonymously mapped areas that
970 * nobody touched so far. This is important
971 * for doing a core dump for these mappings.
972 */
973 if (!write && untouched_anonymous_page(mm,vma,start)) {
974 page = ZERO_PAGE(start);
975 break;
976 }
977 spin_unlock(&mm->page_table_lock);
978 ret = __handle_mm_fault(mm, vma, start, write_access);
979 972
973 cond_resched();
974 while (!(page = follow_page(mm, start, foll_flags))) {
975 int ret;
976 ret = __handle_mm_fault(mm, vma, start,
977 foll_flags & FOLL_WRITE);
980 /* 978 /*
981 * The VM_FAULT_WRITE bit tells us that do_wp_page has 979 * The VM_FAULT_WRITE bit tells us that do_wp_page has
982 * broken COW when necessary, even if maybe_mkwrite 980 * broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +982,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
984 * subsequent page lookups as if they were reads. 982 * subsequent page lookups as if they were reads.
985 */ 983 */
986 if (ret & VM_FAULT_WRITE) 984 if (ret & VM_FAULT_WRITE)
987 write_access = 0; 985 foll_flags &= ~FOLL_WRITE;
988 986
989 switch (ret & ~VM_FAULT_WRITE) { 987 switch (ret & ~VM_FAULT_WRITE) {
990 case VM_FAULT_MINOR: 988 case VM_FAULT_MINOR:
@@ -1000,13 +998,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1000 default: 998 default:
1001 BUG(); 999 BUG();
1002 } 1000 }
1003 spin_lock(&mm->page_table_lock);
1004 } 1001 }
1005 if (pages) { 1002 if (pages) {
1006 pages[i] = page; 1003 pages[i] = page;
1007 flush_dcache_page(page); 1004 flush_dcache_page(page);
1008 if (!PageReserved(page))
1009 page_cache_get(page);
1010 } 1005 }
1011 if (vmas) 1006 if (vmas)
1012 vmas[i] = vma; 1007 vmas[i] = vma;
@@ -1014,7 +1009,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1014 start += PAGE_SIZE; 1009 start += PAGE_SIZE;
1015 len--; 1010 len--;
1016 } while (len && start < vma->vm_end); 1011 } while (len && start < vma->vm_end);
1017 spin_unlock(&mm->page_table_lock);
1018 } while (len); 1012 } while (len);
1019 return i; 1013 return i;
1020} 1014}
@@ -1024,16 +1018,21 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1024 unsigned long addr, unsigned long end, pgprot_t prot) 1018 unsigned long addr, unsigned long end, pgprot_t prot)
1025{ 1019{
1026 pte_t *pte; 1020 pte_t *pte;
1021 spinlock_t *ptl;
1027 1022
1028 pte = pte_alloc_map(mm, pmd, addr); 1023 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1029 if (!pte) 1024 if (!pte)
1030 return -ENOMEM; 1025 return -ENOMEM;
1031 do { 1026 do {
1032 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); 1027 struct page *page = ZERO_PAGE(addr);
1028 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1029 page_cache_get(page);
1030 page_add_file_rmap(page);
1031 inc_mm_counter(mm, file_rss);
1033 BUG_ON(!pte_none(*pte)); 1032 BUG_ON(!pte_none(*pte));
1034 set_pte_at(mm, addr, pte, zero_pte); 1033 set_pte_at(mm, addr, pte, zero_pte);
1035 } while (pte++, addr += PAGE_SIZE, addr != end); 1034 } while (pte++, addr += PAGE_SIZE, addr != end);
1036 pte_unmap(pte - 1); 1035 pte_unmap_unlock(pte - 1, ptl);
1037 return 0; 1036 return 0;
1038} 1037}
1039 1038
@@ -1083,14 +1082,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
1083 BUG_ON(addr >= end); 1082 BUG_ON(addr >= end);
1084 pgd = pgd_offset(mm, addr); 1083 pgd = pgd_offset(mm, addr);
1085 flush_cache_range(vma, addr, end); 1084 flush_cache_range(vma, addr, end);
1086 spin_lock(&mm->page_table_lock);
1087 do { 1085 do {
1088 next = pgd_addr_end(addr, end); 1086 next = pgd_addr_end(addr, end);
1089 err = zeromap_pud_range(mm, pgd, addr, next, prot); 1087 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1090 if (err) 1088 if (err)
1091 break; 1089 break;
1092 } while (pgd++, addr = next, addr != end); 1090 } while (pgd++, addr = next, addr != end);
1093 spin_unlock(&mm->page_table_lock);
1094 return err; 1091 return err;
1095} 1092}
1096 1093
@@ -1104,17 +1101,17 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1104 unsigned long pfn, pgprot_t prot) 1101 unsigned long pfn, pgprot_t prot)
1105{ 1102{
1106 pte_t *pte; 1103 pte_t *pte;
1104 spinlock_t *ptl;
1107 1105
1108 pte = pte_alloc_map(mm, pmd, addr); 1106 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1109 if (!pte) 1107 if (!pte)
1110 return -ENOMEM; 1108 return -ENOMEM;
1111 do { 1109 do {
1112 BUG_ON(!pte_none(*pte)); 1110 BUG_ON(!pte_none(*pte));
1113 if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) 1111 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1114 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1115 pfn++; 1112 pfn++;
1116 } while (pte++, addr += PAGE_SIZE, addr != end); 1113 } while (pte++, addr += PAGE_SIZE, addr != end);
1117 pte_unmap(pte - 1); 1114 pte_unmap_unlock(pte - 1, ptl);
1118 return 0; 1115 return 0;
1119} 1116}
1120 1117
@@ -1173,8 +1170,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1173 * rest of the world about it: 1170 * rest of the world about it:
1174 * VM_IO tells people not to look at these pages 1171 * VM_IO tells people not to look at these pages
1175 * (accesses can have side effects). 1172 * (accesses can have side effects).
1176 * VM_RESERVED tells swapout not to try to touch 1173 * VM_RESERVED tells the core MM not to "manage" these pages
1177 * this region. 1174 * (e.g. refcount, mapcount, try to swap them out).
1178 */ 1175 */
1179 vma->vm_flags |= VM_IO | VM_RESERVED; 1176 vma->vm_flags |= VM_IO | VM_RESERVED;
1180 1177
@@ -1182,7 +1179,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1182 pfn -= addr >> PAGE_SHIFT; 1179 pfn -= addr >> PAGE_SHIFT;
1183 pgd = pgd_offset(mm, addr); 1180 pgd = pgd_offset(mm, addr);
1184 flush_cache_range(vma, addr, end); 1181 flush_cache_range(vma, addr, end);
1185 spin_lock(&mm->page_table_lock);
1186 do { 1182 do {
1187 next = pgd_addr_end(addr, end); 1183 next = pgd_addr_end(addr, end);
1188 err = remap_pud_range(mm, pgd, addr, next, 1184 err = remap_pud_range(mm, pgd, addr, next,
@@ -1190,12 +1186,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1190 if (err) 1186 if (err)
1191 break; 1187 break;
1192 } while (pgd++, addr = next, addr != end); 1188 } while (pgd++, addr = next, addr != end);
1193 spin_unlock(&mm->page_table_lock);
1194 return err; 1189 return err;
1195} 1190}
1196EXPORT_SYMBOL(remap_pfn_range); 1191EXPORT_SYMBOL(remap_pfn_range);
1197 1192
1198/* 1193/*
1194 * handle_pte_fault chooses page fault handler according to an entry
1195 * which was read non-atomically. Before making any commitment, on
1196 * those architectures or configurations (e.g. i386 with PAE) which
1197 * might give a mix of unmatched parts, do_swap_page and do_file_page
1198 * must check under lock before unmapping the pte and proceeding
1199 * (but do_wp_page is only called after already making such a check;
1200 * and do_anonymous_page and do_no_page can safely check later on).
1201 */
1202static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1203 pte_t *page_table, pte_t orig_pte)
1204{
1205 int same = 1;
1206#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1207 if (sizeof(pte_t) > sizeof(unsigned long)) {
1208 spinlock_t *ptl = pte_lockptr(mm, pmd);
1209 spin_lock(ptl);
1210 same = pte_same(*page_table, orig_pte);
1211 spin_unlock(ptl);
1212 }
1213#endif
1214 pte_unmap(page_table);
1215 return same;
1216}
1217
1218/*
1199 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when 1219 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
1200 * servicing faults for write access. In the normal case, do always want 1220 * servicing faults for write access. In the normal case, do always want
1201 * pte_mkwrite. But get_user_pages can cause write faults for mappings 1221 * pte_mkwrite. But get_user_pages can cause write faults for mappings
@@ -1209,28 +1229,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1209} 1229}
1210 1230
1211/* 1231/*
1212 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
1213 */
1214static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
1215 pte_t *page_table)
1216{
1217 pte_t entry;
1218
1219 entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
1220 vma);
1221 ptep_establish(vma, address, page_table, entry);
1222 update_mmu_cache(vma, address, entry);
1223 lazy_mmu_prot_update(entry);
1224}
1225
1226/*
1227 * This routine handles present pages, when users try to write 1232 * This routine handles present pages, when users try to write
1228 * to a shared page. It is done by copying the page to a new address 1233 * to a shared page. It is done by copying the page to a new address
1229 * and decrementing the shared-page counter for the old page. 1234 * and decrementing the shared-page counter for the old page.
1230 * 1235 *
1231 * Goto-purists beware: the only reason for goto's here is that it results
1232 * in better assembly code.. The "default" path will see no jumps at all.
1233 *
1234 * Note that this routine assumes that the protection checks have been 1236 * Note that this routine assumes that the protection checks have been
1235 * done by the caller (the low-level page fault routine in most cases). 1237 * done by the caller (the low-level page fault routine in most cases).
1236 * Thus we can safely just mark it writable once we've done any necessary 1238 * Thus we can safely just mark it writable once we've done any necessary
@@ -1240,28 +1242,28 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page
1240 * change only once the write actually happens. This avoids a few races, 1242 * change only once the write actually happens. This avoids a few races,
1241 * and potentially makes it more efficient. 1243 * and potentially makes it more efficient.
1242 * 1244 *
1243 * We hold the mm semaphore and the page_table_lock on entry and exit 1245 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1244 * with the page_table_lock released. 1246 * but allow concurrent faults), with pte both mapped and locked.
1247 * We return with mmap_sem still held, but pte unmapped and unlocked.
1245 */ 1248 */
1246static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, 1249static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1247 unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) 1250 unsigned long address, pte_t *page_table, pmd_t *pmd,
1251 spinlock_t *ptl, pte_t orig_pte)
1248{ 1252{
1249 struct page *old_page, *new_page; 1253 struct page *old_page, *new_page;
1250 unsigned long pfn = pte_pfn(pte); 1254 unsigned long pfn = pte_pfn(orig_pte);
1251 pte_t entry; 1255 pte_t entry;
1252 int ret; 1256 int ret = VM_FAULT_MINOR;
1257
1258 BUG_ON(vma->vm_flags & VM_RESERVED);
1253 1259
1254 if (unlikely(!pfn_valid(pfn))) { 1260 if (unlikely(!pfn_valid(pfn))) {
1255 /* 1261 /*
1256 * This should really halt the system so it can be debugged or 1262 * Page table corrupted: show pte and kill process.
1257 * at least the kernel stops what it's doing before it corrupts
1258 * data, but for the moment just pretend this is OOM.
1259 */ 1263 */
1260 pte_unmap(page_table); 1264 print_bad_pte(vma, orig_pte, address);
1261 printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", 1265 ret = VM_FAULT_OOM;
1262 address); 1266 goto unlock;
1263 spin_unlock(&mm->page_table_lock);
1264 return VM_FAULT_OOM;
1265 } 1267 }
1266 old_page = pfn_to_page(pfn); 1268 old_page = pfn_to_page(pfn);
1267 1269
@@ -1270,52 +1272,51 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1270 unlock_page(old_page); 1272 unlock_page(old_page);
1271 if (reuse) { 1273 if (reuse) {
1272 flush_cache_page(vma, address, pfn); 1274 flush_cache_page(vma, address, pfn);
1273 entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), 1275 entry = pte_mkyoung(orig_pte);
1274 vma); 1276 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1275 ptep_set_access_flags(vma, address, page_table, entry, 1); 1277 ptep_set_access_flags(vma, address, page_table, entry, 1);
1276 update_mmu_cache(vma, address, entry); 1278 update_mmu_cache(vma, address, entry);
1277 lazy_mmu_prot_update(entry); 1279 lazy_mmu_prot_update(entry);
1278 pte_unmap(page_table); 1280 ret |= VM_FAULT_WRITE;
1279 spin_unlock(&mm->page_table_lock); 1281 goto unlock;
1280 return VM_FAULT_MINOR|VM_FAULT_WRITE;
1281 } 1282 }
1282 } 1283 }
1283 pte_unmap(page_table);
1284 1284
1285 /* 1285 /*
1286 * Ok, we need to copy. Oh, well.. 1286 * Ok, we need to copy. Oh, well..
1287 */ 1287 */
1288 if (!PageReserved(old_page)) 1288 page_cache_get(old_page);
1289 page_cache_get(old_page); 1289 pte_unmap_unlock(page_table, ptl);
1290 spin_unlock(&mm->page_table_lock);
1291 1290
1292 if (unlikely(anon_vma_prepare(vma))) 1291 if (unlikely(anon_vma_prepare(vma)))
1293 goto no_new_page; 1292 goto oom;
1294 if (old_page == ZERO_PAGE(address)) { 1293 if (old_page == ZERO_PAGE(address)) {
1295 new_page = alloc_zeroed_user_highpage(vma, address); 1294 new_page = alloc_zeroed_user_highpage(vma, address);
1296 if (!new_page) 1295 if (!new_page)
1297 goto no_new_page; 1296 goto oom;
1298 } else { 1297 } else {
1299 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1298 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1300 if (!new_page) 1299 if (!new_page)
1301 goto no_new_page; 1300 goto oom;
1302 copy_user_highpage(new_page, old_page, address); 1301 copy_user_highpage(new_page, old_page, address);
1303 } 1302 }
1303
1304 /* 1304 /*
1305 * Re-check the pte - we dropped the lock 1305 * Re-check the pte - we dropped the lock
1306 */ 1306 */
1307 ret = VM_FAULT_MINOR; 1307 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1308 spin_lock(&mm->page_table_lock); 1308 if (likely(pte_same(*page_table, orig_pte))) {
1309 page_table = pte_offset_map(pmd, address); 1309 page_remove_rmap(old_page);
1310 if (likely(pte_same(*page_table, pte))) { 1310 if (!PageAnon(old_page)) {
1311 if (PageAnon(old_page)) 1311 inc_mm_counter(mm, anon_rss);
1312 dec_mm_counter(mm, anon_rss); 1312 dec_mm_counter(mm, file_rss);
1313 if (PageReserved(old_page)) 1313 }
1314 inc_mm_counter(mm, rss);
1315 else
1316 page_remove_rmap(old_page);
1317 flush_cache_page(vma, address, pfn); 1314 flush_cache_page(vma, address, pfn);
1318 break_cow(vma, new_page, address, page_table); 1315 entry = mk_pte(new_page, vma->vm_page_prot);
1316 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1317 ptep_establish(vma, address, page_table, entry);
1318 update_mmu_cache(vma, address, entry);
1319 lazy_mmu_prot_update(entry);
1319 lru_cache_add_active(new_page); 1320 lru_cache_add_active(new_page);
1320 page_add_anon_rmap(new_page, vma, address); 1321 page_add_anon_rmap(new_page, vma, address);
1321 1322
@@ -1323,13 +1324,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1323 new_page = old_page; 1324 new_page = old_page;
1324 ret |= VM_FAULT_WRITE; 1325 ret |= VM_FAULT_WRITE;
1325 } 1326 }
1326 pte_unmap(page_table);
1327 page_cache_release(new_page); 1327 page_cache_release(new_page);
1328 page_cache_release(old_page); 1328 page_cache_release(old_page);
1329 spin_unlock(&mm->page_table_lock); 1329unlock:
1330 pte_unmap_unlock(page_table, ptl);
1330 return ret; 1331 return ret;
1331 1332oom:
1332no_new_page:
1333 page_cache_release(old_page); 1333 page_cache_release(old_page);
1334 return VM_FAULT_OOM; 1334 return VM_FAULT_OOM;
1335} 1335}
@@ -1399,13 +1399,6 @@ again:
1399 1399
1400 restart_addr = zap_page_range(vma, start_addr, 1400 restart_addr = zap_page_range(vma, start_addr,
1401 end_addr - start_addr, details); 1401 end_addr - start_addr, details);
1402
1403 /*
1404 * We cannot rely on the break test in unmap_vmas:
1405 * on the one hand, we don't want to restart our loop
1406 * just because that broke out for the page_table_lock;
1407 * on the other hand, it does no test when vma is small.
1408 */
1409 need_break = need_resched() || 1402 need_break = need_resched() ||
1410 need_lockbreak(details->i_mmap_lock); 1403 need_lockbreak(details->i_mmap_lock);
1411 1404
@@ -1654,38 +1647,37 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
1654} 1647}
1655 1648
1656/* 1649/*
1657 * We hold the mm semaphore and the page_table_lock on entry and 1650 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1658 * should release the pagetable lock on exit.. 1651 * but allow concurrent faults), and pte mapped but not yet locked.
1652 * We return with mmap_sem still held, but pte unmapped and unlocked.
1659 */ 1653 */
1660static int do_swap_page(struct mm_struct * mm, 1654static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1661 struct vm_area_struct * vma, unsigned long address, 1655 unsigned long address, pte_t *page_table, pmd_t *pmd,
1662 pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) 1656 int write_access, pte_t orig_pte)
1663{ 1657{
1658 spinlock_t *ptl;
1664 struct page *page; 1659 struct page *page;
1665 swp_entry_t entry = pte_to_swp_entry(orig_pte); 1660 swp_entry_t entry;
1666 pte_t pte; 1661 pte_t pte;
1667 int ret = VM_FAULT_MINOR; 1662 int ret = VM_FAULT_MINOR;
1668 1663
1669 pte_unmap(page_table); 1664 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
1670 spin_unlock(&mm->page_table_lock); 1665 goto out;
1666
1667 entry = pte_to_swp_entry(orig_pte);
1671 page = lookup_swap_cache(entry); 1668 page = lookup_swap_cache(entry);
1672 if (!page) { 1669 if (!page) {
1673 swapin_readahead(entry, address, vma); 1670 swapin_readahead(entry, address, vma);
1674 page = read_swap_cache_async(entry, vma, address); 1671 page = read_swap_cache_async(entry, vma, address);
1675 if (!page) { 1672 if (!page) {
1676 /* 1673 /*
1677 * Back out if somebody else faulted in this pte while 1674 * Back out if somebody else faulted in this pte
1678 * we released the page table lock. 1675 * while we released the pte lock.
1679 */ 1676 */
1680 spin_lock(&mm->page_table_lock); 1677 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1681 page_table = pte_offset_map(pmd, address);
1682 if (likely(pte_same(*page_table, orig_pte))) 1678 if (likely(pte_same(*page_table, orig_pte)))
1683 ret = VM_FAULT_OOM; 1679 ret = VM_FAULT_OOM;
1684 else 1680 goto unlock;
1685 ret = VM_FAULT_MINOR;
1686 pte_unmap(page_table);
1687 spin_unlock(&mm->page_table_lock);
1688 goto out;
1689 } 1681 }
1690 1682
1691 /* Had to read the page from swap area: Major fault */ 1683 /* Had to read the page from swap area: Major fault */
@@ -1698,15 +1690,11 @@ static int do_swap_page(struct mm_struct * mm,
1698 lock_page(page); 1690 lock_page(page);
1699 1691
1700 /* 1692 /*
1701 * Back out if somebody else faulted in this pte while we 1693 * Back out if somebody else already faulted in this pte.
1702 * released the page table lock.
1703 */ 1694 */
1704 spin_lock(&mm->page_table_lock); 1695 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1705 page_table = pte_offset_map(pmd, address); 1696 if (unlikely(!pte_same(*page_table, orig_pte)))
1706 if (unlikely(!pte_same(*page_table, orig_pte))) {
1707 ret = VM_FAULT_MINOR;
1708 goto out_nomap; 1697 goto out_nomap;
1709 }
1710 1698
1711 if (unlikely(!PageUptodate(page))) { 1699 if (unlikely(!PageUptodate(page))) {
1712 ret = VM_FAULT_SIGBUS; 1700 ret = VM_FAULT_SIGBUS;
@@ -1715,7 +1703,7 @@ static int do_swap_page(struct mm_struct * mm,
1715 1703
1716 /* The page isn't present yet, go ahead with the fault. */ 1704 /* The page isn't present yet, go ahead with the fault. */
1717 1705
1718 inc_mm_counter(mm, rss); 1706 inc_mm_counter(mm, anon_rss);
1719 pte = mk_pte(page, vma->vm_page_prot); 1707 pte = mk_pte(page, vma->vm_page_prot);
1720 if (write_access && can_share_swap_page(page)) { 1708 if (write_access && can_share_swap_page(page)) {
1721 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 1709 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1733,7 +1721,7 @@ static int do_swap_page(struct mm_struct * mm,
1733 1721
1734 if (write_access) { 1722 if (write_access) {
1735 if (do_wp_page(mm, vma, address, 1723 if (do_wp_page(mm, vma, address,
1736 page_table, pmd, pte) == VM_FAULT_OOM) 1724 page_table, pmd, ptl, pte) == VM_FAULT_OOM)
1737 ret = VM_FAULT_OOM; 1725 ret = VM_FAULT_OOM;
1738 goto out; 1726 goto out;
1739 } 1727 }
@@ -1741,74 +1729,76 @@ static int do_swap_page(struct mm_struct * mm,
1741 /* No need to invalidate - it was non-present before */ 1729 /* No need to invalidate - it was non-present before */
1742 update_mmu_cache(vma, address, pte); 1730 update_mmu_cache(vma, address, pte);
1743 lazy_mmu_prot_update(pte); 1731 lazy_mmu_prot_update(pte);
1744 pte_unmap(page_table); 1732unlock:
1745 spin_unlock(&mm->page_table_lock); 1733 pte_unmap_unlock(page_table, ptl);
1746out: 1734out:
1747 return ret; 1735 return ret;
1748out_nomap: 1736out_nomap:
1749 pte_unmap(page_table); 1737 pte_unmap_unlock(page_table, ptl);
1750 spin_unlock(&mm->page_table_lock);
1751 unlock_page(page); 1738 unlock_page(page);
1752 page_cache_release(page); 1739 page_cache_release(page);
1753 goto out; 1740 return ret;
1754} 1741}
1755 1742
1756/* 1743/*
1757 * We are called with the MM semaphore and page_table_lock 1744 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1758 * spinlock held to protect against concurrent faults in 1745 * but allow concurrent faults), and pte mapped but not yet locked.
1759 * multithreaded programs. 1746 * We return with mmap_sem still held, but pte unmapped and unlocked.
1760 */ 1747 */
1761static int 1748static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1762do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 1749 unsigned long address, pte_t *page_table, pmd_t *pmd,
1763 pte_t *page_table, pmd_t *pmd, int write_access, 1750 int write_access)
1764 unsigned long addr)
1765{ 1751{
1752 struct page *page;
1753 spinlock_t *ptl;
1766 pte_t entry; 1754 pte_t entry;
1767 struct page * page = ZERO_PAGE(addr);
1768
1769 /* Read-only mapping of ZERO_PAGE. */
1770 entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1771 1755
1772 /* ..except if it's a write access */
1773 if (write_access) { 1756 if (write_access) {
1774 /* Allocate our own private page. */ 1757 /* Allocate our own private page. */
1775 pte_unmap(page_table); 1758 pte_unmap(page_table);
1776 spin_unlock(&mm->page_table_lock);
1777 1759
1778 if (unlikely(anon_vma_prepare(vma))) 1760 if (unlikely(anon_vma_prepare(vma)))
1779 goto no_mem; 1761 goto oom;
1780 page = alloc_zeroed_user_highpage(vma, addr); 1762 page = alloc_zeroed_user_highpage(vma, address);
1781 if (!page) 1763 if (!page)
1782 goto no_mem; 1764 goto oom;
1783 1765
1784 spin_lock(&mm->page_table_lock); 1766 entry = mk_pte(page, vma->vm_page_prot);
1785 page_table = pte_offset_map(pmd, addr); 1767 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1786 1768
1787 if (!pte_none(*page_table)) { 1769 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1788 pte_unmap(page_table); 1770 if (!pte_none(*page_table))
1789 page_cache_release(page); 1771 goto release;
1790 spin_unlock(&mm->page_table_lock); 1772 inc_mm_counter(mm, anon_rss);
1791 goto out;
1792 }
1793 inc_mm_counter(mm, rss);
1794 entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
1795 vma->vm_page_prot)),
1796 vma);
1797 lru_cache_add_active(page); 1773 lru_cache_add_active(page);
1798 SetPageReferenced(page); 1774 SetPageReferenced(page);
1799 page_add_anon_rmap(page, vma, addr); 1775 page_add_anon_rmap(page, vma, address);
1776 } else {
1777 /* Map the ZERO_PAGE - vm_page_prot is readonly */
1778 page = ZERO_PAGE(address);
1779 page_cache_get(page);
1780 entry = mk_pte(page, vma->vm_page_prot);
1781
1782 ptl = pte_lockptr(mm, pmd);
1783 spin_lock(ptl);
1784 if (!pte_none(*page_table))
1785 goto release;
1786 inc_mm_counter(mm, file_rss);
1787 page_add_file_rmap(page);
1800 } 1788 }
1801 1789
1802 set_pte_at(mm, addr, page_table, entry); 1790 set_pte_at(mm, address, page_table, entry);
1803 pte_unmap(page_table);
1804 1791
1805 /* No need to invalidate - it was non-present before */ 1792 /* No need to invalidate - it was non-present before */
1806 update_mmu_cache(vma, addr, entry); 1793 update_mmu_cache(vma, address, entry);
1807 lazy_mmu_prot_update(entry); 1794 lazy_mmu_prot_update(entry);
1808 spin_unlock(&mm->page_table_lock); 1795unlock:
1809out: 1796 pte_unmap_unlock(page_table, ptl);
1810 return VM_FAULT_MINOR; 1797 return VM_FAULT_MINOR;
1811no_mem: 1798release:
1799 page_cache_release(page);
1800 goto unlock;
1801oom:
1812 return VM_FAULT_OOM; 1802 return VM_FAULT_OOM;
1813} 1803}
1814 1804
@@ -1821,25 +1811,23 @@ no_mem:
1821 * As this is called only for pages that do not currently exist, we 1811 * As this is called only for pages that do not currently exist, we
1822 * do not need to flush old virtual caches or the TLB. 1812 * do not need to flush old virtual caches or the TLB.
1823 * 1813 *
1824 * This is called with the MM semaphore held and the page table 1814 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1825 * spinlock held. Exit with the spinlock released. 1815 * but allow concurrent faults), and pte mapped but not yet locked.
1816 * We return with mmap_sem still held, but pte unmapped and unlocked.
1826 */ 1817 */
1827static int 1818static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1828do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1819 unsigned long address, pte_t *page_table, pmd_t *pmd,
1829 unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) 1820 int write_access)
1830{ 1821{
1831 struct page * new_page; 1822 spinlock_t *ptl;
1823 struct page *new_page;
1832 struct address_space *mapping = NULL; 1824 struct address_space *mapping = NULL;
1833 pte_t entry; 1825 pte_t entry;
1834 unsigned int sequence = 0; 1826 unsigned int sequence = 0;
1835 int ret = VM_FAULT_MINOR; 1827 int ret = VM_FAULT_MINOR;
1836 int anon = 0; 1828 int anon = 0;
1837 1829
1838 if (!vma->vm_ops || !vma->vm_ops->nopage)
1839 return do_anonymous_page(mm, vma, page_table,
1840 pmd, write_access, address);
1841 pte_unmap(page_table); 1830 pte_unmap(page_table);
1842 spin_unlock(&mm->page_table_lock);
1843 1831
1844 if (vma->vm_file) { 1832 if (vma->vm_file) {
1845 mapping = vma->vm_file->f_mapping; 1833 mapping = vma->vm_file->f_mapping;
@@ -1847,7 +1835,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1847 smp_rmb(); /* serializes i_size against truncate_count */ 1835 smp_rmb(); /* serializes i_size against truncate_count */
1848 } 1836 }
1849retry: 1837retry:
1850 cond_resched();
1851 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); 1838 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
1852 /* 1839 /*
1853 * No smp_rmb is needed here as long as there's a full 1840 * No smp_rmb is needed here as long as there's a full
@@ -1880,19 +1867,20 @@ retry:
1880 anon = 1; 1867 anon = 1;
1881 } 1868 }
1882 1869
1883 spin_lock(&mm->page_table_lock); 1870 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1884 /* 1871 /*
1885 * For a file-backed vma, someone could have truncated or otherwise 1872 * For a file-backed vma, someone could have truncated or otherwise
1886 * invalidated this page. If unmap_mapping_range got called, 1873 * invalidated this page. If unmap_mapping_range got called,
1887 * retry getting the page. 1874 * retry getting the page.
1888 */ 1875 */
1889 if (mapping && unlikely(sequence != mapping->truncate_count)) { 1876 if (mapping && unlikely(sequence != mapping->truncate_count)) {
1890 sequence = mapping->truncate_count; 1877 pte_unmap_unlock(page_table, ptl);
1891 spin_unlock(&mm->page_table_lock);
1892 page_cache_release(new_page); 1878 page_cache_release(new_page);
1879 cond_resched();
1880 sequence = mapping->truncate_count;
1881 smp_rmb();
1893 goto retry; 1882 goto retry;
1894 } 1883 }
1895 page_table = pte_offset_map(pmd, address);
1896 1884
1897 /* 1885 /*
1898 * This silly early PAGE_DIRTY setting removes a race 1886 * This silly early PAGE_DIRTY setting removes a race
@@ -1906,68 +1894,67 @@ retry:
1906 */ 1894 */
1907 /* Only go through if we didn't race with anybody else... */ 1895 /* Only go through if we didn't race with anybody else... */
1908 if (pte_none(*page_table)) { 1896 if (pte_none(*page_table)) {
1909 if (!PageReserved(new_page))
1910 inc_mm_counter(mm, rss);
1911
1912 flush_icache_page(vma, new_page); 1897 flush_icache_page(vma, new_page);
1913 entry = mk_pte(new_page, vma->vm_page_prot); 1898 entry = mk_pte(new_page, vma->vm_page_prot);
1914 if (write_access) 1899 if (write_access)
1915 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1900 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1916 set_pte_at(mm, address, page_table, entry); 1901 set_pte_at(mm, address, page_table, entry);
1917 if (anon) { 1902 if (anon) {
1903 inc_mm_counter(mm, anon_rss);
1918 lru_cache_add_active(new_page); 1904 lru_cache_add_active(new_page);
1919 page_add_anon_rmap(new_page, vma, address); 1905 page_add_anon_rmap(new_page, vma, address);
1920 } else 1906 } else if (!(vma->vm_flags & VM_RESERVED)) {
1907 inc_mm_counter(mm, file_rss);
1921 page_add_file_rmap(new_page); 1908 page_add_file_rmap(new_page);
1922 pte_unmap(page_table); 1909 }
1923 } else { 1910 } else {
1924 /* One of our sibling threads was faster, back out. */ 1911 /* One of our sibling threads was faster, back out. */
1925 pte_unmap(page_table);
1926 page_cache_release(new_page); 1912 page_cache_release(new_page);
1927 spin_unlock(&mm->page_table_lock); 1913 goto unlock;
1928 goto out;
1929 } 1914 }
1930 1915
1931 /* no need to invalidate: a not-present page shouldn't be cached */ 1916 /* no need to invalidate: a not-present page shouldn't be cached */
1932 update_mmu_cache(vma, address, entry); 1917 update_mmu_cache(vma, address, entry);
1933 lazy_mmu_prot_update(entry); 1918 lazy_mmu_prot_update(entry);
1934 spin_unlock(&mm->page_table_lock); 1919unlock:
1935out: 1920 pte_unmap_unlock(page_table, ptl);
1936 return ret; 1921 return ret;
1937oom: 1922oom:
1938 page_cache_release(new_page); 1923 page_cache_release(new_page);
1939 ret = VM_FAULT_OOM; 1924 return VM_FAULT_OOM;
1940 goto out;
1941} 1925}
1942 1926
1943/* 1927/*
1944 * Fault of a previously existing named mapping. Repopulate the pte 1928 * Fault of a previously existing named mapping. Repopulate the pte
1945 * from the encoded file_pte if possible. This enables swappable 1929 * from the encoded file_pte if possible. This enables swappable
1946 * nonlinear vmas. 1930 * nonlinear vmas.
1931 *
1932 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1933 * but allow concurrent faults), and pte mapped but not yet locked.
1934 * We return with mmap_sem still held, but pte unmapped and unlocked.
1947 */ 1935 */
1948static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, 1936static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
1949 unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) 1937 unsigned long address, pte_t *page_table, pmd_t *pmd,
1938 int write_access, pte_t orig_pte)
1950{ 1939{
1951 unsigned long pgoff; 1940 pgoff_t pgoff;
1952 int err; 1941 int err;
1953 1942
1954 BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); 1943 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
1955 /* 1944 return VM_FAULT_MINOR;
1956 * Fall back to the linear mapping if the fs does not support
1957 * ->populate:
1958 */
1959 if (!vma->vm_ops->populate ||
1960 (write_access && !(vma->vm_flags & VM_SHARED))) {
1961 pte_clear(mm, address, pte);
1962 return do_no_page(mm, vma, address, write_access, pte, pmd);
1963 }
1964
1965 pgoff = pte_to_pgoff(*pte);
1966 1945
1967 pte_unmap(pte); 1946 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
1968 spin_unlock(&mm->page_table_lock); 1947 /*
1948 * Page table corrupted: show pte and kill process.
1949 */
1950 print_bad_pte(vma, orig_pte, address);
1951 return VM_FAULT_OOM;
1952 }
1953 /* We can then assume vm->vm_ops && vma->vm_ops->populate */
1969 1954
1970 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); 1955 pgoff = pte_to_pgoff(orig_pte);
1956 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
1957 vma->vm_page_prot, pgoff, 0);
1971 if (err == -ENOMEM) 1958 if (err == -ENOMEM)
1972 return VM_FAULT_OOM; 1959 return VM_FAULT_OOM;
1973 if (err) 1960 if (err)
@@ -1984,56 +1971,68 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
1984 * with external mmu caches can use to update those (ie the Sparc or 1971 * with external mmu caches can use to update those (ie the Sparc or
1985 * PowerPC hashed page tables that act as extended TLBs). 1972 * PowerPC hashed page tables that act as extended TLBs).
1986 * 1973 *
1987 * Note the "page_table_lock". It is to protect against kswapd removing 1974 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1988 * pages from under us. Note that kswapd only ever _removes_ pages, never 1975 * but allow concurrent faults), and pte mapped but not yet locked.
1989 * adds them. As such, once we have noticed that the page is not present, 1976 * We return with mmap_sem still held, but pte unmapped and unlocked.
1990 * we can drop the lock early.
1991 *
1992 * The adding of pages is protected by the MM semaphore (which we hold),
1993 * so we don't need to worry about a page being suddenly been added into
1994 * our VM.
1995 *
1996 * We enter with the pagetable spinlock held, we are supposed to
1997 * release it when done.
1998 */ 1977 */
1999static inline int handle_pte_fault(struct mm_struct *mm, 1978static inline int handle_pte_fault(struct mm_struct *mm,
2000 struct vm_area_struct * vma, unsigned long address, 1979 struct vm_area_struct *vma, unsigned long address,
2001 int write_access, pte_t *pte, pmd_t *pmd) 1980 pte_t *pte, pmd_t *pmd, int write_access)
2002{ 1981{
2003 pte_t entry; 1982 pte_t entry;
1983 pte_t old_entry;
1984 spinlock_t *ptl;
2004 1985
2005 entry = *pte; 1986 old_entry = entry = *pte;
2006 if (!pte_present(entry)) { 1987 if (!pte_present(entry)) {
2007 /* 1988 if (pte_none(entry)) {
2008 * If it truly wasn't present, we know that kswapd 1989 if (!vma->vm_ops || !vma->vm_ops->nopage)
2009 * and the PTE updates will not touch it later. So 1990 return do_anonymous_page(mm, vma, address,
2010 * drop the lock. 1991 pte, pmd, write_access);
2011 */ 1992 return do_no_page(mm, vma, address,
2012 if (pte_none(entry)) 1993 pte, pmd, write_access);
2013 return do_no_page(mm, vma, address, write_access, pte, pmd); 1994 }
2014 if (pte_file(entry)) 1995 if (pte_file(entry))
2015 return do_file_page(mm, vma, address, write_access, pte, pmd); 1996 return do_file_page(mm, vma, address,
2016 return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); 1997 pte, pmd, write_access, entry);
1998 return do_swap_page(mm, vma, address,
1999 pte, pmd, write_access, entry);
2017 } 2000 }
2018 2001
2002 ptl = pte_lockptr(mm, pmd);
2003 spin_lock(ptl);
2004 if (unlikely(!pte_same(*pte, entry)))
2005 goto unlock;
2019 if (write_access) { 2006 if (write_access) {
2020 if (!pte_write(entry)) 2007 if (!pte_write(entry))
2021 return do_wp_page(mm, vma, address, pte, pmd, entry); 2008 return do_wp_page(mm, vma, address,
2009 pte, pmd, ptl, entry);
2022 entry = pte_mkdirty(entry); 2010 entry = pte_mkdirty(entry);
2023 } 2011 }
2024 entry = pte_mkyoung(entry); 2012 entry = pte_mkyoung(entry);
2025 ptep_set_access_flags(vma, address, pte, entry, write_access); 2013 if (!pte_same(old_entry, entry)) {
2026 update_mmu_cache(vma, address, entry); 2014 ptep_set_access_flags(vma, address, pte, entry, write_access);
2027 lazy_mmu_prot_update(entry); 2015 update_mmu_cache(vma, address, entry);
2028 pte_unmap(pte); 2016 lazy_mmu_prot_update(entry);
2029 spin_unlock(&mm->page_table_lock); 2017 } else {
2018 /*
2019 * This is needed only for protection faults but the arch code
2020 * is not yet telling us if this is a protection fault or not.
2021 * This still avoids useless tlb flushes for .text page faults
2022 * with threads.
2023 */
2024 if (write_access)
2025 flush_tlb_page(vma, address);
2026 }
2027unlock:
2028 pte_unmap_unlock(pte, ptl);
2030 return VM_FAULT_MINOR; 2029 return VM_FAULT_MINOR;
2031} 2030}
2032 2031
2033/* 2032/*
2034 * By the time we get here, we already hold the mm semaphore 2033 * By the time we get here, we already hold the mm semaphore
2035 */ 2034 */
2036int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, 2035int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2037 unsigned long address, int write_access) 2036 unsigned long address, int write_access)
2038{ 2037{
2039 pgd_t *pgd; 2038 pgd_t *pgd;
@@ -2048,100 +2047,66 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
2048 if (unlikely(is_vm_hugetlb_page(vma))) 2047 if (unlikely(is_vm_hugetlb_page(vma)))
2049 return hugetlb_fault(mm, vma, address, write_access); 2048 return hugetlb_fault(mm, vma, address, write_access);
2050 2049
2051 /*
2052 * We need the page table lock to synchronize with kswapd
2053 * and the SMP-safe atomic PTE updates.
2054 */
2055 pgd = pgd_offset(mm, address); 2050 pgd = pgd_offset(mm, address);
2056 spin_lock(&mm->page_table_lock);
2057
2058 pud = pud_alloc(mm, pgd, address); 2051 pud = pud_alloc(mm, pgd, address);
2059 if (!pud) 2052 if (!pud)
2060 goto oom; 2053 return VM_FAULT_OOM;
2061
2062 pmd = pmd_alloc(mm, pud, address); 2054 pmd = pmd_alloc(mm, pud, address);
2063 if (!pmd) 2055 if (!pmd)
2064 goto oom; 2056 return VM_FAULT_OOM;
2065
2066 pte = pte_alloc_map(mm, pmd, address); 2057 pte = pte_alloc_map(mm, pmd, address);
2067 if (!pte) 2058 if (!pte)
2068 goto oom; 2059 return VM_FAULT_OOM;
2069
2070 return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
2071 2060
2072 oom: 2061 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2073 spin_unlock(&mm->page_table_lock);
2074 return VM_FAULT_OOM;
2075} 2062}
2076 2063
2077#ifndef __PAGETABLE_PUD_FOLDED 2064#ifndef __PAGETABLE_PUD_FOLDED
2078/* 2065/*
2079 * Allocate page upper directory. 2066 * Allocate page upper directory.
2080 * 2067 * We've already handled the fast-path in-line.
2081 * We've already handled the fast-path in-line, and we own the
2082 * page table lock.
2083 */ 2068 */
2084pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 2069int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2085{ 2070{
2086 pud_t *new; 2071 pud_t *new = pud_alloc_one(mm, address);
2087
2088 spin_unlock(&mm->page_table_lock);
2089 new = pud_alloc_one(mm, address);
2090 spin_lock(&mm->page_table_lock);
2091 if (!new) 2072 if (!new)
2092 return NULL; 2073 return -ENOMEM;
2093 2074
2094 /* 2075 spin_lock(&mm->page_table_lock);
2095 * Because we dropped the lock, we should re-check the 2076 if (pgd_present(*pgd)) /* Another has populated it */
2096 * entry, as somebody else could have populated it..
2097 */
2098 if (pgd_present(*pgd)) {
2099 pud_free(new); 2077 pud_free(new);
2100 goto out; 2078 else
2101 } 2079 pgd_populate(mm, pgd, new);
2102 pgd_populate(mm, pgd, new); 2080 spin_unlock(&mm->page_table_lock);
2103 out: 2081 return 0;
2104 return pud_offset(pgd, address);
2105} 2082}
2106#endif /* __PAGETABLE_PUD_FOLDED */ 2083#endif /* __PAGETABLE_PUD_FOLDED */
2107 2084
2108#ifndef __PAGETABLE_PMD_FOLDED 2085#ifndef __PAGETABLE_PMD_FOLDED
2109/* 2086/*
2110 * Allocate page middle directory. 2087 * Allocate page middle directory.
2111 * 2088 * We've already handled the fast-path in-line.
2112 * We've already handled the fast-path in-line, and we own the
2113 * page table lock.
2114 */ 2089 */
2115pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 2090int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2116{ 2091{
2117 pmd_t *new; 2092 pmd_t *new = pmd_alloc_one(mm, address);
2118
2119 spin_unlock(&mm->page_table_lock);
2120 new = pmd_alloc_one(mm, address);
2121 spin_lock(&mm->page_table_lock);
2122 if (!new) 2093 if (!new)
2123 return NULL; 2094 return -ENOMEM;
2124 2095
2125 /* 2096 spin_lock(&mm->page_table_lock);
2126 * Because we dropped the lock, we should re-check the
2127 * entry, as somebody else could have populated it..
2128 */
2129#ifndef __ARCH_HAS_4LEVEL_HACK 2097#ifndef __ARCH_HAS_4LEVEL_HACK
2130 if (pud_present(*pud)) { 2098 if (pud_present(*pud)) /* Another has populated it */
2131 pmd_free(new); 2099 pmd_free(new);
2132 goto out; 2100 else
2133 } 2101 pud_populate(mm, pud, new);
2134 pud_populate(mm, pud, new);
2135#else 2102#else
2136 if (pgd_present(*pud)) { 2103 if (pgd_present(*pud)) /* Another has populated it */
2137 pmd_free(new); 2104 pmd_free(new);
2138 goto out; 2105 else
2139 } 2106 pgd_populate(mm, pud, new);
2140 pgd_populate(mm, pud, new);
2141#endif /* __ARCH_HAS_4LEVEL_HACK */ 2107#endif /* __ARCH_HAS_4LEVEL_HACK */
2142 2108 spin_unlock(&mm->page_table_lock);
2143 out: 2109 return 0;
2144 return pmd_offset(pud, address);
2145} 2110}
2146#endif /* __PAGETABLE_PMD_FOLDED */ 2111#endif /* __PAGETABLE_PMD_FOLDED */
2147 2112
@@ -2206,22 +2171,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2206 2171
2207EXPORT_SYMBOL(vmalloc_to_pfn); 2172EXPORT_SYMBOL(vmalloc_to_pfn);
2208 2173
2209/*
2210 * update_mem_hiwater
2211 * - update per process rss and vm high water data
2212 */
2213void update_mem_hiwater(struct task_struct *tsk)
2214{
2215 if (tsk->mm) {
2216 unsigned long rss = get_mm_counter(tsk->mm, rss);
2217
2218 if (tsk->mm->hiwater_rss < rss)
2219 tsk->mm->hiwater_rss = rss;
2220 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
2221 tsk->mm->hiwater_vm = tsk->mm->total_vm;
2222 }
2223}
2224
2225#if !defined(__HAVE_ARCH_GATE_AREA) 2174#if !defined(__HAVE_ARCH_GATE_AREA)
2226 2175
2227#if defined(AT_SYSINFO_EHDR) 2176#if defined(AT_SYSINFO_EHDR)
@@ -2233,7 +2182,7 @@ static int __init gate_vma_init(void)
2233 gate_vma.vm_start = FIXADDR_USER_START; 2182 gate_vma.vm_start = FIXADDR_USER_START;
2234 gate_vma.vm_end = FIXADDR_USER_END; 2183 gate_vma.vm_end = FIXADDR_USER_END;
2235 gate_vma.vm_page_prot = PAGE_READONLY; 2184 gate_vma.vm_page_prot = PAGE_READONLY;
2236 gate_vma.vm_flags = 0; 2185 gate_vma.vm_flags = VM_RESERVED;
2237 return 0; 2186 return 0;
2238} 2187}
2239__initcall(gate_vma_init); 2188__initcall(gate_vma_init);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
new file mode 100644
index 000000000000..431a64f021c0
--- /dev/null
+++ b/mm/memory_hotplug.c
@@ -0,0 +1,138 @@
1/*
2 * linux/mm/memory_hotplug.c
3 *
4 * Copyright (C)
5 */
6
7#include <linux/config.h>
8#include <linux/stddef.h>
9#include <linux/mm.h>
10#include <linux/swap.h>
11#include <linux/interrupt.h>
12#include <linux/pagemap.h>
13#include <linux/bootmem.h>
14#include <linux/compiler.h>
15#include <linux/module.h>
16#include <linux/pagevec.h>
17#include <linux/slab.h>
18#include <linux/sysctl.h>
19#include <linux/cpu.h>
20#include <linux/memory.h>
21#include <linux/memory_hotplug.h>
22#include <linux/highmem.h>
23#include <linux/vmalloc.h>
24
25#include <asm/tlbflush.h>
26
27extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
28 unsigned long size);
29static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
30{
31 struct pglist_data *pgdat = zone->zone_pgdat;
32 int nr_pages = PAGES_PER_SECTION;
33 int nid = pgdat->node_id;
34 int zone_type;
35
36 zone_type = zone - pgdat->node_zones;
37 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
38 zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
39}
40
41extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
42 int nr_pages);
43static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
44{
45 struct pglist_data *pgdat = zone->zone_pgdat;
46 int nr_pages = PAGES_PER_SECTION;
47 int ret;
48
49 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
50
51 if (ret < 0)
52 return ret;
53
54 __add_zone(zone, phys_start_pfn);
55 return register_new_memory(__pfn_to_section(phys_start_pfn));
56}
57
58/*
59 * Reasonably generic function for adding memory. It is
60 * expected that archs that support memory hotplug will
61 * call this function after deciding the zone to which to
62 * add the new pages.
63 */
64int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
65 unsigned long nr_pages)
66{
67 unsigned long i;
68 int err = 0;
69
70 for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
71 err = __add_section(zone, phys_start_pfn + i);
72
73 if (err)
74 break;
75 }
76
77 return err;
78}
79
80static void grow_zone_span(struct zone *zone,
81 unsigned long start_pfn, unsigned long end_pfn)
82{
83 unsigned long old_zone_end_pfn;
84
85 zone_span_writelock(zone);
86
87 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
88 if (start_pfn < zone->zone_start_pfn)
89 zone->zone_start_pfn = start_pfn;
90
91 if (end_pfn > old_zone_end_pfn)
92 zone->spanned_pages = end_pfn - zone->zone_start_pfn;
93
94 zone_span_writeunlock(zone);
95}
96
97static void grow_pgdat_span(struct pglist_data *pgdat,
98 unsigned long start_pfn, unsigned long end_pfn)
99{
100 unsigned long old_pgdat_end_pfn =
101 pgdat->node_start_pfn + pgdat->node_spanned_pages;
102
103 if (start_pfn < pgdat->node_start_pfn)
104 pgdat->node_start_pfn = start_pfn;
105
106 if (end_pfn > old_pgdat_end_pfn)
107 pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages;
108}
109
110int online_pages(unsigned long pfn, unsigned long nr_pages)
111{
112 unsigned long i;
113 unsigned long flags;
114 unsigned long onlined_pages = 0;
115 struct zone *zone;
116
117 /*
118 * This doesn't need a lock to do pfn_to_page().
119 * The section can't be removed here because of the
120 * memory_block->state_sem.
121 */
122 zone = page_zone(pfn_to_page(pfn));
123 pgdat_resize_lock(zone->zone_pgdat, &flags);
124 grow_zone_span(zone, pfn, pfn + nr_pages);
125 grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
126 pgdat_resize_unlock(zone->zone_pgdat, &flags);
127
128 for (i = 0; i < nr_pages; i++) {
129 struct page *page = pfn_to_page(pfn + i);
130 online_page(page);
131 onlined_pages++;
132 }
133 zone->present_pages += onlined_pages;
134
135 setup_per_zone_pages_min();
136
137 return 0;
138}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 37af443eb094..5abc57c2b8bd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2,6 +2,7 @@
2 * Simple NUMA memory policy for the Linux kernel. 2 * Simple NUMA memory policy for the Linux kernel.
3 * 3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
5 * Subject to the GNU Public License, version 2. 6 * Subject to the GNU Public License, version 2.
6 * 7 *
7 * NUMA policy allows the user to give hints in which node(s) memory should 8 * NUMA policy allows the user to give hints in which node(s) memory should
@@ -17,13 +18,19 @@
17 * offset into the backing object or offset into the mapping 18 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter 19 * for anonymous memory. For process policy an process counter
19 * is used. 20 * is used.
21 *
20 * bind Only allocate memory on a specific set of nodes, 22 * bind Only allocate memory on a specific set of nodes,
21 * no fallback. 23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
22 * preferred Try a specific node first before normal fallback. 28 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation 29 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default, 30 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default 31 * but useful to set in a VMA when you have a non default
26 * process policy. 32 * process policy.
33 *
27 * default Allocate on the local node first, or when on a VMA 34 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did 35 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default. 36 * in a NUMA aware kernel and still does by, ahem, default.
@@ -93,23 +100,10 @@ struct mempolicy default_policy = {
93 .policy = MPOL_DEFAULT, 100 .policy = MPOL_DEFAULT,
94}; 101};
95 102
96/* Check if all specified nodes are online */
97static int nodes_online(unsigned long *nodes)
98{
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
100
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
103 set_bit(0, online2);
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 return -EINVAL;
106 return 0;
107}
108
109/* Do sanity checking on a policy */ 103/* Do sanity checking on a policy */
110static int mpol_check_policy(int mode, unsigned long *nodes) 104static int mpol_check_policy(int mode, nodemask_t *nodes)
111{ 105{
112 int empty = bitmap_empty(nodes, MAX_NUMNODES); 106 int empty = nodes_empty(*nodes);
113 107
114 switch (mode) { 108 switch (mode) {
115 case MPOL_DEFAULT: 109 case MPOL_DEFAULT:
@@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes)
124 return -EINVAL; 118 return -EINVAL;
125 break; 119 break;
126 } 120 }
127 return nodes_online(nodes); 121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
128}
129
130/* Copy a node mask from user space. */
131static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
133{
134 unsigned long k;
135 unsigned long nlongs;
136 unsigned long endmask;
137
138 --maxnode;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
141 return 0;
142
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
145 endmask = ~0UL;
146 else
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
153 return -EINVAL;
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 unsigned long t;
156 if (get_user(t, nmask + k))
157 return -EFAULT;
158 if (k == nlongs - 1) {
159 if (t & endmask)
160 return -EINVAL;
161 } else if (t)
162 return -EINVAL;
163 }
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 endmask = ~0UL;
166 }
167
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 return -EFAULT;
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
176} 122}
177
178/* Generate a custom zonelist for the BIND policy. */ 123/* Generate a custom zonelist for the BIND policy. */
179static struct zonelist *bind_zonelist(unsigned long *nodes) 124static struct zonelist *bind_zonelist(nodemask_t *nodes)
180{ 125{
181 struct zonelist *zl; 126 struct zonelist *zl;
182 int num, max, nd; 127 int num, max, nd;
183 128
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); 129 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 130 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 if (!zl) 131 if (!zl)
187 return NULL; 132 return NULL;
188 num = 0; 133 num = 0;
189 for (nd = find_first_bit(nodes, MAX_NUMNODES); 134 for_each_node_mask(nd, *nodes) {
190 nd < MAX_NUMNODES;
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 int k; 135 int k;
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) { 136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
@@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes)
199 policy_zone = k; 142 policy_zone = k;
200 } 143 }
201 } 144 }
202 BUG_ON(num >= max);
203 zl->zones[num] = NULL; 145 zl->zones[num] = NULL;
204 return zl; 146 return zl;
205} 147}
206 148
207/* Create a new policy */ 149/* Create a new policy */
208static struct mempolicy *mpol_new(int mode, unsigned long *nodes) 150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
209{ 151{
210 struct mempolicy *policy; 152 struct mempolicy *policy;
211 153
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); 154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
213 if (mode == MPOL_DEFAULT) 155 if (mode == MPOL_DEFAULT)
214 return NULL; 156 return NULL;
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
218 atomic_set(&policy->refcnt, 1); 160 atomic_set(&policy->refcnt, 1);
219 switch (mode) { 161 switch (mode) {
220 case MPOL_INTERLEAVE: 162 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); 163 policy->v.nodes = *nodes;
222 break; 164 break;
223 case MPOL_PREFERRED: 165 case MPOL_PREFERRED:
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); 166 policy->v.preferred_node = first_node(*nodes);
225 if (policy->v.preferred_node >= MAX_NUMNODES) 167 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1; 168 policy->v.preferred_node = -1;
227 break; 169 break;
@@ -238,14 +180,14 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
238} 180}
239 181
240/* Ensure all existing pages follow the policy. */ 182/* Ensure all existing pages follow the policy. */
241static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, 183static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
242 unsigned long addr, unsigned long end, unsigned long *nodes) 184 unsigned long addr, unsigned long end, nodemask_t *nodes)
243{ 185{
244 pte_t *orig_pte; 186 pte_t *orig_pte;
245 pte_t *pte; 187 pte_t *pte;
188 spinlock_t *ptl;
246 189
247 spin_lock(&mm->page_table_lock); 190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
248 orig_pte = pte = pte_offset_map(pmd, addr);
249 do { 191 do {
250 unsigned long pfn; 192 unsigned long pfn;
251 unsigned int nid; 193 unsigned int nid;
@@ -253,19 +195,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
253 if (!pte_present(*pte)) 195 if (!pte_present(*pte))
254 continue; 196 continue;
255 pfn = pte_pfn(*pte); 197 pfn = pte_pfn(*pte);
256 if (!pfn_valid(pfn)) 198 if (!pfn_valid(pfn)) {
199 print_bad_pte(vma, *pte, addr);
257 continue; 200 continue;
201 }
258 nid = pfn_to_nid(pfn); 202 nid = pfn_to_nid(pfn);
259 if (!test_bit(nid, nodes)) 203 if (!node_isset(nid, *nodes))
260 break; 204 break;
261 } while (pte++, addr += PAGE_SIZE, addr != end); 205 } while (pte++, addr += PAGE_SIZE, addr != end);
262 pte_unmap(orig_pte); 206 pte_unmap_unlock(orig_pte, ptl);
263 spin_unlock(&mm->page_table_lock);
264 return addr != end; 207 return addr != end;
265} 208}
266 209
267static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, 210static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
268 unsigned long addr, unsigned long end, unsigned long *nodes) 211 unsigned long addr, unsigned long end, nodemask_t *nodes)
269{ 212{
270 pmd_t *pmd; 213 pmd_t *pmd;
271 unsigned long next; 214 unsigned long next;
@@ -275,14 +218,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
275 next = pmd_addr_end(addr, end); 218 next = pmd_addr_end(addr, end);
276 if (pmd_none_or_clear_bad(pmd)) 219 if (pmd_none_or_clear_bad(pmd))
277 continue; 220 continue;
278 if (check_pte_range(mm, pmd, addr, next, nodes)) 221 if (check_pte_range(vma, pmd, addr, next, nodes))
279 return -EIO; 222 return -EIO;
280 } while (pmd++, addr = next, addr != end); 223 } while (pmd++, addr = next, addr != end);
281 return 0; 224 return 0;
282} 225}
283 226
284static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, 227static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
285 unsigned long addr, unsigned long end, unsigned long *nodes) 228 unsigned long addr, unsigned long end, nodemask_t *nodes)
286{ 229{
287 pud_t *pud; 230 pud_t *pud;
288 unsigned long next; 231 unsigned long next;
@@ -292,24 +235,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
292 next = pud_addr_end(addr, end); 235 next = pud_addr_end(addr, end);
293 if (pud_none_or_clear_bad(pud)) 236 if (pud_none_or_clear_bad(pud))
294 continue; 237 continue;
295 if (check_pmd_range(mm, pud, addr, next, nodes)) 238 if (check_pmd_range(vma, pud, addr, next, nodes))
296 return -EIO; 239 return -EIO;
297 } while (pud++, addr = next, addr != end); 240 } while (pud++, addr = next, addr != end);
298 return 0; 241 return 0;
299} 242}
300 243
301static inline int check_pgd_range(struct mm_struct *mm, 244static inline int check_pgd_range(struct vm_area_struct *vma,
302 unsigned long addr, unsigned long end, unsigned long *nodes) 245 unsigned long addr, unsigned long end, nodemask_t *nodes)
303{ 246{
304 pgd_t *pgd; 247 pgd_t *pgd;
305 unsigned long next; 248 unsigned long next;
306 249
307 pgd = pgd_offset(mm, addr); 250 pgd = pgd_offset(vma->vm_mm, addr);
308 do { 251 do {
309 next = pgd_addr_end(addr, end); 252 next = pgd_addr_end(addr, end);
310 if (pgd_none_or_clear_bad(pgd)) 253 if (pgd_none_or_clear_bad(pgd))
311 continue; 254 continue;
312 if (check_pud_range(mm, pgd, addr, next, nodes)) 255 if (check_pud_range(vma, pgd, addr, next, nodes))
313 return -EIO; 256 return -EIO;
314 } while (pgd++, addr = next, addr != end); 257 } while (pgd++, addr = next, addr != end);
315 return 0; 258 return 0;
@@ -318,7 +261,7 @@ static inline int check_pgd_range(struct mm_struct *mm,
318/* Step 1: check the range */ 261/* Step 1: check the range */
319static struct vm_area_struct * 262static struct vm_area_struct *
320check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 263check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
321 unsigned long *nodes, unsigned long flags) 264 nodemask_t *nodes, unsigned long flags)
322{ 265{
323 int err; 266 int err;
324 struct vm_area_struct *first, *vma, *prev; 267 struct vm_area_struct *first, *vma, *prev;
@@ -326,6 +269,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
326 first = find_vma(mm, start); 269 first = find_vma(mm, start);
327 if (!first) 270 if (!first)
328 return ERR_PTR(-EFAULT); 271 return ERR_PTR(-EFAULT);
272 if (first->vm_flags & VM_RESERVED)
273 return ERR_PTR(-EACCES);
329 prev = NULL; 274 prev = NULL;
330 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
331 if (!vma->vm_next && vma->vm_end < end) 276 if (!vma->vm_next && vma->vm_end < end)
@@ -338,8 +283,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
338 endvma = end; 283 endvma = end;
339 if (vma->vm_start > start) 284 if (vma->vm_start > start)
340 start = vma->vm_start; 285 start = vma->vm_start;
341 err = check_pgd_range(vma->vm_mm, 286 err = check_pgd_range(vma, start, endvma, nodes);
342 start, endvma, nodes);
343 if (err) { 287 if (err) {
344 first = ERR_PTR(err); 288 first = ERR_PTR(err);
345 break; 289 break;
@@ -393,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
393 return err; 337 return err;
394} 338}
395 339
396/* Change policy for a memory range */ 340static int contextualize_policy(int mode, nodemask_t *nodes)
397asmlinkage long sys_mbind(unsigned long start, unsigned long len, 341{
398 unsigned long mode, 342 if (!nodes)
399 unsigned long __user *nmask, unsigned long maxnode, 343 return 0;
400 unsigned flags) 344
345 /* Update current mems_allowed */
346 cpuset_update_current_mems_allowed();
347 /* Ignore nodes not set in current->mems_allowed */
348 cpuset_restrict_to_mems_allowed(nodes->bits);
349 return mpol_check_policy(mode, nodes);
350}
351
352long do_mbind(unsigned long start, unsigned long len,
353 unsigned long mode, nodemask_t *nmask, unsigned long flags)
401{ 354{
402 struct vm_area_struct *vma; 355 struct vm_area_struct *vma;
403 struct mm_struct *mm = current->mm; 356 struct mm_struct *mm = current->mm;
404 struct mempolicy *new; 357 struct mempolicy *new;
405 unsigned long end; 358 unsigned long end;
406 DECLARE_BITMAP(nodes, MAX_NUMNODES);
407 int err; 359 int err;
408 360
409 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) 361 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -418,20 +370,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
418 return -EINVAL; 370 return -EINVAL;
419 if (end == start) 371 if (end == start)
420 return 0; 372 return 0;
421 373 if (mpol_check_policy(mode, nmask))
422 err = get_nodes(nodes, nmask, maxnode, mode); 374 return -EINVAL;
423 if (err) 375 new = mpol_new(mode, nmask);
424 return err;
425
426 new = mpol_new(mode, nodes);
427 if (IS_ERR(new)) 376 if (IS_ERR(new))
428 return PTR_ERR(new); 377 return PTR_ERR(new);
429 378
430 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 379 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
431 mode,nodes[0]); 380 mode,nodes_addr(nodes)[0]);
432 381
433 down_write(&mm->mmap_sem); 382 down_write(&mm->mmap_sem);
434 vma = check_range(mm, start, end, nodes, flags); 383 vma = check_range(mm, start, end, nmask, flags);
435 err = PTR_ERR(vma); 384 err = PTR_ERR(vma);
436 if (!IS_ERR(vma)) 385 if (!IS_ERR(vma))
437 err = mbind_range(vma, start, end, new); 386 err = mbind_range(vma, start, end, new);
@@ -441,50 +390,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
441} 390}
442 391
443/* Set the process memory policy */ 392/* Set the process memory policy */
444asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 393long do_set_mempolicy(int mode, nodemask_t *nodes)
445 unsigned long maxnode)
446{ 394{
447 int err;
448 struct mempolicy *new; 395 struct mempolicy *new;
449 DECLARE_BITMAP(nodes, MAX_NUMNODES);
450 396
451 if (mode < 0 || mode > MPOL_MAX) 397 if (contextualize_policy(mode, nodes))
452 return -EINVAL; 398 return -EINVAL;
453 err = get_nodes(nodes, nmask, maxnode, mode);
454 if (err)
455 return err;
456 new = mpol_new(mode, nodes); 399 new = mpol_new(mode, nodes);
457 if (IS_ERR(new)) 400 if (IS_ERR(new))
458 return PTR_ERR(new); 401 return PTR_ERR(new);
459 mpol_free(current->mempolicy); 402 mpol_free(current->mempolicy);
460 current->mempolicy = new; 403 current->mempolicy = new;
461 if (new && new->policy == MPOL_INTERLEAVE) 404 if (new && new->policy == MPOL_INTERLEAVE)
462 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); 405 current->il_next = first_node(new->v.nodes);
463 return 0; 406 return 0;
464} 407}
465 408
466/* Fill a zone bitmap for a policy */ 409/* Fill a zone bitmap for a policy */
467static void get_zonemask(struct mempolicy *p, unsigned long *nodes) 410static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
468{ 411{
469 int i; 412 int i;
470 413
471 bitmap_zero(nodes, MAX_NUMNODES); 414 nodes_clear(*nodes);
472 switch (p->policy) { 415 switch (p->policy) {
473 case MPOL_BIND: 416 case MPOL_BIND:
474 for (i = 0; p->v.zonelist->zones[i]; i++) 417 for (i = 0; p->v.zonelist->zones[i]; i++)
475 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); 418 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
419 *nodes);
476 break; 420 break;
477 case MPOL_DEFAULT: 421 case MPOL_DEFAULT:
478 break; 422 break;
479 case MPOL_INTERLEAVE: 423 case MPOL_INTERLEAVE:
480 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); 424 *nodes = p->v.nodes;
481 break; 425 break;
482 case MPOL_PREFERRED: 426 case MPOL_PREFERRED:
483 /* or use current node instead of online map? */ 427 /* or use current node instead of online map? */
484 if (p->v.preferred_node < 0) 428 if (p->v.preferred_node < 0)
485 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); 429 *nodes = node_online_map;
486 else 430 else
487 __set_bit(p->v.preferred_node, nodes); 431 node_set(p->v.preferred_node, *nodes);
488 break; 432 break;
489 default: 433 default:
490 BUG(); 434 BUG();
@@ -504,37 +448,18 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
504 return err; 448 return err;
505} 449}
506 450
507/* Copy a kernel node mask to user space */
508static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
509 void *nodes, unsigned nbytes)
510{
511 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
512
513 if (copy > nbytes) {
514 if (copy > PAGE_SIZE)
515 return -EINVAL;
516 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
517 return -EFAULT;
518 copy = nbytes;
519 }
520 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
521}
522
523/* Retrieve NUMA policy */ 451/* Retrieve NUMA policy */
524asmlinkage long sys_get_mempolicy(int __user *policy, 452long do_get_mempolicy(int *policy, nodemask_t *nmask,
525 unsigned long __user *nmask, 453 unsigned long addr, unsigned long flags)
526 unsigned long maxnode,
527 unsigned long addr, unsigned long flags)
528{ 454{
529 int err, pval; 455 int err;
530 struct mm_struct *mm = current->mm; 456 struct mm_struct *mm = current->mm;
531 struct vm_area_struct *vma = NULL; 457 struct vm_area_struct *vma = NULL;
532 struct mempolicy *pol = current->mempolicy; 458 struct mempolicy *pol = current->mempolicy;
533 459
460 cpuset_update_current_mems_allowed();
534 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 461 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
535 return -EINVAL; 462 return -EINVAL;
536 if (nmask != NULL && maxnode < MAX_NUMNODES)
537 return -EINVAL;
538 if (flags & MPOL_F_ADDR) { 463 if (flags & MPOL_F_ADDR) {
539 down_read(&mm->mmap_sem); 464 down_read(&mm->mmap_sem);
540 vma = find_vma_intersection(mm, addr, addr+1); 465 vma = find_vma_intersection(mm, addr, addr+1);
@@ -557,31 +482,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
557 err = lookup_node(mm, addr); 482 err = lookup_node(mm, addr);
558 if (err < 0) 483 if (err < 0)
559 goto out; 484 goto out;
560 pval = err; 485 *policy = err;
561 } else if (pol == current->mempolicy && 486 } else if (pol == current->mempolicy &&
562 pol->policy == MPOL_INTERLEAVE) { 487 pol->policy == MPOL_INTERLEAVE) {
563 pval = current->il_next; 488 *policy = current->il_next;
564 } else { 489 } else {
565 err = -EINVAL; 490 err = -EINVAL;
566 goto out; 491 goto out;
567 } 492 }
568 } else 493 } else
569 pval = pol->policy; 494 *policy = pol->policy;
570 495
571 if (vma) { 496 if (vma) {
572 up_read(&current->mm->mmap_sem); 497 up_read(&current->mm->mmap_sem);
573 vma = NULL; 498 vma = NULL;
574 } 499 }
575 500
576 if (policy && put_user(pval, policy))
577 return -EFAULT;
578
579 err = 0; 501 err = 0;
580 if (nmask) { 502 if (nmask)
581 DECLARE_BITMAP(nodes, MAX_NUMNODES); 503 get_zonemask(pol, nmask);
582 get_zonemask(pol, nodes);
583 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
584 }
585 504
586 out: 505 out:
587 if (vma) 506 if (vma)
@@ -589,6 +508,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
589 return err; 508 return err;
590} 509}
591 510
511/*
512 * User space interface with variable sized bitmaps for nodelists.
513 */
514
515/* Copy a node mask from user space. */
516static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
517 unsigned long maxnode)
518{
519 unsigned long k;
520 unsigned long nlongs;
521 unsigned long endmask;
522
523 --maxnode;
524 nodes_clear(*nodes);
525 if (maxnode == 0 || !nmask)
526 return 0;
527
528 nlongs = BITS_TO_LONGS(maxnode);
529 if ((maxnode % BITS_PER_LONG) == 0)
530 endmask = ~0UL;
531 else
532 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
533
534 /* When the user specified more nodes than supported just check
535 if the non supported part is all zero. */
536 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
537 if (nlongs > PAGE_SIZE/sizeof(long))
538 return -EINVAL;
539 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
540 unsigned long t;
541 if (get_user(t, nmask + k))
542 return -EFAULT;
543 if (k == nlongs - 1) {
544 if (t & endmask)
545 return -EINVAL;
546 } else if (t)
547 return -EINVAL;
548 }
549 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
550 endmask = ~0UL;
551 }
552
553 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
554 return -EFAULT;
555 nodes_addr(*nodes)[nlongs-1] &= endmask;
556 return 0;
557}
558
559/* Copy a kernel node mask to user space */
560static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
561 nodemask_t *nodes)
562{
563 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
564 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
565
566 if (copy > nbytes) {
567 if (copy > PAGE_SIZE)
568 return -EINVAL;
569 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
570 return -EFAULT;
571 copy = nbytes;
572 }
573 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
574}
575
576asmlinkage long sys_mbind(unsigned long start, unsigned long len,
577 unsigned long mode,
578 unsigned long __user *nmask, unsigned long maxnode,
579 unsigned flags)
580{
581 nodemask_t nodes;
582 int err;
583
584 err = get_nodes(&nodes, nmask, maxnode);
585 if (err)
586 return err;
587 return do_mbind(start, len, mode, &nodes, flags);
588}
589
590/* Set the process memory policy */
591asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
592 unsigned long maxnode)
593{
594 int err;
595 nodemask_t nodes;
596
597 if (mode < 0 || mode > MPOL_MAX)
598 return -EINVAL;
599 err = get_nodes(&nodes, nmask, maxnode);
600 if (err)
601 return err;
602 return do_set_mempolicy(mode, &nodes);
603}
604
605/* Retrieve NUMA policy */
606asmlinkage long sys_get_mempolicy(int __user *policy,
607 unsigned long __user *nmask,
608 unsigned long maxnode,
609 unsigned long addr, unsigned long flags)
610{
611 int err, pval;
612 nodemask_t nodes;
613
614 if (nmask != NULL && maxnode < MAX_NUMNODES)
615 return -EINVAL;
616
617 err = do_get_mempolicy(&pval, &nodes, addr, flags);
618
619 if (err)
620 return err;
621
622 if (policy && put_user(pval, policy))
623 return -EFAULT;
624
625 if (nmask)
626 err = copy_nodes_to_user(nmask, maxnode, &nodes);
627
628 return err;
629}
630
592#ifdef CONFIG_COMPAT 631#ifdef CONFIG_COMPAT
593 632
594asmlinkage long compat_sys_get_mempolicy(int __user *policy, 633asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@@ -649,15 +688,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
649 long err = 0; 688 long err = 0;
650 unsigned long __user *nm = NULL; 689 unsigned long __user *nm = NULL;
651 unsigned long nr_bits, alloc_size; 690 unsigned long nr_bits, alloc_size;
652 DECLARE_BITMAP(bm, MAX_NUMNODES); 691 nodemask_t bm;
653 692
654 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 693 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
655 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 694 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
656 695
657 if (nmask) { 696 if (nmask) {
658 err = compat_get_bitmap(bm, nmask, nr_bits); 697 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
659 nm = compat_alloc_user_space(alloc_size); 698 nm = compat_alloc_user_space(alloc_size);
660 err |= copy_to_user(nm, bm, alloc_size); 699 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
661 } 700 }
662 701
663 if (err) 702 if (err)
@@ -676,7 +715,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
676 715
677 if (vma) { 716 if (vma) {
678 if (vma->vm_ops && vma->vm_ops->get_policy) 717 if (vma->vm_ops && vma->vm_ops->get_policy)
679 pol = vma->vm_ops->get_policy(vma, addr); 718 pol = vma->vm_ops->get_policy(vma, addr);
680 else if (vma->vm_policy && 719 else if (vma->vm_policy &&
681 vma->vm_policy->policy != MPOL_DEFAULT) 720 vma->vm_policy->policy != MPOL_DEFAULT)
682 pol = vma->vm_policy; 721 pol = vma->vm_policy;
@@ -700,7 +739,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
700 case MPOL_BIND: 739 case MPOL_BIND:
701 /* Lower zones don't get a policy applied */ 740 /* Lower zones don't get a policy applied */
702 /* Careful: current->mems_allowed might have moved */ 741 /* Careful: current->mems_allowed might have moved */
703 if ((gfp & GFP_ZONEMASK) >= policy_zone) 742 if (gfp_zone(gfp) >= policy_zone)
704 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) 743 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
705 return policy->v.zonelist; 744 return policy->v.zonelist;
706 /*FALL THROUGH*/ 745 /*FALL THROUGH*/
@@ -712,7 +751,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
712 nd = 0; 751 nd = 0;
713 BUG(); 752 BUG();
714 } 753 }
715 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK); 754 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
716} 755}
717 756
718/* Do dynamic interleaving for a process */ 757/* Do dynamic interleaving for a process */
@@ -722,10 +761,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
722 struct task_struct *me = current; 761 struct task_struct *me = current;
723 762
724 nid = me->il_next; 763 nid = me->il_next;
725 BUG_ON(nid >= MAX_NUMNODES); 764 next = next_node(nid, policy->v.nodes);
726 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
727 if (next >= MAX_NUMNODES) 765 if (next >= MAX_NUMNODES)
728 next = find_first_bit(policy->v.nodes, MAX_NUMNODES); 766 next = first_node(policy->v.nodes);
729 me->il_next = next; 767 me->il_next = next;
730 return nid; 768 return nid;
731} 769}
@@ -734,30 +772,28 @@ static unsigned interleave_nodes(struct mempolicy *policy)
734static unsigned offset_il_node(struct mempolicy *pol, 772static unsigned offset_il_node(struct mempolicy *pol,
735 struct vm_area_struct *vma, unsigned long off) 773 struct vm_area_struct *vma, unsigned long off)
736{ 774{
737 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); 775 unsigned nnodes = nodes_weight(pol->v.nodes);
738 unsigned target = (unsigned)off % nnodes; 776 unsigned target = (unsigned)off % nnodes;
739 int c; 777 int c;
740 int nid = -1; 778 int nid = -1;
741 779
742 c = 0; 780 c = 0;
743 do { 781 do {
744 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); 782 nid = next_node(nid, pol->v.nodes);
745 c++; 783 c++;
746 } while (c <= target); 784 } while (c <= target);
747 BUG_ON(nid >= MAX_NUMNODES);
748 BUG_ON(!test_bit(nid, pol->v.nodes));
749 return nid; 785 return nid;
750} 786}
751 787
752/* Allocate a page in interleaved policy. 788/* Allocate a page in interleaved policy.
753 Own path because it needs to do special accounting. */ 789 Own path because it needs to do special accounting. */
754static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) 790static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
791 unsigned nid)
755{ 792{
756 struct zonelist *zl; 793 struct zonelist *zl;
757 struct page *page; 794 struct page *page;
758 795
759 BUG_ON(!node_online(nid)); 796 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
760 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
761 page = __alloc_pages(gfp, order, zl); 797 page = __alloc_pages(gfp, order, zl);
762 if (page && page_zone(page) == zl->zones[0]) { 798 if (page && page_zone(page) == zl->zones[0]) {
763 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; 799 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
@@ -799,8 +835,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
799 unsigned nid; 835 unsigned nid;
800 if (vma) { 836 if (vma) {
801 unsigned long off; 837 unsigned long off;
802 BUG_ON(addr >= vma->vm_end);
803 BUG_ON(addr < vma->vm_start);
804 off = vma->vm_pgoff; 838 off = vma->vm_pgoff;
805 off += (addr - vma->vm_start) >> PAGE_SHIFT; 839 off += (addr - vma->vm_start) >> PAGE_SHIFT;
806 nid = offset_il_node(pol, vma, off); 840 nid = offset_il_node(pol, vma, off);
@@ -878,7 +912,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
878 case MPOL_DEFAULT: 912 case MPOL_DEFAULT:
879 return 1; 913 return 1;
880 case MPOL_INTERLEAVE: 914 case MPOL_INTERLEAVE:
881 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); 915 return nodes_equal(a->v.nodes, b->v.nodes);
882 case MPOL_PREFERRED: 916 case MPOL_PREFERRED:
883 return a->v.preferred_node == b->v.preferred_node; 917 return a->v.preferred_node == b->v.preferred_node;
884 case MPOL_BIND: { 918 case MPOL_BIND: {
@@ -1117,7 +1151,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
1117 PDprintk("set_shared_policy %lx sz %lu %d %lx\n", 1151 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1118 vma->vm_pgoff, 1152 vma->vm_pgoff,
1119 sz, npol? npol->policy : -1, 1153 sz, npol? npol->policy : -1,
1120 npol ? npol->v.nodes[0] : -1); 1154 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1121 1155
1122 if (npol) { 1156 if (npol) {
1123 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 1157 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1164,14 +1198,75 @@ void __init numa_policy_init(void)
1164 /* Set interleaving policy for system init. This way not all 1198 /* Set interleaving policy for system init. This way not all
1165 the data structures allocated at system boot end up in node zero. */ 1199 the data structures allocated at system boot end up in node zero. */
1166 1200
1167 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), 1201 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1168 MAX_NUMNODES) < 0)
1169 printk("numa_policy_init: interleaving failed\n"); 1202 printk("numa_policy_init: interleaving failed\n");
1170} 1203}
1171 1204
1172/* Reset policy of current process to default. 1205/* Reset policy of current process to default */
1173 * Assumes fs == KERNEL_DS */
1174void numa_default_policy(void) 1206void numa_default_policy(void)
1175{ 1207{
1176 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); 1208 do_set_mempolicy(MPOL_DEFAULT, NULL);
1209}
1210
1211/* Migrate a policy to a different set of nodes */
1212static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1213 const nodemask_t *new)
1214{
1215 nodemask_t tmp;
1216
1217 if (!pol)
1218 return;
1219
1220 switch (pol->policy) {
1221 case MPOL_DEFAULT:
1222 break;
1223 case MPOL_INTERLEAVE:
1224 nodes_remap(tmp, pol->v.nodes, *old, *new);
1225 pol->v.nodes = tmp;
1226 current->il_next = node_remap(current->il_next, *old, *new);
1227 break;
1228 case MPOL_PREFERRED:
1229 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1230 *old, *new);
1231 break;
1232 case MPOL_BIND: {
1233 nodemask_t nodes;
1234 struct zone **z;
1235 struct zonelist *zonelist;
1236
1237 nodes_clear(nodes);
1238 for (z = pol->v.zonelist->zones; *z; z++)
1239 node_set((*z)->zone_pgdat->node_id, nodes);
1240 nodes_remap(tmp, nodes, *old, *new);
1241 nodes = tmp;
1242
1243 zonelist = bind_zonelist(&nodes);
1244
1245 /* If no mem, then zonelist is NULL and we keep old zonelist.
1246 * If that old zonelist has no remaining mems_allowed nodes,
1247 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1248 */
1249
1250 if (zonelist) {
1251 /* Good - got mem - substitute new zonelist */
1252 kfree(pol->v.zonelist);
1253 pol->v.zonelist = zonelist;
1254 }
1255 break;
1256 }
1257 default:
1258 BUG();
1259 break;
1260 }
1261}
1262
1263/*
1264 * Someone moved this task to different nodes. Fixup mempolicies.
1265 *
1266 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1267 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1268 */
1269void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1270{
1271 rebind_policy(current->mempolicy, old, new);
1177} 1272}
diff --git a/mm/mempool.c b/mm/mempool.c
index 9e377ea700b2..1a99b80480d3 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -205,7 +205,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
205 void *element; 205 void *element;
206 unsigned long flags; 206 unsigned long flags;
207 wait_queue_t wait; 207 wait_queue_t wait;
208 unsigned int gfp_temp; 208 gfp_t gfp_temp;
209 209
210 might_sleep_if(gfp_mask & __GFP_WAIT); 210 might_sleep_if(gfp_mask & __GFP_WAIT);
211 211
diff --git a/mm/mmap.c b/mm/mmap.c
index fa11d91242e8..320dda1778c3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -181,26 +181,36 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
181} 181}
182 182
183/* 183/*
184 * Remove one vm structure and free it. 184 * Unlink a file-based vm structure from its prio_tree, to hide
185 * vma from rmap and vmtruncate before freeing its page tables.
185 */ 186 */
186static void remove_vm_struct(struct vm_area_struct *vma) 187void unlink_file_vma(struct vm_area_struct *vma)
187{ 188{
188 struct file *file = vma->vm_file; 189 struct file *file = vma->vm_file;
189 190
190 might_sleep();
191 if (file) { 191 if (file) {
192 struct address_space *mapping = file->f_mapping; 192 struct address_space *mapping = file->f_mapping;
193 spin_lock(&mapping->i_mmap_lock); 193 spin_lock(&mapping->i_mmap_lock);
194 __remove_shared_vm_struct(vma, file, mapping); 194 __remove_shared_vm_struct(vma, file, mapping);
195 spin_unlock(&mapping->i_mmap_lock); 195 spin_unlock(&mapping->i_mmap_lock);
196 } 196 }
197}
198
199/*
200 * Close a vm structure and free it, returning the next.
201 */
202static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
203{
204 struct vm_area_struct *next = vma->vm_next;
205
206 might_sleep();
197 if (vma->vm_ops && vma->vm_ops->close) 207 if (vma->vm_ops && vma->vm_ops->close)
198 vma->vm_ops->close(vma); 208 vma->vm_ops->close(vma);
199 if (file) 209 if (vma->vm_file)
200 fput(file); 210 fput(vma->vm_file);
201 anon_vma_unlink(vma);
202 mpol_free(vma_policy(vma)); 211 mpol_free(vma_policy(vma));
203 kmem_cache_free(vm_area_cachep, vma); 212 kmem_cache_free(vm_area_cachep, vma);
213 return next;
204} 214}
205 215
206asmlinkage unsigned long sys_brk(unsigned long brk) 216asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -832,7 +842,7 @@ none:
832} 842}
833 843
834#ifdef CONFIG_PROC_FS 844#ifdef CONFIG_PROC_FS
835void __vm_stat_account(struct mm_struct *mm, unsigned long flags, 845void vm_stat_account(struct mm_struct *mm, unsigned long flags,
836 struct file *file, long pages) 846 struct file *file, long pages)
837{ 847{
838 const unsigned long stack_flags 848 const unsigned long stack_flags
@@ -1070,6 +1080,17 @@ munmap_back:
1070 error = file->f_op->mmap(file, vma); 1080 error = file->f_op->mmap(file, vma);
1071 if (error) 1081 if (error)
1072 goto unmap_and_free_vma; 1082 goto unmap_and_free_vma;
1083 if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
1084 == (VM_WRITE | VM_RESERVED)) {
1085 printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
1086 "PROT_WRITE mmap of VM_RESERVED memory, which "
1087 "is deprecated. Please report this to "
1088 "linux-kernel@vger.kernel.org\n",current->comm);
1089 if (vma->vm_ops && vma->vm_ops->close)
1090 vma->vm_ops->close(vma);
1091 error = -EACCES;
1092 goto unmap_and_free_vma;
1093 }
1073 } else if (vm_flags & VM_SHARED) { 1094 } else if (vm_flags & VM_SHARED) {
1074 error = shmem_zero_setup(vma); 1095 error = shmem_zero_setup(vma);
1075 if (error) 1096 if (error)
@@ -1110,7 +1131,7 @@ munmap_back:
1110 } 1131 }
1111out: 1132out:
1112 mm->total_vm += len >> PAGE_SHIFT; 1133 mm->total_vm += len >> PAGE_SHIFT;
1113 __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1134 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1114 if (vm_flags & VM_LOCKED) { 1135 if (vm_flags & VM_LOCKED) {
1115 mm->locked_vm += len >> PAGE_SHIFT; 1136 mm->locked_vm += len >> PAGE_SHIFT;
1116 make_pages_present(addr, addr + len); 1137 make_pages_present(addr, addr + len);
@@ -1475,15 +1496,19 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
1475 mm->total_vm += grow; 1496 mm->total_vm += grow;
1476 if (vma->vm_flags & VM_LOCKED) 1497 if (vma->vm_flags & VM_LOCKED)
1477 mm->locked_vm += grow; 1498 mm->locked_vm += grow;
1478 __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 1499 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
1479 return 0; 1500 return 0;
1480} 1501}
1481 1502
1482#ifdef CONFIG_STACK_GROWSUP 1503#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
1483/* 1504/*
1484 * vma is the first one with address > vma->vm_end. Have to extend vma. 1505 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
1506 * vma is the last one with address > vma->vm_end. Have to extend vma.
1485 */ 1507 */
1486int expand_stack(struct vm_area_struct * vma, unsigned long address) 1508#ifdef CONFIG_STACK_GROWSUP
1509static inline
1510#endif
1511int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1487{ 1512{
1488 int error; 1513 int error;
1489 1514
@@ -1521,6 +1546,13 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address)
1521 anon_vma_unlock(vma); 1546 anon_vma_unlock(vma);
1522 return error; 1547 return error;
1523} 1548}
1549#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
1550
1551#ifdef CONFIG_STACK_GROWSUP
1552int expand_stack(struct vm_area_struct *vma, unsigned long address)
1553{
1554 return expand_upwards(vma, address);
1555}
1524 1556
1525struct vm_area_struct * 1557struct vm_area_struct *
1526find_extend_vma(struct mm_struct *mm, unsigned long addr) 1558find_extend_vma(struct mm_struct *mm, unsigned long addr)
@@ -1603,36 +1635,24 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1603} 1635}
1604#endif 1636#endif
1605 1637
1606/* Normal function to fix up a mapping
1607 * This function is the default for when an area has no specific
1608 * function. This may be used as part of a more specific routine.
1609 *
1610 * By the time this function is called, the area struct has been
1611 * removed from the process mapping list.
1612 */
1613static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
1614{
1615 size_t len = area->vm_end - area->vm_start;
1616
1617 area->vm_mm->total_vm -= len >> PAGE_SHIFT;
1618 if (area->vm_flags & VM_LOCKED)
1619 area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
1620 vm_stat_unaccount(area);
1621 remove_vm_struct(area);
1622}
1623
1624/* 1638/*
1625 * Update the VMA and inode share lists. 1639 * Ok - we have the memory areas we should free on the vma list,
1626 *
1627 * Ok - we have the memory areas we should free on the 'free' list,
1628 * so release them, and do the vma updates. 1640 * so release them, and do the vma updates.
1641 *
1642 * Called with the mm semaphore held.
1629 */ 1643 */
1630static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) 1644static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1631{ 1645{
1646 /* Update high watermark before we lower total_vm */
1647 update_hiwater_vm(mm);
1632 do { 1648 do {
1633 struct vm_area_struct *next = vma->vm_next; 1649 long nrpages = vma_pages(vma);
1634 unmap_vma(mm, vma); 1650
1635 vma = next; 1651 mm->total_vm -= nrpages;
1652 if (vma->vm_flags & VM_LOCKED)
1653 mm->locked_vm -= nrpages;
1654 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1655 vma = remove_vma(vma);
1636 } while (vma); 1656 } while (vma);
1637 validate_mm(mm); 1657 validate_mm(mm);
1638} 1658}
@@ -1651,14 +1671,13 @@ static void unmap_region(struct mm_struct *mm,
1651 unsigned long nr_accounted = 0; 1671 unsigned long nr_accounted = 0;
1652 1672
1653 lru_add_drain(); 1673 lru_add_drain();
1654 spin_lock(&mm->page_table_lock);
1655 tlb = tlb_gather_mmu(mm, 0); 1674 tlb = tlb_gather_mmu(mm, 0);
1656 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); 1675 update_hiwater_rss(mm);
1676 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1657 vm_unacct_memory(nr_accounted); 1677 vm_unacct_memory(nr_accounted);
1658 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1678 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1659 next? next->vm_start: 0); 1679 next? next->vm_start: 0);
1660 tlb_finish_mmu(tlb, start, end); 1680 tlb_finish_mmu(tlb, start, end);
1661 spin_unlock(&mm->page_table_lock);
1662} 1681}
1663 1682
1664/* 1683/*
@@ -1799,7 +1818,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1799 unmap_region(mm, vma, prev, start, end); 1818 unmap_region(mm, vma, prev, start, end);
1800 1819
1801 /* Fix up all other VM information */ 1820 /* Fix up all other VM information */
1802 unmap_vma_list(mm, vma); 1821 remove_vma_list(mm, vma);
1803 1822
1804 return 0; 1823 return 0;
1805} 1824}
@@ -1821,7 +1840,7 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
1821 1840
1822static inline void verify_mm_writelocked(struct mm_struct *mm) 1841static inline void verify_mm_writelocked(struct mm_struct *mm)
1823{ 1842{
1824#ifdef CONFIG_DEBUG_KERNEL 1843#ifdef CONFIG_DEBUG_VM
1825 if (unlikely(down_read_trylock(&mm->mmap_sem))) { 1844 if (unlikely(down_read_trylock(&mm->mmap_sem))) {
1826 WARN_ON(1); 1845 WARN_ON(1);
1827 up_read(&mm->mmap_sem); 1846 up_read(&mm->mmap_sem);
@@ -1933,34 +1952,21 @@ void exit_mmap(struct mm_struct *mm)
1933 unsigned long end; 1952 unsigned long end;
1934 1953
1935 lru_add_drain(); 1954 lru_add_drain();
1936
1937 spin_lock(&mm->page_table_lock);
1938
1939 flush_cache_mm(mm); 1955 flush_cache_mm(mm);
1940 tlb = tlb_gather_mmu(mm, 1); 1956 tlb = tlb_gather_mmu(mm, 1);
1957 /* Don't update_hiwater_rss(mm) here, do_exit already did */
1941 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 1958 /* Use -1 here to ensure all VMAs in the mm are unmapped */
1942 end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); 1959 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
1943 vm_unacct_memory(nr_accounted); 1960 vm_unacct_memory(nr_accounted);
1944 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 1961 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
1945 tlb_finish_mmu(tlb, 0, end); 1962 tlb_finish_mmu(tlb, 0, end);
1946 1963
1947 mm->mmap = mm->mmap_cache = NULL;
1948 mm->mm_rb = RB_ROOT;
1949 set_mm_counter(mm, rss, 0);
1950 mm->total_vm = 0;
1951 mm->locked_vm = 0;
1952
1953 spin_unlock(&mm->page_table_lock);
1954
1955 /* 1964 /*
1956 * Walk the list again, actually closing and freeing it 1965 * Walk the list again, actually closing and freeing it,
1957 * without holding any MM locks. 1966 * with preemption enabled, without holding any MM locks.
1958 */ 1967 */
1959 while (vma) { 1968 while (vma)
1960 struct vm_area_struct *next = vma->vm_next; 1969 vma = remove_vma(vma);
1961 remove_vm_struct(vma);
1962 vma = next;
1963 }
1964 1970
1965 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 1971 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
1966} 1972}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 57577f63b305..17a2b52b753b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
29 unsigned long addr, unsigned long end, pgprot_t newprot) 29 unsigned long addr, unsigned long end, pgprot_t newprot)
30{ 30{
31 pte_t *pte; 31 pte_t *pte;
32 spinlock_t *ptl;
32 33
33 pte = pte_offset_map(pmd, addr); 34 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
34 do { 35 do {
35 if (pte_present(*pte)) { 36 if (pte_present(*pte)) {
36 pte_t ptent; 37 pte_t ptent;
@@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
44 lazy_mmu_prot_update(ptent); 45 lazy_mmu_prot_update(ptent);
45 } 46 }
46 } while (pte++, addr += PAGE_SIZE, addr != end); 47 } while (pte++, addr += PAGE_SIZE, addr != end);
47 pte_unmap(pte - 1); 48 pte_unmap_unlock(pte - 1, ptl);
48} 49}
49 50
50static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, 51static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma,
88 BUG_ON(addr >= end); 89 BUG_ON(addr >= end);
89 pgd = pgd_offset(mm, addr); 90 pgd = pgd_offset(mm, addr);
90 flush_cache_range(vma, addr, end); 91 flush_cache_range(vma, addr, end);
91 spin_lock(&mm->page_table_lock);
92 do { 92 do {
93 next = pgd_addr_end(addr, end); 93 next = pgd_addr_end(addr, end);
94 if (pgd_none_or_clear_bad(pgd)) 94 if (pgd_none_or_clear_bad(pgd))
@@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma,
96 change_pud_range(mm, pgd, addr, next, newprot); 96 change_pud_range(mm, pgd, addr, next, newprot);
97 } while (pgd++, addr = next, addr != end); 97 } while (pgd++, addr = next, addr != end);
98 flush_tlb_range(vma, start, end); 98 flush_tlb_range(vma, start, end);
99 spin_unlock(&mm->page_table_lock);
100} 99}
101 100
102static int 101static int
@@ -125,6 +124,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
125 * a MAP_NORESERVE private mapping to writable will now reserve. 124 * a MAP_NORESERVE private mapping to writable will now reserve.
126 */ 125 */
127 if (newflags & VM_WRITE) { 126 if (newflags & VM_WRITE) {
127 if (oldflags & VM_RESERVED) {
128 BUG_ON(oldflags & VM_WRITE);
129 printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
130 "PROT_WRITE mprotect of VM_RESERVED memory, "
131 "which is deprecated. Please report this to "
132 "linux-kernel@vger.kernel.org\n",current->comm);
133 return -EACCES;
134 }
128 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { 135 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
129 charged = nrpages; 136 charged = nrpages;
130 if (security_vm_enough_memory(charged)) 137 if (security_vm_enough_memory(charged))
@@ -168,8 +175,8 @@ success:
168 vma->vm_flags = newflags; 175 vma->vm_flags = newflags;
169 vma->vm_page_prot = newprot; 176 vma->vm_page_prot = newprot;
170 change_protection(vma, start, end, newprot); 177 change_protection(vma, start, end, newprot);
171 __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 178 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
172 __vm_stat_account(mm, newflags, vma->vm_file, nrpages); 179 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
173 return 0; 180 return 0;
174 181
175fail: 182fail:
diff --git a/mm/mremap.c b/mm/mremap.c
index f343fc73a8bd..b535438c363c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,35 +22,7 @@
22#include <asm/cacheflush.h> 22#include <asm/cacheflush.h>
23#include <asm/tlbflush.h> 23#include <asm/tlbflush.h>
24 24
25static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) 25static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
26{
27 pgd_t *pgd;
28 pud_t *pud;
29 pmd_t *pmd;
30 pte_t *pte = NULL;
31
32 pgd = pgd_offset(mm, addr);
33 if (pgd_none_or_clear_bad(pgd))
34 goto end;
35
36 pud = pud_offset(pgd, addr);
37 if (pud_none_or_clear_bad(pud))
38 goto end;
39
40 pmd = pmd_offset(pud, addr);
41 if (pmd_none_or_clear_bad(pmd))
42 goto end;
43
44 pte = pte_offset_map_nested(pmd, addr);
45 if (pte_none(*pte)) {
46 pte_unmap_nested(pte);
47 pte = NULL;
48 }
49end:
50 return pte;
51}
52
53static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
54{ 26{
55 pgd_t *pgd; 27 pgd_t *pgd;
56 pud_t *pud; 28 pud_t *pud;
@@ -68,35 +40,39 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
68 if (pmd_none_or_clear_bad(pmd)) 40 if (pmd_none_or_clear_bad(pmd))
69 return NULL; 41 return NULL;
70 42
71 return pte_offset_map(pmd, addr); 43 return pmd;
72} 44}
73 45
74static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) 46static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
75{ 47{
76 pgd_t *pgd; 48 pgd_t *pgd;
77 pud_t *pud; 49 pud_t *pud;
78 pmd_t *pmd; 50 pmd_t *pmd;
79 pte_t *pte = NULL;
80 51
81 pgd = pgd_offset(mm, addr); 52 pgd = pgd_offset(mm, addr);
82
83 pud = pud_alloc(mm, pgd, addr); 53 pud = pud_alloc(mm, pgd, addr);
84 if (!pud) 54 if (!pud)
85 return NULL; 55 return NULL;
56
86 pmd = pmd_alloc(mm, pud, addr); 57 pmd = pmd_alloc(mm, pud, addr);
87 if (pmd) 58 if (!pmd)
88 pte = pte_alloc_map(mm, pmd, addr); 59 return NULL;
89 return pte; 60
61 if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
62 return NULL;
63
64 return pmd;
90} 65}
91 66
92static int 67static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
93move_one_page(struct vm_area_struct *vma, unsigned long old_addr, 68 unsigned long old_addr, unsigned long old_end,
94 struct vm_area_struct *new_vma, unsigned long new_addr) 69 struct vm_area_struct *new_vma, pmd_t *new_pmd,
70 unsigned long new_addr)
95{ 71{
96 struct address_space *mapping = NULL; 72 struct address_space *mapping = NULL;
97 struct mm_struct *mm = vma->vm_mm; 73 struct mm_struct *mm = vma->vm_mm;
98 int error = 0; 74 pte_t *old_pte, *new_pte, pte;
99 pte_t *src, *dst; 75 spinlock_t *old_ptl, *new_ptl;
100 76
101 if (vma->vm_file) { 77 if (vma->vm_file) {
102 /* 78 /*
@@ -111,74 +87,69 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
111 new_vma->vm_truncate_count != vma->vm_truncate_count) 87 new_vma->vm_truncate_count != vma->vm_truncate_count)
112 new_vma->vm_truncate_count = 0; 88 new_vma->vm_truncate_count = 0;
113 } 89 }
114 spin_lock(&mm->page_table_lock);
115 90
116 src = get_one_pte_map_nested(mm, old_addr); 91 /*
117 if (src) { 92 * We don't have to worry about the ordering of src and dst
118 /* 93 * pte locks because exclusive mmap_sem prevents deadlock.
119 * Look to see whether alloc_one_pte_map needs to perform a 94 */
120 * memory allocation. If it does then we need to drop the 95 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
121 * atomic kmap 96 new_pte = pte_offset_map_nested(new_pmd, new_addr);
122 */ 97 new_ptl = pte_lockptr(mm, new_pmd);
123 dst = get_one_pte_map(mm, new_addr); 98 if (new_ptl != old_ptl)
124 if (unlikely(!dst)) { 99 spin_lock(new_ptl);
125 pte_unmap_nested(src); 100
126 if (mapping) 101 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
127 spin_unlock(&mapping->i_mmap_lock); 102 new_pte++, new_addr += PAGE_SIZE) {
128 dst = alloc_one_pte_map(mm, new_addr); 103 if (pte_none(*old_pte))
129 if (mapping && !spin_trylock(&mapping->i_mmap_lock)) { 104 continue;
130 spin_unlock(&mm->page_table_lock); 105 pte = ptep_clear_flush(vma, old_addr, old_pte);
131 spin_lock(&mapping->i_mmap_lock); 106 /* ZERO_PAGE can be dependant on virtual addr */
132 spin_lock(&mm->page_table_lock); 107 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
133 } 108 set_pte_at(mm, new_addr, new_pte, pte);
134 src = get_one_pte_map_nested(mm, old_addr);
135 }
136 /*
137 * Since alloc_one_pte_map can drop and re-acquire
138 * page_table_lock, we should re-check the src entry...
139 */
140 if (src) {
141 if (dst) {
142 pte_t pte;
143 pte = ptep_clear_flush(vma, old_addr, src);
144
145 /* ZERO_PAGE can be dependant on virtual addr */
146 pte = move_pte(pte, new_vma->vm_page_prot,
147 old_addr, new_addr);
148 set_pte_at(mm, new_addr, dst, pte);
149 } else
150 error = -ENOMEM;
151 pte_unmap_nested(src);
152 }
153 if (dst)
154 pte_unmap(dst);
155 } 109 }
156 spin_unlock(&mm->page_table_lock); 110
111 if (new_ptl != old_ptl)
112 spin_unlock(new_ptl);
113 pte_unmap_nested(new_pte - 1);
114 pte_unmap_unlock(old_pte - 1, old_ptl);
157 if (mapping) 115 if (mapping)
158 spin_unlock(&mapping->i_mmap_lock); 116 spin_unlock(&mapping->i_mmap_lock);
159 return error;
160} 117}
161 118
119#define LATENCY_LIMIT (64 * PAGE_SIZE)
120
162static unsigned long move_page_tables(struct vm_area_struct *vma, 121static unsigned long move_page_tables(struct vm_area_struct *vma,
163 unsigned long old_addr, struct vm_area_struct *new_vma, 122 unsigned long old_addr, struct vm_area_struct *new_vma,
164 unsigned long new_addr, unsigned long len) 123 unsigned long new_addr, unsigned long len)
165{ 124{
166 unsigned long offset; 125 unsigned long extent, next, old_end;
126 pmd_t *old_pmd, *new_pmd;
167 127
168 flush_cache_range(vma, old_addr, old_addr + len); 128 old_end = old_addr + len;
129 flush_cache_range(vma, old_addr, old_end);
169 130
170 /* 131 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
171 * This is not the clever way to do this, but we're taking the
172 * easy way out on the assumption that most remappings will be
173 * only a few pages.. This also makes error recovery easier.
174 */
175 for (offset = 0; offset < len; offset += PAGE_SIZE) {
176 if (move_one_page(vma, old_addr + offset,
177 new_vma, new_addr + offset) < 0)
178 break;
179 cond_resched(); 132 cond_resched();
133 next = (old_addr + PMD_SIZE) & PMD_MASK;
134 if (next - 1 > old_end)
135 next = old_end;
136 extent = next - old_addr;
137 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
138 if (!old_pmd)
139 continue;
140 new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
141 if (!new_pmd)
142 break;
143 next = (new_addr + PMD_SIZE) & PMD_MASK;
144 if (extent > next - new_addr)
145 extent = next - new_addr;
146 if (extent > LATENCY_LIMIT)
147 extent = LATENCY_LIMIT;
148 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
149 new_vma, new_pmd, new_addr);
180 } 150 }
181 return offset; 151
152 return len + old_addr - old_end; /* how much done */
182} 153}
183 154
184static unsigned long move_vma(struct vm_area_struct *vma, 155static unsigned long move_vma(struct vm_area_struct *vma,
@@ -191,6 +162,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
191 unsigned long new_pgoff; 162 unsigned long new_pgoff;
192 unsigned long moved_len; 163 unsigned long moved_len;
193 unsigned long excess = 0; 164 unsigned long excess = 0;
165 unsigned long hiwater_vm;
194 int split = 0; 166 int split = 0;
195 167
196 /* 168 /*
@@ -229,17 +201,24 @@ static unsigned long move_vma(struct vm_area_struct *vma,
229 } 201 }
230 202
231 /* 203 /*
232 * if we failed to move page tables we still do total_vm increment 204 * If we failed to move page tables we still do total_vm increment
233 * since do_munmap() will decrement it by old_len == new_len 205 * since do_munmap() will decrement it by old_len == new_len.
206 *
207 * Since total_vm is about to be raised artificially high for a
208 * moment, we need to restore high watermark afterwards: if stats
209 * are taken meanwhile, total_vm and hiwater_vm appear too high.
210 * If this were a serious issue, we'd add a flag to do_munmap().
234 */ 211 */
212 hiwater_vm = mm->hiwater_vm;
235 mm->total_vm += new_len >> PAGE_SHIFT; 213 mm->total_vm += new_len >> PAGE_SHIFT;
236 __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); 214 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
237 215
238 if (do_munmap(mm, old_addr, old_len) < 0) { 216 if (do_munmap(mm, old_addr, old_len) < 0) {
239 /* OOM: unable to split vma, just get accounts right */ 217 /* OOM: unable to split vma, just get accounts right */
240 vm_unacct_memory(excess >> PAGE_SHIFT); 218 vm_unacct_memory(excess >> PAGE_SHIFT);
241 excess = 0; 219 excess = 0;
242 } 220 }
221 mm->hiwater_vm = hiwater_vm;
243 222
244 /* Restore VM_ACCOUNT if one or two pieces of vma left */ 223 /* Restore VM_ACCOUNT if one or two pieces of vma left */
245 if (excess) { 224 if (excess) {
@@ -269,6 +248,7 @@ unsigned long do_mremap(unsigned long addr,
269 unsigned long old_len, unsigned long new_len, 248 unsigned long old_len, unsigned long new_len,
270 unsigned long flags, unsigned long new_addr) 249 unsigned long flags, unsigned long new_addr)
271{ 250{
251 struct mm_struct *mm = current->mm;
272 struct vm_area_struct *vma; 252 struct vm_area_struct *vma;
273 unsigned long ret = -EINVAL; 253 unsigned long ret = -EINVAL;
274 unsigned long charged = 0; 254 unsigned long charged = 0;
@@ -309,7 +289,7 @@ unsigned long do_mremap(unsigned long addr,
309 if ((addr <= new_addr) && (addr+old_len) > new_addr) 289 if ((addr <= new_addr) && (addr+old_len) > new_addr)
310 goto out; 290 goto out;
311 291
312 ret = do_munmap(current->mm, new_addr, new_len); 292 ret = do_munmap(mm, new_addr, new_len);
313 if (ret) 293 if (ret)
314 goto out; 294 goto out;
315 } 295 }
@@ -320,7 +300,7 @@ unsigned long do_mremap(unsigned long addr,
320 * do_munmap does all the needed commit accounting 300 * do_munmap does all the needed commit accounting
321 */ 301 */
322 if (old_len >= new_len) { 302 if (old_len >= new_len) {
323 ret = do_munmap(current->mm, addr+new_len, old_len - new_len); 303 ret = do_munmap(mm, addr+new_len, old_len - new_len);
324 if (ret && old_len != new_len) 304 if (ret && old_len != new_len)
325 goto out; 305 goto out;
326 ret = addr; 306 ret = addr;
@@ -333,7 +313,7 @@ unsigned long do_mremap(unsigned long addr,
333 * Ok, we need to grow.. or relocate. 313 * Ok, we need to grow.. or relocate.
334 */ 314 */
335 ret = -EFAULT; 315 ret = -EFAULT;
336 vma = find_vma(current->mm, addr); 316 vma = find_vma(mm, addr);
337 if (!vma || vma->vm_start > addr) 317 if (!vma || vma->vm_start > addr)
338 goto out; 318 goto out;
339 if (is_vm_hugetlb_page(vma)) { 319 if (is_vm_hugetlb_page(vma)) {
@@ -349,14 +329,14 @@ unsigned long do_mremap(unsigned long addr,
349 } 329 }
350 if (vma->vm_flags & VM_LOCKED) { 330 if (vma->vm_flags & VM_LOCKED) {
351 unsigned long locked, lock_limit; 331 unsigned long locked, lock_limit;
352 locked = current->mm->locked_vm << PAGE_SHIFT; 332 locked = mm->locked_vm << PAGE_SHIFT;
353 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 333 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
354 locked += new_len - old_len; 334 locked += new_len - old_len;
355 ret = -EAGAIN; 335 ret = -EAGAIN;
356 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 336 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
357 goto out; 337 goto out;
358 } 338 }
359 if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) { 339 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
360 ret = -ENOMEM; 340 ret = -ENOMEM;
361 goto out; 341 goto out;
362 } 342 }
@@ -383,11 +363,10 @@ unsigned long do_mremap(unsigned long addr,
383 vma_adjust(vma, vma->vm_start, 363 vma_adjust(vma, vma->vm_start,
384 addr + new_len, vma->vm_pgoff, NULL); 364 addr + new_len, vma->vm_pgoff, NULL);
385 365
386 current->mm->total_vm += pages; 366 mm->total_vm += pages;
387 __vm_stat_account(vma->vm_mm, vma->vm_flags, 367 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
388 vma->vm_file, pages);
389 if (vma->vm_flags & VM_LOCKED) { 368 if (vma->vm_flags & VM_LOCKED) {
390 current->mm->locked_vm += pages; 369 mm->locked_vm += pages;
391 make_pages_present(addr + old_len, 370 make_pages_present(addr + old_len,
392 addr + new_len); 371 addr + new_len);
393 } 372 }
diff --git a/mm/msync.c b/mm/msync.c
index d0f5a1bce7cb..0e040e9c39d8 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -17,40 +17,48 @@
17#include <asm/pgtable.h> 17#include <asm/pgtable.h>
18#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
19 19
20/* 20static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
21 * Called with mm->page_table_lock held to protect against other
22 * threads/the swapper from ripping pte's out from under us.
23 */
24
25static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
26 unsigned long addr, unsigned long end) 21 unsigned long addr, unsigned long end)
27{ 22{
28 pte_t *pte; 23 pte_t *pte;
24 spinlock_t *ptl;
25 int progress = 0;
29 26
30 pte = pte_offset_map(pmd, addr); 27again:
28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
31 do { 29 do {
32 unsigned long pfn; 30 unsigned long pfn;
33 struct page *page; 31 struct page *page;
34 32
33 if (progress >= 64) {
34 progress = 0;
35 if (need_resched() || need_lockbreak(ptl))
36 break;
37 }
38 progress++;
35 if (!pte_present(*pte)) 39 if (!pte_present(*pte))
36 continue; 40 continue;
37 if (!pte_maybe_dirty(*pte)) 41 if (!pte_maybe_dirty(*pte))
38 continue; 42 continue;
39 pfn = pte_pfn(*pte); 43 pfn = pte_pfn(*pte);
40 if (!pfn_valid(pfn)) 44 if (unlikely(!pfn_valid(pfn))) {
45 print_bad_pte(vma, *pte, addr);
41 continue; 46 continue;
47 }
42 page = pfn_to_page(pfn); 48 page = pfn_to_page(pfn);
43 if (PageReserved(page))
44 continue;
45 49
46 if (ptep_clear_flush_dirty(vma, addr, pte) || 50 if (ptep_clear_flush_dirty(vma, addr, pte) ||
47 page_test_and_clear_dirty(page)) 51 page_test_and_clear_dirty(page))
48 set_page_dirty(page); 52 set_page_dirty(page);
53 progress += 3;
49 } while (pte++, addr += PAGE_SIZE, addr != end); 54 } while (pte++, addr += PAGE_SIZE, addr != end);
50 pte_unmap(pte - 1); 55 pte_unmap_unlock(pte - 1, ptl);
56 cond_resched();
57 if (addr != end)
58 goto again;
51} 59}
52 60
53static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, 61static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
54 unsigned long addr, unsigned long end) 62 unsigned long addr, unsigned long end)
55{ 63{
56 pmd_t *pmd; 64 pmd_t *pmd;
@@ -61,11 +69,11 @@ static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
61 next = pmd_addr_end(addr, end); 69 next = pmd_addr_end(addr, end);
62 if (pmd_none_or_clear_bad(pmd)) 70 if (pmd_none_or_clear_bad(pmd))
63 continue; 71 continue;
64 sync_pte_range(vma, pmd, addr, next); 72 msync_pte_range(vma, pmd, addr, next);
65 } while (pmd++, addr = next, addr != end); 73 } while (pmd++, addr = next, addr != end);
66} 74}
67 75
68static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 76static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
69 unsigned long addr, unsigned long end) 77 unsigned long addr, unsigned long end)
70{ 78{
71 pud_t *pud; 79 pud_t *pud;
@@ -76,58 +84,34 @@ static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
76 next = pud_addr_end(addr, end); 84 next = pud_addr_end(addr, end);
77 if (pud_none_or_clear_bad(pud)) 85 if (pud_none_or_clear_bad(pud))
78 continue; 86 continue;
79 sync_pmd_range(vma, pud, addr, next); 87 msync_pmd_range(vma, pud, addr, next);
80 } while (pud++, addr = next, addr != end); 88 } while (pud++, addr = next, addr != end);
81} 89}
82 90
83static void sync_page_range(struct vm_area_struct *vma, 91static void msync_page_range(struct vm_area_struct *vma,
84 unsigned long addr, unsigned long end) 92 unsigned long addr, unsigned long end)
85{ 93{
86 struct mm_struct *mm = vma->vm_mm;
87 pgd_t *pgd; 94 pgd_t *pgd;
88 unsigned long next; 95 unsigned long next;
89 96
90 /* For hugepages we can't go walking the page table normally, 97 /* For hugepages we can't go walking the page table normally,
91 * but that's ok, hugetlbfs is memory based, so we don't need 98 * but that's ok, hugetlbfs is memory based, so we don't need
92 * to do anything more on an msync() */ 99 * to do anything more on an msync().
93 if (is_vm_hugetlb_page(vma)) 100 * Can't do anything with VM_RESERVED regions either.
101 */
102 if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
94 return; 103 return;
95 104
96 BUG_ON(addr >= end); 105 BUG_ON(addr >= end);
97 pgd = pgd_offset(mm, addr); 106 pgd = pgd_offset(vma->vm_mm, addr);
98 flush_cache_range(vma, addr, end); 107 flush_cache_range(vma, addr, end);
99 spin_lock(&mm->page_table_lock);
100 do { 108 do {
101 next = pgd_addr_end(addr, end); 109 next = pgd_addr_end(addr, end);
102 if (pgd_none_or_clear_bad(pgd)) 110 if (pgd_none_or_clear_bad(pgd))
103 continue; 111 continue;
104 sync_pud_range(vma, pgd, addr, next); 112 msync_pud_range(vma, pgd, addr, next);
105 } while (pgd++, addr = next, addr != end); 113 } while (pgd++, addr = next, addr != end);
106 spin_unlock(&mm->page_table_lock);
107}
108
109#ifdef CONFIG_PREEMPT
110static inline void filemap_sync(struct vm_area_struct *vma,
111 unsigned long addr, unsigned long end)
112{
113 const size_t chunk = 64 * 1024; /* bytes */
114 unsigned long next;
115
116 do {
117 next = addr + chunk;
118 if (next > end || next < addr)
119 next = end;
120 sync_page_range(vma, addr, next);
121 cond_resched();
122 } while (addr = next, addr != end);
123}
124#else
125static inline void filemap_sync(struct vm_area_struct *vma,
126 unsigned long addr, unsigned long end)
127{
128 sync_page_range(vma, addr, end);
129} 114}
130#endif
131 115
132/* 116/*
133 * MS_SYNC syncs the entire file - including mappings. 117 * MS_SYNC syncs the entire file - including mappings.
@@ -150,7 +134,7 @@ static int msync_interval(struct vm_area_struct *vma,
150 return -EBUSY; 134 return -EBUSY;
151 135
152 if (file && (vma->vm_flags & VM_SHARED)) { 136 if (file && (vma->vm_flags & VM_SHARED)) {
153 filemap_sync(vma, addr, end); 137 msync_page_range(vma, addr, end);
154 138
155 if (flags & MS_SYNC) { 139 if (flags & MS_SYNC) {
156 struct address_space *mapping = file->f_mapping; 140 struct address_space *mapping = file->f_mapping;
diff --git a/mm/nommu.c b/mm/nommu.c
index 0ef241ae3763..d1e076a487cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
931 realalloc -= kobjsize(vml); 931 realalloc -= kobjsize(vml);
932 askedalloc -= sizeof(*vml); 932 askedalloc -= sizeof(*vml);
933 kfree(vml); 933 kfree(vml);
934
935 update_hiwater_vm(mm);
934 mm->total_vm -= len >> PAGE_SHIFT; 936 mm->total_vm -= len >> PAGE_SHIFT;
935 937
936#ifdef DEBUG 938#ifdef DEBUG
@@ -1047,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1047 1049
1048EXPORT_SYMBOL(find_vma); 1050EXPORT_SYMBOL(find_vma);
1049 1051
1050struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) 1052struct page *follow_page(struct mm_struct *mm, unsigned long address,
1053 unsigned int foll_flags)
1051{ 1054{
1052 return NULL; 1055 return NULL;
1053} 1056}
@@ -1078,19 +1081,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1078{ 1081{
1079} 1082}
1080 1083
1081void update_mem_hiwater(struct task_struct *tsk)
1082{
1083 unsigned long rss;
1084
1085 if (likely(tsk->mm)) {
1086 rss = get_mm_counter(tsk->mm, rss);
1087 if (tsk->mm->hiwater_rss < rss)
1088 tsk->mm->hiwater_rss = rss;
1089 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
1090 tsk->mm->hiwater_vm = tsk->mm->total_vm;
1091 }
1092}
1093
1094void unmap_mapping_range(struct address_space *mapping, 1084void unmap_mapping_range(struct address_space *mapping,
1095 loff_t const holebegin, loff_t const holelen, 1085 loff_t const holebegin, loff_t const holelen,
1096 int even_cows) 1086 int even_cows)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e1d3d77f4aee..2dbdd98426fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -33,6 +33,7 @@
33#include <linux/sysctl.h> 33#include <linux/sysctl.h>
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/cpuset.h> 35#include <linux/cpuset.h>
36#include <linux/memory_hotplug.h>
36#include <linux/nodemask.h> 37#include <linux/nodemask.h>
37#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
38 39
@@ -78,21 +79,44 @@ int min_free_kbytes = 1024;
78unsigned long __initdata nr_kernel_pages; 79unsigned long __initdata nr_kernel_pages;
79unsigned long __initdata nr_all_pages; 80unsigned long __initdata nr_all_pages;
80 81
82static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
83{
84 int ret = 0;
85 unsigned seq;
86 unsigned long pfn = page_to_pfn(page);
87
88 do {
89 seq = zone_span_seqbegin(zone);
90 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
91 ret = 1;
92 else if (pfn < zone->zone_start_pfn)
93 ret = 1;
94 } while (zone_span_seqretry(zone, seq));
95
96 return ret;
97}
98
99static int page_is_consistent(struct zone *zone, struct page *page)
100{
101#ifdef CONFIG_HOLES_IN_ZONE
102 if (!pfn_valid(page_to_pfn(page)))
103 return 0;
104#endif
105 if (zone != page_zone(page))
106 return 0;
107
108 return 1;
109}
81/* 110/*
82 * Temporary debugging check for pages not lying within a given zone. 111 * Temporary debugging check for pages not lying within a given zone.
83 */ 112 */
84static int bad_range(struct zone *zone, struct page *page) 113static int bad_range(struct zone *zone, struct page *page)
85{ 114{
86 if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) 115 if (page_outside_zone_boundaries(zone, page))
87 return 1; 116 return 1;
88 if (page_to_pfn(page) < zone->zone_start_pfn) 117 if (!page_is_consistent(zone, page))
89 return 1;
90#ifdef CONFIG_HOLES_IN_ZONE
91 if (!pfn_valid(page_to_pfn(page)))
92 return 1;
93#endif
94 if (zone != page_zone(page))
95 return 1; 118 return 1;
119
96 return 0; 120 return 0;
97} 121}
98 122
@@ -114,7 +138,8 @@ static void bad_page(const char *function, struct page *page)
114 1 << PG_reclaim | 138 1 << PG_reclaim |
115 1 << PG_slab | 139 1 << PG_slab |
116 1 << PG_swapcache | 140 1 << PG_swapcache |
117 1 << PG_writeback); 141 1 << PG_writeback |
142 1 << PG_reserved );
118 set_page_count(page, 0); 143 set_page_count(page, 0);
119 reset_page_mapcount(page); 144 reset_page_mapcount(page);
120 page->mapping = NULL; 145 page->mapping = NULL;
@@ -153,7 +178,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
153 struct page *p = page + i; 178 struct page *p = page + i;
154 179
155 SetPageCompound(p); 180 SetPageCompound(p);
156 p->private = (unsigned long)page; 181 set_page_private(p, (unsigned long)page);
157 } 182 }
158} 183}
159 184
@@ -173,7 +198,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
173 198
174 if (!PageCompound(p)) 199 if (!PageCompound(p))
175 bad_page(__FUNCTION__, page); 200 bad_page(__FUNCTION__, page);
176 if (p->private != (unsigned long)page) 201 if (page_private(p) != (unsigned long)page)
177 bad_page(__FUNCTION__, page); 202 bad_page(__FUNCTION__, page);
178 ClearPageCompound(p); 203 ClearPageCompound(p);
179 } 204 }
@@ -186,18 +211,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
186 * So, we don't need atomic page->flags operations here. 211 * So, we don't need atomic page->flags operations here.
187 */ 212 */
188static inline unsigned long page_order(struct page *page) { 213static inline unsigned long page_order(struct page *page) {
189 return page->private; 214 return page_private(page);
190} 215}
191 216
192static inline void set_page_order(struct page *page, int order) { 217static inline void set_page_order(struct page *page, int order) {
193 page->private = order; 218 set_page_private(page, order);
194 __SetPagePrivate(page); 219 __SetPagePrivate(page);
195} 220}
196 221
197static inline void rmv_page_order(struct page *page) 222static inline void rmv_page_order(struct page *page)
198{ 223{
199 __ClearPagePrivate(page); 224 __ClearPagePrivate(page);
200 page->private = 0; 225 set_page_private(page, 0);
201} 226}
202 227
203/* 228/*
@@ -237,14 +262,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
237 * (a) the buddy is free && 262 * (a) the buddy is free &&
238 * (b) the buddy is on the buddy system && 263 * (b) the buddy is on the buddy system &&
239 * (c) a page and its buddy have the same order. 264 * (c) a page and its buddy have the same order.
240 * for recording page's order, we use page->private and PG_private. 265 * for recording page's order, we use page_private(page) and PG_private.
241 * 266 *
242 */ 267 */
243static inline int page_is_buddy(struct page *page, int order) 268static inline int page_is_buddy(struct page *page, int order)
244{ 269{
245 if (PagePrivate(page) && 270 if (PagePrivate(page) &&
246 (page_order(page) == order) && 271 (page_order(page) == order) &&
247 !PageReserved(page) &&
248 page_count(page) == 0) 272 page_count(page) == 0)
249 return 1; 273 return 1;
250 return 0; 274 return 0;
@@ -264,7 +288,7 @@ static inline int page_is_buddy(struct page *page, int order)
264 * parts of the VM system. 288 * parts of the VM system.
265 * At each level, we keep a list of pages, which are heads of continuous 289 * At each level, we keep a list of pages, which are heads of continuous
266 * free pages of length of (1 << order) and marked with PG_Private.Page's 290 * free pages of length of (1 << order) and marked with PG_Private.Page's
267 * order is recorded in page->private field. 291 * order is recorded in page_private(page) field.
268 * So when we are allocating or freeing one, we can derive the state of the 292 * So when we are allocating or freeing one, we can derive the state of the
269 * other. That is, if we allocate a small block, and both were 293 * other. That is, if we allocate a small block, and both were
270 * free, the remainder of the region must be split into blocks. 294 * free, the remainder of the region must be split into blocks.
@@ -327,7 +351,8 @@ static inline void free_pages_check(const char *function, struct page *page)
327 1 << PG_reclaim | 351 1 << PG_reclaim |
328 1 << PG_slab | 352 1 << PG_slab |
329 1 << PG_swapcache | 353 1 << PG_swapcache |
330 1 << PG_writeback ))) 354 1 << PG_writeback |
355 1 << PG_reserved )))
331 bad_page(function, page); 356 bad_page(function, page);
332 if (PageDirty(page)) 357 if (PageDirty(page))
333 __ClearPageDirty(page); 358 __ClearPageDirty(page);
@@ -455,13 +480,14 @@ static void prep_new_page(struct page *page, int order)
455 1 << PG_reclaim | 480 1 << PG_reclaim |
456 1 << PG_slab | 481 1 << PG_slab |
457 1 << PG_swapcache | 482 1 << PG_swapcache |
458 1 << PG_writeback ))) 483 1 << PG_writeback |
484 1 << PG_reserved )))
459 bad_page(__FUNCTION__, page); 485 bad_page(__FUNCTION__, page);
460 486
461 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 487 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
462 1 << PG_referenced | 1 << PG_arch_1 | 488 1 << PG_referenced | 1 << PG_arch_1 |
463 1 << PG_checked | 1 << PG_mappedtodisk); 489 1 << PG_checked | 1 << PG_mappedtodisk);
464 page->private = 0; 490 set_page_private(page, 0);
465 set_page_refs(page, order); 491 set_page_refs(page, order);
466 kernel_map_pages(page, 1 << order, 1); 492 kernel_map_pages(page, 1 << order, 1);
467} 493}
@@ -734,7 +760,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
734 * of the allocation. 760 * of the allocation.
735 */ 761 */
736int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 762int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
737 int classzone_idx, int can_try_harder, int gfp_high) 763 int classzone_idx, int can_try_harder, gfp_t gfp_high)
738{ 764{
739 /* free_pages my go negative - that's OK */ 765 /* free_pages my go negative - that's OK */
740 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 766 long min = mark, free_pages = z->free_pages - (1 << order) + 1;
@@ -777,7 +803,7 @@ struct page * fastcall
777__alloc_pages(gfp_t gfp_mask, unsigned int order, 803__alloc_pages(gfp_t gfp_mask, unsigned int order,
778 struct zonelist *zonelist) 804 struct zonelist *zonelist)
779{ 805{
780 const int wait = gfp_mask & __GFP_WAIT; 806 const gfp_t wait = gfp_mask & __GFP_WAIT;
781 struct zone **zones, *z; 807 struct zone **zones, *z;
782 struct page *page; 808 struct page *page;
783 struct reclaim_state reclaim_state; 809 struct reclaim_state reclaim_state;
@@ -996,7 +1022,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
996 * get_zeroed_page() returns a 32-bit address, which cannot represent 1022 * get_zeroed_page() returns a 32-bit address, which cannot represent
997 * a highmem page 1023 * a highmem page
998 */ 1024 */
999 BUG_ON(gfp_mask & __GFP_HIGHMEM); 1025 BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1000 1026
1001 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1027 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1002 if (page) 1028 if (page)
@@ -1016,7 +1042,7 @@ void __pagevec_free(struct pagevec *pvec)
1016 1042
1017fastcall void __free_pages(struct page *page, unsigned int order) 1043fastcall void __free_pages(struct page *page, unsigned int order)
1018{ 1044{
1019 if (!PageReserved(page) && put_page_testzero(page)) { 1045 if (put_page_testzero(page)) {
1020 if (order == 0) 1046 if (order == 0)
1021 free_hot_page(page); 1047 free_hot_page(page);
1022 else 1048 else
@@ -1089,7 +1115,7 @@ static unsigned int nr_free_zone_pages(int offset)
1089 */ 1115 */
1090unsigned int nr_free_buffer_pages(void) 1116unsigned int nr_free_buffer_pages(void)
1091{ 1117{
1092 return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); 1118 return nr_free_zone_pages(gfp_zone(GFP_USER));
1093} 1119}
1094 1120
1095/* 1121/*
@@ -1097,7 +1123,7 @@ unsigned int nr_free_buffer_pages(void)
1097 */ 1123 */
1098unsigned int nr_free_pagecache_pages(void) 1124unsigned int nr_free_pagecache_pages(void)
1099{ 1125{
1100 return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); 1126 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1101} 1127}
1102 1128
1103#ifdef CONFIG_HIGHMEM 1129#ifdef CONFIG_HIGHMEM
@@ -1305,12 +1331,9 @@ void show_free_areas(void)
1305 } else 1331 } else
1306 printk("\n"); 1332 printk("\n");
1307 1333
1308 for (cpu = 0; cpu < NR_CPUS; ++cpu) { 1334 for_each_cpu(cpu) {
1309 struct per_cpu_pageset *pageset; 1335 struct per_cpu_pageset *pageset;
1310 1336
1311 if (!cpu_possible(cpu))
1312 continue;
1313
1314 pageset = zone_pcp(zone, cpu); 1337 pageset = zone_pcp(zone, cpu);
1315 1338
1316 for (temperature = 0; temperature < 2; temperature++) 1339 for (temperature = 0; temperature < 2; temperature++)
@@ -1428,6 +1451,16 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
1428 return j; 1451 return j;
1429} 1452}
1430 1453
1454static inline int highest_zone(int zone_bits)
1455{
1456 int res = ZONE_NORMAL;
1457 if (zone_bits & (__force int)__GFP_HIGHMEM)
1458 res = ZONE_HIGHMEM;
1459 if (zone_bits & (__force int)__GFP_DMA)
1460 res = ZONE_DMA;
1461 return res;
1462}
1463
1431#ifdef CONFIG_NUMA 1464#ifdef CONFIG_NUMA
1432#define MAX_NODE_LOAD (num_online_nodes()) 1465#define MAX_NODE_LOAD (num_online_nodes())
1433static int __initdata node_load[MAX_NUMNODES]; 1466static int __initdata node_load[MAX_NUMNODES];
@@ -1524,11 +1557,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
1524 zonelist = pgdat->node_zonelists + i; 1557 zonelist = pgdat->node_zonelists + i;
1525 for (j = 0; zonelist->zones[j] != NULL; j++); 1558 for (j = 0; zonelist->zones[j] != NULL; j++);
1526 1559
1527 k = ZONE_NORMAL; 1560 k = highest_zone(i);
1528 if (i & __GFP_HIGHMEM)
1529 k = ZONE_HIGHMEM;
1530 if (i & __GFP_DMA)
1531 k = ZONE_DMA;
1532 1561
1533 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1562 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1534 zonelist->zones[j] = NULL; 1563 zonelist->zones[j] = NULL;
@@ -1549,12 +1578,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
1549 zonelist = pgdat->node_zonelists + i; 1578 zonelist = pgdat->node_zonelists + i;
1550 1579
1551 j = 0; 1580 j = 0;
1552 k = ZONE_NORMAL; 1581 k = highest_zone(i);
1553 if (i & __GFP_HIGHMEM)
1554 k = ZONE_HIGHMEM;
1555 if (i & __GFP_DMA)
1556 k = ZONE_DMA;
1557
1558 j = build_zonelists_node(pgdat, zonelist, j, k); 1582 j = build_zonelists_node(pgdat, zonelist, j, k);
1559 /* 1583 /*
1560 * Now we build the zonelist so that it contains the zones 1584 * Now we build the zonelist so that it contains the zones
@@ -1659,7 +1683,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1659 * up by free_all_bootmem() once the early boot process is 1683 * up by free_all_bootmem() once the early boot process is
1660 * done. Non-atomic initialization, single-pass. 1684 * done. Non-atomic initialization, single-pass.
1661 */ 1685 */
1662void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1686void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1663 unsigned long start_pfn) 1687 unsigned long start_pfn)
1664{ 1688{
1665 struct page *page; 1689 struct page *page;
@@ -1673,7 +1697,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1673 continue; 1697 continue;
1674 page = pfn_to_page(pfn); 1698 page = pfn_to_page(pfn);
1675 set_page_links(page, zone, nid, pfn); 1699 set_page_links(page, zone, nid, pfn);
1676 set_page_count(page, 0); 1700 set_page_count(page, 1);
1677 reset_page_mapcount(page); 1701 reset_page_mapcount(page);
1678 SetPageReserved(page); 1702 SetPageReserved(page);
1679 INIT_LIST_HEAD(&page->lru); 1703 INIT_LIST_HEAD(&page->lru);
@@ -1720,29 +1744,29 @@ static int __devinit zone_batchsize(struct zone *zone)
1720 1744
1721 /* 1745 /*
1722 * The per-cpu-pages pools are set to around 1000th of the 1746 * The per-cpu-pages pools are set to around 1000th of the
1723 * size of the zone. But no more than 1/4 of a meg - there's 1747 * size of the zone. But no more than 1/2 of a meg.
1724 * no point in going beyond the size of L2 cache.
1725 * 1748 *
1726 * OK, so we don't know how big the cache is. So guess. 1749 * OK, so we don't know how big the cache is. So guess.
1727 */ 1750 */
1728 batch = zone->present_pages / 1024; 1751 batch = zone->present_pages / 1024;
1729 if (batch * PAGE_SIZE > 256 * 1024) 1752 if (batch * PAGE_SIZE > 512 * 1024)
1730 batch = (256 * 1024) / PAGE_SIZE; 1753 batch = (512 * 1024) / PAGE_SIZE;
1731 batch /= 4; /* We effectively *= 4 below */ 1754 batch /= 4; /* We effectively *= 4 below */
1732 if (batch < 1) 1755 if (batch < 1)
1733 batch = 1; 1756 batch = 1;
1734 1757
1735 /* 1758 /*
1736 * Clamp the batch to a 2^n - 1 value. Having a power 1759 * We will be trying to allcoate bigger chunks of contiguous
1737 * of 2 value was found to be more likely to have 1760 * memory of the order of fls(batch). This should result in
1738 * suboptimal cache aliasing properties in some cases. 1761 * better cache coloring.
1739 * 1762 *
1740 * For example if 2 tasks are alternately allocating 1763 * A sanity check also to ensure that batch is still in limits.
1741 * batches of pages, one task can end up with a lot
1742 * of pages of one half of the possible page colors
1743 * and the other with pages of the other colors.
1744 */ 1764 */
1745 batch = (1 << fls(batch + batch/2)) - 1; 1765 batch = (1 << fls(batch + batch/2));
1766
1767 if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
1768 batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
1769
1746 return batch; 1770 return batch;
1747} 1771}
1748 1772
@@ -1754,7 +1778,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1754 1778
1755 pcp = &p->pcp[0]; /* hot */ 1779 pcp = &p->pcp[0]; /* hot */
1756 pcp->count = 0; 1780 pcp->count = 0;
1757 pcp->low = 2 * batch; 1781 pcp->low = 0;
1758 pcp->high = 6 * batch; 1782 pcp->high = 6 * batch;
1759 pcp->batch = max(1UL, 1 * batch); 1783 pcp->batch = max(1UL, 1 * batch);
1760 INIT_LIST_HEAD(&pcp->list); 1784 INIT_LIST_HEAD(&pcp->list);
@@ -1763,7 +1787,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1763 pcp->count = 0; 1787 pcp->count = 0;
1764 pcp->low = 0; 1788 pcp->low = 0;
1765 pcp->high = 2 * batch; 1789 pcp->high = 2 * batch;
1766 pcp->batch = max(1UL, 1 * batch); 1790 pcp->batch = max(1UL, batch/2);
1767 INIT_LIST_HEAD(&pcp->list); 1791 INIT_LIST_HEAD(&pcp->list);
1768} 1792}
1769 1793
@@ -1872,6 +1896,60 @@ void __init setup_per_cpu_pageset()
1872 1896
1873#endif 1897#endif
1874 1898
1899static __devinit
1900void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
1901{
1902 int i;
1903 struct pglist_data *pgdat = zone->zone_pgdat;
1904
1905 /*
1906 * The per-page waitqueue mechanism uses hashed waitqueues
1907 * per zone.
1908 */
1909 zone->wait_table_size = wait_table_size(zone_size_pages);
1910 zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
1911 zone->wait_table = (wait_queue_head_t *)
1912 alloc_bootmem_node(pgdat, zone->wait_table_size
1913 * sizeof(wait_queue_head_t));
1914
1915 for(i = 0; i < zone->wait_table_size; ++i)
1916 init_waitqueue_head(zone->wait_table + i);
1917}
1918
1919static __devinit void zone_pcp_init(struct zone *zone)
1920{
1921 int cpu;
1922 unsigned long batch = zone_batchsize(zone);
1923
1924 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1925#ifdef CONFIG_NUMA
1926 /* Early boot. Slab allocator not functional yet */
1927 zone->pageset[cpu] = &boot_pageset[cpu];
1928 setup_pageset(&boot_pageset[cpu],0);
1929#else
1930 setup_pageset(zone_pcp(zone,cpu), batch);
1931#endif
1932 }
1933 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1934 zone->name, zone->present_pages, batch);
1935}
1936
1937static __devinit void init_currently_empty_zone(struct zone *zone,
1938 unsigned long zone_start_pfn, unsigned long size)
1939{
1940 struct pglist_data *pgdat = zone->zone_pgdat;
1941
1942 zone_wait_table_init(zone, size);
1943 pgdat->nr_zones = zone_idx(zone) + 1;
1944
1945 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1946 zone->zone_start_pfn = zone_start_pfn;
1947
1948 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
1949
1950 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1951}
1952
1875/* 1953/*
1876 * Set up the zone data structures: 1954 * Set up the zone data structures:
1877 * - mark all pages reserved 1955 * - mark all pages reserved
@@ -1881,10 +1959,11 @@ void __init setup_per_cpu_pageset()
1881static void __init free_area_init_core(struct pglist_data *pgdat, 1959static void __init free_area_init_core(struct pglist_data *pgdat,
1882 unsigned long *zones_size, unsigned long *zholes_size) 1960 unsigned long *zones_size, unsigned long *zholes_size)
1883{ 1961{
1884 unsigned long i, j; 1962 unsigned long j;
1885 int cpu, nid = pgdat->node_id; 1963 int nid = pgdat->node_id;
1886 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1964 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1887 1965
1966 pgdat_resize_init(pgdat);
1888 pgdat->nr_zones = 0; 1967 pgdat->nr_zones = 0;
1889 init_waitqueue_head(&pgdat->kswapd_wait); 1968 init_waitqueue_head(&pgdat->kswapd_wait);
1890 pgdat->kswapd_max_order = 0; 1969 pgdat->kswapd_max_order = 0;
@@ -1892,7 +1971,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1892 for (j = 0; j < MAX_NR_ZONES; j++) { 1971 for (j = 0; j < MAX_NR_ZONES; j++) {
1893 struct zone *zone = pgdat->node_zones + j; 1972 struct zone *zone = pgdat->node_zones + j;
1894 unsigned long size, realsize; 1973 unsigned long size, realsize;
1895 unsigned long batch;
1896 1974
1897 realsize = size = zones_size[j]; 1975 realsize = size = zones_size[j];
1898 if (zholes_size) 1976 if (zholes_size)
@@ -1907,24 +1985,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1907 zone->name = zone_names[j]; 1985 zone->name = zone_names[j];
1908 spin_lock_init(&zone->lock); 1986 spin_lock_init(&zone->lock);
1909 spin_lock_init(&zone->lru_lock); 1987 spin_lock_init(&zone->lru_lock);
1988 zone_seqlock_init(zone);
1910 zone->zone_pgdat = pgdat; 1989 zone->zone_pgdat = pgdat;
1911 zone->free_pages = 0; 1990 zone->free_pages = 0;
1912 1991
1913 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1992 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1914 1993
1915 batch = zone_batchsize(zone); 1994 zone_pcp_init(zone);
1916
1917 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1918#ifdef CONFIG_NUMA
1919 /* Early boot. Slab allocator not functional yet */
1920 zone->pageset[cpu] = &boot_pageset[cpu];
1921 setup_pageset(&boot_pageset[cpu],0);
1922#else
1923 setup_pageset(zone_pcp(zone,cpu), batch);
1924#endif
1925 }
1926 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1927 zone_names[j], realsize, batch);
1928 INIT_LIST_HEAD(&zone->active_list); 1995 INIT_LIST_HEAD(&zone->active_list);
1929 INIT_LIST_HEAD(&zone->inactive_list); 1996 INIT_LIST_HEAD(&zone->inactive_list);
1930 zone->nr_scan_active = 0; 1997 zone->nr_scan_active = 0;
@@ -1935,32 +2002,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1935 if (!size) 2002 if (!size)
1936 continue; 2003 continue;
1937 2004
1938 /*
1939 * The per-page waitqueue mechanism uses hashed waitqueues
1940 * per zone.
1941 */
1942 zone->wait_table_size = wait_table_size(size);
1943 zone->wait_table_bits =
1944 wait_table_bits(zone->wait_table_size);
1945 zone->wait_table = (wait_queue_head_t *)
1946 alloc_bootmem_node(pgdat, zone->wait_table_size
1947 * sizeof(wait_queue_head_t));
1948
1949 for(i = 0; i < zone->wait_table_size; ++i)
1950 init_waitqueue_head(zone->wait_table + i);
1951
1952 pgdat->nr_zones = j+1;
1953
1954 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1955 zone->zone_start_pfn = zone_start_pfn;
1956
1957 memmap_init(size, nid, j, zone_start_pfn);
1958
1959 zonetable_add(zone, nid, j, zone_start_pfn, size); 2005 zonetable_add(zone, nid, j, zone_start_pfn, size);
1960 2006 init_currently_empty_zone(zone, zone_start_pfn, size);
1961 zone_start_pfn += size; 2007 zone_start_pfn += size;
1962
1963 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1964 } 2008 }
1965} 2009}
1966 2010
@@ -2360,7 +2404,7 @@ static void setup_per_zone_lowmem_reserve(void)
2360 * that the pages_{min,low,high} values for each zone are set correctly 2404 * that the pages_{min,low,high} values for each zone are set correctly
2361 * with respect to min_free_kbytes. 2405 * with respect to min_free_kbytes.
2362 */ 2406 */
2363static void setup_per_zone_pages_min(void) 2407void setup_per_zone_pages_min(void)
2364{ 2408{
2365 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 2409 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
2366 unsigned long lowmem_pages = 0; 2410 unsigned long lowmem_pages = 0;
diff --git a/mm/page_io.c b/mm/page_io.c
index 330e00d6db00..bb2b0d53889c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
91 unlock_page(page); 91 unlock_page(page);
92 goto out; 92 goto out;
93 } 93 }
94 bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write); 94 bio = get_swap_bio(GFP_NOIO, page_private(page), page,
95 end_swap_bio_write);
95 if (bio == NULL) { 96 if (bio == NULL) {
96 set_page_dirty(page); 97 set_page_dirty(page);
97 unlock_page(page); 98 unlock_page(page);
@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page)
115 116
116 BUG_ON(!PageLocked(page)); 117 BUG_ON(!PageLocked(page));
117 ClearPageUptodate(page); 118 ClearPageUptodate(page);
118 bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read); 119 bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
120 end_swap_bio_read);
119 if (bio == NULL) { 121 if (bio == NULL) {
120 unlock_page(page); 122 unlock_page(page);
121 ret = -ENOMEM; 123 ret = -ENOMEM;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index d6781951267e..52822c98c489 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -20,6 +20,7 @@
20#include <linux/fs.h> // Needed by writeback.h 20#include <linux/fs.h> // Needed by writeback.h
21#include <linux/writeback.h> // Prototypes pdflush_operation() 21#include <linux/writeback.h> // Prototypes pdflush_operation()
22#include <linux/kthread.h> 22#include <linux/kthread.h>
23#include <linux/cpuset.h>
23 24
24 25
25/* 26/*
@@ -170,12 +171,24 @@ static int __pdflush(struct pdflush_work *my_work)
170static int pdflush(void *dummy) 171static int pdflush(void *dummy)
171{ 172{
172 struct pdflush_work my_work; 173 struct pdflush_work my_work;
174 cpumask_t cpus_allowed;
173 175
174 /* 176 /*
175 * pdflush can spend a lot of time doing encryption via dm-crypt. We 177 * pdflush can spend a lot of time doing encryption via dm-crypt. We
176 * don't want to do that at keventd's priority. 178 * don't want to do that at keventd's priority.
177 */ 179 */
178 set_user_nice(current, 0); 180 set_user_nice(current, 0);
181
182 /*
183 * Some configs put our parent kthread in a limited cpuset,
184 * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
185 * Our needs are more modest - cut back to our cpusets cpus_allowed.
186 * This is needed as pdflush's are dynamically created and destroyed.
187 * The boottime pdflush's are easily placed w/o these 2 lines.
188 */
189 cpus_allowed = cpuset_cpus_allowed(current);
190 set_cpus_allowed(current, cpus_allowed);
191
179 return __pdflush(&my_work); 192 return __pdflush(&my_work);
180} 193}
181 194
diff --git a/mm/rmap.c b/mm/rmap.c
index 450f5241b5a5..914d04b98bee 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,7 +32,7 @@
32 * page->flags PG_locked (lock_page) 32 * page->flags PG_locked (lock_page)
33 * mapping->i_mmap_lock 33 * mapping->i_mmap_lock
34 * anon_vma->lock 34 * anon_vma->lock
35 * mm->page_table_lock 35 * mm->page_table_lock or pte_lock
36 * zone->lru_lock (in mark_page_accessed) 36 * zone->lru_lock (in mark_page_accessed)
37 * swap_lock (in swap_duplicate, swap_info_get) 37 * swap_lock (in swap_duplicate, swap_info_get)
38 * mmlist_lock (in mmput, drain_mmlist and others) 38 * mmlist_lock (in mmput, drain_mmlist and others)
@@ -244,37 +244,44 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
244/* 244/*
245 * Check that @page is mapped at @address into @mm. 245 * Check that @page is mapped at @address into @mm.
246 * 246 *
247 * On success returns with mapped pte and locked mm->page_table_lock. 247 * On success returns with pte mapped and locked.
248 */ 248 */
249pte_t *page_check_address(struct page *page, struct mm_struct *mm, 249pte_t *page_check_address(struct page *page, struct mm_struct *mm,
250 unsigned long address) 250 unsigned long address, spinlock_t **ptlp)
251{ 251{
252 pgd_t *pgd; 252 pgd_t *pgd;
253 pud_t *pud; 253 pud_t *pud;
254 pmd_t *pmd; 254 pmd_t *pmd;
255 pte_t *pte; 255 pte_t *pte;
256 spinlock_t *ptl;
256 257
257 /*
258 * We need the page_table_lock to protect us from page faults,
259 * munmap, fork, etc...
260 */
261 spin_lock(&mm->page_table_lock);
262 pgd = pgd_offset(mm, address); 258 pgd = pgd_offset(mm, address);
263 if (likely(pgd_present(*pgd))) { 259 if (!pgd_present(*pgd))
264 pud = pud_offset(pgd, address); 260 return NULL;
265 if (likely(pud_present(*pud))) { 261
266 pmd = pmd_offset(pud, address); 262 pud = pud_offset(pgd, address);
267 if (likely(pmd_present(*pmd))) { 263 if (!pud_present(*pud))
268 pte = pte_offset_map(pmd, address); 264 return NULL;
269 if (likely(pte_present(*pte) && 265
270 page_to_pfn(page) == pte_pfn(*pte))) 266 pmd = pmd_offset(pud, address);
271 return pte; 267 if (!pmd_present(*pmd))
272 pte_unmap(pte); 268 return NULL;
273 } 269
274 } 270 pte = pte_offset_map(pmd, address);
271 /* Make a quick check before getting the lock */
272 if (!pte_present(*pte)) {
273 pte_unmap(pte);
274 return NULL;
275 }
276
277 ptl = pte_lockptr(mm, pmd);
278 spin_lock(ptl);
279 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
280 *ptlp = ptl;
281 return pte;
275 } 282 }
276 spin_unlock(&mm->page_table_lock); 283 pte_unmap_unlock(pte, ptl);
277 return ERR_PTR(-ENOENT); 284 return NULL;
278} 285}
279 286
280/* 287/*
@@ -287,24 +294,28 @@ static int page_referenced_one(struct page *page,
287 struct mm_struct *mm = vma->vm_mm; 294 struct mm_struct *mm = vma->vm_mm;
288 unsigned long address; 295 unsigned long address;
289 pte_t *pte; 296 pte_t *pte;
297 spinlock_t *ptl;
290 int referenced = 0; 298 int referenced = 0;
291 299
292 address = vma_address(page, vma); 300 address = vma_address(page, vma);
293 if (address == -EFAULT) 301 if (address == -EFAULT)
294 goto out; 302 goto out;
295 303
296 pte = page_check_address(page, mm, address); 304 pte = page_check_address(page, mm, address, &ptl);
297 if (!IS_ERR(pte)) { 305 if (!pte)
298 if (ptep_clear_flush_young(vma, address, pte)) 306 goto out;
299 referenced++;
300 307
301 if (mm != current->mm && !ignore_token && has_swap_token(mm)) 308 if (ptep_clear_flush_young(vma, address, pte))
302 referenced++; 309 referenced++;
303 310
304 (*mapcount)--; 311 /* Pretend the page is referenced if the task has the
305 pte_unmap(pte); 312 swap token and is in the middle of a page fault. */
306 spin_unlock(&mm->page_table_lock); 313 if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
307 } 314 rwsem_is_locked(&mm->mmap_sem))
315 referenced++;
316
317 (*mapcount)--;
318 pte_unmap_unlock(pte, ptl);
308out: 319out:
309 return referenced; 320 return referenced;
310} 321}
@@ -434,15 +445,11 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
434 * @vma: the vm area in which the mapping is added 445 * @vma: the vm area in which the mapping is added
435 * @address: the user virtual address mapped 446 * @address: the user virtual address mapped
436 * 447 *
437 * The caller needs to hold the mm->page_table_lock. 448 * The caller needs to hold the pte lock.
438 */ 449 */
439void page_add_anon_rmap(struct page *page, 450void page_add_anon_rmap(struct page *page,
440 struct vm_area_struct *vma, unsigned long address) 451 struct vm_area_struct *vma, unsigned long address)
441{ 452{
442 BUG_ON(PageReserved(page));
443
444 inc_mm_counter(vma->vm_mm, anon_rss);
445
446 if (atomic_inc_and_test(&page->_mapcount)) { 453 if (atomic_inc_and_test(&page->_mapcount)) {
447 struct anon_vma *anon_vma = vma->anon_vma; 454 struct anon_vma *anon_vma = vma->anon_vma;
448 455
@@ -461,13 +468,12 @@ void page_add_anon_rmap(struct page *page,
461 * page_add_file_rmap - add pte mapping to a file page 468 * page_add_file_rmap - add pte mapping to a file page
462 * @page: the page to add the mapping to 469 * @page: the page to add the mapping to
463 * 470 *
464 * The caller needs to hold the mm->page_table_lock. 471 * The caller needs to hold the pte lock.
465 */ 472 */
466void page_add_file_rmap(struct page *page) 473void page_add_file_rmap(struct page *page)
467{ 474{
468 BUG_ON(PageAnon(page)); 475 BUG_ON(PageAnon(page));
469 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) 476 BUG_ON(!pfn_valid(page_to_pfn(page)));
470 return;
471 477
472 if (atomic_inc_and_test(&page->_mapcount)) 478 if (atomic_inc_and_test(&page->_mapcount))
473 inc_page_state(nr_mapped); 479 inc_page_state(nr_mapped);
@@ -477,12 +483,10 @@ void page_add_file_rmap(struct page *page)
477 * page_remove_rmap - take down pte mapping from a page 483 * page_remove_rmap - take down pte mapping from a page
478 * @page: page to remove mapping from 484 * @page: page to remove mapping from
479 * 485 *
480 * Caller needs to hold the mm->page_table_lock. 486 * The caller needs to hold the pte lock.
481 */ 487 */
482void page_remove_rmap(struct page *page) 488void page_remove_rmap(struct page *page)
483{ 489{
484 BUG_ON(PageReserved(page));
485
486 if (atomic_add_negative(-1, &page->_mapcount)) { 490 if (atomic_add_negative(-1, &page->_mapcount)) {
487 BUG_ON(page_mapcount(page) < 0); 491 BUG_ON(page_mapcount(page) < 0);
488 /* 492 /*
@@ -510,14 +514,15 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
510 unsigned long address; 514 unsigned long address;
511 pte_t *pte; 515 pte_t *pte;
512 pte_t pteval; 516 pte_t pteval;
517 spinlock_t *ptl;
513 int ret = SWAP_AGAIN; 518 int ret = SWAP_AGAIN;
514 519
515 address = vma_address(page, vma); 520 address = vma_address(page, vma);
516 if (address == -EFAULT) 521 if (address == -EFAULT)
517 goto out; 522 goto out;
518 523
519 pte = page_check_address(page, mm, address); 524 pte = page_check_address(page, mm, address, &ptl);
520 if (IS_ERR(pte)) 525 if (!pte)
521 goto out; 526 goto out;
522 527
523 /* 528 /*
@@ -541,8 +546,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
541 if (pte_dirty(pteval)) 546 if (pte_dirty(pteval))
542 set_page_dirty(page); 547 set_page_dirty(page);
543 548
549 /* Update high watermark before we lower rss */
550 update_hiwater_rss(mm);
551
544 if (PageAnon(page)) { 552 if (PageAnon(page)) {
545 swp_entry_t entry = { .val = page->private }; 553 swp_entry_t entry = { .val = page_private(page) };
546 /* 554 /*
547 * Store the swap location in the pte. 555 * Store the swap location in the pte.
548 * See handle_pte_fault() ... 556 * See handle_pte_fault() ...
@@ -551,21 +559,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
551 swap_duplicate(entry); 559 swap_duplicate(entry);
552 if (list_empty(&mm->mmlist)) { 560 if (list_empty(&mm->mmlist)) {
553 spin_lock(&mmlist_lock); 561 spin_lock(&mmlist_lock);
554 list_add(&mm->mmlist, &init_mm.mmlist); 562 if (list_empty(&mm->mmlist))
563 list_add(&mm->mmlist, &init_mm.mmlist);
555 spin_unlock(&mmlist_lock); 564 spin_unlock(&mmlist_lock);
556 } 565 }
557 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 566 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
558 BUG_ON(pte_file(*pte)); 567 BUG_ON(pte_file(*pte));
559 dec_mm_counter(mm, anon_rss); 568 dec_mm_counter(mm, anon_rss);
560 } 569 } else
570 dec_mm_counter(mm, file_rss);
561 571
562 dec_mm_counter(mm, rss);
563 page_remove_rmap(page); 572 page_remove_rmap(page);
564 page_cache_release(page); 573 page_cache_release(page);
565 574
566out_unmap: 575out_unmap:
567 pte_unmap(pte); 576 pte_unmap_unlock(pte, ptl);
568 spin_unlock(&mm->page_table_lock);
569out: 577out:
570 return ret; 578 return ret;
571} 579}
@@ -599,19 +607,14 @@ static void try_to_unmap_cluster(unsigned long cursor,
599 pgd_t *pgd; 607 pgd_t *pgd;
600 pud_t *pud; 608 pud_t *pud;
601 pmd_t *pmd; 609 pmd_t *pmd;
602 pte_t *pte, *original_pte; 610 pte_t *pte;
603 pte_t pteval; 611 pte_t pteval;
612 spinlock_t *ptl;
604 struct page *page; 613 struct page *page;
605 unsigned long address; 614 unsigned long address;
606 unsigned long end; 615 unsigned long end;
607 unsigned long pfn; 616 unsigned long pfn;
608 617
609 /*
610 * We need the page_table_lock to protect us from page faults,
611 * munmap, fork, etc...
612 */
613 spin_lock(&mm->page_table_lock);
614
615 address = (vma->vm_start + cursor) & CLUSTER_MASK; 618 address = (vma->vm_start + cursor) & CLUSTER_MASK;
616 end = address + CLUSTER_SIZE; 619 end = address + CLUSTER_SIZE;
617 if (address < vma->vm_start) 620 if (address < vma->vm_start)
@@ -621,30 +624,33 @@ static void try_to_unmap_cluster(unsigned long cursor,
621 624
622 pgd = pgd_offset(mm, address); 625 pgd = pgd_offset(mm, address);
623 if (!pgd_present(*pgd)) 626 if (!pgd_present(*pgd))
624 goto out_unlock; 627 return;
625 628
626 pud = pud_offset(pgd, address); 629 pud = pud_offset(pgd, address);
627 if (!pud_present(*pud)) 630 if (!pud_present(*pud))
628 goto out_unlock; 631 return;
629 632
630 pmd = pmd_offset(pud, address); 633 pmd = pmd_offset(pud, address);
631 if (!pmd_present(*pmd)) 634 if (!pmd_present(*pmd))
632 goto out_unlock; 635 return;
636
637 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
633 638
634 for (original_pte = pte = pte_offset_map(pmd, address); 639 /* Update high watermark before we lower rss */
635 address < end; pte++, address += PAGE_SIZE) { 640 update_hiwater_rss(mm);
636 641
642 for (; address < end; pte++, address += PAGE_SIZE) {
637 if (!pte_present(*pte)) 643 if (!pte_present(*pte))
638 continue; 644 continue;
639 645
640 pfn = pte_pfn(*pte); 646 pfn = pte_pfn(*pte);
641 if (!pfn_valid(pfn)) 647 if (unlikely(!pfn_valid(pfn))) {
648 print_bad_pte(vma, *pte, address);
642 continue; 649 continue;
650 }
643 651
644 page = pfn_to_page(pfn); 652 page = pfn_to_page(pfn);
645 BUG_ON(PageAnon(page)); 653 BUG_ON(PageAnon(page));
646 if (PageReserved(page))
647 continue;
648 654
649 if (ptep_clear_flush_young(vma, address, pte)) 655 if (ptep_clear_flush_young(vma, address, pte))
650 continue; 656 continue;
@@ -663,13 +669,10 @@ static void try_to_unmap_cluster(unsigned long cursor,
663 669
664 page_remove_rmap(page); 670 page_remove_rmap(page);
665 page_cache_release(page); 671 page_cache_release(page);
666 dec_mm_counter(mm, rss); 672 dec_mm_counter(mm, file_rss);
667 (*mapcount)--; 673 (*mapcount)--;
668 } 674 }
669 675 pte_unmap_unlock(pte - 1, ptl);
670 pte_unmap(original_pte);
671out_unlock:
672 spin_unlock(&mm->page_table_lock);
673} 676}
674 677
675static int try_to_unmap_anon(struct page *page) 678static int try_to_unmap_anon(struct page *page)
@@ -806,7 +809,6 @@ int try_to_unmap(struct page *page)
806{ 809{
807 int ret; 810 int ret;
808 811
809 BUG_ON(PageReserved(page));
810 BUG_ON(!PageLocked(page)); 812 BUG_ON(!PageLocked(page));
811 813
812 if (PageAnon(page)) 814 if (PageAnon(page))
diff --git a/mm/shmem.c b/mm/shmem.c
index ea064d89cda9..dc25565a61e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,9 +71,6 @@
71/* Pretend that each entry is of this size in directory's i_size */ 71/* Pretend that each entry is of this size in directory's i_size */
72#define BOGO_DIRENT_SIZE 20 72#define BOGO_DIRENT_SIZE 20
73 73
74/* Keep swapped page count in private field of indirect struct page */
75#define nr_swapped private
76
77/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 74/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
78enum sgp_type { 75enum sgp_type {
79 SGP_QUICK, /* don't try more than file page cache lookup */ 76 SGP_QUICK, /* don't try more than file page cache lookup */
@@ -85,7 +82,7 @@ enum sgp_type {
85static int shmem_getpage(struct inode *inode, unsigned long idx, 82static int shmem_getpage(struct inode *inode, unsigned long idx,
86 struct page **pagep, enum sgp_type sgp, int *type); 83 struct page **pagep, enum sgp_type sgp, int *type);
87 84
88static inline struct page *shmem_dir_alloc(unsigned int gfp_mask) 85static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
89{ 86{
90 /* 87 /*
91 * The above definition of ENTRIES_PER_PAGE, and the use of 88 * The above definition of ENTRIES_PER_PAGE, and the use of
@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
324 321
325 entry->val = value; 322 entry->val = value;
326 info->swapped += incdec; 323 info->swapped += incdec;
327 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) 324 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
328 kmap_atomic_to_page(entry)->nr_swapped += incdec; 325 struct page *page = kmap_atomic_to_page(entry);
326 set_page_private(page, page_private(page) + incdec);
327 }
329} 328}
330 329
331/* 330/*
@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
368 367
369 spin_unlock(&info->lock); 368 spin_unlock(&info->lock);
370 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); 369 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
371 if (page) { 370 if (page)
372 page->nr_swapped = 0; 371 set_page_private(page, 0);
373 }
374 spin_lock(&info->lock); 372 spin_lock(&info->lock);
375 373
376 if (!page) { 374 if (!page) {
@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode)
561 diroff = 0; 559 diroff = 0;
562 } 560 }
563 subdir = dir[diroff]; 561 subdir = dir[diroff];
564 if (subdir && subdir->nr_swapped) { 562 if (subdir && page_private(subdir)) {
565 size = limit - idx; 563 size = limit - idx;
566 if (size > ENTRIES_PER_PAGE) 564 if (size > ENTRIES_PER_PAGE)
567 size = ENTRIES_PER_PAGE; 565 size = ENTRIES_PER_PAGE;
@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode)
572 nr_swaps_freed += freed; 570 nr_swaps_freed += freed;
573 if (offset) 571 if (offset)
574 spin_lock(&info->lock); 572 spin_lock(&info->lock);
575 subdir->nr_swapped -= freed; 573 set_page_private(subdir, page_private(subdir) - freed);
576 if (offset) 574 if (offset)
577 spin_unlock(&info->lock); 575 spin_unlock(&info->lock);
578 BUG_ON(subdir->nr_swapped > offset); 576 BUG_ON(page_private(subdir) > offset);
579 } 577 }
580 if (offset) 578 if (offset)
581 offset = 0; 579 offset = 0;
@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
743 dir = shmem_dir_map(subdir); 741 dir = shmem_dir_map(subdir);
744 } 742 }
745 subdir = *dir; 743 subdir = *dir;
746 if (subdir && subdir->nr_swapped) { 744 if (subdir && page_private(subdir)) {
747 ptr = shmem_swp_map(subdir); 745 ptr = shmem_swp_map(subdir);
748 size = limit - idx; 746 size = limit - idx;
749 if (size > ENTRIES_PER_PAGE) 747 if (size > ENTRIES_PER_PAGE)
@@ -898,7 +896,7 @@ struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
898} 896}
899 897
900static struct page * 898static struct page *
901shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info, 899shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
902 unsigned long idx) 900 unsigned long idx)
903{ 901{
904 struct vm_area_struct pvma; 902 struct vm_area_struct pvma;
@@ -1201,7 +1199,7 @@ static int shmem_populate(struct vm_area_struct *vma,
1201 page_cache_release(page); 1199 page_cache_release(page);
1202 return err; 1200 return err;
1203 } 1201 }
1204 } else { 1202 } else if (vma->vm_flags & VM_NONLINEAR) {
1205 /* No page was found just because we can't read it in 1203 /* No page was found just because we can't read it in
1206 * now (being here implies nonblock != 0), but the page 1204 * now (being here implies nonblock != 0), but the page
1207 * may exist, so set the PTE to fault it in later. */ 1205 * may exist, so set the PTE to fault it in later. */
@@ -1506,8 +1504,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1506 */ 1504 */
1507 if (!offset) 1505 if (!offset)
1508 mark_page_accessed(page); 1506 mark_page_accessed(page);
1509 } else 1507 } else {
1510 page = ZERO_PAGE(0); 1508 page = ZERO_PAGE(0);
1509 page_cache_get(page);
1510 }
1511 1511
1512 /* 1512 /*
1513 * Ok, we have the page, and it's up-to-date, so 1513 * Ok, we have the page, and it's up-to-date, so
diff --git a/mm/slab.c b/mm/slab.c
index d05c678bceb3..22bfb0b2ac8b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -386,7 +386,7 @@ struct kmem_cache_s {
386 unsigned int gfporder; 386 unsigned int gfporder;
387 387
388 /* force GFP flags, e.g. GFP_DMA */ 388 /* force GFP flags, e.g. GFP_DMA */
389 unsigned int gfpflags; 389 gfp_t gfpflags;
390 390
391 size_t colour; /* cache colouring range */ 391 size_t colour; /* cache colouring range */
392 unsigned int colour_off; /* colour offset */ 392 unsigned int colour_off; /* colour offset */
@@ -2117,7 +2117,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
2117 slabp->free = 0; 2117 slabp->free = 0;
2118} 2118}
2119 2119
2120static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags) 2120static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
2121{ 2121{
2122 if (flags & SLAB_DMA) { 2122 if (flags & SLAB_DMA) {
2123 if (!(cachep->gfpflags & GFP_DMA)) 2123 if (!(cachep->gfpflags & GFP_DMA))
@@ -2152,7 +2152,7 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2152 struct slab *slabp; 2152 struct slab *slabp;
2153 void *objp; 2153 void *objp;
2154 size_t offset; 2154 size_t offset;
2155 unsigned int local_flags; 2155 gfp_t local_flags;
2156 unsigned long ctor_flags; 2156 unsigned long ctor_flags;
2157 struct kmem_list3 *l3; 2157 struct kmem_list3 *l3;
2158 2158
@@ -2419,6 +2419,7 @@ retry:
2419 next = slab_bufctl(slabp)[slabp->free]; 2419 next = slab_bufctl(slabp)[slabp->free];
2420#if DEBUG 2420#if DEBUG
2421 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2421 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2422 WARN_ON(numa_node_id() != slabp->nodeid);
2422#endif 2423#endif
2423 slabp->free = next; 2424 slabp->free = next;
2424 } 2425 }
@@ -2546,7 +2547,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2546/* 2547/*
2547 * A interface to enable slab creation on nodeid 2548 * A interface to enable slab creation on nodeid
2548 */ 2549 */
2549static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid) 2550static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2550{ 2551{
2551 struct list_head *entry; 2552 struct list_head *entry;
2552 struct slab *slabp; 2553 struct slab *slabp;
@@ -2633,8 +2634,10 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
2633 check_spinlock_acquired_node(cachep, node); 2634 check_spinlock_acquired_node(cachep, node);
2634 check_slabp(cachep, slabp); 2635 check_slabp(cachep, slabp);
2635 2636
2636
2637#if DEBUG 2637#if DEBUG
2638 /* Verify that the slab belongs to the intended node */
2639 WARN_ON(slabp->nodeid != node);
2640
2638 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2641 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2639 printk(KERN_ERR "slab: double free detected in cache " 2642 printk(KERN_ERR "slab: double free detected in cache "
2640 "'%s', objp %p\n", cachep->name, objp); 2643 "'%s', objp %p\n", cachep->name, objp);
diff --git a/mm/sparse.c b/mm/sparse.c
index 347249a4917a..72079b538e2d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,8 +5,10 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/mmzone.h> 6#include <linux/mmzone.h>
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/highmem.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/vmalloc.h>
10#include <asm/dma.h> 12#include <asm/dma.h>
11 13
12/* 14/*
@@ -72,6 +74,31 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
72} 74}
73#endif 75#endif
74 76
77/*
78 * Although written for the SPARSEMEM_EXTREME case, this happens
79 * to also work for the flat array case becase
80 * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
81 */
82int __section_nr(struct mem_section* ms)
83{
84 unsigned long root_nr;
85 struct mem_section* root;
86
87 for (root_nr = 0;
88 root_nr < NR_MEM_SECTIONS;
89 root_nr += SECTIONS_PER_ROOT) {
90 root = __nr_to_section(root_nr);
91
92 if (!root)
93 continue;
94
95 if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
96 break;
97 }
98
99 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
100}
101
75/* Record a memory area against a node. */ 102/* Record a memory area against a node. */
76void memory_present(int nid, unsigned long start, unsigned long end) 103void memory_present(int nid, unsigned long start, unsigned long end)
77{ 104{
@@ -162,6 +189,45 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
162 return NULL; 189 return NULL;
163} 190}
164 191
192static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
193{
194 struct page *page, *ret;
195 unsigned long memmap_size = sizeof(struct page) * nr_pages;
196
197 page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
198 if (page)
199 goto got_map_page;
200
201 ret = vmalloc(memmap_size);
202 if (ret)
203 goto got_map_ptr;
204
205 return NULL;
206got_map_page:
207 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
208got_map_ptr:
209 memset(ret, 0, memmap_size);
210
211 return ret;
212}
213
214static int vaddr_in_vmalloc_area(void *addr)
215{
216 if (addr >= (void *)VMALLOC_START &&
217 addr < (void *)VMALLOC_END)
218 return 1;
219 return 0;
220}
221
222static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
223{
224 if (vaddr_in_vmalloc_area(memmap))
225 vfree(memmap);
226 else
227 free_pages((unsigned long)memmap,
228 get_order(sizeof(struct page) * nr_pages));
229}
230
165/* 231/*
166 * Allocate the accumulated non-linear sections, allocate a mem_map 232 * Allocate the accumulated non-linear sections, allocate a mem_map
167 * for each and record the physical to section mapping. 233 * for each and record the physical to section mapping.
@@ -187,14 +253,37 @@ void sparse_init(void)
187 * set. If this is <=0, then that means that the passed-in 253 * set. If this is <=0, then that means that the passed-in
188 * map was not consumed and must be freed. 254 * map was not consumed and must be freed.
189 */ 255 */
190int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) 256int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
257 int nr_pages)
191{ 258{
192 struct mem_section *ms = __pfn_to_section(start_pfn); 259 unsigned long section_nr = pfn_to_section_nr(start_pfn);
260 struct pglist_data *pgdat = zone->zone_pgdat;
261 struct mem_section *ms;
262 struct page *memmap;
263 unsigned long flags;
264 int ret;
193 265
194 if (ms->section_mem_map & SECTION_MARKED_PRESENT) 266 /*
195 return -EEXIST; 267 * no locking for this, because it does its own
268 * plus, it does a kmalloc
269 */
270 sparse_index_init(section_nr, pgdat->node_id);
271 memmap = __kmalloc_section_memmap(nr_pages);
272
273 pgdat_resize_lock(pgdat, &flags);
196 274
275 ms = __pfn_to_section(start_pfn);
276 if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
277 ret = -EEXIST;
278 goto out;
279 }
197 ms->section_mem_map |= SECTION_MARKED_PRESENT; 280 ms->section_mem_map |= SECTION_MARKED_PRESENT;
198 281
199 return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); 282 ret = sparse_init_one_section(ms, section_nr, memmap);
283
284 if (ret <= 0)
285 __kfree_section_memmap(memmap, nr_pages);
286out:
287 pgdat_resize_unlock(pgdat, &flags);
288 return ret;
200} 289}
diff --git a/mm/swap.c b/mm/swap.c
index 7771d2803f62..96387e20184a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,7 +39,7 @@ int page_cluster;
39void put_page(struct page *page) 39void put_page(struct page *page)
40{ 40{
41 if (unlikely(PageCompound(page))) { 41 if (unlikely(PageCompound(page))) {
42 page = (struct page *)page->private; 42 page = (struct page *)page_private(page);
43 if (put_page_testzero(page)) { 43 if (put_page_testzero(page)) {
44 void (*dtor)(struct page *page); 44 void (*dtor)(struct page *page);
45 45
@@ -48,7 +48,7 @@ void put_page(struct page *page)
48 } 48 }
49 return; 49 return;
50 } 50 }
51 if (!PageReserved(page) && put_page_testzero(page)) 51 if (put_page_testzero(page))
52 __page_cache_release(page); 52 __page_cache_release(page);
53} 53}
54EXPORT_SYMBOL(put_page); 54EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
215 struct page *page = pages[i]; 215 struct page *page = pages[i];
216 struct zone *pagezone; 216 struct zone *pagezone;
217 217
218 if (PageReserved(page) || !put_page_testzero(page)) 218 if (!put_page_testzero(page))
219 continue; 219 continue;
220 220
221 pagezone = page_zone(page); 221 pagezone = page_zone(page);
@@ -270,7 +270,6 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
270 struct pagevec pages_to_free; 270 struct pagevec pages_to_free;
271 271
272 pagevec_init(&pages_to_free, pvec->cold); 272 pagevec_init(&pages_to_free, pvec->cold);
273 pages_to_free.cold = pvec->cold;
274 for (i = 0; i < pagevec_count(pvec); i++) { 273 for (i = 0; i < pagevec_count(pvec); i++) {
275 struct page *page = pvec->pages[i]; 274 struct page *page = pvec->pages[i];
276 275
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 132164f7d0a7..dfd9a46755b8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
83 page_cache_get(page); 83 page_cache_get(page);
84 SetPageLocked(page); 84 SetPageLocked(page);
85 SetPageSwapCache(page); 85 SetPageSwapCache(page);
86 page->private = entry.val; 86 set_page_private(page, entry.val);
87 total_swapcache_pages++; 87 total_swapcache_pages++;
88 pagecache_acct(1); 88 pagecache_acct(1);
89 } 89 }
@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page)
126 BUG_ON(PageWriteback(page)); 126 BUG_ON(PageWriteback(page));
127 BUG_ON(PagePrivate(page)); 127 BUG_ON(PagePrivate(page));
128 128
129 radix_tree_delete(&swapper_space.page_tree, page->private); 129 radix_tree_delete(&swapper_space.page_tree, page_private(page));
130 page->private = 0; 130 set_page_private(page, 0);
131 ClearPageSwapCache(page); 131 ClearPageSwapCache(page);
132 total_swapcache_pages--; 132 total_swapcache_pages--;
133 pagecache_acct(-1); 133 pagecache_acct(-1);
@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page)
197{ 197{
198 swp_entry_t entry; 198 swp_entry_t entry;
199 199
200 entry.val = page->private; 200 entry.val = page_private(page);
201 201
202 write_lock_irq(&swapper_space.tree_lock); 202 write_lock_irq(&swapper_space.tree_lock);
203 __delete_from_swap_cache(page); 203 __delete_from_swap_cache(page);
@@ -259,8 +259,7 @@ static inline void free_swap_cache(struct page *page)
259 259
260/* 260/*
261 * Perform a free_page(), also freeing any swap cache associated with 261 * Perform a free_page(), also freeing any swap cache associated with
262 * this page if it is the last user of the page. Can not do a lock_page, 262 * this page if it is the last user of the page.
263 * as we are holding the page_table_lock spinlock.
264 */ 263 */
265void free_page_and_swap_cache(struct page *page) 264void free_page_and_swap_cache(struct page *page)
266{ 265{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1dcaeda039f4..8970c0b74194 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
61 swp_entry_t entry; 61 swp_entry_t entry;
62 62
63 down_read(&swap_unplug_sem); 63 down_read(&swap_unplug_sem);
64 entry.val = page->private; 64 entry.val = page_private(page);
65 if (PageSwapCache(page)) { 65 if (PageSwapCache(page)) {
66 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 66 struct block_device *bdev = swap_info[swp_type(entry)].bdev;
67 struct backing_dev_info *bdi; 67 struct backing_dev_info *bdi;
@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
69 /* 69 /*
70 * If the page is removed from swapcache from under us (with a 70 * If the page is removed from swapcache from under us (with a
71 * racy try_to_unuse/swapoff) we need an additional reference 71 * racy try_to_unuse/swapoff) we need an additional reference
72 * count to avoid reading garbage from page->private above. If 72 * count to avoid reading garbage from page_private(page) above.
73 * the WARN_ON triggers during a swapoff it maybe the race 73 * If the WARN_ON triggers during a swapoff it maybe the race
74 * condition and it's harmless. However if it triggers without 74 * condition and it's harmless. However if it triggers without
75 * swapoff it signals a problem. 75 * swapoff it signals a problem.
76 */ 76 */
@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page)
294 struct swap_info_struct *p; 294 struct swap_info_struct *p;
295 swp_entry_t entry; 295 swp_entry_t entry;
296 296
297 entry.val = page->private; 297 entry.val = page_private(page);
298 p = swap_info_get(entry); 298 p = swap_info_get(entry);
299 if (p) { 299 if (p) {
300 /* Subtract the 1 for the swap cache itself */ 300 /* Subtract the 1 for the swap cache itself */
@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page)
339 if (page_count(page) != 2) /* 2: us + cache */ 339 if (page_count(page) != 2) /* 2: us + cache */
340 return 0; 340 return 0;
341 341
342 entry.val = page->private; 342 entry.val = page_private(page);
343 p = swap_info_get(entry); 343 p = swap_info_get(entry);
344 if (!p) 344 if (!p)
345 return 0; 345 return 0;
@@ -398,17 +398,14 @@ void free_swap_and_cache(swp_entry_t entry)
398} 398}
399 399
400/* 400/*
401 * Always set the resulting pte to be nowrite (the same as COW pages 401 * No need to decide whether this PTE shares the swap entry with others,
402 * after one process has exited). We don't know just how many PTEs will 402 * just let do_wp_page work it out if a write is requested later - to
403 * share this swap entry, so be cautious and let do_wp_page work out 403 * force COW, vm_page_prot omits write permission from any private vma.
404 * what to do if a write is requested later.
405 *
406 * vma->vm_mm->page_table_lock is held.
407 */ 404 */
408static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, 405static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
409 unsigned long addr, swp_entry_t entry, struct page *page) 406 unsigned long addr, swp_entry_t entry, struct page *page)
410{ 407{
411 inc_mm_counter(vma->vm_mm, rss); 408 inc_mm_counter(vma->vm_mm, anon_rss);
412 get_page(page); 409 get_page(page);
413 set_pte_at(vma->vm_mm, addr, pte, 410 set_pte_at(vma->vm_mm, addr, pte,
414 pte_mkold(mk_pte(page, vma->vm_page_prot))); 411 pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -425,23 +422,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
425 unsigned long addr, unsigned long end, 422 unsigned long addr, unsigned long end,
426 swp_entry_t entry, struct page *page) 423 swp_entry_t entry, struct page *page)
427{ 424{
428 pte_t *pte;
429 pte_t swp_pte = swp_entry_to_pte(entry); 425 pte_t swp_pte = swp_entry_to_pte(entry);
426 pte_t *pte;
427 spinlock_t *ptl;
428 int found = 0;
430 429
431 pte = pte_offset_map(pmd, addr); 430 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
432 do { 431 do {
433 /* 432 /*
434 * swapoff spends a _lot_ of time in this loop! 433 * swapoff spends a _lot_ of time in this loop!
435 * Test inline before going to call unuse_pte. 434 * Test inline before going to call unuse_pte.
436 */ 435 */
437 if (unlikely(pte_same(*pte, swp_pte))) { 436 if (unlikely(pte_same(*pte, swp_pte))) {
438 unuse_pte(vma, pte, addr, entry, page); 437 unuse_pte(vma, pte++, addr, entry, page);
439 pte_unmap(pte); 438 found = 1;
440 return 1; 439 break;
441 } 440 }
442 } while (pte++, addr += PAGE_SIZE, addr != end); 441 } while (pte++, addr += PAGE_SIZE, addr != end);
443 pte_unmap(pte - 1); 442 pte_unmap_unlock(pte - 1, ptl);
444 return 0; 443 return found;
445} 444}
446 445
447static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 446static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -523,12 +522,10 @@ static int unuse_mm(struct mm_struct *mm,
523 down_read(&mm->mmap_sem); 522 down_read(&mm->mmap_sem);
524 lock_page(page); 523 lock_page(page);
525 } 524 }
526 spin_lock(&mm->page_table_lock);
527 for (vma = mm->mmap; vma; vma = vma->vm_next) { 525 for (vma = mm->mmap; vma; vma = vma->vm_next) {
528 if (vma->anon_vma && unuse_vma(vma, entry, page)) 526 if (vma->anon_vma && unuse_vma(vma, entry, page))
529 break; 527 break;
530 } 528 }
531 spin_unlock(&mm->page_table_lock);
532 up_read(&mm->mmap_sem); 529 up_read(&mm->mmap_sem);
533 /* 530 /*
534 * Currently unuse_mm cannot fail, but leave error handling 531 * Currently unuse_mm cannot fail, but leave error handling
@@ -1045,7 +1042,7 @@ int page_queue_congested(struct page *page)
1045 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ 1042 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
1046 1043
1047 if (PageSwapCache(page)) { 1044 if (PageSwapCache(page)) {
1048 swp_entry_t entry = { .val = page->private }; 1045 swp_entry_t entry = { .val = page_private(page) };
1049 struct swap_info_struct *sis; 1046 struct swap_info_struct *sis;
1050 1047
1051 sis = get_swap_info_struct(swp_type(entry)); 1048 sis = get_swap_info_struct(swp_type(entry));
diff --git a/mm/thrash.c b/mm/thrash.c
index 11461f7ad830..eff3c18c33a1 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -19,7 +19,7 @@ static unsigned long swap_token_check;
19struct mm_struct * swap_token_mm = &init_mm; 19struct mm_struct * swap_token_mm = &init_mm;
20 20
21#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) 21#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
22#define SWAP_TOKEN_TIMEOUT 0 22#define SWAP_TOKEN_TIMEOUT (300 * HZ)
23/* 23/*
24 * Currently disabled; Needs further code to work at HZ * 300. 24 * Currently disabled; Needs further code to work at HZ * 300.
25 */ 25 */
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index c13a2161bca2..b58abcf44ed6 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -31,11 +31,14 @@ static struct vfsmount *shm_mnt;
31 31
32static int __init init_tmpfs(void) 32static int __init init_tmpfs(void)
33{ 33{
34 register_filesystem(&tmpfs_fs_type); 34 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
35
35#ifdef CONFIG_TMPFS 36#ifdef CONFIG_TMPFS
36 devfs_mk_dir("shm"); 37 devfs_mk_dir("shm");
37#endif 38#endif
38 shm_mnt = kern_mount(&tmpfs_fs_type); 39 shm_mnt = kern_mount(&tmpfs_fs_type);
40 BUG_ON(IS_ERR(shm_mnt));
41
39 return 0; 42 return 0;
40} 43}
41module_init(init_tmpfs) 44module_init(init_tmpfs)
diff --git a/mm/truncate.c b/mm/truncate.c
index 60c8764bfac2..29c18f68dc35 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -13,18 +13,9 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/pagevec.h> 14#include <linux/pagevec.h>
15#include <linux/buffer_head.h> /* grr. try_to_release_page, 15#include <linux/buffer_head.h> /* grr. try_to_release_page,
16 block_invalidatepage */ 16 do_invalidatepage */
17 17
18 18
19static int do_invalidatepage(struct page *page, unsigned long offset)
20{
21 int (*invalidatepage)(struct page *, unsigned long);
22 invalidatepage = page->mapping->a_ops->invalidatepage;
23 if (invalidatepage == NULL)
24 invalidatepage = block_invalidatepage;
25 return (*invalidatepage)(page, offset);
26}
27
28static inline void truncate_partial_page(struct page *page, unsigned partial) 19static inline void truncate_partial_page(struct page *page, unsigned partial)
29{ 20{
30 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); 21 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1150229b6366..54a90e83cb31 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -5,6 +5,7 @@
5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 6 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
7 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 7 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
8 * Numa awareness, Christoph Lameter, SGI, June 2005
8 */ 9 */
9 10
10#include <linux/mm.h> 11#include <linux/mm.h>
@@ -88,7 +89,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
88{ 89{
89 pte_t *pte; 90 pte_t *pte;
90 91
91 pte = pte_alloc_kernel(&init_mm, pmd, addr); 92 pte = pte_alloc_kernel(pmd, addr);
92 if (!pte) 93 if (!pte)
93 return -ENOMEM; 94 return -ENOMEM;
94 do { 95 do {
@@ -146,20 +147,18 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
146 147
147 BUG_ON(addr >= end); 148 BUG_ON(addr >= end);
148 pgd = pgd_offset_k(addr); 149 pgd = pgd_offset_k(addr);
149 spin_lock(&init_mm.page_table_lock);
150 do { 150 do {
151 next = pgd_addr_end(addr, end); 151 next = pgd_addr_end(addr, end);
152 err = vmap_pud_range(pgd, addr, next, prot, pages); 152 err = vmap_pud_range(pgd, addr, next, prot, pages);
153 if (err) 153 if (err)
154 break; 154 break;
155 } while (pgd++, addr = next, addr != end); 155 } while (pgd++, addr = next, addr != end);
156 spin_unlock(&init_mm.page_table_lock);
157 flush_cache_vmap((unsigned long) area->addr, end); 156 flush_cache_vmap((unsigned long) area->addr, end);
158 return err; 157 return err;
159} 158}
160 159
161struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 160struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
162 unsigned long start, unsigned long end) 161 unsigned long start, unsigned long end, int node)
163{ 162{
164 struct vm_struct **p, *tmp, *area; 163 struct vm_struct **p, *tmp, *area;
165 unsigned long align = 1; 164 unsigned long align = 1;
@@ -178,7 +177,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
178 addr = ALIGN(start, align); 177 addr = ALIGN(start, align);
179 size = PAGE_ALIGN(size); 178 size = PAGE_ALIGN(size);
180 179
181 area = kmalloc(sizeof(*area), GFP_KERNEL); 180 area = kmalloc_node(sizeof(*area), GFP_KERNEL, node);
182 if (unlikely(!area)) 181 if (unlikely(!area))
183 return NULL; 182 return NULL;
184 183
@@ -231,6 +230,12 @@ out:
231 return NULL; 230 return NULL;
232} 231}
233 232
233struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
234 unsigned long start, unsigned long end)
235{
236 return __get_vm_area_node(size, flags, start, end, -1);
237}
238
234/** 239/**
235 * get_vm_area - reserve a contingous kernel virtual area 240 * get_vm_area - reserve a contingous kernel virtual area
236 * 241 *
@@ -246,6 +251,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
246 return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); 251 return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
247} 252}
248 253
254struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node)
255{
256 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
257}
258
249/* Caller must hold vmlist_lock */ 259/* Caller must hold vmlist_lock */
250struct vm_struct *__remove_vm_area(void *addr) 260struct vm_struct *__remove_vm_area(void *addr)
251{ 261{
@@ -342,7 +352,6 @@ void vfree(void *addr)
342 BUG_ON(in_interrupt()); 352 BUG_ON(in_interrupt());
343 __vunmap(addr, 1); 353 __vunmap(addr, 1);
344} 354}
345
346EXPORT_SYMBOL(vfree); 355EXPORT_SYMBOL(vfree);
347 356
348/** 357/**
@@ -360,7 +369,6 @@ void vunmap(void *addr)
360 BUG_ON(in_interrupt()); 369 BUG_ON(in_interrupt());
361 __vunmap(addr, 0); 370 __vunmap(addr, 0);
362} 371}
363
364EXPORT_SYMBOL(vunmap); 372EXPORT_SYMBOL(vunmap);
365 373
366/** 374/**
@@ -392,10 +400,10 @@ void *vmap(struct page **pages, unsigned int count,
392 400
393 return area->addr; 401 return area->addr;
394} 402}
395
396EXPORT_SYMBOL(vmap); 403EXPORT_SYMBOL(vmap);
397 404
398void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) 405void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
406 pgprot_t prot, int node)
399{ 407{
400 struct page **pages; 408 struct page **pages;
401 unsigned int nr_pages, array_size, i; 409 unsigned int nr_pages, array_size, i;
@@ -406,9 +414,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
406 area->nr_pages = nr_pages; 414 area->nr_pages = nr_pages;
407 /* Please note that the recursion is strictly bounded. */ 415 /* Please note that the recursion is strictly bounded. */
408 if (array_size > PAGE_SIZE) 416 if (array_size > PAGE_SIZE)
409 pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL); 417 pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
410 else 418 else
411 pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); 419 pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
412 area->pages = pages; 420 area->pages = pages;
413 if (!area->pages) { 421 if (!area->pages) {
414 remove_vm_area(area->addr); 422 remove_vm_area(area->addr);
@@ -418,7 +426,10 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
418 memset(area->pages, 0, array_size); 426 memset(area->pages, 0, array_size);
419 427
420 for (i = 0; i < area->nr_pages; i++) { 428 for (i = 0; i < area->nr_pages; i++) {
421 area->pages[i] = alloc_page(gfp_mask); 429 if (node < 0)
430 area->pages[i] = alloc_page(gfp_mask);
431 else
432 area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
422 if (unlikely(!area->pages[i])) { 433 if (unlikely(!area->pages[i])) {
423 /* Successfully allocated i pages, free them in __vunmap() */ 434 /* Successfully allocated i pages, free them in __vunmap() */
424 area->nr_pages = i; 435 area->nr_pages = i;
@@ -435,18 +446,25 @@ fail:
435 return NULL; 446 return NULL;
436} 447}
437 448
449void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
450{
451 return __vmalloc_area_node(area, gfp_mask, prot, -1);
452}
453
438/** 454/**
439 * __vmalloc - allocate virtually contiguous memory 455 * __vmalloc_node - allocate virtually contiguous memory
440 * 456 *
441 * @size: allocation size 457 * @size: allocation size
442 * @gfp_mask: flags for the page level allocator 458 * @gfp_mask: flags for the page level allocator
443 * @prot: protection mask for the allocated pages 459 * @prot: protection mask for the allocated pages
460 * @node node to use for allocation or -1
444 * 461 *
445 * Allocate enough pages to cover @size from the page level 462 * Allocate enough pages to cover @size from the page level
446 * allocator with @gfp_mask flags. Map them into contiguous 463 * allocator with @gfp_mask flags. Map them into contiguous
447 * kernel virtual space, using a pagetable protection of @prot. 464 * kernel virtual space, using a pagetable protection of @prot.
448 */ 465 */
449void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 466void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
467 int node)
450{ 468{
451 struct vm_struct *area; 469 struct vm_struct *area;
452 470
@@ -454,13 +472,18 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
454 if (!size || (size >> PAGE_SHIFT) > num_physpages) 472 if (!size || (size >> PAGE_SHIFT) > num_physpages)
455 return NULL; 473 return NULL;
456 474
457 area = get_vm_area(size, VM_ALLOC); 475 area = get_vm_area_node(size, VM_ALLOC, node);
458 if (!area) 476 if (!area)
459 return NULL; 477 return NULL;
460 478
461 return __vmalloc_area(area, gfp_mask, prot); 479 return __vmalloc_area_node(area, gfp_mask, prot, node);
462} 480}
481EXPORT_SYMBOL(__vmalloc_node);
463 482
483void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
484{
485 return __vmalloc_node(size, gfp_mask, prot, -1);
486}
464EXPORT_SYMBOL(__vmalloc); 487EXPORT_SYMBOL(__vmalloc);
465 488
466/** 489/**
@@ -478,9 +501,26 @@ void *vmalloc(unsigned long size)
478{ 501{
479 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); 502 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
480} 503}
481
482EXPORT_SYMBOL(vmalloc); 504EXPORT_SYMBOL(vmalloc);
483 505
506/**
507 * vmalloc_node - allocate memory on a specific node
508 *
509 * @size: allocation size
510 * @node; numa node
511 *
512 * Allocate enough pages to cover @size from the page level
513 * allocator and map them into contiguous kernel virtual space.
514 *
515 * For tight cotrol over page level allocator and protection flags
516 * use __vmalloc() instead.
517 */
518void *vmalloc_node(unsigned long size, int node)
519{
520 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
521}
522EXPORT_SYMBOL(vmalloc_node);
523
484#ifndef PAGE_KERNEL_EXEC 524#ifndef PAGE_KERNEL_EXEC
485# define PAGE_KERNEL_EXEC PAGE_KERNEL 525# define PAGE_KERNEL_EXEC PAGE_KERNEL
486#endif 526#endif
@@ -515,7 +555,6 @@ void *vmalloc_32(unsigned long size)
515{ 555{
516 return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); 556 return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
517} 557}
518
519EXPORT_SYMBOL(vmalloc_32); 558EXPORT_SYMBOL(vmalloc_32);
520 559
521long vread(char *buf, char *addr, unsigned long count) 560long vread(char *buf, char *addr, unsigned long count)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 64f9570cff56..135bf8ca96ee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -70,7 +70,7 @@ struct scan_control {
70 unsigned int priority; 70 unsigned int priority;
71 71
72 /* This context's GFP mask */ 72 /* This context's GFP mask */
73 unsigned int gfp_mask; 73 gfp_t gfp_mask;
74 74
75 int may_writepage; 75 int may_writepage;
76 76
@@ -186,7 +186,7 @@ EXPORT_SYMBOL(remove_shrinker);
186 * 186 *
187 * Returns the number of slab objects which we shrunk. 187 * Returns the number of slab objects which we shrunk.
188 */ 188 */
189static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, 189static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
190 unsigned long lru_pages) 190 unsigned long lru_pages)
191{ 191{
192 struct shrinker *shrinker; 192 struct shrinker *shrinker;
@@ -417,7 +417,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
417 * Anonymous process memory has backing store? 417 * Anonymous process memory has backing store?
418 * Try to allocate it some swap space here. 418 * Try to allocate it some swap space here.
419 */ 419 */
420 if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) { 420 if (PageAnon(page) && !PageSwapCache(page)) {
421 if (!sc->may_swap)
422 goto keep_locked;
421 if (!add_to_swap(page)) 423 if (!add_to_swap(page))
422 goto activate_locked; 424 goto activate_locked;
423 } 425 }
@@ -519,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
519 521
520#ifdef CONFIG_SWAP 522#ifdef CONFIG_SWAP
521 if (PageSwapCache(page)) { 523 if (PageSwapCache(page)) {
522 swp_entry_t swap = { .val = page->private }; 524 swp_entry_t swap = { .val = page_private(page) };
523 __delete_from_swap_cache(page); 525 __delete_from_swap_cache(page);
524 write_unlock_irq(&mapping->tree_lock); 526 write_unlock_irq(&mapping->tree_lock);
525 swap_free(swap); 527 swap_free(swap);
@@ -926,7 +928,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
926 * holds filesystem locks which prevent writeout this might not work, and the 928 * holds filesystem locks which prevent writeout this might not work, and the
927 * allocation attempt will fail. 929 * allocation attempt will fail.
928 */ 930 */
929int try_to_free_pages(struct zone **zones, unsigned int gfp_mask) 931int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
930{ 932{
931 int priority; 933 int priority;
932 int ret = 0; 934 int ret = 0;
@@ -1338,7 +1340,7 @@ module_init(kswapd_init)
1338/* 1340/*
1339 * Try to free up some pages from this zone through reclaim. 1341 * Try to free up some pages from this zone through reclaim.
1340 */ 1342 */
1341int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order) 1343int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1342{ 1344{
1343 struct scan_control sc; 1345 struct scan_control sc;
1344 int nr_pages = 1 << order; 1346 int nr_pages = 1 << order;