aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <nickpiggin@yahoo.com.au>2005-10-29 21:16:12 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 00:40:39 -0400
commitb5810039a54e5babf428e9a1e89fc1940fabff11 (patch)
tree835836cb527ec9bd525f93eb7e016f3dfb8c8ae2
parentf9c98d0287de42221c624482fd4f8d485c98ab22 (diff)
[PATCH] core remove PageReserved
Remove PageReserved() calls from core code by tightening VM_RESERVED handling in mm/ to cover PageReserved functionality. PageReserved special casing is removed from get_page and put_page. All setting and clearing of PageReserved is retained, and it is now flagged in the page_alloc checks to help ensure we don't introduce any refcount based freeing of Reserved pages. MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being deprecated. We never completely handled it correctly anyway, and is be reintroduced in future if required (Hugh has a proof of concept). Once PageReserved() calls are removed from kernel/power/swsusp.c, and all arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can be trivially removed. Last real user of PageReserved is swsusp, which uses PageReserved to determine whether a struct page points to valid memory or not. This still needs to be addressed (a generic page_is_ram() should work). A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and thus mapcounted and count towards shared rss). These writes to the struct page could cause excessive cacheline bouncing on big systems. There are a number of ways this could be addressed if it is an issue. Signed-off-by: Nick Piggin <npiggin@suse.de> Refcount bug fix for filemap_xip.c Signed-off-by: Carsten Otte <cotte@de.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/ppc64/kernel/vdso.c12
-rw-r--r--arch/sparc/mm/generic.c3
-rw-r--r--arch/sparc64/mm/generic.c3
-rw-r--r--drivers/scsi/sg.c12
-rw-r--r--drivers/scsi/st.c10
-rw-r--r--fs/direct-io.c4
-rw-r--r--include/linux/mm.h5
-rw-r--r--kernel/power/swsusp.c25
-rw-r--r--mm/bootmem.c1
-rw-r--r--mm/filemap_xip.c11
-rw-r--r--mm/fremap.c23
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory.c131
-rw-r--r--mm/mempolicy.c29
-rw-r--r--mm/mmap.c11
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/msync.c17
-rw-r--r--mm/page_alloc.c14
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/swap.c4
-rw-r--r--sound/core/pcm_native.c9
22 files changed, 218 insertions, 134 deletions
diff --git a/arch/ppc64/kernel/vdso.c b/arch/ppc64/kernel/vdso.c
index efa985f05aca..4aacf521e3e4 100644
--- a/arch/ppc64/kernel/vdso.c
+++ b/arch/ppc64/kernel/vdso.c
@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
176 return NOPAGE_SIGBUS; 176 return NOPAGE_SIGBUS;
177 177
178 /* 178 /*
179 * Last page is systemcfg, special handling here, no get_page() a 179 * Last page is systemcfg.
180 * this is a reserved page
181 */ 180 */
182 if ((vma->vm_end - address) <= PAGE_SIZE) 181 if ((vma->vm_end - address) <= PAGE_SIZE)
183 return virt_to_page(systemcfg); 182 pg = virt_to_page(systemcfg);
183 else
184 pg = virt_to_page(vbase + offset);
184 185
185 pg = virt_to_page(vbase + offset);
186 get_page(pg); 186 get_page(pg);
187 DBG(" ->page count: %d\n", page_count(pg)); 187 DBG(" ->page count: %d\n", page_count(pg));
188 188
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
259 * gettimeofday will be totally dead. It's fine to use that for setting 259 * gettimeofday will be totally dead. It's fine to use that for setting
260 * breakpoints in the vDSO code pages though 260 * breakpoints in the vDSO code pages though
261 */ 261 */
262 vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 262 vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED;
263 vma->vm_flags |= mm->def_flags; 263 vma->vm_flags |= mm->def_flags;
264 vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; 264 vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
265 vma->vm_ops = &vdso_vmops; 265 vma->vm_ops = &vdso_vmops;
@@ -603,6 +603,8 @@ void __init vdso_init(void)
603 ClearPageReserved(pg); 603 ClearPageReserved(pg);
604 get_page(pg); 604 get_page(pg);
605 } 605 }
606
607 get_page(virt_to_page(systemcfg));
606} 608}
607 609
608int in_gate_area_no_task(unsigned long addr) 610int in_gate_area_no_task(unsigned long addr)
diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c
index 20ccb957fb77..659c9a71f867 100644
--- a/arch/sparc/mm/generic.c
+++ b/arch/sparc/mm/generic.c
@@ -73,6 +73,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
73 int space = GET_IOSPACE(pfn); 73 int space = GET_IOSPACE(pfn);
74 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; 74 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
75 75
76 /* See comment in mm/memory.c remap_pfn_range */
77 vma->vm_flags |= VM_IO | VM_RESERVED;
78
76 prot = __pgprot(pg_iobits); 79 prot = __pgprot(pg_iobits);
77 offset -= from; 80 offset -= from;
78 dir = pgd_offset(mm, from); 81 dir = pgd_offset(mm, from);
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c
index c954d91f01d0..afc01cec701f 100644
--- a/arch/sparc64/mm/generic.c
+++ b/arch/sparc64/mm/generic.c
@@ -127,6 +127,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
127 int space = GET_IOSPACE(pfn); 127 int space = GET_IOSPACE(pfn);
128 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; 128 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
129 129
130 /* See comment in mm/memory.c remap_pfn_range */
131 vma->vm_flags |= VM_IO | VM_RESERVED;
132
130 prot = __pgprot(pg_iobits); 133 prot = __pgprot(pg_iobits);
131 offset -= from; 134 offset -= from;
132 dir = pgd_offset(mm, from); 135 dir = pgd_offset(mm, from);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 861e51375d70..2d30b46806bf 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1886,13 +1886,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
1886 int i; 1886 int i;
1887 1887
1888 for (i=0; i < nr_pages; i++) { 1888 for (i=0; i < nr_pages; i++) {
1889 if (dirtied && !PageReserved(sgl[i].page)) 1889 struct page *page = sgl[i].page;
1890 SetPageDirty(sgl[i].page); 1890
1891 /* unlock_page(sgl[i].page); */ 1891 /* XXX: just for debug. Remove when PageReserved is removed */
1892 BUG_ON(PageReserved(page));
1893 if (dirtied)
1894 SetPageDirty(page);
1895 /* unlock_page(page); */
1892 /* FIXME: cache flush missing for rw==READ 1896 /* FIXME: cache flush missing for rw==READ
1893 * FIXME: call the correct reference counting function 1897 * FIXME: call the correct reference counting function
1894 */ 1898 */
1895 page_cache_release(sgl[i].page); 1899 page_cache_release(page);
1896 } 1900 }
1897 1901
1898 return 0; 1902 return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 5eb54d8019b4..da9766283bd7 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
4526 int i; 4526 int i;
4527 4527
4528 for (i=0; i < nr_pages; i++) { 4528 for (i=0; i < nr_pages; i++) {
4529 if (dirtied && !PageReserved(sgl[i].page)) 4529 struct page *page = sgl[i].page;
4530 SetPageDirty(sgl[i].page); 4530
4531 /* XXX: just for debug. Remove when PageReserved is removed */
4532 BUG_ON(PageReserved(page));
4533 if (dirtied)
4534 SetPageDirty(page);
4531 /* FIXME: cache flush missing for rw==READ 4535 /* FIXME: cache flush missing for rw==READ
4532 * FIXME: call the correct reference counting function 4536 * FIXME: call the correct reference counting function
4533 */ 4537 */
4534 page_cache_release(sgl[i].page); 4538 page_cache_release(page);
4535 } 4539 }
4536 4540
4537 return 0; 4541 return 0;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0d06097bc995..3931e7f1e6bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
162 up_read(&current->mm->mmap_sem); 162 up_read(&current->mm->mmap_sem);
163 163
164 if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) { 164 if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
165 struct page *page = ZERO_PAGE(dio->curr_user_address);
165 /* 166 /*
166 * A memory fault, but the filesystem has some outstanding 167 * A memory fault, but the filesystem has some outstanding
167 * mapped blocks. We need to use those blocks up to avoid 168 * mapped blocks. We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
169 */ 170 */
170 if (dio->page_errors == 0) 171 if (dio->page_errors == 0)
171 dio->page_errors = ret; 172 dio->page_errors = ret;
172 dio->pages[0] = ZERO_PAGE(dio->curr_user_address); 173 page_cache_get(page);
174 dio->pages[0] = page;
173 dio->head = 0; 175 dio->head = 0;
174 dio->tail = 1; 176 dio->tail = 1;
175 ret = 0; 177 ret = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0c64484d8ae0..da42093250c3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp);
157 157
158#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ 158#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
159#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ 159#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
160#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ 160#define VM_RESERVED 0x00080000 /* Pages managed in a special way */
161#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 161#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
162#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 162#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
163#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ 163#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
@@ -338,7 +338,7 @@ static inline void get_page(struct page *page)
338 338
339static inline void put_page(struct page *page) 339static inline void put_page(struct page *page)
340{ 340{
341 if (!PageReserved(page) && put_page_testzero(page)) 341 if (put_page_testzero(page))
342 __page_cache_release(page); 342 __page_cache_release(page);
343} 343}
344 344
@@ -723,6 +723,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
723 723
724int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, 724int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
725 int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); 725 int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
726void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
726 727
727int __set_page_dirty_buffers(struct page *page); 728int __set_page_dirty_buffers(struct page *page);
728int __set_page_dirty_nobuffers(struct page *page); 729int __set_page_dirty_nobuffers(struct page *page);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 10bc5ec496d7..016504ccfccf 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone)
578 continue; 578 continue;
579 page = pfn_to_page(pfn); 579 page = pfn_to_page(pfn);
580 /* 580 /*
581 * This condition results from rvmalloc() sans vmalloc_32() 581 * PageReserved results from rvmalloc() sans vmalloc_32()
582 * and architectural memory reservations. This should be 582 * and architectural memory reservations.
583 * corrected eventually when the cases giving rise to this 583 *
584 * are better understood. 584 * rvmalloc should not cause this, because all implementations
585 * appear to always be using vmalloc_32 on architectures with
586 * highmem. This is a good thing, because we would like to save
587 * rvmalloc pages.
588 *
589 * It appears to be triggered by pages which do not point to
590 * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
591 * which sets PageReserved if the page does not point to valid
592 * RAM.
593 *
594 * XXX: must remove usage of PageReserved!
585 */ 595 */
586 if (PageReserved(page)) { 596 if (PageReserved(page))
587 printk("highmem reserved page?!\n");
588 continue; 597 continue;
589 }
590 BUG_ON(PageNosave(page)); 598 BUG_ON(PageNosave(page));
591 if (PageNosaveFree(page)) 599 if (PageNosaveFree(page))
592 continue; 600 continue;
@@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
672 return 0; 680 return 0;
673 681
674 page = pfn_to_page(pfn); 682 page = pfn_to_page(pfn);
675 BUG_ON(PageReserved(page) && PageNosave(page));
676 if (PageNosave(page)) 683 if (PageNosave(page))
677 return 0; 684 return 0;
678 if (PageReserved(page) && pfn_is_nosave(pfn)) { 685 if (pfn_is_nosave(pfn)) {
679 pr_debug("[nosave pfn 0x%lx]", pfn); 686 pr_debug("[nosave pfn 0x%lx]", pfn);
680 return 0; 687 return 0;
681 } 688 }
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a58699b6579e..e8c567177dcf 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
305 if (j + 16 < BITS_PER_LONG) 305 if (j + 16 < BITS_PER_LONG)
306 prefetchw(page + j + 16); 306 prefetchw(page + j + 16);
307 __ClearPageReserved(page + j); 307 __ClearPageReserved(page + j);
308 set_page_count(page + j, 0);
308 } 309 }
309 __free_pages(page, order); 310 __free_pages(page, order);
310 i += BITS_PER_LONG; 311 i += BITS_PER_LONG;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8c199f537732..9354ee279b13 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapping,
174 unsigned long address; 174 unsigned long address;
175 pte_t *pte; 175 pte_t *pte;
176 pte_t pteval; 176 pte_t pteval;
177 struct page *page = ZERO_PAGE(address);
177 178
178 spin_lock(&mapping->i_mmap_lock); 179 spin_lock(&mapping->i_mmap_lock);
179 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 180 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -185,15 +186,17 @@ __xip_unmap (struct address_space * mapping,
185 * We need the page_table_lock to protect us from page faults, 186 * We need the page_table_lock to protect us from page faults,
186 * munmap, fork, etc... 187 * munmap, fork, etc...
187 */ 188 */
188 pte = page_check_address(ZERO_PAGE(address), mm, 189 pte = page_check_address(page, mm, address);
189 address);
190 if (!IS_ERR(pte)) { 190 if (!IS_ERR(pte)) {
191 /* Nuke the page table entry. */ 191 /* Nuke the page table entry. */
192 flush_cache_page(vma, address, pte_pfn(*pte)); 192 flush_cache_page(vma, address, pte_pfn(*pte));
193 pteval = ptep_clear_flush(vma, address, pte); 193 pteval = ptep_clear_flush(vma, address, pte);
194 page_remove_rmap(page);
195 dec_mm_counter(mm, file_rss);
194 BUG_ON(pte_dirty(pteval)); 196 BUG_ON(pte_dirty(pteval));
195 pte_unmap(pte); 197 pte_unmap(pte);
196 spin_unlock(&mm->page_table_lock); 198 spin_unlock(&mm->page_table_lock);
199 page_cache_release(page);
197 } 200 }
198 } 201 }
199 spin_unlock(&mapping->i_mmap_lock); 202 spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +231,7 @@ xip_file_nopage(struct vm_area_struct * area,
228 231
229 page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); 232 page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
230 if (!IS_ERR(page)) { 233 if (!IS_ERR(page)) {
231 return page; 234 goto out;
232 } 235 }
233 if (PTR_ERR(page) != -ENODATA) 236 if (PTR_ERR(page) != -ENODATA)
234 return NULL; 237 return NULL;
@@ -249,6 +252,8 @@ xip_file_nopage(struct vm_area_struct * area,
249 page = ZERO_PAGE(address); 252 page = ZERO_PAGE(address);
250 } 253 }
251 254
255out:
256 page_cache_get(page);
252 return page; 257 return page;
253} 258}
254 259
diff --git a/mm/fremap.c b/mm/fremap.c
index fd7f2a17ff3e..224cc1598b35 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -29,19 +29,20 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
29 return; 29 return;
30 if (pte_present(pte)) { 30 if (pte_present(pte)) {
31 unsigned long pfn = pte_pfn(pte); 31 unsigned long pfn = pte_pfn(pte);
32 struct page *page;
32 33
33 flush_cache_page(vma, addr, pfn); 34 flush_cache_page(vma, addr, pfn);
34 pte = ptep_clear_flush(vma, addr, ptep); 35 pte = ptep_clear_flush(vma, addr, ptep);
35 if (pfn_valid(pfn)) { 36 if (unlikely(!pfn_valid(pfn))) {
36 struct page *page = pfn_to_page(pfn); 37 print_bad_pte(vma, pte, addr);
37 if (!PageReserved(page)) { 38 return;
38 if (pte_dirty(pte))
39 set_page_dirty(page);
40 page_remove_rmap(page);
41 page_cache_release(page);
42 dec_mm_counter(mm, file_rss);
43 }
44 } 39 }
40 page = pfn_to_page(pfn);
41 if (pte_dirty(pte))
42 set_page_dirty(page);
43 page_remove_rmap(page);
44 page_cache_release(page);
45 dec_mm_counter(mm, file_rss);
45 } else { 46 } else {
46 if (!pte_file(pte)) 47 if (!pte_file(pte))
47 free_swap_and_cache(pte_to_swp_entry(pte)); 48 free_swap_and_cache(pte_to_swp_entry(pte));
@@ -65,6 +66,8 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
65 pgd_t *pgd; 66 pgd_t *pgd;
66 pte_t pte_val; 67 pte_t pte_val;
67 68
69 BUG_ON(vma->vm_flags & VM_RESERVED);
70
68 pgd = pgd_offset(mm, addr); 71 pgd = pgd_offset(mm, addr);
69 spin_lock(&mm->page_table_lock); 72 spin_lock(&mm->page_table_lock);
70 73
@@ -125,6 +128,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
125 pgd_t *pgd; 128 pgd_t *pgd;
126 pte_t pte_val; 129 pte_t pte_val;
127 130
131 BUG_ON(vma->vm_flags & VM_RESERVED);
132
128 pgd = pgd_offset(mm, addr); 133 pgd = pgd_offset(mm, addr);
129 spin_lock(&mm->page_table_lock); 134 spin_lock(&mm->page_table_lock);
130 135
diff --git a/mm/madvise.c b/mm/madvise.c
index 20e075d1c64c..17aaf3e16449 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
126 unsigned long start, unsigned long end) 126 unsigned long start, unsigned long end)
127{ 127{
128 *prev = vma; 128 *prev = vma;
129 if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) 129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
130 return -EINVAL; 130 return -EINVAL;
131 131
132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index da642b5528fa..e83f9440bb66 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -343,6 +343,23 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
343#define NO_RSS 2 /* Increment neither file_rss nor anon_rss */ 343#define NO_RSS 2 /* Increment neither file_rss nor anon_rss */
344 344
345/* 345/*
346 * This function is called to print an error when a pte in a
347 * !VM_RESERVED region is found pointing to an invalid pfn (which
348 * is an error.
349 *
350 * The calling function must still handle the error.
351 */
352void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
353{
354 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
355 "vm_flags = %lx, vaddr = %lx\n",
356 (long long)pte_val(pte),
357 (vma->vm_mm == current->mm ? current->comm : "???"),
358 vma->vm_flags, vaddr);
359 dump_stack();
360}
361
362/*
346 * copy one vm_area from one task to the other. Assumes the page tables 363 * copy one vm_area from one task to the other. Assumes the page tables
347 * already present in the new task to be cleared in the whole range 364 * already present in the new task to be cleared in the whole range
348 * covered by this vma. 365 * covered by this vma.
@@ -353,9 +370,10 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
353 370
354static inline int 371static inline int
355copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 372copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
356 pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, 373 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
357 unsigned long addr) 374 unsigned long addr)
358{ 375{
376 unsigned long vm_flags = vma->vm_flags;
359 pte_t pte = *src_pte; 377 pte_t pte = *src_pte;
360 struct page *page; 378 struct page *page;
361 unsigned long pfn; 379 unsigned long pfn;
@@ -375,18 +393,22 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
375 goto out_set_pte; 393 goto out_set_pte;
376 } 394 }
377 395
396 /* If the region is VM_RESERVED, the mapping is not
397 * mapped via rmap - duplicate the pte as is.
398 */
399 if (vm_flags & VM_RESERVED)
400 goto out_set_pte;
401
378 pfn = pte_pfn(pte); 402 pfn = pte_pfn(pte);
379 /* the pte points outside of valid memory, the 403 /* If the pte points outside of valid memory but
380 * mapping is assumed to be good, meaningful 404 * the region is not VM_RESERVED, we have a problem.
381 * and not mapped via rmap - duplicate the
382 * mapping as is.
383 */ 405 */
384 page = NULL; 406 if (unlikely(!pfn_valid(pfn))) {
385 if (pfn_valid(pfn)) 407 print_bad_pte(vma, pte, addr);
386 page = pfn_to_page(pfn); 408 goto out_set_pte; /* try to do something sane */
409 }
387 410
388 if (!page || PageReserved(page)) 411 page = pfn_to_page(pfn);
389 goto out_set_pte;
390 412
391 /* 413 /*
392 * If it's a COW mapping, write protect it both 414 * If it's a COW mapping, write protect it both
@@ -418,7 +440,6 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
418 unsigned long addr, unsigned long end) 440 unsigned long addr, unsigned long end)
419{ 441{
420 pte_t *src_pte, *dst_pte; 442 pte_t *src_pte, *dst_pte;
421 unsigned long vm_flags = vma->vm_flags;
422 int progress = 0; 443 int progress = 0;
423 int rss[NO_RSS+1], anon; 444 int rss[NO_RSS+1], anon;
424 445
@@ -446,8 +467,7 @@ again:
446 progress++; 467 progress++;
447 continue; 468 continue;
448 } 469 }
449 anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, 470 anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma,addr);
450 vm_flags, addr);
451 rss[anon]++; 471 rss[anon]++;
452 progress += 8; 472 progress += 8;
453 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 473 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -541,10 +561,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
541 return 0; 561 return 0;
542} 562}
543 563
544static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 564static void zap_pte_range(struct mmu_gather *tlb,
565 struct vm_area_struct *vma, pmd_t *pmd,
545 unsigned long addr, unsigned long end, 566 unsigned long addr, unsigned long end,
546 struct zap_details *details) 567 struct zap_details *details)
547{ 568{
569 struct mm_struct *mm = tlb->mm;
548 pte_t *pte; 570 pte_t *pte;
549 int file_rss = 0; 571 int file_rss = 0;
550 int anon_rss = 0; 572 int anon_rss = 0;
@@ -556,11 +578,12 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
556 continue; 578 continue;
557 if (pte_present(ptent)) { 579 if (pte_present(ptent)) {
558 struct page *page = NULL; 580 struct page *page = NULL;
559 unsigned long pfn = pte_pfn(ptent); 581 if (!(vma->vm_flags & VM_RESERVED)) {
560 if (pfn_valid(pfn)) { 582 unsigned long pfn = pte_pfn(ptent);
561 page = pfn_to_page(pfn); 583 if (unlikely(!pfn_valid(pfn)))
562 if (PageReserved(page)) 584 print_bad_pte(vma, ptent, addr);
563 page = NULL; 585 else
586 page = pfn_to_page(pfn);
564 } 587 }
565 if (unlikely(details) && page) { 588 if (unlikely(details) && page) {
566 /* 589 /*
@@ -580,7 +603,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
580 page->index > details->last_index)) 603 page->index > details->last_index))
581 continue; 604 continue;
582 } 605 }
583 ptent = ptep_get_and_clear_full(tlb->mm, addr, pte, 606 ptent = ptep_get_and_clear_full(mm, addr, pte,
584 tlb->fullmm); 607 tlb->fullmm);
585 tlb_remove_tlb_entry(tlb, pte, addr); 608 tlb_remove_tlb_entry(tlb, pte, addr);
586 if (unlikely(!page)) 609 if (unlikely(!page))
@@ -588,7 +611,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
588 if (unlikely(details) && details->nonlinear_vma 611 if (unlikely(details) && details->nonlinear_vma
589 && linear_page_index(details->nonlinear_vma, 612 && linear_page_index(details->nonlinear_vma,
590 addr) != page->index) 613 addr) != page->index)
591 set_pte_at(tlb->mm, addr, pte, 614 set_pte_at(mm, addr, pte,
592 pgoff_to_pte(page->index)); 615 pgoff_to_pte(page->index));
593 if (PageAnon(page)) 616 if (PageAnon(page))
594 anon_rss++; 617 anon_rss++;
@@ -611,14 +634,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
611 continue; 634 continue;
612 if (!pte_file(ptent)) 635 if (!pte_file(ptent))
613 free_swap_and_cache(pte_to_swp_entry(ptent)); 636 free_swap_and_cache(pte_to_swp_entry(ptent));
614 pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); 637 pte_clear_full(mm, addr, pte, tlb->fullmm);
615 } while (pte++, addr += PAGE_SIZE, addr != end); 638 } while (pte++, addr += PAGE_SIZE, addr != end);
616 639
617 add_mm_rss(tlb->mm, -file_rss, -anon_rss); 640 add_mm_rss(mm, -file_rss, -anon_rss);
618 pte_unmap(pte - 1); 641 pte_unmap(pte - 1);
619} 642}
620 643
621static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, 644static inline void zap_pmd_range(struct mmu_gather *tlb,
645 struct vm_area_struct *vma, pud_t *pud,
622 unsigned long addr, unsigned long end, 646 unsigned long addr, unsigned long end,
623 struct zap_details *details) 647 struct zap_details *details)
624{ 648{
@@ -630,11 +654,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
630 next = pmd_addr_end(addr, end); 654 next = pmd_addr_end(addr, end);
631 if (pmd_none_or_clear_bad(pmd)) 655 if (pmd_none_or_clear_bad(pmd))
632 continue; 656 continue;
633 zap_pte_range(tlb, pmd, addr, next, details); 657 zap_pte_range(tlb, vma, pmd, addr, next, details);
634 } while (pmd++, addr = next, addr != end); 658 } while (pmd++, addr = next, addr != end);
635} 659}
636 660
637static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 661static inline void zap_pud_range(struct mmu_gather *tlb,
662 struct vm_area_struct *vma, pgd_t *pgd,
638 unsigned long addr, unsigned long end, 663 unsigned long addr, unsigned long end,
639 struct zap_details *details) 664 struct zap_details *details)
640{ 665{
@@ -646,7 +671,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
646 next = pud_addr_end(addr, end); 671 next = pud_addr_end(addr, end);
647 if (pud_none_or_clear_bad(pud)) 672 if (pud_none_or_clear_bad(pud))
648 continue; 673 continue;
649 zap_pmd_range(tlb, pud, addr, next, details); 674 zap_pmd_range(tlb, vma, pud, addr, next, details);
650 } while (pud++, addr = next, addr != end); 675 } while (pud++, addr = next, addr != end);
651} 676}
652 677
@@ -667,7 +692,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
667 next = pgd_addr_end(addr, end); 692 next = pgd_addr_end(addr, end);
668 if (pgd_none_or_clear_bad(pgd)) 693 if (pgd_none_or_clear_bad(pgd))
669 continue; 694 continue;
670 zap_pud_range(tlb, pgd, addr, next, details); 695 zap_pud_range(tlb, vma, pgd, addr, next, details);
671 } while (pgd++, addr = next, addr != end); 696 } while (pgd++, addr = next, addr != end);
672 tlb_end_vma(tlb, vma); 697 tlb_end_vma(tlb, vma);
673} 698}
@@ -967,7 +992,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
967 continue; 992 continue;
968 } 993 }
969 994
970 if (!vma || (vma->vm_flags & VM_IO) 995 if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
971 || !(flags & vma->vm_flags)) 996 || !(flags & vma->vm_flags))
972 return i ? : -EFAULT; 997 return i ? : -EFAULT;
973 998
@@ -1027,8 +1052,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1027 if (pages) { 1052 if (pages) {
1028 pages[i] = page; 1053 pages[i] = page;
1029 flush_dcache_page(page); 1054 flush_dcache_page(page);
1030 if (!PageReserved(page)) 1055 page_cache_get(page);
1031 page_cache_get(page);
1032 } 1056 }
1033 if (vmas) 1057 if (vmas)
1034 vmas[i] = vma; 1058 vmas[i] = vma;
@@ -1051,7 +1075,11 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1051 if (!pte) 1075 if (!pte)
1052 return -ENOMEM; 1076 return -ENOMEM;
1053 do { 1077 do {
1054 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); 1078 struct page *page = ZERO_PAGE(addr);
1079 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1080 page_cache_get(page);
1081 page_add_file_rmap(page);
1082 inc_mm_counter(mm, file_rss);
1055 BUG_ON(!pte_none(*pte)); 1083 BUG_ON(!pte_none(*pte));
1056 set_pte_at(mm, addr, pte, zero_pte); 1084 set_pte_at(mm, addr, pte, zero_pte);
1057 } while (pte++, addr += PAGE_SIZE, addr != end); 1085 } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1132,8 +1160,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1132 return -ENOMEM; 1160 return -ENOMEM;
1133 do { 1161 do {
1134 BUG_ON(!pte_none(*pte)); 1162 BUG_ON(!pte_none(*pte));
1135 if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) 1163 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1136 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1137 pfn++; 1164 pfn++;
1138 } while (pte++, addr += PAGE_SIZE, addr != end); 1165 } while (pte++, addr += PAGE_SIZE, addr != end);
1139 pte_unmap(pte - 1); 1166 pte_unmap(pte - 1);
@@ -1195,8 +1222,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1195 * rest of the world about it: 1222 * rest of the world about it:
1196 * VM_IO tells people not to look at these pages 1223 * VM_IO tells people not to look at these pages
1197 * (accesses can have side effects). 1224 * (accesses can have side effects).
1198 * VM_RESERVED tells swapout not to try to touch 1225 * VM_RESERVED tells the core MM not to "manage" these pages
1199 * this region. 1226 * (e.g. refcount, mapcount, try to swap them out).
1200 */ 1227 */
1201 vma->vm_flags |= VM_IO | VM_RESERVED; 1228 vma->vm_flags |= VM_IO | VM_RESERVED;
1202 1229
@@ -1256,11 +1283,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1256 pte_t entry; 1283 pte_t entry;
1257 int ret = VM_FAULT_MINOR; 1284 int ret = VM_FAULT_MINOR;
1258 1285
1286 BUG_ON(vma->vm_flags & VM_RESERVED);
1287
1259 if (unlikely(!pfn_valid(pfn))) { 1288 if (unlikely(!pfn_valid(pfn))) {
1260 /* 1289 /*
1261 * Page table corrupted: show pte and kill process. 1290 * Page table corrupted: show pte and kill process.
1262 */ 1291 */
1263 pte_ERROR(orig_pte); 1292 print_bad_pte(vma, orig_pte, address);
1264 ret = VM_FAULT_OOM; 1293 ret = VM_FAULT_OOM;
1265 goto unlock; 1294 goto unlock;
1266 } 1295 }
@@ -1284,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1284 /* 1313 /*
1285 * Ok, we need to copy. Oh, well.. 1314 * Ok, we need to copy. Oh, well..
1286 */ 1315 */
1287 if (!PageReserved(old_page)) 1316 page_cache_get(old_page);
1288 page_cache_get(old_page);
1289 pte_unmap(page_table); 1317 pte_unmap(page_table);
1290 spin_unlock(&mm->page_table_lock); 1318 spin_unlock(&mm->page_table_lock);
1291 1319
@@ -1308,14 +1336,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1308 spin_lock(&mm->page_table_lock); 1336 spin_lock(&mm->page_table_lock);
1309 page_table = pte_offset_map(pmd, address); 1337 page_table = pte_offset_map(pmd, address);
1310 if (likely(pte_same(*page_table, orig_pte))) { 1338 if (likely(pte_same(*page_table, orig_pte))) {
1311 if (PageReserved(old_page)) 1339 page_remove_rmap(old_page);
1340 if (!PageAnon(old_page)) {
1312 inc_mm_counter(mm, anon_rss); 1341 inc_mm_counter(mm, anon_rss);
1313 else { 1342 dec_mm_counter(mm, file_rss);
1314 page_remove_rmap(old_page);
1315 if (!PageAnon(old_page)) {
1316 inc_mm_counter(mm, anon_rss);
1317 dec_mm_counter(mm, file_rss);
1318 }
1319 } 1343 }
1320 flush_cache_page(vma, address, pfn); 1344 flush_cache_page(vma, address, pfn);
1321 entry = mk_pte(new_page, vma->vm_page_prot); 1345 entry = mk_pte(new_page, vma->vm_page_prot);
@@ -1769,14 +1793,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1769 unsigned long address, pte_t *page_table, pmd_t *pmd, 1793 unsigned long address, pte_t *page_table, pmd_t *pmd,
1770 int write_access) 1794 int write_access)
1771{ 1795{
1796 struct page *page = ZERO_PAGE(addr);
1772 pte_t entry; 1797 pte_t entry;
1773 1798
1774 /* Mapping of ZERO_PAGE - vm_page_prot is readonly */ 1799 /* Mapping of ZERO_PAGE - vm_page_prot is readonly */
1775 entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot); 1800 entry = mk_pte(page, vma->vm_page_prot);
1776 1801
1777 if (write_access) { 1802 if (write_access) {
1778 struct page *page;
1779
1780 /* Allocate our own private page. */ 1803 /* Allocate our own private page. */
1781 pte_unmap(page_table); 1804 pte_unmap(page_table);
1782 spin_unlock(&mm->page_table_lock); 1805 spin_unlock(&mm->page_table_lock);
@@ -1800,6 +1823,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1800 lru_cache_add_active(page); 1823 lru_cache_add_active(page);
1801 SetPageReferenced(page); 1824 SetPageReferenced(page);
1802 page_add_anon_rmap(page, vma, address); 1825 page_add_anon_rmap(page, vma, address);
1826 } else {
1827 inc_mm_counter(mm, file_rss);
1828 page_add_file_rmap(page);
1829 page_cache_get(page);
1803 } 1830 }
1804 1831
1805 set_pte_at(mm, address, page_table, entry); 1832 set_pte_at(mm, address, page_table, entry);
@@ -1916,7 +1943,7 @@ retry:
1916 inc_mm_counter(mm, anon_rss); 1943 inc_mm_counter(mm, anon_rss);
1917 lru_cache_add_active(new_page); 1944 lru_cache_add_active(new_page);
1918 page_add_anon_rmap(new_page, vma, address); 1945 page_add_anon_rmap(new_page, vma, address);
1919 } else if (!PageReserved(new_page)) { 1946 } else if (!(vma->vm_flags & VM_RESERVED)) {
1920 inc_mm_counter(mm, file_rss); 1947 inc_mm_counter(mm, file_rss);
1921 page_add_file_rmap(new_page); 1948 page_add_file_rmap(new_page);
1922 } 1949 }
@@ -1957,7 +1984,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
1957 /* 1984 /*
1958 * Page table corrupted: show pte and kill process. 1985 * Page table corrupted: show pte and kill process.
1959 */ 1986 */
1960 pte_ERROR(orig_pte); 1987 print_bad_pte(vma, orig_pte, address);
1961 return VM_FAULT_OOM; 1988 return VM_FAULT_OOM;
1962 } 1989 }
1963 /* We can then assume vm->vm_ops && vma->vm_ops->populate */ 1990 /* We can then assume vm->vm_ops && vma->vm_ops->populate */
@@ -2232,7 +2259,7 @@ static int __init gate_vma_init(void)
2232 gate_vma.vm_start = FIXADDR_USER_START; 2259 gate_vma.vm_start = FIXADDR_USER_START;
2233 gate_vma.vm_end = FIXADDR_USER_END; 2260 gate_vma.vm_end = FIXADDR_USER_END;
2234 gate_vma.vm_page_prot = PAGE_READONLY; 2261 gate_vma.vm_page_prot = PAGE_READONLY;
2235 gate_vma.vm_flags = 0; 2262 gate_vma.vm_flags = VM_RESERVED;
2236 return 0; 2263 return 0;
2237} 2264}
2238__initcall(gate_vma_init); 2265__initcall(gate_vma_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 43b1199af591..11d824f282f1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -223,13 +223,13 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
223} 223}
224 224
225/* Ensure all existing pages follow the policy. */ 225/* Ensure all existing pages follow the policy. */
226static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, 226static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
227 unsigned long addr, unsigned long end, nodemask_t *nodes) 227 unsigned long addr, unsigned long end, nodemask_t *nodes)
228{ 228{
229 pte_t *orig_pte; 229 pte_t *orig_pte;
230 pte_t *pte; 230 pte_t *pte;
231 231
232 spin_lock(&mm->page_table_lock); 232 spin_lock(&vma->vm_mm->page_table_lock);
233 orig_pte = pte = pte_offset_map(pmd, addr); 233 orig_pte = pte = pte_offset_map(pmd, addr);
234 do { 234 do {
235 unsigned long pfn; 235 unsigned long pfn;
@@ -238,18 +238,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
238 if (!pte_present(*pte)) 238 if (!pte_present(*pte))
239 continue; 239 continue;
240 pfn = pte_pfn(*pte); 240 pfn = pte_pfn(*pte);
241 if (!pfn_valid(pfn)) 241 if (!pfn_valid(pfn)) {
242 print_bad_pte(vma, *pte, addr);
242 continue; 243 continue;
244 }
243 nid = pfn_to_nid(pfn); 245 nid = pfn_to_nid(pfn);
244 if (!node_isset(nid, *nodes)) 246 if (!node_isset(nid, *nodes))
245 break; 247 break;
246 } while (pte++, addr += PAGE_SIZE, addr != end); 248 } while (pte++, addr += PAGE_SIZE, addr != end);
247 pte_unmap(orig_pte); 249 pte_unmap(orig_pte);
248 spin_unlock(&mm->page_table_lock); 250 spin_unlock(&vma->vm_mm->page_table_lock);
249 return addr != end; 251 return addr != end;
250} 252}
251 253
252static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, 254static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
253 unsigned long addr, unsigned long end, nodemask_t *nodes) 255 unsigned long addr, unsigned long end, nodemask_t *nodes)
254{ 256{
255 pmd_t *pmd; 257 pmd_t *pmd;
@@ -260,13 +262,13 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
260 next = pmd_addr_end(addr, end); 262 next = pmd_addr_end(addr, end);
261 if (pmd_none_or_clear_bad(pmd)) 263 if (pmd_none_or_clear_bad(pmd))
262 continue; 264 continue;
263 if (check_pte_range(mm, pmd, addr, next, nodes)) 265 if (check_pte_range(vma, pmd, addr, next, nodes))
264 return -EIO; 266 return -EIO;
265 } while (pmd++, addr = next, addr != end); 267 } while (pmd++, addr = next, addr != end);
266 return 0; 268 return 0;
267} 269}
268 270
269static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, 271static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
270 unsigned long addr, unsigned long end, nodemask_t *nodes) 272 unsigned long addr, unsigned long end, nodemask_t *nodes)
271{ 273{
272 pud_t *pud; 274 pud_t *pud;
@@ -277,24 +279,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
277 next = pud_addr_end(addr, end); 279 next = pud_addr_end(addr, end);
278 if (pud_none_or_clear_bad(pud)) 280 if (pud_none_or_clear_bad(pud))
279 continue; 281 continue;
280 if (check_pmd_range(mm, pud, addr, next, nodes)) 282 if (check_pmd_range(vma, pud, addr, next, nodes))
281 return -EIO; 283 return -EIO;
282 } while (pud++, addr = next, addr != end); 284 } while (pud++, addr = next, addr != end);
283 return 0; 285 return 0;
284} 286}
285 287
286static inline int check_pgd_range(struct mm_struct *mm, 288static inline int check_pgd_range(struct vm_area_struct *vma,
287 unsigned long addr, unsigned long end, nodemask_t *nodes) 289 unsigned long addr, unsigned long end, nodemask_t *nodes)
288{ 290{
289 pgd_t *pgd; 291 pgd_t *pgd;
290 unsigned long next; 292 unsigned long next;
291 293
292 pgd = pgd_offset(mm, addr); 294 pgd = pgd_offset(vma->vm_mm, addr);
293 do { 295 do {
294 next = pgd_addr_end(addr, end); 296 next = pgd_addr_end(addr, end);
295 if (pgd_none_or_clear_bad(pgd)) 297 if (pgd_none_or_clear_bad(pgd))
296 continue; 298 continue;
297 if (check_pud_range(mm, pgd, addr, next, nodes)) 299 if (check_pud_range(vma, pgd, addr, next, nodes))
298 return -EIO; 300 return -EIO;
299 } while (pgd++, addr = next, addr != end); 301 } while (pgd++, addr = next, addr != end);
300 return 0; 302 return 0;
@@ -311,6 +313,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
311 first = find_vma(mm, start); 313 first = find_vma(mm, start);
312 if (!first) 314 if (!first)
313 return ERR_PTR(-EFAULT); 315 return ERR_PTR(-EFAULT);
316 if (first->vm_flags & VM_RESERVED)
317 return ERR_PTR(-EACCES);
314 prev = NULL; 318 prev = NULL;
315 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 319 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
316 if (!vma->vm_next && vma->vm_end < end) 320 if (!vma->vm_next && vma->vm_end < end)
@@ -323,8 +327,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
323 endvma = end; 327 endvma = end;
324 if (vma->vm_start > start) 328 if (vma->vm_start > start)
325 start = vma->vm_start; 329 start = vma->vm_start;
326 err = check_pgd_range(vma->vm_mm, 330 err = check_pgd_range(vma, start, endvma, nodes);
327 start, endvma, nodes);
328 if (err) { 331 if (err) {
329 first = ERR_PTR(err); 332 first = ERR_PTR(err);
330 break; 333 break;
diff --git a/mm/mmap.c b/mm/mmap.c
index 459b9f068ad7..8a111792b8db 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1088,6 +1088,17 @@ munmap_back:
1088 error = file->f_op->mmap(file, vma); 1088 error = file->f_op->mmap(file, vma);
1089 if (error) 1089 if (error)
1090 goto unmap_and_free_vma; 1090 goto unmap_and_free_vma;
1091 if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
1092 == (VM_WRITE | VM_RESERVED)) {
1093 printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
1094 "PROT_WRITE mmap of VM_RESERVED memory, which "
1095 "is deprecated. Please report this to "
1096 "linux-kernel@vger.kernel.org\n",current->comm);
1097 if (vma->vm_ops && vma->vm_ops->close)
1098 vma->vm_ops->close(vma);
1099 error = -EACCES;
1100 goto unmap_and_free_vma;
1101 }
1091 } else if (vm_flags & VM_SHARED) { 1102 } else if (vm_flags & VM_SHARED) {
1092 error = shmem_zero_setup(vma); 1103 error = shmem_zero_setup(vma);
1093 if (error) 1104 if (error)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b426f01c5e9c..672a76fddd5e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -125,6 +125,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
125 * a MAP_NORESERVE private mapping to writable will now reserve. 125 * a MAP_NORESERVE private mapping to writable will now reserve.
126 */ 126 */
127 if (newflags & VM_WRITE) { 127 if (newflags & VM_WRITE) {
128 if (oldflags & VM_RESERVED) {
129 BUG_ON(oldflags & VM_WRITE);
130 printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
131 "PROT_WRITE mprotect of VM_RESERVED memory, "
132 "which is deprecated. Please report this to "
133 "linux-kernel@vger.kernel.org\n",current->comm);
134 return -EACCES;
135 }
128 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { 136 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
129 charged = nrpages; 137 charged = nrpages;
130 if (security_vm_enough_memory(charged)) 138 if (security_vm_enough_memory(charged))
diff --git a/mm/msync.c b/mm/msync.c
index 3b5f1c521d4b..860395486060 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -25,6 +25,7 @@
25static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 25static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
26 unsigned long addr, unsigned long end) 26 unsigned long addr, unsigned long end)
27{ 27{
28 struct mm_struct *mm = vma->vm_mm;
28 pte_t *pte; 29 pte_t *pte;
29 int progress = 0; 30 int progress = 0;
30 31
@@ -37,7 +38,7 @@ again:
37 if (progress >= 64) { 38 if (progress >= 64) {
38 progress = 0; 39 progress = 0;
39 if (need_resched() || 40 if (need_resched() ||
40 need_lockbreak(&vma->vm_mm->page_table_lock)) 41 need_lockbreak(&mm->page_table_lock))
41 break; 42 break;
42 } 43 }
43 progress++; 44 progress++;
@@ -46,11 +47,11 @@ again:
46 if (!pte_maybe_dirty(*pte)) 47 if (!pte_maybe_dirty(*pte))
47 continue; 48 continue;
48 pfn = pte_pfn(*pte); 49 pfn = pte_pfn(*pte);
49 if (!pfn_valid(pfn)) 50 if (unlikely(!pfn_valid(pfn))) {
51 print_bad_pte(vma, *pte, addr);
50 continue; 52 continue;
53 }
51 page = pfn_to_page(pfn); 54 page = pfn_to_page(pfn);
52 if (PageReserved(page))
53 continue;
54 55
55 if (ptep_clear_flush_dirty(vma, addr, pte) || 56 if (ptep_clear_flush_dirty(vma, addr, pte) ||
56 page_test_and_clear_dirty(page)) 57 page_test_and_clear_dirty(page))
@@ -58,7 +59,7 @@ again:
58 progress += 3; 59 progress += 3;
59 } while (pte++, addr += PAGE_SIZE, addr != end); 60 } while (pte++, addr += PAGE_SIZE, addr != end);
60 pte_unmap(pte - 1); 61 pte_unmap(pte - 1);
61 cond_resched_lock(&vma->vm_mm->page_table_lock); 62 cond_resched_lock(&mm->page_table_lock);
62 if (addr != end) 63 if (addr != end)
63 goto again; 64 goto again;
64} 65}
@@ -102,8 +103,10 @@ static void msync_page_range(struct vm_area_struct *vma,
102 103
103 /* For hugepages we can't go walking the page table normally, 104 /* For hugepages we can't go walking the page table normally,
104 * but that's ok, hugetlbfs is memory based, so we don't need 105 * but that's ok, hugetlbfs is memory based, so we don't need
105 * to do anything more on an msync() */ 106 * to do anything more on an msync().
106 if (is_vm_hugetlb_page(vma)) 107 * Can't do anything with VM_RESERVED regions either.
108 */
109 if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
107 return; 110 return;
108 111
109 BUG_ON(addr >= end); 112 BUG_ON(addr >= end);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 60663232fbb2..0541288ebf4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -114,7 +114,8 @@ static void bad_page(const char *function, struct page *page)
114 1 << PG_reclaim | 114 1 << PG_reclaim |
115 1 << PG_slab | 115 1 << PG_slab |
116 1 << PG_swapcache | 116 1 << PG_swapcache |
117 1 << PG_writeback); 117 1 << PG_writeback |
118 1 << PG_reserved );
118 set_page_count(page, 0); 119 set_page_count(page, 0);
119 reset_page_mapcount(page); 120 reset_page_mapcount(page);
120 page->mapping = NULL; 121 page->mapping = NULL;
@@ -244,7 +245,6 @@ static inline int page_is_buddy(struct page *page, int order)
244{ 245{
245 if (PagePrivate(page) && 246 if (PagePrivate(page) &&
246 (page_order(page) == order) && 247 (page_order(page) == order) &&
247 !PageReserved(page) &&
248 page_count(page) == 0) 248 page_count(page) == 0)
249 return 1; 249 return 1;
250 return 0; 250 return 0;
@@ -327,7 +327,8 @@ static inline void free_pages_check(const char *function, struct page *page)
327 1 << PG_reclaim | 327 1 << PG_reclaim |
328 1 << PG_slab | 328 1 << PG_slab |
329 1 << PG_swapcache | 329 1 << PG_swapcache |
330 1 << PG_writeback ))) 330 1 << PG_writeback |
331 1 << PG_reserved )))
331 bad_page(function, page); 332 bad_page(function, page);
332 if (PageDirty(page)) 333 if (PageDirty(page))
333 __ClearPageDirty(page); 334 __ClearPageDirty(page);
@@ -455,7 +456,8 @@ static void prep_new_page(struct page *page, int order)
455 1 << PG_reclaim | 456 1 << PG_reclaim |
456 1 << PG_slab | 457 1 << PG_slab |
457 1 << PG_swapcache | 458 1 << PG_swapcache |
458 1 << PG_writeback ))) 459 1 << PG_writeback |
460 1 << PG_reserved )))
459 bad_page(__FUNCTION__, page); 461 bad_page(__FUNCTION__, page);
460 462
461 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 463 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
@@ -1016,7 +1018,7 @@ void __pagevec_free(struct pagevec *pvec)
1016 1018
1017fastcall void __free_pages(struct page *page, unsigned int order) 1019fastcall void __free_pages(struct page *page, unsigned int order)
1018{ 1020{
1019 if (!PageReserved(page) && put_page_testzero(page)) { 1021 if (put_page_testzero(page)) {
1020 if (order == 0) 1022 if (order == 0)
1021 free_hot_page(page); 1023 free_hot_page(page);
1022 else 1024 else
@@ -1674,7 +1676,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1674 continue; 1676 continue;
1675 page = pfn_to_page(pfn); 1677 page = pfn_to_page(pfn);
1676 set_page_links(page, zone, nid, pfn); 1678 set_page_links(page, zone, nid, pfn);
1677 set_page_count(page, 0); 1679 set_page_count(page, 1);
1678 reset_page_mapcount(page); 1680 reset_page_mapcount(page);
1679 SetPageReserved(page); 1681 SetPageReserved(page);
1680 INIT_LIST_HEAD(&page->lru); 1682 INIT_LIST_HEAD(&page->lru);
diff --git a/mm/rmap.c b/mm/rmap.c
index 504757624cce..f69d5342ce7f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -443,8 +443,6 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
443void page_add_anon_rmap(struct page *page, 443void page_add_anon_rmap(struct page *page,
444 struct vm_area_struct *vma, unsigned long address) 444 struct vm_area_struct *vma, unsigned long address)
445{ 445{
446 BUG_ON(PageReserved(page));
447
448 if (atomic_inc_and_test(&page->_mapcount)) { 446 if (atomic_inc_and_test(&page->_mapcount)) {
449 struct anon_vma *anon_vma = vma->anon_vma; 447 struct anon_vma *anon_vma = vma->anon_vma;
450 448
@@ -468,8 +466,7 @@ void page_add_anon_rmap(struct page *page,
468void page_add_file_rmap(struct page *page) 466void page_add_file_rmap(struct page *page)
469{ 467{
470 BUG_ON(PageAnon(page)); 468 BUG_ON(PageAnon(page));
471 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) 469 BUG_ON(!pfn_valid(page_to_pfn(page)));
472 return;
473 470
474 if (atomic_inc_and_test(&page->_mapcount)) 471 if (atomic_inc_and_test(&page->_mapcount))
475 inc_page_state(nr_mapped); 472 inc_page_state(nr_mapped);
@@ -483,8 +480,6 @@ void page_add_file_rmap(struct page *page)
483 */ 480 */
484void page_remove_rmap(struct page *page) 481void page_remove_rmap(struct page *page)
485{ 482{
486 BUG_ON(PageReserved(page));
487
488 if (atomic_add_negative(-1, &page->_mapcount)) { 483 if (atomic_add_negative(-1, &page->_mapcount)) {
489 BUG_ON(page_mapcount(page) < 0); 484 BUG_ON(page_mapcount(page) < 0);
490 /* 485 /*
@@ -640,13 +635,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
640 continue; 635 continue;
641 636
642 pfn = pte_pfn(*pte); 637 pfn = pte_pfn(*pte);
643 if (!pfn_valid(pfn)) 638 if (unlikely(!pfn_valid(pfn))) {
639 print_bad_pte(vma, *pte, address);
644 continue; 640 continue;
641 }
645 642
646 page = pfn_to_page(pfn); 643 page = pfn_to_page(pfn);
647 BUG_ON(PageAnon(page)); 644 BUG_ON(PageAnon(page));
648 if (PageReserved(page))
649 continue;
650 645
651 if (ptep_clear_flush_young(vma, address, pte)) 646 if (ptep_clear_flush_young(vma, address, pte))
652 continue; 647 continue;
@@ -808,7 +803,6 @@ int try_to_unmap(struct page *page)
808{ 803{
809 int ret; 804 int ret;
810 805
811 BUG_ON(PageReserved(page));
812 BUG_ON(!PageLocked(page)); 806 BUG_ON(!PageLocked(page));
813 807
814 if (PageAnon(page)) 808 if (PageAnon(page))
diff --git a/mm/shmem.c b/mm/shmem.c
index 6796311a23ef..37777f4c11f8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1506,8 +1506,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1506 */ 1506 */
1507 if (!offset) 1507 if (!offset)
1508 mark_page_accessed(page); 1508 mark_page_accessed(page);
1509 } else 1509 } else {
1510 page = ZERO_PAGE(0); 1510 page = ZERO_PAGE(0);
1511 page_cache_get(page);
1512 }
1511 1513
1512 /* 1514 /*
1513 * Ok, we have the page, and it's up-to-date, so 1515 * Ok, we have the page, and it's up-to-date, so
diff --git a/mm/swap.c b/mm/swap.c
index 7771d2803f62..21d15f99805c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -48,7 +48,7 @@ void put_page(struct page *page)
48 } 48 }
49 return; 49 return;
50 } 50 }
51 if (!PageReserved(page) && put_page_testzero(page)) 51 if (put_page_testzero(page))
52 __page_cache_release(page); 52 __page_cache_release(page);
53} 53}
54EXPORT_SYMBOL(put_page); 54EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
215 struct page *page = pages[i]; 215 struct page *page = pages[i];
216 struct zone *pagezone; 216 struct zone *pagezone;
217 217
218 if (PageReserved(page) || !put_page_testzero(page)) 218 if (!put_page_testzero(page))
219 continue; 219 continue;
220 220
221 pagezone = page_zone(page); 221 pagezone = page_zone(page);
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 67abebabf83e..e97b2d162cc7 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2949,8 +2949,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns
2949 return NOPAGE_OOM; 2949 return NOPAGE_OOM;
2950 runtime = substream->runtime; 2950 runtime = substream->runtime;
2951 page = virt_to_page(runtime->status); 2951 page = virt_to_page(runtime->status);
2952 if (!PageReserved(page)) 2952 get_page(page);
2953 get_page(page);
2954 if (type) 2953 if (type)
2955 *type = VM_FAULT_MINOR; 2954 *type = VM_FAULT_MINOR;
2956 return page; 2955 return page;
@@ -2992,8 +2991,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un
2992 return NOPAGE_OOM; 2991 return NOPAGE_OOM;
2993 runtime = substream->runtime; 2992 runtime = substream->runtime;
2994 page = virt_to_page(runtime->control); 2993 page = virt_to_page(runtime->control);
2995 if (!PageReserved(page)) 2994 get_page(page);
2996 get_page(page);
2997 if (type) 2995 if (type)
2998 *type = VM_FAULT_MINOR; 2996 *type = VM_FAULT_MINOR;
2999 return page; 2997 return page;
@@ -3066,8 +3064,7 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign
3066 vaddr = runtime->dma_area + offset; 3064 vaddr = runtime->dma_area + offset;
3067 page = virt_to_page(vaddr); 3065 page = virt_to_page(vaddr);
3068 } 3066 }
3069 if (!PageReserved(page)) 3067 get_page(page);
3070 get_page(page);
3071 if (type) 3068 if (type)
3072 *type = VM_FAULT_MINOR; 3069 *type = VM_FAULT_MINOR;
3073 return page; 3070 return page;