aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/fremap.c28
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory.c213
-rw-r--r--mm/mempolicy.c12
-rw-r--r--mm/mmap.c11
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/msync.c12
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page_alloc.c75
-rw-r--r--mm/rmap.c58
-rw-r--r--mm/swap.c3
-rw-r--r--mm/thrash.c10
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/vmscan.c29
16 files changed, 273 insertions, 208 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ae9ce6b73e8a..21eb51d4da8f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -125,12 +125,10 @@ comment "Memory hotplug is currently incompatible with Software Suspend"
125# space can be handled with less contention: split it at this NR_CPUS. 125# space can be handled with less contention: split it at this NR_CPUS.
126# Default to 4 for wider testing, though 8 might be more appropriate. 126# Default to 4 for wider testing, though 8 might be more appropriate.
127# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. 127# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
128# PA-RISC's debug spinlock_t is too large for the 32-bit struct page. 128# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
129# ARM26 and SPARC32 and PPC64 may use one page for multiple page tables.
130# 129#
131config SPLIT_PTLOCK_CPUS 130config SPLIT_PTLOCK_CPUS
132 int 131 int
133 default "4096" if ARM && !CPU_CACHE_VIPT 132 default "4096" if ARM && !CPU_CACHE_VIPT
134 default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT 133 default "4096" if PARISC && !PA20
135 default "4096" if ARM26 || SPARC32 || PPC64
136 default "4" 134 default "4"
diff --git a/mm/fremap.c b/mm/fremap.c
index d862be3bc3e3..f851775e09c2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
27 struct page *page = NULL; 27 struct page *page = NULL;
28 28
29 if (pte_present(pte)) { 29 if (pte_present(pte)) {
30 unsigned long pfn = pte_pfn(pte); 30 flush_cache_page(vma, addr, pte_pfn(pte));
31 flush_cache_page(vma, addr, pfn);
32 pte = ptep_clear_flush(vma, addr, ptep); 31 pte = ptep_clear_flush(vma, addr, ptep);
33 if (unlikely(!pfn_valid(pfn))) { 32 page = vm_normal_page(vma, addr, pte);
34 print_bad_pte(vma, pte, addr); 33 if (page) {
35 goto out; 34 if (pte_dirty(pte))
35 set_page_dirty(page);
36 page_remove_rmap(page);
37 page_cache_release(page);
36 } 38 }
37 page = pfn_to_page(pfn);
38 if (pte_dirty(pte))
39 set_page_dirty(page);
40 page_remove_rmap(page);
41 page_cache_release(page);
42 } else { 39 } else {
43 if (!pte_file(pte)) 40 if (!pte_file(pte))
44 free_swap_and_cache(pte_to_swp_entry(pte)); 41 free_swap_and_cache(pte_to_swp_entry(pte));
45 pte_clear(mm, addr, ptep); 42 pte_clear(mm, addr, ptep);
46 } 43 }
47out:
48 return !!page; 44 return !!page;
49} 45}
50 46
@@ -65,8 +61,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
65 pte_t pte_val; 61 pte_t pte_val;
66 spinlock_t *ptl; 62 spinlock_t *ptl;
67 63
68 BUG_ON(vma->vm_flags & VM_RESERVED);
69
70 pgd = pgd_offset(mm, addr); 64 pgd = pgd_offset(mm, addr);
71 pud = pud_alloc(mm, pgd, addr); 65 pud = pud_alloc(mm, pgd, addr);
72 if (!pud) 66 if (!pud)
@@ -122,8 +116,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
122 pte_t pte_val; 116 pte_t pte_val;
123 spinlock_t *ptl; 117 spinlock_t *ptl;
124 118
125 BUG_ON(vma->vm_flags & VM_RESERVED);
126
127 pgd = pgd_offset(mm, addr); 119 pgd = pgd_offset(mm, addr);
128 pud = pud_alloc(mm, pgd, addr); 120 pud = pud_alloc(mm, pgd, addr);
129 if (!pud) 121 if (!pud)
@@ -204,12 +196,10 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
204 * Make sure the vma is shared, that it supports prefaulting, 196 * Make sure the vma is shared, that it supports prefaulting,
205 * and that the remapped range is valid and fully within 197 * and that the remapped range is valid and fully within
206 * the single existing vma. vm_private_data is used as a 198 * the single existing vma. vm_private_data is used as a
207 * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED 199 * swapout cursor in a VM_NONLINEAR vma.
208 * or VM_LOCKED, but VM_LOCKED could be revoked later on).
209 */ 200 */
210 if (vma && (vma->vm_flags & VM_SHARED) && 201 if (vma && (vma->vm_flags & VM_SHARED) &&
211 (!vma->vm_private_data || 202 (!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
212 (vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) &&
213 vma->vm_ops && vma->vm_ops->populate && 203 vma->vm_ops && vma->vm_ops->populate &&
214 end > start && start >= vma->vm_start && 204 end > start && start >= vma->vm_start &&
215 end <= vma->vm_end) { 205 end <= vma->vm_end) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 728e9bda12ea..3e52df7c471b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,6 +22,10 @@ unsigned long max_huge_pages;
22static struct list_head hugepage_freelists[MAX_NUMNODES]; 22static struct list_head hugepage_freelists[MAX_NUMNODES];
23static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 23static unsigned int nr_huge_pages_node[MAX_NUMNODES];
24static unsigned int free_huge_pages_node[MAX_NUMNODES]; 24static unsigned int free_huge_pages_node[MAX_NUMNODES];
25
26/*
27 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
28 */
25static DEFINE_SPINLOCK(hugetlb_lock); 29static DEFINE_SPINLOCK(hugetlb_lock);
26 30
27static void enqueue_huge_page(struct page *page) 31static void enqueue_huge_page(struct page *page)
@@ -61,8 +65,10 @@ static struct page *alloc_fresh_huge_page(void)
61 HUGETLB_PAGE_ORDER); 65 HUGETLB_PAGE_ORDER);
62 nid = (nid + 1) % num_online_nodes(); 66 nid = (nid + 1) % num_online_nodes();
63 if (page) { 67 if (page) {
68 spin_lock(&hugetlb_lock);
64 nr_huge_pages++; 69 nr_huge_pages++;
65 nr_huge_pages_node[page_to_nid(page)]++; 70 nr_huge_pages_node[page_to_nid(page)]++;
71 spin_unlock(&hugetlb_lock);
66 } 72 }
67 return page; 73 return page;
68} 74}
diff --git a/mm/madvise.c b/mm/madvise.c
index 17aaf3e16449..2b7cf0400a21 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
126 unsigned long start, unsigned long end) 126 unsigned long start, unsigned long end)
127{ 127{
128 *prev = vma; 128 *prev = vma;
129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED)) 129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
130 return -EINVAL; 130 return -EINVAL;
131 131
132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index 2998cfc12f5b..6c1eac92a316 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
333} 333}
334 334
335/* 335/*
336 * This function is called to print an error when a pte in a 336 * This function is called to print an error when a bad pte
337 * !VM_RESERVED region is found pointing to an invalid pfn (which 337 * is found. For example, we might have a PFN-mapped pte in
338 * is an error. 338 * a region that doesn't allow it.
339 * 339 *
340 * The calling function must still handle the error. 340 * The calling function must still handle the error.
341 */ 341 */
@@ -350,6 +350,59 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
350} 350}
351 351
352/* 352/*
353 * This function gets the "struct page" associated with a pte.
354 *
355 * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
356 * will have each page table entry just pointing to a raw page frame
357 * number, and as far as the VM layer is concerned, those do not have
358 * pages associated with them - even if the PFN might point to memory
359 * that otherwise is perfectly fine and has a "struct page".
360 *
361 * The way we recognize those mappings is through the rules set up
362 * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
363 * and the vm_pgoff will point to the first PFN mapped: thus every
364 * page that is a raw mapping will always honor the rule
365 *
366 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
367 *
368 * and if that isn't true, the page has been COW'ed (in which case it
369 * _does_ have a "struct page" associated with it even if it is in a
370 * VM_PFNMAP range).
371 */
372struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
373{
374 unsigned long pfn = pte_pfn(pte);
375
376 if (vma->vm_flags & VM_PFNMAP) {
377 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
378 if (pfn == vma->vm_pgoff + off)
379 return NULL;
380 }
381
382 /*
383 * Add some anal sanity checks for now. Eventually,
384 * we should just do "return pfn_to_page(pfn)", but
385 * in the meantime we check that we get a valid pfn,
386 * and that the resulting page looks ok.
387 *
388 * Remove this test eventually!
389 */
390 if (unlikely(!pfn_valid(pfn))) {
391 print_bad_pte(vma, pte, addr);
392 return NULL;
393 }
394
395 /*
396 * NOTE! We still have PageReserved() pages in the page
397 * tables.
398 *
399 * The PAGE_ZERO() pages and various VDSO mappings can
400 * cause them to exist.
401 */
402 return pfn_to_page(pfn);
403}
404
405/*
353 * copy one vm_area from one task to the other. Assumes the page tables 406 * copy one vm_area from one task to the other. Assumes the page tables
354 * already present in the new task to be cleared in the whole range 407 * already present in the new task to be cleared in the whole range
355 * covered by this vma. 408 * covered by this vma.
@@ -363,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
363 unsigned long vm_flags = vma->vm_flags; 416 unsigned long vm_flags = vma->vm_flags;
364 pte_t pte = *src_pte; 417 pte_t pte = *src_pte;
365 struct page *page; 418 struct page *page;
366 unsigned long pfn;
367 419
368 /* pte contains position in swap or file, so copy. */ 420 /* pte contains position in swap or file, so copy. */
369 if (unlikely(!pte_present(pte))) { 421 if (unlikely(!pte_present(pte))) {
@@ -381,23 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
381 goto out_set_pte; 433 goto out_set_pte;
382 } 434 }
383 435
384 /* If the region is VM_RESERVED, the mapping is not
385 * mapped via rmap - duplicate the pte as is.
386 */
387 if (vm_flags & VM_RESERVED)
388 goto out_set_pte;
389
390 pfn = pte_pfn(pte);
391 /* If the pte points outside of valid memory but
392 * the region is not VM_RESERVED, we have a problem.
393 */
394 if (unlikely(!pfn_valid(pfn))) {
395 print_bad_pte(vma, pte, addr);
396 goto out_set_pte; /* try to do something sane */
397 }
398
399 page = pfn_to_page(pfn);
400
401 /* 436 /*
402 * If it's a COW mapping, write protect it both 437 * If it's a COW mapping, write protect it both
403 * in the parent and the child 438 * in the parent and the child
@@ -414,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
414 if (vm_flags & VM_SHARED) 449 if (vm_flags & VM_SHARED)
415 pte = pte_mkclean(pte); 450 pte = pte_mkclean(pte);
416 pte = pte_mkold(pte); 451 pte = pte_mkold(pte);
417 get_page(page); 452
418 page_dup_rmap(page); 453 page = vm_normal_page(vma, addr, pte);
419 rss[!!PageAnon(page)]++; 454 if (page) {
455 get_page(page);
456 page_dup_rmap(page);
457 rss[!!PageAnon(page)]++;
458 }
420 459
421out_set_pte: 460out_set_pte:
422 set_pte_at(dst_mm, addr, dst_pte, pte); 461 set_pte_at(dst_mm, addr, dst_pte, pte);
@@ -528,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
528 * readonly mappings. The tradeoff is that copy_page_range is more 567 * readonly mappings. The tradeoff is that copy_page_range is more
529 * efficient than faulting. 568 * efficient than faulting.
530 */ 569 */
531 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) { 570 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
532 if (!vma->anon_vma) 571 if (!vma->anon_vma)
533 return 0; 572 return 0;
534 } 573 }
@@ -568,17 +607,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
568 continue; 607 continue;
569 } 608 }
570 if (pte_present(ptent)) { 609 if (pte_present(ptent)) {
571 struct page *page = NULL; 610 struct page *page;
572 611
573 (*zap_work) -= PAGE_SIZE; 612 (*zap_work) -= PAGE_SIZE;
574 613
575 if (!(vma->vm_flags & VM_RESERVED)) { 614 page = vm_normal_page(vma, addr, ptent);
576 unsigned long pfn = pte_pfn(ptent);
577 if (unlikely(!pfn_valid(pfn)))
578 print_bad_pte(vma, ptent, addr);
579 else
580 page = pfn_to_page(pfn);
581 }
582 if (unlikely(details) && page) { 615 if (unlikely(details) && page) {
583 /* 616 /*
584 * unmap_shared_mapping_pages() wants to 617 * unmap_shared_mapping_pages() wants to
@@ -834,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
834/* 867/*
835 * Do a quick page-table lookup for a single page. 868 * Do a quick page-table lookup for a single page.
836 */ 869 */
837struct page *follow_page(struct mm_struct *mm, unsigned long address, 870struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
838 unsigned int flags) 871 unsigned int flags)
839{ 872{
840 pgd_t *pgd; 873 pgd_t *pgd;
@@ -842,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
842 pmd_t *pmd; 875 pmd_t *pmd;
843 pte_t *ptep, pte; 876 pte_t *ptep, pte;
844 spinlock_t *ptl; 877 spinlock_t *ptl;
845 unsigned long pfn;
846 struct page *page; 878 struct page *page;
879 struct mm_struct *mm = vma->vm_mm;
847 880
848 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 881 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
849 if (!IS_ERR(page)) { 882 if (!IS_ERR(page)) {
@@ -879,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
879 goto unlock; 912 goto unlock;
880 if ((flags & FOLL_WRITE) && !pte_write(pte)) 913 if ((flags & FOLL_WRITE) && !pte_write(pte))
881 goto unlock; 914 goto unlock;
882 pfn = pte_pfn(pte); 915 page = vm_normal_page(vma, address, pte);
883 if (!pfn_valid(pfn)) 916 if (unlikely(!page))
884 goto unlock; 917 goto unlock;
885 918
886 page = pfn_to_page(pfn);
887 if (flags & FOLL_GET) 919 if (flags & FOLL_GET)
888 get_page(page); 920 get_page(page);
889 if (flags & FOLL_TOUCH) { 921 if (flags & FOLL_TOUCH) {
@@ -956,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
956 return i ? : -EFAULT; 988 return i ? : -EFAULT;
957 } 989 }
958 if (pages) { 990 if (pages) {
959 pages[i] = pte_page(*pte); 991 struct page *page = vm_normal_page(gate_vma, start, *pte);
960 get_page(pages[i]); 992 pages[i] = page;
993 if (page)
994 get_page(page);
961 } 995 }
962 pte_unmap(pte); 996 pte_unmap(pte);
963 if (vmas) 997 if (vmas)
@@ -968,7 +1002,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
968 continue; 1002 continue;
969 } 1003 }
970 1004
971 if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) 1005 if (!vma || (vma->vm_flags & VM_IO)
972 || !(vm_flags & vma->vm_flags)) 1006 || !(vm_flags & vma->vm_flags))
973 return i ? : -EFAULT; 1007 return i ? : -EFAULT;
974 1008
@@ -992,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
992 foll_flags |= FOLL_WRITE; 1026 foll_flags |= FOLL_WRITE;
993 1027
994 cond_resched(); 1028 cond_resched();
995 while (!(page = follow_page(mm, start, foll_flags))) { 1029 while (!(page = follow_page(vma, start, foll_flags))) {
996 int ret; 1030 int ret;
997 ret = __handle_mm_fault(mm, vma, start, 1031 ret = __handle_mm_fault(mm, vma, start,
998 foll_flags & FOLL_WRITE); 1032 foll_flags & FOLL_WRITE);
@@ -1191,10 +1225,17 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1191 * rest of the world about it: 1225 * rest of the world about it:
1192 * VM_IO tells people not to look at these pages 1226 * VM_IO tells people not to look at these pages
1193 * (accesses can have side effects). 1227 * (accesses can have side effects).
1194 * VM_RESERVED tells the core MM not to "manage" these pages 1228 * VM_RESERVED is specified all over the place, because
1195 * (e.g. refcount, mapcount, try to swap them out). 1229 * in 2.4 it kept swapout's vma scan off this vma; but
1230 * in 2.6 the LRU scan won't even find its pages, so this
1231 * flag means no more than count its pages in reserved_vm,
1232 * and omit it from core dump, even when VM_IO turned off.
1233 * VM_PFNMAP tells the core MM that the base pages are just
1234 * raw PFN mappings, and do not have a "struct page" associated
1235 * with them.
1196 */ 1236 */
1197 vma->vm_flags |= VM_IO | VM_RESERVED; 1237 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1238 vma->vm_pgoff = pfn;
1198 1239
1199 BUG_ON(addr >= end); 1240 BUG_ON(addr >= end);
1200 pfn -= addr >> PAGE_SHIFT; 1241 pfn -= addr >> PAGE_SHIFT;
@@ -1249,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1249 return pte; 1290 return pte;
1250} 1291}
1251 1292
1293static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
1294{
1295 /*
1296 * If the source page was a PFN mapping, we don't have
1297 * a "struct page" for it. We do a best-effort copy by
1298 * just copying from the original user address. If that
1299 * fails, we just zero-fill it. Live with it.
1300 */
1301 if (unlikely(!src)) {
1302 void *kaddr = kmap_atomic(dst, KM_USER0);
1303 unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE);
1304 if (left)
1305 memset(kaddr, 0, PAGE_SIZE);
1306 kunmap_atomic(kaddr, KM_USER0);
1307 return;
1308
1309 }
1310 copy_user_highpage(dst, src, va);
1311}
1312
1252/* 1313/*
1253 * This routine handles present pages, when users try to write 1314 * This routine handles present pages, when users try to write
1254 * to a shared page. It is done by copying the page to a new address 1315 * to a shared page. It is done by copying the page to a new address
@@ -1271,22 +1332,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1271 unsigned long address, pte_t *page_table, pmd_t *pmd, 1332 unsigned long address, pte_t *page_table, pmd_t *pmd,
1272 spinlock_t *ptl, pte_t orig_pte) 1333 spinlock_t *ptl, pte_t orig_pte)
1273{ 1334{
1274 struct page *old_page, *new_page; 1335 struct page *old_page, *src_page, *new_page;
1275 unsigned long pfn = pte_pfn(orig_pte);
1276 pte_t entry; 1336 pte_t entry;
1277 int ret = VM_FAULT_MINOR; 1337 int ret = VM_FAULT_MINOR;
1278 1338
1279 BUG_ON(vma->vm_flags & VM_RESERVED); 1339 old_page = vm_normal_page(vma, address, orig_pte);
1280 1340 src_page = old_page;
1281 if (unlikely(!pfn_valid(pfn))) { 1341 if (!old_page)
1282 /* 1342 goto gotten;
1283 * Page table corrupted: show pte and kill process.
1284 */
1285 print_bad_pte(vma, orig_pte, address);
1286 ret = VM_FAULT_OOM;
1287 goto unlock;
1288 }
1289 old_page = pfn_to_page(pfn);
1290 1343
1291 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { 1344 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1292 int reuse = can_share_swap_page(old_page); 1345 int reuse = can_share_swap_page(old_page);
@@ -1307,11 +1360,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1307 * Ok, we need to copy. Oh, well.. 1360 * Ok, we need to copy. Oh, well..
1308 */ 1361 */
1309 page_cache_get(old_page); 1362 page_cache_get(old_page);
1363gotten:
1310 pte_unmap_unlock(page_table, ptl); 1364 pte_unmap_unlock(page_table, ptl);
1311 1365
1312 if (unlikely(anon_vma_prepare(vma))) 1366 if (unlikely(anon_vma_prepare(vma)))
1313 goto oom; 1367 goto oom;
1314 if (old_page == ZERO_PAGE(address)) { 1368 if (src_page == ZERO_PAGE(address)) {
1315 new_page = alloc_zeroed_user_highpage(vma, address); 1369 new_page = alloc_zeroed_user_highpage(vma, address);
1316 if (!new_page) 1370 if (!new_page)
1317 goto oom; 1371 goto oom;
@@ -1319,7 +1373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1319 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1373 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1320 if (!new_page) 1374 if (!new_page)
1321 goto oom; 1375 goto oom;
1322 copy_user_highpage(new_page, old_page, address); 1376 cow_user_page(new_page, src_page, address);
1323 } 1377 }
1324 1378
1325 /* 1379 /*
@@ -1327,11 +1381,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1327 */ 1381 */
1328 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 1382 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1329 if (likely(pte_same(*page_table, orig_pte))) { 1383 if (likely(pte_same(*page_table, orig_pte))) {
1330 page_remove_rmap(old_page); 1384 if (old_page) {
1331 if (!PageAnon(old_page)) { 1385 page_remove_rmap(old_page);
1386 if (!PageAnon(old_page)) {
1387 dec_mm_counter(mm, file_rss);
1388 inc_mm_counter(mm, anon_rss);
1389 }
1390 } else
1332 inc_mm_counter(mm, anon_rss); 1391 inc_mm_counter(mm, anon_rss);
1333 dec_mm_counter(mm, file_rss);
1334 }
1335 flush_cache_page(vma, address, pfn); 1392 flush_cache_page(vma, address, pfn);
1336 entry = mk_pte(new_page, vma->vm_page_prot); 1393 entry = mk_pte(new_page, vma->vm_page_prot);
1337 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1394 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1345,13 +1402,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1345 new_page = old_page; 1402 new_page = old_page;
1346 ret |= VM_FAULT_WRITE; 1403 ret |= VM_FAULT_WRITE;
1347 } 1404 }
1348 page_cache_release(new_page); 1405 if (new_page)
1349 page_cache_release(old_page); 1406 page_cache_release(new_page);
1407 if (old_page)
1408 page_cache_release(old_page);
1350unlock: 1409unlock:
1351 pte_unmap_unlock(page_table, ptl); 1410 pte_unmap_unlock(page_table, ptl);
1352 return ret; 1411 return ret;
1353oom: 1412oom:
1354 page_cache_release(old_page); 1413 if (old_page)
1414 page_cache_release(old_page);
1355 return VM_FAULT_OOM; 1415 return VM_FAULT_OOM;
1356} 1416}
1357 1417
@@ -1849,7 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1849 int anon = 0; 1909 int anon = 0;
1850 1910
1851 pte_unmap(page_table); 1911 pte_unmap(page_table);
1852
1853 if (vma->vm_file) { 1912 if (vma->vm_file) {
1854 mapping = vma->vm_file->f_mapping; 1913 mapping = vma->vm_file->f_mapping;
1855 sequence = mapping->truncate_count; 1914 sequence = mapping->truncate_count;
@@ -1882,7 +1941,7 @@ retry:
1882 page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1941 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1883 if (!page) 1942 if (!page)
1884 goto oom; 1943 goto oom;
1885 copy_user_highpage(page, new_page, address); 1944 cow_user_page(page, new_page, address);
1886 page_cache_release(new_page); 1945 page_cache_release(new_page);
1887 new_page = page; 1946 new_page = page;
1888 anon = 1; 1947 anon = 1;
@@ -1924,7 +1983,7 @@ retry:
1924 inc_mm_counter(mm, anon_rss); 1983 inc_mm_counter(mm, anon_rss);
1925 lru_cache_add_active(new_page); 1984 lru_cache_add_active(new_page);
1926 page_add_anon_rmap(new_page, vma, address); 1985 page_add_anon_rmap(new_page, vma, address);
1927 } else if (!(vma->vm_flags & VM_RESERVED)) { 1986 } else {
1928 inc_mm_counter(mm, file_rss); 1987 inc_mm_counter(mm, file_rss);
1929 page_add_file_rmap(new_page); 1988 page_add_file_rmap(new_page);
1930 } 1989 }
@@ -2101,6 +2160,12 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2101 spin_unlock(&mm->page_table_lock); 2160 spin_unlock(&mm->page_table_lock);
2102 return 0; 2161 return 0;
2103} 2162}
2163#else
2164/* Workaround for gcc 2.96 */
2165int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2166{
2167 return 0;
2168}
2104#endif /* __PAGETABLE_PUD_FOLDED */ 2169#endif /* __PAGETABLE_PUD_FOLDED */
2105 2170
2106#ifndef __PAGETABLE_PMD_FOLDED 2171#ifndef __PAGETABLE_PMD_FOLDED
@@ -2129,6 +2194,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2129 spin_unlock(&mm->page_table_lock); 2194 spin_unlock(&mm->page_table_lock);
2130 return 0; 2195 return 0;
2131} 2196}
2197#else
2198/* Workaround for gcc 2.96 */
2199int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2200{
2201 return 0;
2202}
2132#endif /* __PAGETABLE_PMD_FOLDED */ 2203#endif /* __PAGETABLE_PMD_FOLDED */
2133 2204
2134int make_pages_present(unsigned long addr, unsigned long end) 2205int make_pages_present(unsigned long addr, unsigned long end)
@@ -2203,7 +2274,7 @@ static int __init gate_vma_init(void)
2203 gate_vma.vm_start = FIXADDR_USER_START; 2274 gate_vma.vm_start = FIXADDR_USER_START;
2204 gate_vma.vm_end = FIXADDR_USER_END; 2275 gate_vma.vm_end = FIXADDR_USER_END;
2205 gate_vma.vm_page_prot = PAGE_READONLY; 2276 gate_vma.vm_page_prot = PAGE_READONLY;
2206 gate_vma.vm_flags = VM_RESERVED; 2277 gate_vma.vm_flags = 0;
2207 return 0; 2278 return 0;
2208} 2279}
2209__initcall(gate_vma_init); 2280__initcall(gate_vma_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5abc57c2b8bd..bec88c81244e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
189 189
190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
191 do { 191 do {
192 unsigned long pfn; 192 struct page *page;
193 unsigned int nid; 193 unsigned int nid;
194 194
195 if (!pte_present(*pte)) 195 if (!pte_present(*pte))
196 continue; 196 continue;
197 pfn = pte_pfn(*pte); 197 page = vm_normal_page(vma, addr, *pte);
198 if (!pfn_valid(pfn)) { 198 if (!page)
199 print_bad_pte(vma, *pte, addr);
200 continue; 199 continue;
201 } 200 nid = page_to_nid(page);
202 nid = pfn_to_nid(pfn);
203 if (!node_isset(nid, *nodes)) 201 if (!node_isset(nid, *nodes))
204 break; 202 break;
205 } while (pte++, addr += PAGE_SIZE, addr != end); 203 } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
269 first = find_vma(mm, start); 267 first = find_vma(mm, start);
270 if (!first) 268 if (!first)
271 return ERR_PTR(-EFAULT); 269 return ERR_PTR(-EFAULT);
272 if (first->vm_flags & VM_RESERVED)
273 return ERR_PTR(-EACCES);
274 prev = NULL; 270 prev = NULL;
275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 271 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
276 if (!vma->vm_next && vma->vm_end < end) 272 if (!vma->vm_next && vma->vm_end < end)
diff --git a/mm/mmap.c b/mm/mmap.c
index 4f8def03428c..11ca5927d5ff 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1076,17 +1076,6 @@ munmap_back:
1076 error = file->f_op->mmap(file, vma); 1076 error = file->f_op->mmap(file, vma);
1077 if (error) 1077 if (error)
1078 goto unmap_and_free_vma; 1078 goto unmap_and_free_vma;
1079 if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
1080 == (VM_WRITE | VM_RESERVED)) {
1081 printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
1082 "PROT_WRITE mmap of VM_RESERVED memory, which "
1083 "is deprecated. Please report this to "
1084 "linux-kernel@vger.kernel.org\n",current->comm);
1085 if (vma->vm_ops && vma->vm_ops->close)
1086 vma->vm_ops->close(vma);
1087 error = -EACCES;
1088 goto unmap_and_free_vma;
1089 }
1090 } else if (vm_flags & VM_SHARED) { 1079 } else if (vm_flags & VM_SHARED) {
1091 error = shmem_zero_setup(vma); 1080 error = shmem_zero_setup(vma);
1092 if (error) 1081 if (error)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 17a2b52b753b..653b8571c1ed 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,14 +124,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
124 * a MAP_NORESERVE private mapping to writable will now reserve. 124 * a MAP_NORESERVE private mapping to writable will now reserve.
125 */ 125 */
126 if (newflags & VM_WRITE) { 126 if (newflags & VM_WRITE) {
127 if (oldflags & VM_RESERVED) {
128 BUG_ON(oldflags & VM_WRITE);
129 printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
130 "PROT_WRITE mprotect of VM_RESERVED memory, "
131 "which is deprecated. Please report this to "
132 "linux-kernel@vger.kernel.org\n",current->comm);
133 return -EACCES;
134 }
135 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { 127 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
136 charged = nrpages; 128 charged = nrpages;
137 if (security_vm_enough_memory(charged)) 129 if (security_vm_enough_memory(charged))
diff --git a/mm/msync.c b/mm/msync.c
index 0e040e9c39d8..1b5b6f662dcf 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
27again: 27again:
28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
29 do { 29 do {
30 unsigned long pfn;
31 struct page *page; 30 struct page *page;
32 31
33 if (progress >= 64) { 32 if (progress >= 64) {
@@ -40,13 +39,9 @@ again:
40 continue; 39 continue;
41 if (!pte_maybe_dirty(*pte)) 40 if (!pte_maybe_dirty(*pte))
42 continue; 41 continue;
43 pfn = pte_pfn(*pte); 42 page = vm_normal_page(vma, addr, *pte);
44 if (unlikely(!pfn_valid(pfn))) { 43 if (!page)
45 print_bad_pte(vma, *pte, addr);
46 continue; 44 continue;
47 }
48 page = pfn_to_page(pfn);
49
50 if (ptep_clear_flush_dirty(vma, addr, pte) || 45 if (ptep_clear_flush_dirty(vma, addr, pte) ||
51 page_test_and_clear_dirty(page)) 46 page_test_and_clear_dirty(page))
52 set_page_dirty(page); 47 set_page_dirty(page);
@@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma,
97 /* For hugepages we can't go walking the page table normally, 92 /* For hugepages we can't go walking the page table normally,
98 * but that's ok, hugetlbfs is memory based, so we don't need 93 * but that's ok, hugetlbfs is memory based, so we don't need
99 * to do anything more on an msync(). 94 * to do anything more on an msync().
100 * Can't do anything with VM_RESERVED regions either.
101 */ 95 */
102 if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED)) 96 if (vma->vm_flags & VM_HUGETLB)
103 return; 97 return;
104 98
105 BUG_ON(addr >= end); 99 BUG_ON(addr >= end);
diff --git a/mm/nommu.c b/mm/nommu.c
index 6deb6ab3d6ad..c1196812876b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1045 1045
1046EXPORT_SYMBOL(find_vma); 1046EXPORT_SYMBOL(find_vma);
1047 1047
1048struct page *follow_page(struct mm_struct *mm, unsigned long address, 1048struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1049 unsigned int foll_flags) 1049 unsigned int foll_flags)
1050{ 1050{
1051 return NULL; 1051 return NULL;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bd4de592dc23..b257720edfc8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -140,18 +140,13 @@ static void bad_page(const char *function, struct page *page)
140 1 << PG_reclaim | 140 1 << PG_reclaim |
141 1 << PG_slab | 141 1 << PG_slab |
142 1 << PG_swapcache | 142 1 << PG_swapcache |
143 1 << PG_writeback | 143 1 << PG_writeback );
144 1 << PG_reserved );
145 set_page_count(page, 0); 144 set_page_count(page, 0);
146 reset_page_mapcount(page); 145 reset_page_mapcount(page);
147 page->mapping = NULL; 146 page->mapping = NULL;
148 add_taint(TAINT_BAD_PAGE); 147 add_taint(TAINT_BAD_PAGE);
149} 148}
150 149
151#ifndef CONFIG_HUGETLB_PAGE
152#define prep_compound_page(page, order) do { } while (0)
153#define destroy_compound_page(page, order) do { } while (0)
154#else
155/* 150/*
156 * Higher-order pages are called "compound pages". They are structured thusly: 151 * Higher-order pages are called "compound pages". They are structured thusly:
157 * 152 *
@@ -205,7 +200,6 @@ static void destroy_compound_page(struct page *page, unsigned long order)
205 ClearPageCompound(p); 200 ClearPageCompound(p);
206 } 201 }
207} 202}
208#endif /* CONFIG_HUGETLB_PAGE */
209 203
210/* 204/*
211 * function for dealing with page's order in buddy system. 205 * function for dealing with page's order in buddy system.
@@ -340,7 +334,7 @@ static inline void __free_pages_bulk (struct page *page,
340 zone->free_area[order].nr_free++; 334 zone->free_area[order].nr_free++;
341} 335}
342 336
343static inline void free_pages_check(const char *function, struct page *page) 337static inline int free_pages_check(const char *function, struct page *page)
344{ 338{
345 if ( page_mapcount(page) || 339 if ( page_mapcount(page) ||
346 page->mapping != NULL || 340 page->mapping != NULL ||
@@ -358,6 +352,12 @@ static inline void free_pages_check(const char *function, struct page *page)
358 bad_page(function, page); 352 bad_page(function, page);
359 if (PageDirty(page)) 353 if (PageDirty(page))
360 __ClearPageDirty(page); 354 __ClearPageDirty(page);
355 /*
356 * For now, we report if PG_reserved was found set, but do not
357 * clear it, and do not free the page. But we shall soon need
358 * to do more, for when the ZERO_PAGE count wraps negative.
359 */
360 return PageReserved(page);
361} 361}
362 362
363/* 363/*
@@ -397,11 +397,10 @@ void __free_pages_ok(struct page *page, unsigned int order)
397{ 397{
398 LIST_HEAD(list); 398 LIST_HEAD(list);
399 int i; 399 int i;
400 int reserved = 0;
400 401
401 arch_free_page(page, order); 402 arch_free_page(page, order);
402 403
403 mod_page_state(pgfree, 1 << order);
404
405#ifndef CONFIG_MMU 404#ifndef CONFIG_MMU
406 if (order > 0) 405 if (order > 0)
407 for (i = 1 ; i < (1 << order) ; ++i) 406 for (i = 1 ; i < (1 << order) ; ++i)
@@ -409,8 +408,12 @@ void __free_pages_ok(struct page *page, unsigned int order)
409#endif 408#endif
410 409
411 for (i = 0 ; i < (1 << order) ; ++i) 410 for (i = 0 ; i < (1 << order) ; ++i)
412 free_pages_check(__FUNCTION__, page + i); 411 reserved += free_pages_check(__FUNCTION__, page + i);
412 if (reserved)
413 return;
414
413 list_add(&page->lru, &list); 415 list_add(&page->lru, &list);
416 mod_page_state(pgfree, 1 << order);
414 kernel_map_pages(page, 1<<order, 0); 417 kernel_map_pages(page, 1<<order, 0);
415 free_pages_bulk(page_zone(page), 1, &list, order); 418 free_pages_bulk(page_zone(page), 1, &list, order);
416} 419}
@@ -468,7 +471,7 @@ void set_page_refs(struct page *page, int order)
468/* 471/*
469 * This page is about to be returned from the page allocator 472 * This page is about to be returned from the page allocator
470 */ 473 */
471static void prep_new_page(struct page *page, int order) 474static int prep_new_page(struct page *page, int order)
472{ 475{
473 if ( page_mapcount(page) || 476 if ( page_mapcount(page) ||
474 page->mapping != NULL || 477 page->mapping != NULL ||
@@ -486,12 +489,20 @@ static void prep_new_page(struct page *page, int order)
486 1 << PG_reserved ))) 489 1 << PG_reserved )))
487 bad_page(__FUNCTION__, page); 490 bad_page(__FUNCTION__, page);
488 491
492 /*
493 * For now, we report if PG_reserved was found set, but do not
494 * clear it, and do not allocate the page: as a safety net.
495 */
496 if (PageReserved(page))
497 return 1;
498
489 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 499 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
490 1 << PG_referenced | 1 << PG_arch_1 | 500 1 << PG_referenced | 1 << PG_arch_1 |
491 1 << PG_checked | 1 << PG_mappedtodisk); 501 1 << PG_checked | 1 << PG_mappedtodisk);
492 set_page_private(page, 0); 502 set_page_private(page, 0);
493 set_page_refs(page, order); 503 set_page_refs(page, order);
494 kernel_map_pages(page, 1 << order, 1); 504 kernel_map_pages(page, 1 << order, 1);
505 return 0;
495} 506}
496 507
497/* 508/*
@@ -674,11 +685,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
674 685
675 arch_free_page(page, 0); 686 arch_free_page(page, 0);
676 687
677 kernel_map_pages(page, 1, 0);
678 inc_page_state(pgfree);
679 if (PageAnon(page)) 688 if (PageAnon(page))
680 page->mapping = NULL; 689 page->mapping = NULL;
681 free_pages_check(__FUNCTION__, page); 690 if (free_pages_check(__FUNCTION__, page))
691 return;
692
693 inc_page_state(pgfree);
694 kernel_map_pages(page, 1, 0);
695
682 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 696 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
683 local_irq_save(flags); 697 local_irq_save(flags);
684 list_add(&page->lru, &pcp->list); 698 list_add(&page->lru, &pcp->list);
@@ -717,12 +731,14 @@ static struct page *
717buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 731buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
718{ 732{
719 unsigned long flags; 733 unsigned long flags;
720 struct page *page = NULL; 734 struct page *page;
721 int cold = !!(gfp_flags & __GFP_COLD); 735 int cold = !!(gfp_flags & __GFP_COLD);
722 736
737again:
723 if (order == 0) { 738 if (order == 0) {
724 struct per_cpu_pages *pcp; 739 struct per_cpu_pages *pcp;
725 740
741 page = NULL;
726 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 742 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
727 local_irq_save(flags); 743 local_irq_save(flags);
728 if (pcp->count <= pcp->low) 744 if (pcp->count <= pcp->low)
@@ -744,7 +760,8 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
744 if (page != NULL) { 760 if (page != NULL) {
745 BUG_ON(bad_range(zone, page)); 761 BUG_ON(bad_range(zone, page));
746 mod_page_state_zone(zone, pgalloc, 1 << order); 762 mod_page_state_zone(zone, pgalloc, 1 << order);
747 prep_new_page(page, order); 763 if (prep_new_page(page, order))
764 goto again;
748 765
749 if (gfp_flags & __GFP_ZERO) 766 if (gfp_flags & __GFP_ZERO)
750 prep_zero_page(page, order, gfp_flags); 767 prep_zero_page(page, order, gfp_flags);
@@ -756,9 +773,12 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
756} 773}
757 774
758#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 775#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
759#define ALLOC_HARDER 0x02 /* try to alloc harder */ 776#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
760#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */ 777#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
761#define ALLOC_CPUSET 0x08 /* check for correct cpuset */ 778#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
779#define ALLOC_HARDER 0x10 /* try to alloc harder */
780#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
781#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
762 782
763/* 783/*
764 * Return 1 if free pages are above 'mark'. This takes into account the order 784 * Return 1 if free pages are above 'mark'. This takes into account the order
@@ -813,7 +833,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
813 continue; 833 continue;
814 834
815 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 835 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
816 if (!zone_watermark_ok(*z, order, (*z)->pages_low, 836 unsigned long mark;
837 if (alloc_flags & ALLOC_WMARK_MIN)
838 mark = (*z)->pages_min;
839 else if (alloc_flags & ALLOC_WMARK_LOW)
840 mark = (*z)->pages_low;
841 else
842 mark = (*z)->pages_high;
843 if (!zone_watermark_ok(*z, order, mark,
817 classzone_idx, alloc_flags)) 844 classzone_idx, alloc_flags))
818 continue; 845 continue;
819 } 846 }
@@ -854,7 +881,7 @@ restart:
854 } 881 }
855 882
856 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 883 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
857 zonelist, ALLOC_CPUSET); 884 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
858 if (page) 885 if (page)
859 goto got_pg; 886 goto got_pg;
860 887
@@ -871,7 +898,7 @@ restart:
871 * cannot run direct reclaim, or if the caller has realtime scheduling 898 * cannot run direct reclaim, or if the caller has realtime scheduling
872 * policy. 899 * policy.
873 */ 900 */
874 alloc_flags = 0; 901 alloc_flags = ALLOC_WMARK_MIN;
875 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 902 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
876 alloc_flags |= ALLOC_HARDER; 903 alloc_flags |= ALLOC_HARDER;
877 if (gfp_mask & __GFP_HIGH) 904 if (gfp_mask & __GFP_HIGH)
@@ -942,7 +969,7 @@ rebalance:
942 * under heavy pressure. 969 * under heavy pressure.
943 */ 970 */
944 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 971 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
945 zonelist, ALLOC_CPUSET); 972 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
946 if (page) 973 if (page)
947 goto got_pg; 974 goto got_pg;
948 975
diff --git a/mm/rmap.c b/mm/rmap.c
index 914d04b98bee..491ac350048f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -225,7 +225,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
225 225
226/* 226/*
227 * At what user virtual address is page expected in vma? checking that the 227 * At what user virtual address is page expected in vma? checking that the
228 * page matches the vma: currently only used by unuse_process, on anon pages. 228 * page matches the vma: currently only used on anon pages, by unuse_vma;
229 */ 229 */
230unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 230unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
231{ 231{
@@ -234,7 +234,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
234 (void *)page->mapping - PAGE_MAPPING_ANON) 234 (void *)page->mapping - PAGE_MAPPING_ANON)
235 return -EFAULT; 235 return -EFAULT;
236 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 236 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
237 if (vma->vm_file->f_mapping != page->mapping) 237 if (!vma->vm_file ||
238 vma->vm_file->f_mapping != page->mapping)
238 return -EFAULT; 239 return -EFAULT;
239 } else 240 } else
240 return -EFAULT; 241 return -EFAULT;
@@ -289,7 +290,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
289 * repeatedly from either page_referenced_anon or page_referenced_file. 290 * repeatedly from either page_referenced_anon or page_referenced_file.
290 */ 291 */
291static int page_referenced_one(struct page *page, 292static int page_referenced_one(struct page *page,
292 struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token) 293 struct vm_area_struct *vma, unsigned int *mapcount)
293{ 294{
294 struct mm_struct *mm = vma->vm_mm; 295 struct mm_struct *mm = vma->vm_mm;
295 unsigned long address; 296 unsigned long address;
@@ -310,7 +311,7 @@ static int page_referenced_one(struct page *page,
310 311
311 /* Pretend the page is referenced if the task has the 312 /* Pretend the page is referenced if the task has the
312 swap token and is in the middle of a page fault. */ 313 swap token and is in the middle of a page fault. */
313 if (mm != current->mm && !ignore_token && has_swap_token(mm) && 314 if (mm != current->mm && has_swap_token(mm) &&
314 rwsem_is_locked(&mm->mmap_sem)) 315 rwsem_is_locked(&mm->mmap_sem))
315 referenced++; 316 referenced++;
316 317
@@ -320,7 +321,7 @@ out:
320 return referenced; 321 return referenced;
321} 322}
322 323
323static int page_referenced_anon(struct page *page, int ignore_token) 324static int page_referenced_anon(struct page *page)
324{ 325{
325 unsigned int mapcount; 326 unsigned int mapcount;
326 struct anon_vma *anon_vma; 327 struct anon_vma *anon_vma;
@@ -333,8 +334,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
333 334
334 mapcount = page_mapcount(page); 335 mapcount = page_mapcount(page);
335 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 336 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
336 referenced += page_referenced_one(page, vma, &mapcount, 337 referenced += page_referenced_one(page, vma, &mapcount);
337 ignore_token);
338 if (!mapcount) 338 if (!mapcount)
339 break; 339 break;
340 } 340 }
@@ -353,7 +353,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
353 * 353 *
354 * This function is only called from page_referenced for object-based pages. 354 * This function is only called from page_referenced for object-based pages.
355 */ 355 */
356static int page_referenced_file(struct page *page, int ignore_token) 356static int page_referenced_file(struct page *page)
357{ 357{
358 unsigned int mapcount; 358 unsigned int mapcount;
359 struct address_space *mapping = page->mapping; 359 struct address_space *mapping = page->mapping;
@@ -391,8 +391,7 @@ static int page_referenced_file(struct page *page, int ignore_token)
391 referenced++; 391 referenced++;
392 break; 392 break;
393 } 393 }
394 referenced += page_referenced_one(page, vma, &mapcount, 394 referenced += page_referenced_one(page, vma, &mapcount);
395 ignore_token);
396 if (!mapcount) 395 if (!mapcount)
397 break; 396 break;
398 } 397 }
@@ -409,13 +408,10 @@ static int page_referenced_file(struct page *page, int ignore_token)
409 * Quick test_and_clear_referenced for all mappings to a page, 408 * Quick test_and_clear_referenced for all mappings to a page,
410 * returns the number of ptes which referenced the page. 409 * returns the number of ptes which referenced the page.
411 */ 410 */
412int page_referenced(struct page *page, int is_locked, int ignore_token) 411int page_referenced(struct page *page, int is_locked)
413{ 412{
414 int referenced = 0; 413 int referenced = 0;
415 414
416 if (!swap_token_default_timeout)
417 ignore_token = 1;
418
419 if (page_test_and_clear_young(page)) 415 if (page_test_and_clear_young(page))
420 referenced++; 416 referenced++;
421 417
@@ -424,15 +420,14 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
424 420
425 if (page_mapped(page) && page->mapping) { 421 if (page_mapped(page) && page->mapping) {
426 if (PageAnon(page)) 422 if (PageAnon(page))
427 referenced += page_referenced_anon(page, ignore_token); 423 referenced += page_referenced_anon(page);
428 else if (is_locked) 424 else if (is_locked)
429 referenced += page_referenced_file(page, ignore_token); 425 referenced += page_referenced_file(page);
430 else if (TestSetPageLocked(page)) 426 else if (TestSetPageLocked(page))
431 referenced++; 427 referenced++;
432 else { 428 else {
433 if (page->mapping) 429 if (page->mapping)
434 referenced += page_referenced_file(page, 430 referenced += page_referenced_file(page);
435 ignore_token);
436 unlock_page(page); 431 unlock_page(page);
437 } 432 }
438 } 433 }
@@ -529,10 +524,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
529 * If the page is mlock()d, we cannot swap it out. 524 * If the page is mlock()d, we cannot swap it out.
530 * If it's recently referenced (perhaps page_referenced 525 * If it's recently referenced (perhaps page_referenced
531 * skipped over this mm) then we should reactivate it. 526 * skipped over this mm) then we should reactivate it.
532 *
533 * Pages belonging to VM_RESERVED regions should not happen here.
534 */ 527 */
535 if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || 528 if ((vma->vm_flags & VM_LOCKED) ||
536 ptep_clear_flush_young(vma, address, pte)) { 529 ptep_clear_flush_young(vma, address, pte)) {
537 ret = SWAP_FAIL; 530 ret = SWAP_FAIL;
538 goto out_unmap; 531 goto out_unmap;
@@ -613,7 +606,6 @@ static void try_to_unmap_cluster(unsigned long cursor,
613 struct page *page; 606 struct page *page;
614 unsigned long address; 607 unsigned long address;
615 unsigned long end; 608 unsigned long end;
616 unsigned long pfn;
617 609
618 address = (vma->vm_start + cursor) & CLUSTER_MASK; 610 address = (vma->vm_start + cursor) & CLUSTER_MASK;
619 end = address + CLUSTER_SIZE; 611 end = address + CLUSTER_SIZE;
@@ -642,15 +634,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
642 for (; address < end; pte++, address += PAGE_SIZE) { 634 for (; address < end; pte++, address += PAGE_SIZE) {
643 if (!pte_present(*pte)) 635 if (!pte_present(*pte))
644 continue; 636 continue;
645 637 page = vm_normal_page(vma, address, *pte);
646 pfn = pte_pfn(*pte); 638 BUG_ON(!page || PageAnon(page));
647 if (unlikely(!pfn_valid(pfn))) {
648 print_bad_pte(vma, *pte, address);
649 continue;
650 }
651
652 page = pfn_to_page(pfn);
653 BUG_ON(PageAnon(page));
654 639
655 if (ptep_clear_flush_young(vma, address, pte)) 640 if (ptep_clear_flush_young(vma, address, pte))
656 continue; 641 continue;
@@ -727,7 +712,7 @@ static int try_to_unmap_file(struct page *page)
727 712
728 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 713 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
729 shared.vm_set.list) { 714 shared.vm_set.list) {
730 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) 715 if (vma->vm_flags & VM_LOCKED)
731 continue; 716 continue;
732 cursor = (unsigned long) vma->vm_private_data; 717 cursor = (unsigned long) vma->vm_private_data;
733 if (cursor > max_nl_cursor) 718 if (cursor > max_nl_cursor)
@@ -761,7 +746,7 @@ static int try_to_unmap_file(struct page *page)
761 do { 746 do {
762 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 747 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
763 shared.vm_set.list) { 748 shared.vm_set.list) {
764 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) 749 if (vma->vm_flags & VM_LOCKED)
765 continue; 750 continue;
766 cursor = (unsigned long) vma->vm_private_data; 751 cursor = (unsigned long) vma->vm_private_data;
767 while ( cursor < max_nl_cursor && 752 while ( cursor < max_nl_cursor &&
@@ -783,11 +768,8 @@ static int try_to_unmap_file(struct page *page)
783 * in locked vmas). Reset cursor on all unreserved nonlinear 768 * in locked vmas). Reset cursor on all unreserved nonlinear
784 * vmas, now forgetting on which ones it had fallen behind. 769 * vmas, now forgetting on which ones it had fallen behind.
785 */ 770 */
786 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 771 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
787 shared.vm_set.list) { 772 vma->vm_private_data = NULL;
788 if (!(vma->vm_flags & VM_RESERVED))
789 vma->vm_private_data = NULL;
790 }
791out: 773out:
792 spin_unlock(&mapping->i_mmap_lock); 774 spin_unlock(&mapping->i_mmap_lock);
793 return ret; 775 return ret;
diff --git a/mm/swap.c b/mm/swap.c
index d09cf7f03e76..73d351439ef6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,8 +34,6 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37#ifdef CONFIG_HUGETLB_PAGE
38
39void put_page(struct page *page) 37void put_page(struct page *page)
40{ 38{
41 if (unlikely(PageCompound(page))) { 39 if (unlikely(PageCompound(page))) {
@@ -52,7 +50,6 @@ void put_page(struct page *page)
52 __page_cache_release(page); 50 __page_cache_release(page);
53} 51}
54EXPORT_SYMBOL(put_page); 52EXPORT_SYMBOL(put_page);
55#endif
56 53
57/* 54/*
58 * Writeback is about to end against a page which has been marked for immediate 55 * Writeback is about to end against a page which has been marked for immediate
diff --git a/mm/thrash.c b/mm/thrash.c
index eff3c18c33a1..f4c560b4a2b7 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -57,14 +57,17 @@ void grab_swap_token(void)
57 /* We have the token. Let others know we still need it. */ 57 /* We have the token. Let others know we still need it. */
58 if (has_swap_token(current->mm)) { 58 if (has_swap_token(current->mm)) {
59 current->mm->recent_pagein = 1; 59 current->mm->recent_pagein = 1;
60 if (unlikely(!swap_token_default_timeout))
61 disable_swap_token();
60 return; 62 return;
61 } 63 }
62 64
63 if (time_after(jiffies, swap_token_check)) { 65 if (time_after(jiffies, swap_token_check)) {
64 66
65 /* Can't get swapout protection if we exceed our RSS limit. */ 67 if (!swap_token_default_timeout) {
66 // if (current->mm->rss > current->mm->rlimit_rss) 68 swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
67 // return; 69 return;
70 }
68 71
69 /* ... or if we recently held the token. */ 72 /* ... or if we recently held the token. */
70 if (time_before(jiffies, current->mm->swap_token_time)) 73 if (time_before(jiffies, current->mm->swap_token_time))
@@ -95,6 +98,7 @@ void __put_swap_token(struct mm_struct *mm)
95{ 98{
96 spin_lock(&swap_token_lock); 99 spin_lock(&swap_token_lock);
97 if (likely(mm == swap_token_mm)) { 100 if (likely(mm == swap_token_mm)) {
101 mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
98 swap_token_mm = &init_mm; 102 swap_token_mm = &init_mm;
99 swap_token_check = jiffies; 103 swap_token_check = jiffies;
100 } 104 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 29c18f68dc35..9173ab500604 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -282,8 +282,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
282 * Zap the rest of the file in one hit. 282 * Zap the rest of the file in one hit.
283 */ 283 */
284 unmap_mapping_range(mapping, 284 unmap_mapping_range(mapping,
285 page_index << PAGE_CACHE_SHIFT, 285 (loff_t)page_index<<PAGE_CACHE_SHIFT,
286 (end - page_index + 1) 286 (loff_t)(end - page_index + 1)
287 << PAGE_CACHE_SHIFT, 287 << PAGE_CACHE_SHIFT,
288 0); 288 0);
289 did_range_unmap = 1; 289 did_range_unmap = 1;
@@ -292,7 +292,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
292 * Just zap this page 292 * Just zap this page
293 */ 293 */
294 unmap_mapping_range(mapping, 294 unmap_mapping_range(mapping,
295 page_index << PAGE_CACHE_SHIFT, 295 (loff_t)page_index<<PAGE_CACHE_SHIFT,
296 PAGE_CACHE_SIZE, 0); 296 PAGE_CACHE_SIZE, 0);
297 } 297 }
298 } 298 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 28130541270f..b0cd81c32de6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -201,13 +201,25 @@ static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
201 list_for_each_entry(shrinker, &shrinker_list, list) { 201 list_for_each_entry(shrinker, &shrinker_list, list) {
202 unsigned long long delta; 202 unsigned long long delta;
203 unsigned long total_scan; 203 unsigned long total_scan;
204 unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
204 205
205 delta = (4 * scanned) / shrinker->seeks; 206 delta = (4 * scanned) / shrinker->seeks;
206 delta *= (*shrinker->shrinker)(0, gfp_mask); 207 delta *= max_pass;
207 do_div(delta, lru_pages + 1); 208 do_div(delta, lru_pages + 1);
208 shrinker->nr += delta; 209 shrinker->nr += delta;
209 if (shrinker->nr < 0) 210 if (shrinker->nr < 0) {
210 shrinker->nr = LONG_MAX; /* It wrapped! */ 211 printk(KERN_ERR "%s: nr=%ld\n",
212 __FUNCTION__, shrinker->nr);
213 shrinker->nr = max_pass;
214 }
215
216 /*
217 * Avoid risking looping forever due to too large nr value:
218 * never try to free more than twice the estimate number of
219 * freeable entries.
220 */
221 if (shrinker->nr > max_pass * 2)
222 shrinker->nr = max_pass * 2;
211 223
212 total_scan = shrinker->nr; 224 total_scan = shrinker->nr;
213 shrinker->nr = 0; 225 shrinker->nr = 0;
@@ -407,7 +419,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
407 if (PageWriteback(page)) 419 if (PageWriteback(page))
408 goto keep_locked; 420 goto keep_locked;
409 421
410 referenced = page_referenced(page, 1, sc->priority <= 0); 422 referenced = page_referenced(page, 1);
411 /* In active use or really unfreeable? Activate it. */ 423 /* In active use or really unfreeable? Activate it. */
412 if (referenced && page_mapping_inuse(page)) 424 if (referenced && page_mapping_inuse(page))
413 goto activate_locked; 425 goto activate_locked;
@@ -756,7 +768,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
756 if (page_mapped(page)) { 768 if (page_mapped(page)) {
757 if (!reclaim_mapped || 769 if (!reclaim_mapped ||
758 (total_swap_pages == 0 && PageAnon(page)) || 770 (total_swap_pages == 0 && PageAnon(page)) ||
759 page_referenced(page, 0, sc->priority <= 0)) { 771 page_referenced(page, 0)) {
760 list_add(&page->lru, &l_active); 772 list_add(&page->lru, &l_active);
761 continue; 773 continue;
762 } 774 }
@@ -960,6 +972,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
960 sc.nr_reclaimed = 0; 972 sc.nr_reclaimed = 0;
961 sc.priority = priority; 973 sc.priority = priority;
962 sc.swap_cluster_max = SWAP_CLUSTER_MAX; 974 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
975 if (!priority)
976 disable_swap_token();
963 shrink_caches(zones, &sc); 977 shrink_caches(zones, &sc);
964 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 978 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
965 if (reclaim_state) { 979 if (reclaim_state) {
@@ -1056,6 +1070,10 @@ loop_again:
1056 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1070 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
1057 unsigned long lru_pages = 0; 1071 unsigned long lru_pages = 0;
1058 1072
1073 /* The swap token gets in the way of swapout... */
1074 if (!priority)
1075 disable_swap_token();
1076
1059 all_zones_ok = 1; 1077 all_zones_ok = 1;
1060 1078
1061 if (nr_pages == 0) { 1079 if (nr_pages == 0) {
@@ -1360,6 +1378,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1360 sc.nr_reclaimed = 0; 1378 sc.nr_reclaimed = 0;
1361 /* scan at the highest priority */ 1379 /* scan at the highest priority */
1362 sc.priority = 0; 1380 sc.priority = 0;
1381 disable_swap_token();
1363 1382
1364 if (nr_pages > SWAP_CLUSTER_MAX) 1383 if (nr_pages > SWAP_CLUSTER_MAX)
1365 sc.swap_cluster_max = nr_pages; 1384 sc.swap_cluster_max = nr_pages;