aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorAnton Altaparmakov <aia21@cantab.net>2005-12-05 10:48:41 -0500
committerAnton Altaparmakov <aia21@cantab.net>2005-12-05 10:48:41 -0500
commit292d4ed32e35df4755052b5002e533348d1648fd (patch)
tree8522e6bab962696bd25a6c02fb068c674a09b7ee /mm
parent3c6af7fa787f21f8873a050568ed892312899eb5 (diff)
parente4f5c82a92c2a546a16af1614114eec19120e40a (diff)
Merge branch 'master' of /usr/src/ntfs-2.6/
Diffstat (limited to 'mm')
-rw-r--r--mm/fremap.c46
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory.c341
-rw-r--r--mm/mempolicy.c12
-rw-r--r--mm/msync.c12
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page_alloc.c40
-rw-r--r--mm/rmap.c42
-rw-r--r--mm/thrash.c10
-rw-r--r--mm/vmscan.c29
10 files changed, 336 insertions, 200 deletions
diff --git a/mm/fremap.c b/mm/fremap.c
index 007cbad933..9f381e58bf 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
27 struct page *page = NULL; 27 struct page *page = NULL;
28 28
29 if (pte_present(pte)) { 29 if (pte_present(pte)) {
30 unsigned long pfn = pte_pfn(pte); 30 flush_cache_page(vma, addr, pte_pfn(pte));
31 flush_cache_page(vma, addr, pfn);
32 pte = ptep_clear_flush(vma, addr, ptep); 31 pte = ptep_clear_flush(vma, addr, ptep);
33 if (unlikely(!pfn_valid(pfn))) { 32 page = vm_normal_page(vma, addr, pte);
34 print_bad_pte(vma, pte, addr); 33 if (page) {
35 goto out; 34 if (pte_dirty(pte))
35 set_page_dirty(page);
36 page_remove_rmap(page);
37 page_cache_release(page);
36 } 38 }
37 page = pfn_to_page(pfn);
38 if (pte_dirty(pte))
39 set_page_dirty(page);
40 page_remove_rmap(page);
41 page_cache_release(page);
42 } else { 39 } else {
43 if (!pte_file(pte)) 40 if (!pte_file(pte))
44 free_swap_and_cache(pte_to_swp_entry(pte)); 41 free_swap_and_cache(pte_to_swp_entry(pte));
45 pte_clear(mm, addr, ptep); 42 pte_clear(mm, addr, ptep);
46 } 43 }
47out:
48 return !!page; 44 return !!page;
49} 45}
50 46
@@ -59,22 +55,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
59 pgoff_t size; 55 pgoff_t size;
60 int err = -ENOMEM; 56 int err = -ENOMEM;
61 pte_t *pte; 57 pte_t *pte;
62 pmd_t *pmd;
63 pud_t *pud;
64 pgd_t *pgd;
65 pte_t pte_val; 58 pte_t pte_val;
66 spinlock_t *ptl; 59 spinlock_t *ptl;
67 60
68 BUG_ON(vma->vm_flags & VM_UNPAGED); 61 pte = get_locked_pte(mm, addr, &ptl);
69
70 pgd = pgd_offset(mm, addr);
71 pud = pud_alloc(mm, pgd, addr);
72 if (!pud)
73 goto out;
74 pmd = pmd_alloc(mm, pud, addr);
75 if (!pmd)
76 goto out;
77 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
78 if (!pte) 62 if (!pte)
79 goto out; 63 goto out;
80 64
@@ -116,22 +100,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
116{ 100{
117 int err = -ENOMEM; 101 int err = -ENOMEM;
118 pte_t *pte; 102 pte_t *pte;
119 pmd_t *pmd;
120 pud_t *pud;
121 pgd_t *pgd;
122 pte_t pte_val; 103 pte_t pte_val;
123 spinlock_t *ptl; 104 spinlock_t *ptl;
124 105
125 BUG_ON(vma->vm_flags & VM_UNPAGED); 106 pte = get_locked_pte(mm, addr, &ptl);
126
127 pgd = pgd_offset(mm, addr);
128 pud = pud_alloc(mm, pgd, addr);
129 if (!pud)
130 goto out;
131 pmd = pmd_alloc(mm, pud, addr);
132 if (!pmd)
133 goto out;
134 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
135 if (!pte) 107 if (!pte)
136 goto out; 108 goto out;
137 109
diff --git a/mm/madvise.c b/mm/madvise.c
index 328a3bcce5..2b7cf0400a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
126 unsigned long start, unsigned long end) 126 unsigned long start, unsigned long end)
127{ 127{
128 *prev = vma; 128 *prev = vma;
129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED)) 129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
130 return -EINVAL; 130 return -EINVAL;
131 131
132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index d1f46f4e4c..aa8af0e202 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
333} 333}
334 334
335/* 335/*
336 * This function is called to print an error when a pte in a 336 * This function is called to print an error when a bad pte
337 * !VM_UNPAGED region is found pointing to an invalid pfn (which 337 * is found. For example, we might have a PFN-mapped pte in
338 * is an error. 338 * a region that doesn't allow it.
339 * 339 *
340 * The calling function must still handle the error. 340 * The calling function must still handle the error.
341 */ 341 */
@@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
350} 350}
351 351
352/* 352/*
353 * page_is_anon applies strict checks for an anonymous page belonging to 353 * This function gets the "struct page" associated with a pte.
354 * this vma at this address. It is used on VM_UNPAGED vmas, which are 354 *
355 * usually populated with shared originals (which must not be counted), 355 * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
356 * but occasionally contain private COWed copies (when !VM_SHARED, or 356 * will have each page table entry just pointing to a raw page frame
357 * perhaps via ptrace when VM_SHARED). An mmap of /dev/mem might window 357 * number, and as far as the VM layer is concerned, those do not have
358 * free pages, pages from other processes, or from other parts of this: 358 * pages associated with them - even if the PFN might point to memory
359 * it's tricky, but try not to be deceived by foreign anonymous pages. 359 * that otherwise is perfectly fine and has a "struct page".
360 *
361 * The way we recognize those mappings is through the rules set up
362 * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
363 * and the vm_pgoff will point to the first PFN mapped: thus every
364 * page that is a raw mapping will always honor the rule
365 *
366 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
367 *
368 * and if that isn't true, the page has been COW'ed (in which case it
369 * _does_ have a "struct page" associated with it even if it is in a
370 * VM_PFNMAP range).
360 */ 371 */
361static inline int page_is_anon(struct page *page, 372struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
362 struct vm_area_struct *vma, unsigned long addr)
363{ 373{
364 return page && PageAnon(page) && page_mapped(page) && 374 unsigned long pfn = pte_pfn(pte);
365 page_address_in_vma(page, vma) == addr; 375
376 if (vma->vm_flags & VM_PFNMAP) {
377 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
378 if (pfn == vma->vm_pgoff + off)
379 return NULL;
380 }
381
382 /*
383 * Add some anal sanity checks for now. Eventually,
384 * we should just do "return pfn_to_page(pfn)", but
385 * in the meantime we check that we get a valid pfn,
386 * and that the resulting page looks ok.
387 *
388 * Remove this test eventually!
389 */
390 if (unlikely(!pfn_valid(pfn))) {
391 print_bad_pte(vma, pte, addr);
392 return NULL;
393 }
394
395 /*
396 * NOTE! We still have PageReserved() pages in the page
397 * tables.
398 *
399 * The PAGE_ZERO() pages and various VDSO mappings can
400 * cause them to exist.
401 */
402 return pfn_to_page(pfn);
366} 403}
367 404
368/* 405/*
@@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
379 unsigned long vm_flags = vma->vm_flags; 416 unsigned long vm_flags = vma->vm_flags;
380 pte_t pte = *src_pte; 417 pte_t pte = *src_pte;
381 struct page *page; 418 struct page *page;
382 unsigned long pfn;
383 419
384 /* pte contains position in swap or file, so copy. */ 420 /* pte contains position in swap or file, so copy. */
385 if (unlikely(!pte_present(pte))) { 421 if (unlikely(!pte_present(pte))) {
@@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
397 goto out_set_pte; 433 goto out_set_pte;
398 } 434 }
399 435
400 pfn = pte_pfn(pte);
401 page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
402
403 if (unlikely(vm_flags & VM_UNPAGED))
404 if (!page_is_anon(page, vma, addr))
405 goto out_set_pte;
406
407 /*
408 * If the pte points outside of valid memory but
409 * the region is not VM_UNPAGED, we have a problem.
410 */
411 if (unlikely(!page)) {
412 print_bad_pte(vma, pte, addr);
413 goto out_set_pte; /* try to do something sane */
414 }
415
416 /* 436 /*
417 * If it's a COW mapping, write protect it both 437 * If it's a COW mapping, write protect it both
418 * in the parent and the child 438 * in the parent and the child
@@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
429 if (vm_flags & VM_SHARED) 449 if (vm_flags & VM_SHARED)
430 pte = pte_mkclean(pte); 450 pte = pte_mkclean(pte);
431 pte = pte_mkold(pte); 451 pte = pte_mkold(pte);
432 get_page(page); 452
433 page_dup_rmap(page); 453 page = vm_normal_page(vma, addr, pte);
434 rss[!!PageAnon(page)]++; 454 if (page) {
455 get_page(page);
456 page_dup_rmap(page);
457 rss[!!PageAnon(page)]++;
458 }
435 459
436out_set_pte: 460out_set_pte:
437 set_pte_at(dst_mm, addr, dst_pte, pte); 461 set_pte_at(dst_mm, addr, dst_pte, pte);
@@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
543 * readonly mappings. The tradeoff is that copy_page_range is more 567 * readonly mappings. The tradeoff is that copy_page_range is more
544 * efficient than faulting. 568 * efficient than faulting.
545 */ 569 */
546 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) { 570 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
547 if (!vma->anon_vma) 571 if (!vma->anon_vma)
548 return 0; 572 return 0;
549 } 573 }
@@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
584 } 608 }
585 if (pte_present(ptent)) { 609 if (pte_present(ptent)) {
586 struct page *page; 610 struct page *page;
587 unsigned long pfn;
588 611
589 (*zap_work) -= PAGE_SIZE; 612 (*zap_work) -= PAGE_SIZE;
590 613
591 pfn = pte_pfn(ptent); 614 page = vm_normal_page(vma, addr, ptent);
592 page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
593
594 if (unlikely(vma->vm_flags & VM_UNPAGED)) {
595 if (!page_is_anon(page, vma, addr))
596 page = NULL;
597 } else if (unlikely(!page))
598 print_bad_pte(vma, ptent, addr);
599
600 if (unlikely(details) && page) { 615 if (unlikely(details) && page) {
601 /* 616 /*
602 * unmap_shared_mapping_pages() wants to 617 * unmap_shared_mapping_pages() wants to
@@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
852/* 867/*
853 * Do a quick page-table lookup for a single page. 868 * Do a quick page-table lookup for a single page.
854 */ 869 */
855struct page *follow_page(struct mm_struct *mm, unsigned long address, 870struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
856 unsigned int flags) 871 unsigned int flags)
857{ 872{
858 pgd_t *pgd; 873 pgd_t *pgd;
@@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
860 pmd_t *pmd; 875 pmd_t *pmd;
861 pte_t *ptep, pte; 876 pte_t *ptep, pte;
862 spinlock_t *ptl; 877 spinlock_t *ptl;
863 unsigned long pfn;
864 struct page *page; 878 struct page *page;
879 struct mm_struct *mm = vma->vm_mm;
865 880
866 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 881 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
867 if (!IS_ERR(page)) { 882 if (!IS_ERR(page)) {
@@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
897 goto unlock; 912 goto unlock;
898 if ((flags & FOLL_WRITE) && !pte_write(pte)) 913 if ((flags & FOLL_WRITE) && !pte_write(pte))
899 goto unlock; 914 goto unlock;
900 pfn = pte_pfn(pte); 915 page = vm_normal_page(vma, address, pte);
901 if (!pfn_valid(pfn)) 916 if (unlikely(!page))
902 goto unlock; 917 goto unlock;
903 918
904 page = pfn_to_page(pfn);
905 if (flags & FOLL_GET) 919 if (flags & FOLL_GET)
906 get_page(page); 920 get_page(page);
907 if (flags & FOLL_TOUCH) { 921 if (flags & FOLL_TOUCH) {
@@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
974 return i ? : -EFAULT; 988 return i ? : -EFAULT;
975 } 989 }
976 if (pages) { 990 if (pages) {
977 pages[i] = pte_page(*pte); 991 struct page *page = vm_normal_page(gate_vma, start, *pte);
978 get_page(pages[i]); 992 pages[i] = page;
993 if (page)
994 get_page(page);
979 } 995 }
980 pte_unmap(pte); 996 pte_unmap(pte);
981 if (vmas) 997 if (vmas)
@@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1010 foll_flags |= FOLL_WRITE; 1026 foll_flags |= FOLL_WRITE;
1011 1027
1012 cond_resched(); 1028 cond_resched();
1013 while (!(page = follow_page(mm, start, foll_flags))) { 1029 while (!(page = follow_page(vma, start, foll_flags))) {
1014 int ret; 1030 int ret;
1015 ret = __handle_mm_fault(mm, vma, start, 1031 ret = __handle_mm_fault(mm, vma, start,
1016 foll_flags & FOLL_WRITE); 1032 foll_flags & FOLL_WRITE);
@@ -1130,6 +1146,129 @@ int zeromap_page_range(struct vm_area_struct *vma,
1130 return err; 1146 return err;
1131} 1147}
1132 1148
1149pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
1150{
1151 pgd_t * pgd = pgd_offset(mm, addr);
1152 pud_t * pud = pud_alloc(mm, pgd, addr);
1153 if (pud) {
1154 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1155 if (pmd)
1156 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1157 }
1158 return NULL;
1159}
1160
1161/*
1162 * This is the old fallback for page remapping.
1163 *
1164 * For historical reasons, it only allows reserved pages. Only
1165 * old drivers should use this, and they needed to mark their
1166 * pages reserved for the old functions anyway.
1167 */
1168static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
1169{
1170 int retval;
1171 pte_t *pte;
1172 spinlock_t *ptl;
1173
1174 retval = -EINVAL;
1175 if (PageAnon(page))
1176 goto out;
1177 retval = -ENOMEM;
1178 flush_dcache_page(page);
1179 pte = get_locked_pte(mm, addr, &ptl);
1180 if (!pte)
1181 goto out;
1182 retval = -EBUSY;
1183 if (!pte_none(*pte))
1184 goto out_unlock;
1185
1186 /* Ok, finally just insert the thing.. */
1187 get_page(page);
1188 inc_mm_counter(mm, file_rss);
1189 page_add_file_rmap(page);
1190 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1191
1192 retval = 0;
1193out_unlock:
1194 pte_unmap_unlock(pte, ptl);
1195out:
1196 return retval;
1197}
1198
1199/*
1200 * This allows drivers to insert individual pages they've allocated
1201 * into a user vma.
1202 *
1203 * The page has to be a nice clean _individual_ kernel allocation.
1204 * If you allocate a compound page, you need to have marked it as
1205 * such (__GFP_COMP), or manually just split the page up yourself
1206 * (which is mainly an issue of doing "set_page_count(page, 1)" for
1207 * each sub-page, and then freeing them one by one when you free
1208 * them rather than freeing it as a compound page).
1209 *
1210 * NOTE! Traditionally this was done with "remap_pfn_range()" which
1211 * took an arbitrary page protection parameter. This doesn't allow
1212 * that. Your vma protection will have to be set up correctly, which
1213 * means that if you want a shared writable mapping, you'd better
1214 * ask for a shared writable mapping!
1215 *
1216 * The page does not need to be reserved.
1217 */
1218int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
1219{
1220 if (addr < vma->vm_start || addr >= vma->vm_end)
1221 return -EFAULT;
1222 if (!page_count(page))
1223 return -EINVAL;
1224 return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
1225}
1226EXPORT_SYMBOL(vm_insert_page);
1227
1228/*
1229 * Somebody does a pfn remapping that doesn't actually work as a vma.
1230 *
1231 * Do it as individual pages instead, and warn about it. It's bad form,
1232 * and very inefficient.
1233 */
1234static int incomplete_pfn_remap(struct vm_area_struct *vma,
1235 unsigned long start, unsigned long end,
1236 unsigned long pfn, pgprot_t prot)
1237{
1238 static int warn = 10;
1239 struct page *page;
1240 int retval;
1241
1242 if (!(vma->vm_flags & VM_INCOMPLETE)) {
1243 if (warn) {
1244 warn--;
1245 printk("%s does an incomplete pfn remapping", current->comm);
1246 dump_stack();
1247 }
1248 }
1249 vma->vm_flags |= VM_INCOMPLETE | VM_IO | VM_RESERVED;
1250
1251 if (start < vma->vm_start || end > vma->vm_end)
1252 return -EINVAL;
1253
1254 if (!pfn_valid(pfn))
1255 return -EINVAL;
1256
1257 page = pfn_to_page(pfn);
1258 if (!PageReserved(page))
1259 return -EINVAL;
1260
1261 retval = 0;
1262 while (start < end) {
1263 retval = insert_page(vma->vm_mm, start, page, prot);
1264 if (retval < 0)
1265 break;
1266 start += PAGE_SIZE;
1267 page++;
1268 }
1269 return retval;
1270}
1271
1133/* 1272/*
1134 * maps a range of physical memory into the requested pages. the old 1273 * maps a range of physical memory into the requested pages. the old
1135 * mappings are removed. any references to nonexistent pages results 1274 * mappings are removed. any references to nonexistent pages results
@@ -1204,6 +1343,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1204 struct mm_struct *mm = vma->vm_mm; 1343 struct mm_struct *mm = vma->vm_mm;
1205 int err; 1344 int err;
1206 1345
1346 if (addr != vma->vm_start || end != vma->vm_end)
1347 return incomplete_pfn_remap(vma, addr, end, pfn, prot);
1348
1207 /* 1349 /*
1208 * Physically remapped pages are special. Tell the 1350 * Physically remapped pages are special. Tell the
1209 * rest of the world about it: 1351 * rest of the world about it:
@@ -1214,11 +1356,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1214 * in 2.6 the LRU scan won't even find its pages, so this 1356 * in 2.6 the LRU scan won't even find its pages, so this
1215 * flag means no more than count its pages in reserved_vm, 1357 * flag means no more than count its pages in reserved_vm,
1216 * and omit it from core dump, even when VM_IO turned off. 1358 * and omit it from core dump, even when VM_IO turned off.
1217 * VM_UNPAGED tells the core MM not to "manage" these pages 1359 * VM_PFNMAP tells the core MM that the base pages are just
1218 * (e.g. refcount, mapcount, try to swap them out): in 1360 * raw PFN mappings, and do not have a "struct page" associated
1219 * particular, zap_pte_range does not try to free them. 1361 * with them.
1220 */ 1362 */
1221 vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED; 1363 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1364 vma->vm_pgoff = pfn;
1222 1365
1223 BUG_ON(addr >= end); 1366 BUG_ON(addr >= end);
1224 pfn -= addr >> PAGE_SHIFT; 1367 pfn -= addr >> PAGE_SHIFT;
@@ -1273,6 +1416,33 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1273 return pte; 1416 return pte;
1274} 1417}
1275 1418
1419static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
1420{
1421 /*
1422 * If the source page was a PFN mapping, we don't have
1423 * a "struct page" for it. We do a best-effort copy by
1424 * just copying from the original user address. If that
1425 * fails, we just zero-fill it. Live with it.
1426 */
1427 if (unlikely(!src)) {
1428 void *kaddr = kmap_atomic(dst, KM_USER0);
1429 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1430
1431 /*
1432 * This really shouldn't fail, because the page is there
1433 * in the page tables. But it might just be unreadable,
1434 * in which case we just give up and fill the result with
1435 * zeroes.
1436 */
1437 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1438 memset(kaddr, 0, PAGE_SIZE);
1439 kunmap_atomic(kaddr, KM_USER0);
1440 return;
1441
1442 }
1443 copy_user_highpage(dst, src, va);
1444}
1445
1276/* 1446/*
1277 * This routine handles present pages, when users try to write 1447 * This routine handles present pages, when users try to write
1278 * to a shared page. It is done by copying the page to a new address 1448 * to a shared page. It is done by copying the page to a new address
@@ -1295,35 +1465,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1295 unsigned long address, pte_t *page_table, pmd_t *pmd, 1465 unsigned long address, pte_t *page_table, pmd_t *pmd,
1296 spinlock_t *ptl, pte_t orig_pte) 1466 spinlock_t *ptl, pte_t orig_pte)
1297{ 1467{
1298 struct page *old_page, *src_page, *new_page; 1468 struct page *old_page, *new_page;
1299 unsigned long pfn = pte_pfn(orig_pte);
1300 pte_t entry; 1469 pte_t entry;
1301 int ret = VM_FAULT_MINOR; 1470 int ret = VM_FAULT_MINOR;
1302 1471
1303 if (unlikely(!pfn_valid(pfn))) { 1472 old_page = vm_normal_page(vma, address, orig_pte);
1304 /* 1473 if (!old_page)
1305 * Page table corrupted: show pte and kill process. 1474 goto gotten;
1306 * Or it's an attempt to COW an out-of-map VM_UNPAGED
1307 * entry, which copy_user_highpage does not support.
1308 */
1309 print_bad_pte(vma, orig_pte, address);
1310 ret = VM_FAULT_OOM;
1311 goto unlock;
1312 }
1313 old_page = pfn_to_page(pfn);
1314 src_page = old_page;
1315
1316 if (unlikely(vma->vm_flags & VM_UNPAGED))
1317 if (!page_is_anon(old_page, vma, address)) {
1318 old_page = NULL;
1319 goto gotten;
1320 }
1321 1475
1322 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { 1476 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1323 int reuse = can_share_swap_page(old_page); 1477 int reuse = can_share_swap_page(old_page);
1324 unlock_page(old_page); 1478 unlock_page(old_page);
1325 if (reuse) { 1479 if (reuse) {
1326 flush_cache_page(vma, address, pfn); 1480 flush_cache_page(vma, address, pte_pfn(orig_pte));
1327 entry = pte_mkyoung(orig_pte); 1481 entry = pte_mkyoung(orig_pte);
1328 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1482 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1329 ptep_set_access_flags(vma, address, page_table, entry, 1); 1483 ptep_set_access_flags(vma, address, page_table, entry, 1);
@@ -1343,7 +1497,7 @@ gotten:
1343 1497
1344 if (unlikely(anon_vma_prepare(vma))) 1498 if (unlikely(anon_vma_prepare(vma)))
1345 goto oom; 1499 goto oom;
1346 if (src_page == ZERO_PAGE(address)) { 1500 if (old_page == ZERO_PAGE(address)) {
1347 new_page = alloc_zeroed_user_highpage(vma, address); 1501 new_page = alloc_zeroed_user_highpage(vma, address);
1348 if (!new_page) 1502 if (!new_page)
1349 goto oom; 1503 goto oom;
@@ -1351,7 +1505,7 @@ gotten:
1351 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1505 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1352 if (!new_page) 1506 if (!new_page)
1353 goto oom; 1507 goto oom;
1354 copy_user_highpage(new_page, src_page, address); 1508 cow_user_page(new_page, old_page, address);
1355 } 1509 }
1356 1510
1357 /* 1511 /*
@@ -1367,7 +1521,7 @@ gotten:
1367 } 1521 }
1368 } else 1522 } else
1369 inc_mm_counter(mm, anon_rss); 1523 inc_mm_counter(mm, anon_rss);
1370 flush_cache_page(vma, address, pfn); 1524 flush_cache_page(vma, address, pte_pfn(orig_pte));
1371 entry = mk_pte(new_page, vma->vm_page_prot); 1525 entry = mk_pte(new_page, vma->vm_page_prot);
1372 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1526 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1373 ptep_establish(vma, address, page_table, entry); 1527 ptep_establish(vma, address, page_table, entry);
@@ -1812,16 +1966,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1812 spinlock_t *ptl; 1966 spinlock_t *ptl;
1813 pte_t entry; 1967 pte_t entry;
1814 1968
1815 /* 1969 if (write_access) {
1816 * A VM_UNPAGED vma will normally be filled with present ptes
1817 * by remap_pfn_range, and never arrive here; but it might have
1818 * holes, or if !VM_DONTEXPAND, mremap might have expanded it.
1819 * It's weird enough handling anon pages in unpaged vmas, we do
1820 * not want to worry about ZERO_PAGEs too (it may or may not
1821 * matter if their counts wrap): just give them anon pages.
1822 */
1823
1824 if (write_access || (vma->vm_flags & VM_UNPAGED)) {
1825 /* Allocate our own private page. */ 1970 /* Allocate our own private page. */
1826 pte_unmap(page_table); 1971 pte_unmap(page_table);
1827 1972
@@ -1896,7 +2041,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1896 int anon = 0; 2041 int anon = 0;
1897 2042
1898 pte_unmap(page_table); 2043 pte_unmap(page_table);
1899 BUG_ON(vma->vm_flags & VM_UNPAGED); 2044 BUG_ON(vma->vm_flags & VM_PFNMAP);
1900 2045
1901 if (vma->vm_file) { 2046 if (vma->vm_file) {
1902 mapping = vma->vm_file->f_mapping; 2047 mapping = vma->vm_file->f_mapping;
@@ -2149,6 +2294,12 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2149 spin_unlock(&mm->page_table_lock); 2294 spin_unlock(&mm->page_table_lock);
2150 return 0; 2295 return 0;
2151} 2296}
2297#else
2298/* Workaround for gcc 2.96 */
2299int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2300{
2301 return 0;
2302}
2152#endif /* __PAGETABLE_PUD_FOLDED */ 2303#endif /* __PAGETABLE_PUD_FOLDED */
2153 2304
2154#ifndef __PAGETABLE_PMD_FOLDED 2305#ifndef __PAGETABLE_PMD_FOLDED
@@ -2177,6 +2328,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2177 spin_unlock(&mm->page_table_lock); 2328 spin_unlock(&mm->page_table_lock);
2178 return 0; 2329 return 0;
2179} 2330}
2331#else
2332/* Workaround for gcc 2.96 */
2333int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2334{
2335 return 0;
2336}
2180#endif /* __PAGETABLE_PMD_FOLDED */ 2337#endif /* __PAGETABLE_PMD_FOLDED */
2181 2338
2182int make_pages_present(unsigned long addr, unsigned long end) 2339int make_pages_present(unsigned long addr, unsigned long end)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5609a31bdf..bec88c8124 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
189 189
190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
191 do { 191 do {
192 unsigned long pfn; 192 struct page *page;
193 unsigned int nid; 193 unsigned int nid;
194 194
195 if (!pte_present(*pte)) 195 if (!pte_present(*pte))
196 continue; 196 continue;
197 pfn = pte_pfn(*pte); 197 page = vm_normal_page(vma, addr, *pte);
198 if (!pfn_valid(pfn)) { 198 if (!page)
199 print_bad_pte(vma, *pte, addr);
200 continue; 199 continue;
201 } 200 nid = page_to_nid(page);
202 nid = pfn_to_nid(pfn);
203 if (!node_isset(nid, *nodes)) 201 if (!node_isset(nid, *nodes))
204 break; 202 break;
205 } while (pte++, addr += PAGE_SIZE, addr != end); 203 } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
269 first = find_vma(mm, start); 267 first = find_vma(mm, start);
270 if (!first) 268 if (!first)
271 return ERR_PTR(-EFAULT); 269 return ERR_PTR(-EFAULT);
272 if (first->vm_flags & VM_UNPAGED)
273 return ERR_PTR(-EACCES);
274 prev = NULL; 270 prev = NULL;
275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 271 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
276 if (!vma->vm_next && vma->vm_end < end) 272 if (!vma->vm_next && vma->vm_end < end)
diff --git a/mm/msync.c b/mm/msync.c
index b3f4caf301..1b5b6f662d 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
27again: 27again:
28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
29 do { 29 do {
30 unsigned long pfn;
31 struct page *page; 30 struct page *page;
32 31
33 if (progress >= 64) { 32 if (progress >= 64) {
@@ -40,13 +39,9 @@ again:
40 continue; 39 continue;
41 if (!pte_maybe_dirty(*pte)) 40 if (!pte_maybe_dirty(*pte))
42 continue; 41 continue;
43 pfn = pte_pfn(*pte); 42 page = vm_normal_page(vma, addr, *pte);
44 if (unlikely(!pfn_valid(pfn))) { 43 if (!page)
45 print_bad_pte(vma, *pte, addr);
46 continue; 44 continue;
47 }
48 page = pfn_to_page(pfn);
49
50 if (ptep_clear_flush_dirty(vma, addr, pte) || 45 if (ptep_clear_flush_dirty(vma, addr, pte) ||
51 page_test_and_clear_dirty(page)) 46 page_test_and_clear_dirty(page))
52 set_page_dirty(page); 47 set_page_dirty(page);
@@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma,
97 /* For hugepages we can't go walking the page table normally, 92 /* For hugepages we can't go walking the page table normally,
98 * but that's ok, hugetlbfs is memory based, so we don't need 93 * but that's ok, hugetlbfs is memory based, so we don't need
99 * to do anything more on an msync(). 94 * to do anything more on an msync().
100 * Can't do anything with VM_UNPAGED regions either.
101 */ 95 */
102 if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED)) 96 if (vma->vm_flags & VM_HUGETLB)
103 return; 97 return;
104 98
105 BUG_ON(addr >= end); 99 BUG_ON(addr >= end);
diff --git a/mm/nommu.c b/mm/nommu.c
index 6deb6ab3d6..c119681287 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1045 1045
1046EXPORT_SYMBOL(find_vma); 1046EXPORT_SYMBOL(find_vma);
1047 1047
1048struct page *follow_page(struct mm_struct *mm, unsigned long address, 1048struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1049 unsigned int foll_flags) 1049 unsigned int foll_flags)
1050{ 1050{
1051 return NULL; 1051 return NULL;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1731236dec..3b21a13d84 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -773,9 +773,12 @@ again:
773} 773}
774 774
775#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 775#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
776#define ALLOC_HARDER 0x02 /* try to alloc harder */ 776#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
777#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */ 777#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
778#define ALLOC_CPUSET 0x08 /* check for correct cpuset */ 778#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
779#define ALLOC_HARDER 0x10 /* try to alloc harder */
780#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
781#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
779 782
780/* 783/*
781 * Return 1 if free pages are above 'mark'. This takes into account the order 784 * Return 1 if free pages are above 'mark'. This takes into account the order
@@ -830,7 +833,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
830 continue; 833 continue;
831 834
832 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 835 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
833 if (!zone_watermark_ok(*z, order, (*z)->pages_low, 836 unsigned long mark;
837 if (alloc_flags & ALLOC_WMARK_MIN)
838 mark = (*z)->pages_min;
839 else if (alloc_flags & ALLOC_WMARK_LOW)
840 mark = (*z)->pages_low;
841 else
842 mark = (*z)->pages_high;
843 if (!zone_watermark_ok(*z, order, mark,
834 classzone_idx, alloc_flags)) 844 classzone_idx, alloc_flags))
835 continue; 845 continue;
836 } 846 }
@@ -871,7 +881,7 @@ restart:
871 } 881 }
872 882
873 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 883 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
874 zonelist, ALLOC_CPUSET); 884 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
875 if (page) 885 if (page)
876 goto got_pg; 886 goto got_pg;
877 887
@@ -888,7 +898,7 @@ restart:
888 * cannot run direct reclaim, or if the caller has realtime scheduling 898 * cannot run direct reclaim, or if the caller has realtime scheduling
889 * policy. 899 * policy.
890 */ 900 */
891 alloc_flags = 0; 901 alloc_flags = ALLOC_WMARK_MIN;
892 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 902 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
893 alloc_flags |= ALLOC_HARDER; 903 alloc_flags |= ALLOC_HARDER;
894 if (gfp_mask & __GFP_HIGH) 904 if (gfp_mask & __GFP_HIGH)
@@ -959,7 +969,7 @@ rebalance:
959 * under heavy pressure. 969 * under heavy pressure.
960 */ 970 */
961 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 971 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
962 zonelist, ALLOC_CPUSET); 972 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
963 if (page) 973 if (page)
964 goto got_pg; 974 goto got_pg;
965 975
@@ -1762,16 +1772,16 @@ static int __devinit zone_batchsize(struct zone *zone)
1762 batch = 1; 1772 batch = 1;
1763 1773
1764 /* 1774 /*
1765 * We will be trying to allcoate bigger chunks of contiguous 1775 * Clamp the batch to a 2^n - 1 value. Having a power
1766 * memory of the order of fls(batch). This should result in 1776 * of 2 value was found to be more likely to have
1767 * better cache coloring. 1777 * suboptimal cache aliasing properties in some cases.
1768 * 1778 *
1769 * A sanity check also to ensure that batch is still in limits. 1779 * For example if 2 tasks are alternately allocating
1780 * batches of pages, one task can end up with a lot
1781 * of pages of one half of the possible page colors
1782 * and the other with pages of the other colors.
1770 */ 1783 */
1771 batch = (1 << fls(batch + batch/2)); 1784 batch = (1 << (fls(batch + batch/2)-1)) - 1;
1772
1773 if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
1774 batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
1775 1785
1776 return batch; 1786 return batch;
1777} 1787}
diff --git a/mm/rmap.c b/mm/rmap.c
index 2e034a0b89..f853c6def1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -226,8 +226,6 @@ vma_address(struct page *page, struct vm_area_struct *vma)
226/* 226/*
227 * At what user virtual address is page expected in vma? checking that the 227 * At what user virtual address is page expected in vma? checking that the
228 * page matches the vma: currently only used on anon pages, by unuse_vma; 228 * page matches the vma: currently only used on anon pages, by unuse_vma;
229 * and by extraordinary checks on anon pages in VM_UNPAGED vmas, taking
230 * care that an mmap of /dev/mem might window free and foreign pages.
231 */ 229 */
232unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 230unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
233{ 231{
@@ -292,7 +290,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
292 * repeatedly from either page_referenced_anon or page_referenced_file. 290 * repeatedly from either page_referenced_anon or page_referenced_file.
293 */ 291 */
294static int page_referenced_one(struct page *page, 292static int page_referenced_one(struct page *page,
295 struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token) 293 struct vm_area_struct *vma, unsigned int *mapcount)
296{ 294{
297 struct mm_struct *mm = vma->vm_mm; 295 struct mm_struct *mm = vma->vm_mm;
298 unsigned long address; 296 unsigned long address;
@@ -313,7 +311,7 @@ static int page_referenced_one(struct page *page,
313 311
314 /* Pretend the page is referenced if the task has the 312 /* Pretend the page is referenced if the task has the
315 swap token and is in the middle of a page fault. */ 313 swap token and is in the middle of a page fault. */
316 if (mm != current->mm && !ignore_token && has_swap_token(mm) && 314 if (mm != current->mm && has_swap_token(mm) &&
317 rwsem_is_locked(&mm->mmap_sem)) 315 rwsem_is_locked(&mm->mmap_sem))
318 referenced++; 316 referenced++;
319 317
@@ -323,7 +321,7 @@ out:
323 return referenced; 321 return referenced;
324} 322}
325 323
326static int page_referenced_anon(struct page *page, int ignore_token) 324static int page_referenced_anon(struct page *page)
327{ 325{
328 unsigned int mapcount; 326 unsigned int mapcount;
329 struct anon_vma *anon_vma; 327 struct anon_vma *anon_vma;
@@ -336,8 +334,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
336 334
337 mapcount = page_mapcount(page); 335 mapcount = page_mapcount(page);
338 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 336 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
339 referenced += page_referenced_one(page, vma, &mapcount, 337 referenced += page_referenced_one(page, vma, &mapcount);
340 ignore_token);
341 if (!mapcount) 338 if (!mapcount)
342 break; 339 break;
343 } 340 }
@@ -356,7 +353,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
356 * 353 *
357 * This function is only called from page_referenced for object-based pages. 354 * This function is only called from page_referenced for object-based pages.
358 */ 355 */
359static int page_referenced_file(struct page *page, int ignore_token) 356static int page_referenced_file(struct page *page)
360{ 357{
361 unsigned int mapcount; 358 unsigned int mapcount;
362 struct address_space *mapping = page->mapping; 359 struct address_space *mapping = page->mapping;
@@ -394,8 +391,7 @@ static int page_referenced_file(struct page *page, int ignore_token)
394 referenced++; 391 referenced++;
395 break; 392 break;
396 } 393 }
397 referenced += page_referenced_one(page, vma, &mapcount, 394 referenced += page_referenced_one(page, vma, &mapcount);
398 ignore_token);
399 if (!mapcount) 395 if (!mapcount)
400 break; 396 break;
401 } 397 }
@@ -412,13 +408,10 @@ static int page_referenced_file(struct page *page, int ignore_token)
412 * Quick test_and_clear_referenced for all mappings to a page, 408 * Quick test_and_clear_referenced for all mappings to a page,
413 * returns the number of ptes which referenced the page. 409 * returns the number of ptes which referenced the page.
414 */ 410 */
415int page_referenced(struct page *page, int is_locked, int ignore_token) 411int page_referenced(struct page *page, int is_locked)
416{ 412{
417 int referenced = 0; 413 int referenced = 0;
418 414
419 if (!swap_token_default_timeout)
420 ignore_token = 1;
421
422 if (page_test_and_clear_young(page)) 415 if (page_test_and_clear_young(page))
423 referenced++; 416 referenced++;
424 417
@@ -427,15 +420,14 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
427 420
428 if (page_mapped(page) && page->mapping) { 421 if (page_mapped(page) && page->mapping) {
429 if (PageAnon(page)) 422 if (PageAnon(page))
430 referenced += page_referenced_anon(page, ignore_token); 423 referenced += page_referenced_anon(page);
431 else if (is_locked) 424 else if (is_locked)
432 referenced += page_referenced_file(page, ignore_token); 425 referenced += page_referenced_file(page);
433 else if (TestSetPageLocked(page)) 426 else if (TestSetPageLocked(page))
434 referenced++; 427 referenced++;
435 else { 428 else {
436 if (page->mapping) 429 if (page->mapping)
437 referenced += page_referenced_file(page, 430 referenced += page_referenced_file(page);
438 ignore_token);
439 unlock_page(page); 431 unlock_page(page);
440 } 432 }
441 } 433 }
@@ -614,7 +606,6 @@ static void try_to_unmap_cluster(unsigned long cursor,
614 struct page *page; 606 struct page *page;
615 unsigned long address; 607 unsigned long address;
616 unsigned long end; 608 unsigned long end;
617 unsigned long pfn;
618 609
619 address = (vma->vm_start + cursor) & CLUSTER_MASK; 610 address = (vma->vm_start + cursor) & CLUSTER_MASK;
620 end = address + CLUSTER_SIZE; 611 end = address + CLUSTER_SIZE;
@@ -643,21 +634,14 @@ static void try_to_unmap_cluster(unsigned long cursor,
643 for (; address < end; pte++, address += PAGE_SIZE) { 634 for (; address < end; pte++, address += PAGE_SIZE) {
644 if (!pte_present(*pte)) 635 if (!pte_present(*pte))
645 continue; 636 continue;
646 637 page = vm_normal_page(vma, address, *pte);
647 pfn = pte_pfn(*pte); 638 BUG_ON(!page || PageAnon(page));
648 if (unlikely(!pfn_valid(pfn))) {
649 print_bad_pte(vma, *pte, address);
650 continue;
651 }
652
653 page = pfn_to_page(pfn);
654 BUG_ON(PageAnon(page));
655 639
656 if (ptep_clear_flush_young(vma, address, pte)) 640 if (ptep_clear_flush_young(vma, address, pte))
657 continue; 641 continue;
658 642
659 /* Nuke the page table entry. */ 643 /* Nuke the page table entry. */
660 flush_cache_page(vma, address, pfn); 644 flush_cache_page(vma, address, pte_pfn(*pte));
661 pteval = ptep_clear_flush(vma, address, pte); 645 pteval = ptep_clear_flush(vma, address, pte);
662 646
663 /* If nonlinear, store the file page offset in the pte. */ 647 /* If nonlinear, store the file page offset in the pte. */
diff --git a/mm/thrash.c b/mm/thrash.c
index eff3c18c33..f4c560b4a2 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -57,14 +57,17 @@ void grab_swap_token(void)
57 /* We have the token. Let others know we still need it. */ 57 /* We have the token. Let others know we still need it. */
58 if (has_swap_token(current->mm)) { 58 if (has_swap_token(current->mm)) {
59 current->mm->recent_pagein = 1; 59 current->mm->recent_pagein = 1;
60 if (unlikely(!swap_token_default_timeout))
61 disable_swap_token();
60 return; 62 return;
61 } 63 }
62 64
63 if (time_after(jiffies, swap_token_check)) { 65 if (time_after(jiffies, swap_token_check)) {
64 66
65 /* Can't get swapout protection if we exceed our RSS limit. */ 67 if (!swap_token_default_timeout) {
66 // if (current->mm->rss > current->mm->rlimit_rss) 68 swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
67 // return; 69 return;
70 }
68 71
69 /* ... or if we recently held the token. */ 72 /* ... or if we recently held the token. */
70 if (time_before(jiffies, current->mm->swap_token_time)) 73 if (time_before(jiffies, current->mm->swap_token_time))
@@ -95,6 +98,7 @@ void __put_swap_token(struct mm_struct *mm)
95{ 98{
96 spin_lock(&swap_token_lock); 99 spin_lock(&swap_token_lock);
97 if (likely(mm == swap_token_mm)) { 100 if (likely(mm == swap_token_mm)) {
101 mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
98 swap_token_mm = &init_mm; 102 swap_token_mm = &init_mm;
99 swap_token_check = jiffies; 103 swap_token_check = jiffies;
100 } 104 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2813054127..b0cd81c32d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -201,13 +201,25 @@ static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
201 list_for_each_entry(shrinker, &shrinker_list, list) { 201 list_for_each_entry(shrinker, &shrinker_list, list) {
202 unsigned long long delta; 202 unsigned long long delta;
203 unsigned long total_scan; 203 unsigned long total_scan;
204 unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
204 205
205 delta = (4 * scanned) / shrinker->seeks; 206 delta = (4 * scanned) / shrinker->seeks;
206 delta *= (*shrinker->shrinker)(0, gfp_mask); 207 delta *= max_pass;
207 do_div(delta, lru_pages + 1); 208 do_div(delta, lru_pages + 1);
208 shrinker->nr += delta; 209 shrinker->nr += delta;
209 if (shrinker->nr < 0) 210 if (shrinker->nr < 0) {
210 shrinker->nr = LONG_MAX; /* It wrapped! */ 211 printk(KERN_ERR "%s: nr=%ld\n",
212 __FUNCTION__, shrinker->nr);
213 shrinker->nr = max_pass;
214 }
215
216 /*
217 * Avoid risking looping forever due to too large nr value:
218 * never try to free more than twice the estimate number of
219 * freeable entries.
220 */
221 if (shrinker->nr > max_pass * 2)
222 shrinker->nr = max_pass * 2;
211 223
212 total_scan = shrinker->nr; 224 total_scan = shrinker->nr;
213 shrinker->nr = 0; 225 shrinker->nr = 0;
@@ -407,7 +419,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
407 if (PageWriteback(page)) 419 if (PageWriteback(page))
408 goto keep_locked; 420 goto keep_locked;
409 421
410 referenced = page_referenced(page, 1, sc->priority <= 0); 422 referenced = page_referenced(page, 1);
411 /* In active use or really unfreeable? Activate it. */ 423 /* In active use or really unfreeable? Activate it. */
412 if (referenced && page_mapping_inuse(page)) 424 if (referenced && page_mapping_inuse(page))
413 goto activate_locked; 425 goto activate_locked;
@@ -756,7 +768,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
756 if (page_mapped(page)) { 768 if (page_mapped(page)) {
757 if (!reclaim_mapped || 769 if (!reclaim_mapped ||
758 (total_swap_pages == 0 && PageAnon(page)) || 770 (total_swap_pages == 0 && PageAnon(page)) ||
759 page_referenced(page, 0, sc->priority <= 0)) { 771 page_referenced(page, 0)) {
760 list_add(&page->lru, &l_active); 772 list_add(&page->lru, &l_active);
761 continue; 773 continue;
762 } 774 }
@@ -960,6 +972,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
960 sc.nr_reclaimed = 0; 972 sc.nr_reclaimed = 0;
961 sc.priority = priority; 973 sc.priority = priority;
962 sc.swap_cluster_max = SWAP_CLUSTER_MAX; 974 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
975 if (!priority)
976 disable_swap_token();
963 shrink_caches(zones, &sc); 977 shrink_caches(zones, &sc);
964 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 978 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
965 if (reclaim_state) { 979 if (reclaim_state) {
@@ -1056,6 +1070,10 @@ loop_again:
1056 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1070 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
1057 unsigned long lru_pages = 0; 1071 unsigned long lru_pages = 0;
1058 1072
1073 /* The swap token gets in the way of swapout... */
1074 if (!priority)
1075 disable_swap_token();
1076
1059 all_zones_ok = 1; 1077 all_zones_ok = 1;
1060 1078
1061 if (nr_pages == 0) { 1079 if (nr_pages == 0) {
@@ -1360,6 +1378,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1360 sc.nr_reclaimed = 0; 1378 sc.nr_reclaimed = 0;
1361 /* scan at the highest priority */ 1379 /* scan at the highest priority */
1362 sc.priority = 0; 1380 sc.priority = 0;
1381 disable_swap_token();
1363 1382
1364 if (nr_pages > SWAP_CLUSTER_MAX) 1383 if (nr_pages > SWAP_CLUSTER_MAX)
1365 sc.swap_cluster_max = nr_pages; 1384 sc.swap_cluster_max = nr_pages;