diff options
| author | Anton Altaparmakov <aia21@cantab.net> | 2005-12-05 10:48:41 -0500 |
|---|---|---|
| committer | Anton Altaparmakov <aia21@cantab.net> | 2005-12-05 10:48:41 -0500 |
| commit | 292d4ed32e35df4755052b5002e533348d1648fd (patch) | |
| tree | 8522e6bab962696bd25a6c02fb068c674a09b7ee /mm | |
| parent | 3c6af7fa787f21f8873a050568ed892312899eb5 (diff) | |
| parent | e4f5c82a92c2a546a16af1614114eec19120e40a (diff) | |
Merge branch 'master' of /usr/src/ntfs-2.6/
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/fremap.c | 46 | ||||
| -rw-r--r-- | mm/madvise.c | 2 | ||||
| -rw-r--r-- | mm/memory.c | 341 | ||||
| -rw-r--r-- | mm/mempolicy.c | 12 | ||||
| -rw-r--r-- | mm/msync.c | 12 | ||||
| -rw-r--r-- | mm/nommu.c | 2 | ||||
| -rw-r--r-- | mm/page_alloc.c | 40 | ||||
| -rw-r--r-- | mm/rmap.c | 42 | ||||
| -rw-r--r-- | mm/thrash.c | 10 | ||||
| -rw-r--r-- | mm/vmscan.c | 29 |
10 files changed, 336 insertions, 200 deletions
diff --git a/mm/fremap.c b/mm/fremap.c index 007cbad9331e..9f381e58bf44 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
| @@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 27 | struct page *page = NULL; | 27 | struct page *page = NULL; |
| 28 | 28 | ||
| 29 | if (pte_present(pte)) { | 29 | if (pte_present(pte)) { |
| 30 | unsigned long pfn = pte_pfn(pte); | 30 | flush_cache_page(vma, addr, pte_pfn(pte)); |
| 31 | flush_cache_page(vma, addr, pfn); | ||
| 32 | pte = ptep_clear_flush(vma, addr, ptep); | 31 | pte = ptep_clear_flush(vma, addr, ptep); |
| 33 | if (unlikely(!pfn_valid(pfn))) { | 32 | page = vm_normal_page(vma, addr, pte); |
| 34 | print_bad_pte(vma, pte, addr); | 33 | if (page) { |
| 35 | goto out; | 34 | if (pte_dirty(pte)) |
| 35 | set_page_dirty(page); | ||
| 36 | page_remove_rmap(page); | ||
| 37 | page_cache_release(page); | ||
| 36 | } | 38 | } |
| 37 | page = pfn_to_page(pfn); | ||
| 38 | if (pte_dirty(pte)) | ||
| 39 | set_page_dirty(page); | ||
| 40 | page_remove_rmap(page); | ||
| 41 | page_cache_release(page); | ||
| 42 | } else { | 39 | } else { |
| 43 | if (!pte_file(pte)) | 40 | if (!pte_file(pte)) |
| 44 | free_swap_and_cache(pte_to_swp_entry(pte)); | 41 | free_swap_and_cache(pte_to_swp_entry(pte)); |
| 45 | pte_clear(mm, addr, ptep); | 42 | pte_clear(mm, addr, ptep); |
| 46 | } | 43 | } |
| 47 | out: | ||
| 48 | return !!page; | 44 | return !!page; |
| 49 | } | 45 | } |
| 50 | 46 | ||
| @@ -59,22 +55,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 59 | pgoff_t size; | 55 | pgoff_t size; |
| 60 | int err = -ENOMEM; | 56 | int err = -ENOMEM; |
| 61 | pte_t *pte; | 57 | pte_t *pte; |
| 62 | pmd_t *pmd; | ||
| 63 | pud_t *pud; | ||
| 64 | pgd_t *pgd; | ||
| 65 | pte_t pte_val; | 58 | pte_t pte_val; |
| 66 | spinlock_t *ptl; | 59 | spinlock_t *ptl; |
| 67 | 60 | ||
| 68 | BUG_ON(vma->vm_flags & VM_UNPAGED); | 61 | pte = get_locked_pte(mm, addr, &ptl); |
| 69 | |||
| 70 | pgd = pgd_offset(mm, addr); | ||
| 71 | pud = pud_alloc(mm, pgd, addr); | ||
| 72 | if (!pud) | ||
| 73 | goto out; | ||
| 74 | pmd = pmd_alloc(mm, pud, addr); | ||
| 75 | if (!pmd) | ||
| 76 | goto out; | ||
| 77 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | ||
| 78 | if (!pte) | 62 | if (!pte) |
| 79 | goto out; | 63 | goto out; |
| 80 | 64 | ||
| @@ -116,22 +100,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 116 | { | 100 | { |
| 117 | int err = -ENOMEM; | 101 | int err = -ENOMEM; |
| 118 | pte_t *pte; | 102 | pte_t *pte; |
| 119 | pmd_t *pmd; | ||
| 120 | pud_t *pud; | ||
| 121 | pgd_t *pgd; | ||
| 122 | pte_t pte_val; | 103 | pte_t pte_val; |
| 123 | spinlock_t *ptl; | 104 | spinlock_t *ptl; |
| 124 | 105 | ||
| 125 | BUG_ON(vma->vm_flags & VM_UNPAGED); | 106 | pte = get_locked_pte(mm, addr, &ptl); |
| 126 | |||
| 127 | pgd = pgd_offset(mm, addr); | ||
| 128 | pud = pud_alloc(mm, pgd, addr); | ||
| 129 | if (!pud) | ||
| 130 | goto out; | ||
| 131 | pmd = pmd_alloc(mm, pud, addr); | ||
| 132 | if (!pmd) | ||
| 133 | goto out; | ||
| 134 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | ||
| 135 | if (!pte) | 107 | if (!pte) |
| 136 | goto out; | 108 | goto out; |
| 137 | 109 | ||
diff --git a/mm/madvise.c b/mm/madvise.c index 328a3bcce527..2b7cf0400a21 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, | |||
| 126 | unsigned long start, unsigned long end) | 126 | unsigned long start, unsigned long end) |
| 127 | { | 127 | { |
| 128 | *prev = vma; | 128 | *prev = vma; |
| 129 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED)) | 129 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) |
| 130 | return -EINVAL; | 130 | return -EINVAL; |
| 131 | 131 | ||
| 132 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { | 132 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { |
diff --git a/mm/memory.c b/mm/memory.c index d1f46f4e4c8a..aa8af0e20269 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | |||
| 333 | } | 333 | } |
| 334 | 334 | ||
| 335 | /* | 335 | /* |
| 336 | * This function is called to print an error when a pte in a | 336 | * This function is called to print an error when a bad pte |
| 337 | * !VM_UNPAGED region is found pointing to an invalid pfn (which | 337 | * is found. For example, we might have a PFN-mapped pte in |
| 338 | * is an error. | 338 | * a region that doesn't allow it. |
| 339 | * | 339 | * |
| 340 | * The calling function must still handle the error. | 340 | * The calling function must still handle the error. |
| 341 | */ | 341 | */ |
| @@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) | |||
| 350 | } | 350 | } |
| 351 | 351 | ||
| 352 | /* | 352 | /* |
| 353 | * page_is_anon applies strict checks for an anonymous page belonging to | 353 | * This function gets the "struct page" associated with a pte. |
| 354 | * this vma at this address. It is used on VM_UNPAGED vmas, which are | 354 | * |
| 355 | * usually populated with shared originals (which must not be counted), | 355 | * NOTE! Some mappings do not have "struct pages". A raw PFN mapping |
| 356 | * but occasionally contain private COWed copies (when !VM_SHARED, or | 356 | * will have each page table entry just pointing to a raw page frame |
| 357 | * perhaps via ptrace when VM_SHARED). An mmap of /dev/mem might window | 357 | * number, and as far as the VM layer is concerned, those do not have |
| 358 | * free pages, pages from other processes, or from other parts of this: | 358 | * pages associated with them - even if the PFN might point to memory |
| 359 | * it's tricky, but try not to be deceived by foreign anonymous pages. | 359 | * that otherwise is perfectly fine and has a "struct page". |
| 360 | * | ||
| 361 | * The way we recognize those mappings is through the rules set up | ||
| 362 | * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, | ||
| 363 | * and the vm_pgoff will point to the first PFN mapped: thus every | ||
| 364 | * page that is a raw mapping will always honor the rule | ||
| 365 | * | ||
| 366 | * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) | ||
| 367 | * | ||
| 368 | * and if that isn't true, the page has been COW'ed (in which case it | ||
| 369 | * _does_ have a "struct page" associated with it even if it is in a | ||
| 370 | * VM_PFNMAP range). | ||
| 360 | */ | 371 | */ |
| 361 | static inline int page_is_anon(struct page *page, | 372 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) |
| 362 | struct vm_area_struct *vma, unsigned long addr) | ||
| 363 | { | 373 | { |
| 364 | return page && PageAnon(page) && page_mapped(page) && | 374 | unsigned long pfn = pte_pfn(pte); |
| 365 | page_address_in_vma(page, vma) == addr; | 375 | |
| 376 | if (vma->vm_flags & VM_PFNMAP) { | ||
| 377 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; | ||
| 378 | if (pfn == vma->vm_pgoff + off) | ||
| 379 | return NULL; | ||
| 380 | } | ||
| 381 | |||
| 382 | /* | ||
| 383 | * Add some anal sanity checks for now. Eventually, | ||
| 384 | * we should just do "return pfn_to_page(pfn)", but | ||
| 385 | * in the meantime we check that we get a valid pfn, | ||
| 386 | * and that the resulting page looks ok. | ||
| 387 | * | ||
| 388 | * Remove this test eventually! | ||
| 389 | */ | ||
| 390 | if (unlikely(!pfn_valid(pfn))) { | ||
| 391 | print_bad_pte(vma, pte, addr); | ||
| 392 | return NULL; | ||
| 393 | } | ||
| 394 | |||
| 395 | /* | ||
| 396 | * NOTE! We still have PageReserved() pages in the page | ||
| 397 | * tables. | ||
| 398 | * | ||
| 399 | * The PAGE_ZERO() pages and various VDSO mappings can | ||
| 400 | * cause them to exist. | ||
| 401 | */ | ||
| 402 | return pfn_to_page(pfn); | ||
| 366 | } | 403 | } |
| 367 | 404 | ||
| 368 | /* | 405 | /* |
| @@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 379 | unsigned long vm_flags = vma->vm_flags; | 416 | unsigned long vm_flags = vma->vm_flags; |
| 380 | pte_t pte = *src_pte; | 417 | pte_t pte = *src_pte; |
| 381 | struct page *page; | 418 | struct page *page; |
| 382 | unsigned long pfn; | ||
| 383 | 419 | ||
| 384 | /* pte contains position in swap or file, so copy. */ | 420 | /* pte contains position in swap or file, so copy. */ |
| 385 | if (unlikely(!pte_present(pte))) { | 421 | if (unlikely(!pte_present(pte))) { |
| @@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 397 | goto out_set_pte; | 433 | goto out_set_pte; |
| 398 | } | 434 | } |
| 399 | 435 | ||
| 400 | pfn = pte_pfn(pte); | ||
| 401 | page = pfn_valid(pfn)? pfn_to_page(pfn): NULL; | ||
| 402 | |||
| 403 | if (unlikely(vm_flags & VM_UNPAGED)) | ||
| 404 | if (!page_is_anon(page, vma, addr)) | ||
| 405 | goto out_set_pte; | ||
| 406 | |||
| 407 | /* | ||
| 408 | * If the pte points outside of valid memory but | ||
| 409 | * the region is not VM_UNPAGED, we have a problem. | ||
| 410 | */ | ||
| 411 | if (unlikely(!page)) { | ||
| 412 | print_bad_pte(vma, pte, addr); | ||
| 413 | goto out_set_pte; /* try to do something sane */ | ||
| 414 | } | ||
| 415 | |||
| 416 | /* | 436 | /* |
| 417 | * If it's a COW mapping, write protect it both | 437 | * If it's a COW mapping, write protect it both |
| 418 | * in the parent and the child | 438 | * in the parent and the child |
| @@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 429 | if (vm_flags & VM_SHARED) | 449 | if (vm_flags & VM_SHARED) |
| 430 | pte = pte_mkclean(pte); | 450 | pte = pte_mkclean(pte); |
| 431 | pte = pte_mkold(pte); | 451 | pte = pte_mkold(pte); |
| 432 | get_page(page); | 452 | |
| 433 | page_dup_rmap(page); | 453 | page = vm_normal_page(vma, addr, pte); |
| 434 | rss[!!PageAnon(page)]++; | 454 | if (page) { |
| 455 | get_page(page); | ||
| 456 | page_dup_rmap(page); | ||
| 457 | rss[!!PageAnon(page)]++; | ||
| 458 | } | ||
| 435 | 459 | ||
| 436 | out_set_pte: | 460 | out_set_pte: |
| 437 | set_pte_at(dst_mm, addr, dst_pte, pte); | 461 | set_pte_at(dst_mm, addr, dst_pte, pte); |
| @@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 543 | * readonly mappings. The tradeoff is that copy_page_range is more | 567 | * readonly mappings. The tradeoff is that copy_page_range is more |
| 544 | * efficient than faulting. | 568 | * efficient than faulting. |
| 545 | */ | 569 | */ |
| 546 | if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) { | 570 | if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) { |
| 547 | if (!vma->anon_vma) | 571 | if (!vma->anon_vma) |
| 548 | return 0; | 572 | return 0; |
| 549 | } | 573 | } |
| @@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
| 584 | } | 608 | } |
| 585 | if (pte_present(ptent)) { | 609 | if (pte_present(ptent)) { |
| 586 | struct page *page; | 610 | struct page *page; |
| 587 | unsigned long pfn; | ||
| 588 | 611 | ||
| 589 | (*zap_work) -= PAGE_SIZE; | 612 | (*zap_work) -= PAGE_SIZE; |
| 590 | 613 | ||
| 591 | pfn = pte_pfn(ptent); | 614 | page = vm_normal_page(vma, addr, ptent); |
| 592 | page = pfn_valid(pfn)? pfn_to_page(pfn): NULL; | ||
| 593 | |||
| 594 | if (unlikely(vma->vm_flags & VM_UNPAGED)) { | ||
| 595 | if (!page_is_anon(page, vma, addr)) | ||
| 596 | page = NULL; | ||
| 597 | } else if (unlikely(!page)) | ||
| 598 | print_bad_pte(vma, ptent, addr); | ||
| 599 | |||
| 600 | if (unlikely(details) && page) { | 615 | if (unlikely(details) && page) { |
| 601 | /* | 616 | /* |
| 602 | * unmap_shared_mapping_pages() wants to | 617 | * unmap_shared_mapping_pages() wants to |
| @@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
| 852 | /* | 867 | /* |
| 853 | * Do a quick page-table lookup for a single page. | 868 | * Do a quick page-table lookup for a single page. |
| 854 | */ | 869 | */ |
| 855 | struct page *follow_page(struct mm_struct *mm, unsigned long address, | 870 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
| 856 | unsigned int flags) | 871 | unsigned int flags) |
| 857 | { | 872 | { |
| 858 | pgd_t *pgd; | 873 | pgd_t *pgd; |
| @@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, | |||
| 860 | pmd_t *pmd; | 875 | pmd_t *pmd; |
| 861 | pte_t *ptep, pte; | 876 | pte_t *ptep, pte; |
| 862 | spinlock_t *ptl; | 877 | spinlock_t *ptl; |
| 863 | unsigned long pfn; | ||
| 864 | struct page *page; | 878 | struct page *page; |
| 879 | struct mm_struct *mm = vma->vm_mm; | ||
| 865 | 880 | ||
| 866 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | 881 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
| 867 | if (!IS_ERR(page)) { | 882 | if (!IS_ERR(page)) { |
| @@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, | |||
| 897 | goto unlock; | 912 | goto unlock; |
| 898 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 913 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
| 899 | goto unlock; | 914 | goto unlock; |
| 900 | pfn = pte_pfn(pte); | 915 | page = vm_normal_page(vma, address, pte); |
| 901 | if (!pfn_valid(pfn)) | 916 | if (unlikely(!page)) |
| 902 | goto unlock; | 917 | goto unlock; |
| 903 | 918 | ||
| 904 | page = pfn_to_page(pfn); | ||
| 905 | if (flags & FOLL_GET) | 919 | if (flags & FOLL_GET) |
| 906 | get_page(page); | 920 | get_page(page); |
| 907 | if (flags & FOLL_TOUCH) { | 921 | if (flags & FOLL_TOUCH) { |
| @@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 974 | return i ? : -EFAULT; | 988 | return i ? : -EFAULT; |
| 975 | } | 989 | } |
| 976 | if (pages) { | 990 | if (pages) { |
| 977 | pages[i] = pte_page(*pte); | 991 | struct page *page = vm_normal_page(gate_vma, start, *pte); |
| 978 | get_page(pages[i]); | 992 | pages[i] = page; |
| 993 | if (page) | ||
| 994 | get_page(page); | ||
| 979 | } | 995 | } |
| 980 | pte_unmap(pte); | 996 | pte_unmap(pte); |
| 981 | if (vmas) | 997 | if (vmas) |
| @@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1010 | foll_flags |= FOLL_WRITE; | 1026 | foll_flags |= FOLL_WRITE; |
| 1011 | 1027 | ||
| 1012 | cond_resched(); | 1028 | cond_resched(); |
| 1013 | while (!(page = follow_page(mm, start, foll_flags))) { | 1029 | while (!(page = follow_page(vma, start, foll_flags))) { |
| 1014 | int ret; | 1030 | int ret; |
| 1015 | ret = __handle_mm_fault(mm, vma, start, | 1031 | ret = __handle_mm_fault(mm, vma, start, |
| 1016 | foll_flags & FOLL_WRITE); | 1032 | foll_flags & FOLL_WRITE); |
| @@ -1130,6 +1146,129 @@ int zeromap_page_range(struct vm_area_struct *vma, | |||
| 1130 | return err; | 1146 | return err; |
| 1131 | } | 1147 | } |
| 1132 | 1148 | ||
| 1149 | pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) | ||
| 1150 | { | ||
| 1151 | pgd_t * pgd = pgd_offset(mm, addr); | ||
| 1152 | pud_t * pud = pud_alloc(mm, pgd, addr); | ||
| 1153 | if (pud) { | ||
| 1154 | pmd_t * pmd = pmd_alloc(mm, pud, addr); | ||
| 1155 | if (pmd) | ||
| 1156 | return pte_alloc_map_lock(mm, pmd, addr, ptl); | ||
| 1157 | } | ||
| 1158 | return NULL; | ||
| 1159 | } | ||
| 1160 | |||
| 1161 | /* | ||
| 1162 | * This is the old fallback for page remapping. | ||
| 1163 | * | ||
| 1164 | * For historical reasons, it only allows reserved pages. Only | ||
| 1165 | * old drivers should use this, and they needed to mark their | ||
| 1166 | * pages reserved for the old functions anyway. | ||
| 1167 | */ | ||
| 1168 | static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) | ||
| 1169 | { | ||
| 1170 | int retval; | ||
| 1171 | pte_t *pte; | ||
| 1172 | spinlock_t *ptl; | ||
| 1173 | |||
| 1174 | retval = -EINVAL; | ||
| 1175 | if (PageAnon(page)) | ||
| 1176 | goto out; | ||
| 1177 | retval = -ENOMEM; | ||
| 1178 | flush_dcache_page(page); | ||
| 1179 | pte = get_locked_pte(mm, addr, &ptl); | ||
| 1180 | if (!pte) | ||
| 1181 | goto out; | ||
| 1182 | retval = -EBUSY; | ||
| 1183 | if (!pte_none(*pte)) | ||
| 1184 | goto out_unlock; | ||
| 1185 | |||
| 1186 | /* Ok, finally just insert the thing.. */ | ||
| 1187 | get_page(page); | ||
| 1188 | inc_mm_counter(mm, file_rss); | ||
| 1189 | page_add_file_rmap(page); | ||
| 1190 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | ||
| 1191 | |||
| 1192 | retval = 0; | ||
| 1193 | out_unlock: | ||
| 1194 | pte_unmap_unlock(pte, ptl); | ||
| 1195 | out: | ||
| 1196 | return retval; | ||
| 1197 | } | ||
| 1198 | |||
| 1199 | /* | ||
| 1200 | * This allows drivers to insert individual pages they've allocated | ||
| 1201 | * into a user vma. | ||
| 1202 | * | ||
| 1203 | * The page has to be a nice clean _individual_ kernel allocation. | ||
| 1204 | * If you allocate a compound page, you need to have marked it as | ||
| 1205 | * such (__GFP_COMP), or manually just split the page up yourself | ||
| 1206 | * (which is mainly an issue of doing "set_page_count(page, 1)" for | ||
| 1207 | * each sub-page, and then freeing them one by one when you free | ||
| 1208 | * them rather than freeing it as a compound page). | ||
| 1209 | * | ||
| 1210 | * NOTE! Traditionally this was done with "remap_pfn_range()" which | ||
| 1211 | * took an arbitrary page protection parameter. This doesn't allow | ||
| 1212 | * that. Your vma protection will have to be set up correctly, which | ||
| 1213 | * means that if you want a shared writable mapping, you'd better | ||
| 1214 | * ask for a shared writable mapping! | ||
| 1215 | * | ||
| 1216 | * The page does not need to be reserved. | ||
| 1217 | */ | ||
| 1218 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) | ||
| 1219 | { | ||
| 1220 | if (addr < vma->vm_start || addr >= vma->vm_end) | ||
| 1221 | return -EFAULT; | ||
| 1222 | if (!page_count(page)) | ||
| 1223 | return -EINVAL; | ||
| 1224 | return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); | ||
| 1225 | } | ||
| 1226 | EXPORT_SYMBOL(vm_insert_page); | ||
| 1227 | |||
| 1228 | /* | ||
| 1229 | * Somebody does a pfn remapping that doesn't actually work as a vma. | ||
| 1230 | * | ||
| 1231 | * Do it as individual pages instead, and warn about it. It's bad form, | ||
| 1232 | * and very inefficient. | ||
| 1233 | */ | ||
| 1234 | static int incomplete_pfn_remap(struct vm_area_struct *vma, | ||
| 1235 | unsigned long start, unsigned long end, | ||
| 1236 | unsigned long pfn, pgprot_t prot) | ||
| 1237 | { | ||
| 1238 | static int warn = 10; | ||
| 1239 | struct page *page; | ||
| 1240 | int retval; | ||
| 1241 | |||
| 1242 | if (!(vma->vm_flags & VM_INCOMPLETE)) { | ||
| 1243 | if (warn) { | ||
| 1244 | warn--; | ||
| 1245 | printk("%s does an incomplete pfn remapping", current->comm); | ||
| 1246 | dump_stack(); | ||
| 1247 | } | ||
| 1248 | } | ||
| 1249 | vma->vm_flags |= VM_INCOMPLETE | VM_IO | VM_RESERVED; | ||
| 1250 | |||
| 1251 | if (start < vma->vm_start || end > vma->vm_end) | ||
| 1252 | return -EINVAL; | ||
| 1253 | |||
| 1254 | if (!pfn_valid(pfn)) | ||
| 1255 | return -EINVAL; | ||
| 1256 | |||
| 1257 | page = pfn_to_page(pfn); | ||
| 1258 | if (!PageReserved(page)) | ||
| 1259 | return -EINVAL; | ||
| 1260 | |||
| 1261 | retval = 0; | ||
| 1262 | while (start < end) { | ||
| 1263 | retval = insert_page(vma->vm_mm, start, page, prot); | ||
| 1264 | if (retval < 0) | ||
| 1265 | break; | ||
| 1266 | start += PAGE_SIZE; | ||
| 1267 | page++; | ||
| 1268 | } | ||
| 1269 | return retval; | ||
| 1270 | } | ||
| 1271 | |||
| 1133 | /* | 1272 | /* |
| 1134 | * maps a range of physical memory into the requested pages. the old | 1273 | * maps a range of physical memory into the requested pages. the old |
| 1135 | * mappings are removed. any references to nonexistent pages results | 1274 | * mappings are removed. any references to nonexistent pages results |
| @@ -1204,6 +1343,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
| 1204 | struct mm_struct *mm = vma->vm_mm; | 1343 | struct mm_struct *mm = vma->vm_mm; |
| 1205 | int err; | 1344 | int err; |
| 1206 | 1345 | ||
| 1346 | if (addr != vma->vm_start || end != vma->vm_end) | ||
| 1347 | return incomplete_pfn_remap(vma, addr, end, pfn, prot); | ||
| 1348 | |||
| 1207 | /* | 1349 | /* |
| 1208 | * Physically remapped pages are special. Tell the | 1350 | * Physically remapped pages are special. Tell the |
| 1209 | * rest of the world about it: | 1351 | * rest of the world about it: |
| @@ -1214,11 +1356,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
| 1214 | * in 2.6 the LRU scan won't even find its pages, so this | 1356 | * in 2.6 the LRU scan won't even find its pages, so this |
| 1215 | * flag means no more than count its pages in reserved_vm, | 1357 | * flag means no more than count its pages in reserved_vm, |
| 1216 | * and omit it from core dump, even when VM_IO turned off. | 1358 | * and omit it from core dump, even when VM_IO turned off. |
| 1217 | * VM_UNPAGED tells the core MM not to "manage" these pages | 1359 | * VM_PFNMAP tells the core MM that the base pages are just |
| 1218 | * (e.g. refcount, mapcount, try to swap them out): in | 1360 | * raw PFN mappings, and do not have a "struct page" associated |
| 1219 | * particular, zap_pte_range does not try to free them. | 1361 | * with them. |
| 1220 | */ | 1362 | */ |
| 1221 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED; | 1363 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; |
| 1364 | vma->vm_pgoff = pfn; | ||
| 1222 | 1365 | ||
| 1223 | BUG_ON(addr >= end); | 1366 | BUG_ON(addr >= end); |
| 1224 | pfn -= addr >> PAGE_SHIFT; | 1367 | pfn -= addr >> PAGE_SHIFT; |
| @@ -1273,6 +1416,33 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
| 1273 | return pte; | 1416 | return pte; |
| 1274 | } | 1417 | } |
| 1275 | 1418 | ||
| 1419 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) | ||
| 1420 | { | ||
| 1421 | /* | ||
| 1422 | * If the source page was a PFN mapping, we don't have | ||
| 1423 | * a "struct page" for it. We do a best-effort copy by | ||
| 1424 | * just copying from the original user address. If that | ||
| 1425 | * fails, we just zero-fill it. Live with it. | ||
| 1426 | */ | ||
| 1427 | if (unlikely(!src)) { | ||
| 1428 | void *kaddr = kmap_atomic(dst, KM_USER0); | ||
| 1429 | void __user *uaddr = (void __user *)(va & PAGE_MASK); | ||
| 1430 | |||
| 1431 | /* | ||
| 1432 | * This really shouldn't fail, because the page is there | ||
| 1433 | * in the page tables. But it might just be unreadable, | ||
| 1434 | * in which case we just give up and fill the result with | ||
| 1435 | * zeroes. | ||
| 1436 | */ | ||
| 1437 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) | ||
| 1438 | memset(kaddr, 0, PAGE_SIZE); | ||
| 1439 | kunmap_atomic(kaddr, KM_USER0); | ||
| 1440 | return; | ||
| 1441 | |||
| 1442 | } | ||
| 1443 | copy_user_highpage(dst, src, va); | ||
| 1444 | } | ||
| 1445 | |||
| 1276 | /* | 1446 | /* |
| 1277 | * This routine handles present pages, when users try to write | 1447 | * This routine handles present pages, when users try to write |
| 1278 | * to a shared page. It is done by copying the page to a new address | 1448 | * to a shared page. It is done by copying the page to a new address |
| @@ -1295,35 +1465,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1295 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 1465 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 1296 | spinlock_t *ptl, pte_t orig_pte) | 1466 | spinlock_t *ptl, pte_t orig_pte) |
| 1297 | { | 1467 | { |
| 1298 | struct page *old_page, *src_page, *new_page; | 1468 | struct page *old_page, *new_page; |
| 1299 | unsigned long pfn = pte_pfn(orig_pte); | ||
| 1300 | pte_t entry; | 1469 | pte_t entry; |
| 1301 | int ret = VM_FAULT_MINOR; | 1470 | int ret = VM_FAULT_MINOR; |
| 1302 | 1471 | ||
| 1303 | if (unlikely(!pfn_valid(pfn))) { | 1472 | old_page = vm_normal_page(vma, address, orig_pte); |
| 1304 | /* | 1473 | if (!old_page) |
| 1305 | * Page table corrupted: show pte and kill process. | 1474 | goto gotten; |
| 1306 | * Or it's an attempt to COW an out-of-map VM_UNPAGED | ||
| 1307 | * entry, which copy_user_highpage does not support. | ||
| 1308 | */ | ||
| 1309 | print_bad_pte(vma, orig_pte, address); | ||
| 1310 | ret = VM_FAULT_OOM; | ||
| 1311 | goto unlock; | ||
| 1312 | } | ||
| 1313 | old_page = pfn_to_page(pfn); | ||
| 1314 | src_page = old_page; | ||
| 1315 | |||
| 1316 | if (unlikely(vma->vm_flags & VM_UNPAGED)) | ||
| 1317 | if (!page_is_anon(old_page, vma, address)) { | ||
| 1318 | old_page = NULL; | ||
| 1319 | goto gotten; | ||
| 1320 | } | ||
| 1321 | 1475 | ||
| 1322 | if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { | 1476 | if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { |
| 1323 | int reuse = can_share_swap_page(old_page); | 1477 | int reuse = can_share_swap_page(old_page); |
| 1324 | unlock_page(old_page); | 1478 | unlock_page(old_page); |
| 1325 | if (reuse) { | 1479 | if (reuse) { |
| 1326 | flush_cache_page(vma, address, pfn); | 1480 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
| 1327 | entry = pte_mkyoung(orig_pte); | 1481 | entry = pte_mkyoung(orig_pte); |
| 1328 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1482 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 1329 | ptep_set_access_flags(vma, address, page_table, entry, 1); | 1483 | ptep_set_access_flags(vma, address, page_table, entry, 1); |
| @@ -1343,7 +1497,7 @@ gotten: | |||
| 1343 | 1497 | ||
| 1344 | if (unlikely(anon_vma_prepare(vma))) | 1498 | if (unlikely(anon_vma_prepare(vma))) |
| 1345 | goto oom; | 1499 | goto oom; |
| 1346 | if (src_page == ZERO_PAGE(address)) { | 1500 | if (old_page == ZERO_PAGE(address)) { |
| 1347 | new_page = alloc_zeroed_user_highpage(vma, address); | 1501 | new_page = alloc_zeroed_user_highpage(vma, address); |
| 1348 | if (!new_page) | 1502 | if (!new_page) |
| 1349 | goto oom; | 1503 | goto oom; |
| @@ -1351,7 +1505,7 @@ gotten: | |||
| 1351 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 1505 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
| 1352 | if (!new_page) | 1506 | if (!new_page) |
| 1353 | goto oom; | 1507 | goto oom; |
| 1354 | copy_user_highpage(new_page, src_page, address); | 1508 | cow_user_page(new_page, old_page, address); |
| 1355 | } | 1509 | } |
| 1356 | 1510 | ||
| 1357 | /* | 1511 | /* |
| @@ -1367,7 +1521,7 @@ gotten: | |||
| 1367 | } | 1521 | } |
| 1368 | } else | 1522 | } else |
| 1369 | inc_mm_counter(mm, anon_rss); | 1523 | inc_mm_counter(mm, anon_rss); |
| 1370 | flush_cache_page(vma, address, pfn); | 1524 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
| 1371 | entry = mk_pte(new_page, vma->vm_page_prot); | 1525 | entry = mk_pte(new_page, vma->vm_page_prot); |
| 1372 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1526 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 1373 | ptep_establish(vma, address, page_table, entry); | 1527 | ptep_establish(vma, address, page_table, entry); |
| @@ -1812,16 +1966,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1812 | spinlock_t *ptl; | 1966 | spinlock_t *ptl; |
| 1813 | pte_t entry; | 1967 | pte_t entry; |
| 1814 | 1968 | ||
| 1815 | /* | 1969 | if (write_access) { |
| 1816 | * A VM_UNPAGED vma will normally be filled with present ptes | ||
| 1817 | * by remap_pfn_range, and never arrive here; but it might have | ||
| 1818 | * holes, or if !VM_DONTEXPAND, mremap might have expanded it. | ||
| 1819 | * It's weird enough handling anon pages in unpaged vmas, we do | ||
| 1820 | * not want to worry about ZERO_PAGEs too (it may or may not | ||
| 1821 | * matter if their counts wrap): just give them anon pages. | ||
| 1822 | */ | ||
| 1823 | |||
| 1824 | if (write_access || (vma->vm_flags & VM_UNPAGED)) { | ||
| 1825 | /* Allocate our own private page. */ | 1970 | /* Allocate our own private page. */ |
| 1826 | pte_unmap(page_table); | 1971 | pte_unmap(page_table); |
| 1827 | 1972 | ||
| @@ -1896,7 +2041,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1896 | int anon = 0; | 2041 | int anon = 0; |
| 1897 | 2042 | ||
| 1898 | pte_unmap(page_table); | 2043 | pte_unmap(page_table); |
| 1899 | BUG_ON(vma->vm_flags & VM_UNPAGED); | 2044 | BUG_ON(vma->vm_flags & VM_PFNMAP); |
| 1900 | 2045 | ||
| 1901 | if (vma->vm_file) { | 2046 | if (vma->vm_file) { |
| 1902 | mapping = vma->vm_file->f_mapping; | 2047 | mapping = vma->vm_file->f_mapping; |
| @@ -2149,6 +2294,12 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) | |||
| 2149 | spin_unlock(&mm->page_table_lock); | 2294 | spin_unlock(&mm->page_table_lock); |
| 2150 | return 0; | 2295 | return 0; |
| 2151 | } | 2296 | } |
| 2297 | #else | ||
| 2298 | /* Workaround for gcc 2.96 */ | ||
| 2299 | int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) | ||
| 2300 | { | ||
| 2301 | return 0; | ||
| 2302 | } | ||
| 2152 | #endif /* __PAGETABLE_PUD_FOLDED */ | 2303 | #endif /* __PAGETABLE_PUD_FOLDED */ |
| 2153 | 2304 | ||
| 2154 | #ifndef __PAGETABLE_PMD_FOLDED | 2305 | #ifndef __PAGETABLE_PMD_FOLDED |
| @@ -2177,6 +2328,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | |||
| 2177 | spin_unlock(&mm->page_table_lock); | 2328 | spin_unlock(&mm->page_table_lock); |
| 2178 | return 0; | 2329 | return 0; |
| 2179 | } | 2330 | } |
| 2331 | #else | ||
| 2332 | /* Workaround for gcc 2.96 */ | ||
| 2333 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | ||
| 2334 | { | ||
| 2335 | return 0; | ||
| 2336 | } | ||
| 2180 | #endif /* __PAGETABLE_PMD_FOLDED */ | 2337 | #endif /* __PAGETABLE_PMD_FOLDED */ |
| 2181 | 2338 | ||
| 2182 | int make_pages_present(unsigned long addr, unsigned long end) | 2339 | int make_pages_present(unsigned long addr, unsigned long end) |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5609a31bdf22..bec88c81244e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 189 | 189 | ||
| 190 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 190 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| 191 | do { | 191 | do { |
| 192 | unsigned long pfn; | 192 | struct page *page; |
| 193 | unsigned int nid; | 193 | unsigned int nid; |
| 194 | 194 | ||
| 195 | if (!pte_present(*pte)) | 195 | if (!pte_present(*pte)) |
| 196 | continue; | 196 | continue; |
| 197 | pfn = pte_pfn(*pte); | 197 | page = vm_normal_page(vma, addr, *pte); |
| 198 | if (!pfn_valid(pfn)) { | 198 | if (!page) |
| 199 | print_bad_pte(vma, *pte, addr); | ||
| 200 | continue; | 199 | continue; |
| 201 | } | 200 | nid = page_to_nid(page); |
| 202 | nid = pfn_to_nid(pfn); | ||
| 203 | if (!node_isset(nid, *nodes)) | 201 | if (!node_isset(nid, *nodes)) |
| 204 | break; | 202 | break; |
| 205 | } while (pte++, addr += PAGE_SIZE, addr != end); | 203 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| @@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
| 269 | first = find_vma(mm, start); | 267 | first = find_vma(mm, start); |
| 270 | if (!first) | 268 | if (!first) |
| 271 | return ERR_PTR(-EFAULT); | 269 | return ERR_PTR(-EFAULT); |
| 272 | if (first->vm_flags & VM_UNPAGED) | ||
| 273 | return ERR_PTR(-EACCES); | ||
| 274 | prev = NULL; | 270 | prev = NULL; |
| 275 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 271 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
| 276 | if (!vma->vm_next && vma->vm_end < end) | 272 | if (!vma->vm_next && vma->vm_end < end) |
diff --git a/mm/msync.c b/mm/msync.c index b3f4caf3010b..1b5b6f662dcf 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
| @@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 27 | again: | 27 | again: |
| 28 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 28 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| 29 | do { | 29 | do { |
| 30 | unsigned long pfn; | ||
| 31 | struct page *page; | 30 | struct page *page; |
| 32 | 31 | ||
| 33 | if (progress >= 64) { | 32 | if (progress >= 64) { |
| @@ -40,13 +39,9 @@ again: | |||
| 40 | continue; | 39 | continue; |
| 41 | if (!pte_maybe_dirty(*pte)) | 40 | if (!pte_maybe_dirty(*pte)) |
| 42 | continue; | 41 | continue; |
| 43 | pfn = pte_pfn(*pte); | 42 | page = vm_normal_page(vma, addr, *pte); |
| 44 | if (unlikely(!pfn_valid(pfn))) { | 43 | if (!page) |
| 45 | print_bad_pte(vma, *pte, addr); | ||
| 46 | continue; | 44 | continue; |
| 47 | } | ||
| 48 | page = pfn_to_page(pfn); | ||
| 49 | |||
| 50 | if (ptep_clear_flush_dirty(vma, addr, pte) || | 45 | if (ptep_clear_flush_dirty(vma, addr, pte) || |
| 51 | page_test_and_clear_dirty(page)) | 46 | page_test_and_clear_dirty(page)) |
| 52 | set_page_dirty(page); | 47 | set_page_dirty(page); |
| @@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma, | |||
| 97 | /* For hugepages we can't go walking the page table normally, | 92 | /* For hugepages we can't go walking the page table normally, |
| 98 | * but that's ok, hugetlbfs is memory based, so we don't need | 93 | * but that's ok, hugetlbfs is memory based, so we don't need |
| 99 | * to do anything more on an msync(). | 94 | * to do anything more on an msync(). |
| 100 | * Can't do anything with VM_UNPAGED regions either. | ||
| 101 | */ | 95 | */ |
| 102 | if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED)) | 96 | if (vma->vm_flags & VM_HUGETLB) |
| 103 | return; | 97 | return; |
| 104 | 98 | ||
| 105 | BUG_ON(addr >= end); | 99 | BUG_ON(addr >= end); |
diff --git a/mm/nommu.c b/mm/nommu.c index 6deb6ab3d6ad..c1196812876b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
| 1045 | 1045 | ||
| 1046 | EXPORT_SYMBOL(find_vma); | 1046 | EXPORT_SYMBOL(find_vma); |
| 1047 | 1047 | ||
| 1048 | struct page *follow_page(struct mm_struct *mm, unsigned long address, | 1048 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
| 1049 | unsigned int foll_flags) | 1049 | unsigned int foll_flags) |
| 1050 | { | 1050 | { |
| 1051 | return NULL; | 1051 | return NULL; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1731236dec35..3b21a13d841c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -773,9 +773,12 @@ again: | |||
| 773 | } | 773 | } |
| 774 | 774 | ||
| 775 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 775 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ |
| 776 | #define ALLOC_HARDER 0x02 /* try to alloc harder */ | 776 | #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ |
| 777 | #define ALLOC_HIGH 0x04 /* __GFP_HIGH set */ | 777 | #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ |
| 778 | #define ALLOC_CPUSET 0x08 /* check for correct cpuset */ | 778 | #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ |
| 779 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | ||
| 780 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | ||
| 781 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | ||
| 779 | 782 | ||
| 780 | /* | 783 | /* |
| 781 | * Return 1 if free pages are above 'mark'. This takes into account the order | 784 | * Return 1 if free pages are above 'mark'. This takes into account the order |
| @@ -830,7 +833,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
| 830 | continue; | 833 | continue; |
| 831 | 834 | ||
| 832 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 835 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
| 833 | if (!zone_watermark_ok(*z, order, (*z)->pages_low, | 836 | unsigned long mark; |
| 837 | if (alloc_flags & ALLOC_WMARK_MIN) | ||
| 838 | mark = (*z)->pages_min; | ||
| 839 | else if (alloc_flags & ALLOC_WMARK_LOW) | ||
| 840 | mark = (*z)->pages_low; | ||
| 841 | else | ||
| 842 | mark = (*z)->pages_high; | ||
| 843 | if (!zone_watermark_ok(*z, order, mark, | ||
| 834 | classzone_idx, alloc_flags)) | 844 | classzone_idx, alloc_flags)) |
| 835 | continue; | 845 | continue; |
| 836 | } | 846 | } |
| @@ -871,7 +881,7 @@ restart: | |||
| 871 | } | 881 | } |
| 872 | 882 | ||
| 873 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 883 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
| 874 | zonelist, ALLOC_CPUSET); | 884 | zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); |
| 875 | if (page) | 885 | if (page) |
| 876 | goto got_pg; | 886 | goto got_pg; |
| 877 | 887 | ||
| @@ -888,7 +898,7 @@ restart: | |||
| 888 | * cannot run direct reclaim, or if the caller has realtime scheduling | 898 | * cannot run direct reclaim, or if the caller has realtime scheduling |
| 889 | * policy. | 899 | * policy. |
| 890 | */ | 900 | */ |
| 891 | alloc_flags = 0; | 901 | alloc_flags = ALLOC_WMARK_MIN; |
| 892 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) | 902 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) |
| 893 | alloc_flags |= ALLOC_HARDER; | 903 | alloc_flags |= ALLOC_HARDER; |
| 894 | if (gfp_mask & __GFP_HIGH) | 904 | if (gfp_mask & __GFP_HIGH) |
| @@ -959,7 +969,7 @@ rebalance: | |||
| 959 | * under heavy pressure. | 969 | * under heavy pressure. |
| 960 | */ | 970 | */ |
| 961 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 971 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
| 962 | zonelist, ALLOC_CPUSET); | 972 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); |
| 963 | if (page) | 973 | if (page) |
| 964 | goto got_pg; | 974 | goto got_pg; |
| 965 | 975 | ||
| @@ -1762,16 +1772,16 @@ static int __devinit zone_batchsize(struct zone *zone) | |||
| 1762 | batch = 1; | 1772 | batch = 1; |
| 1763 | 1773 | ||
| 1764 | /* | 1774 | /* |
| 1765 | * We will be trying to allcoate bigger chunks of contiguous | 1775 | * Clamp the batch to a 2^n - 1 value. Having a power |
| 1766 | * memory of the order of fls(batch). This should result in | 1776 | * of 2 value was found to be more likely to have |
| 1767 | * better cache coloring. | 1777 | * suboptimal cache aliasing properties in some cases. |
| 1768 | * | 1778 | * |
| 1769 | * A sanity check also to ensure that batch is still in limits. | 1779 | * For example if 2 tasks are alternately allocating |
| 1780 | * batches of pages, one task can end up with a lot | ||
| 1781 | * of pages of one half of the possible page colors | ||
| 1782 | * and the other with pages of the other colors. | ||
| 1770 | */ | 1783 | */ |
| 1771 | batch = (1 << fls(batch + batch/2)); | 1784 | batch = (1 << (fls(batch + batch/2)-1)) - 1; |
| 1772 | |||
| 1773 | if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) | ||
| 1774 | batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); | ||
| 1775 | 1785 | ||
| 1776 | return batch; | 1786 | return batch; |
| 1777 | } | 1787 | } |
| @@ -226,8 +226,6 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
| 226 | /* | 226 | /* |
| 227 | * At what user virtual address is page expected in vma? checking that the | 227 | * At what user virtual address is page expected in vma? checking that the |
| 228 | * page matches the vma: currently only used on anon pages, by unuse_vma; | 228 | * page matches the vma: currently only used on anon pages, by unuse_vma; |
| 229 | * and by extraordinary checks on anon pages in VM_UNPAGED vmas, taking | ||
| 230 | * care that an mmap of /dev/mem might window free and foreign pages. | ||
| 231 | */ | 229 | */ |
| 232 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 230 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
| 233 | { | 231 | { |
| @@ -292,7 +290,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, | |||
| 292 | * repeatedly from either page_referenced_anon or page_referenced_file. | 290 | * repeatedly from either page_referenced_anon or page_referenced_file. |
| 293 | */ | 291 | */ |
| 294 | static int page_referenced_one(struct page *page, | 292 | static int page_referenced_one(struct page *page, |
| 295 | struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token) | 293 | struct vm_area_struct *vma, unsigned int *mapcount) |
| 296 | { | 294 | { |
| 297 | struct mm_struct *mm = vma->vm_mm; | 295 | struct mm_struct *mm = vma->vm_mm; |
| 298 | unsigned long address; | 296 | unsigned long address; |
| @@ -313,7 +311,7 @@ static int page_referenced_one(struct page *page, | |||
| 313 | 311 | ||
| 314 | /* Pretend the page is referenced if the task has the | 312 | /* Pretend the page is referenced if the task has the |
| 315 | swap token and is in the middle of a page fault. */ | 313 | swap token and is in the middle of a page fault. */ |
| 316 | if (mm != current->mm && !ignore_token && has_swap_token(mm) && | 314 | if (mm != current->mm && has_swap_token(mm) && |
| 317 | rwsem_is_locked(&mm->mmap_sem)) | 315 | rwsem_is_locked(&mm->mmap_sem)) |
| 318 | referenced++; | 316 | referenced++; |
| 319 | 317 | ||
| @@ -323,7 +321,7 @@ out: | |||
| 323 | return referenced; | 321 | return referenced; |
| 324 | } | 322 | } |
| 325 | 323 | ||
| 326 | static int page_referenced_anon(struct page *page, int ignore_token) | 324 | static int page_referenced_anon(struct page *page) |
| 327 | { | 325 | { |
| 328 | unsigned int mapcount; | 326 | unsigned int mapcount; |
| 329 | struct anon_vma *anon_vma; | 327 | struct anon_vma *anon_vma; |
| @@ -336,8 +334,7 @@ static int page_referenced_anon(struct page *page, int ignore_token) | |||
| 336 | 334 | ||
| 337 | mapcount = page_mapcount(page); | 335 | mapcount = page_mapcount(page); |
| 338 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 336 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
| 339 | referenced += page_referenced_one(page, vma, &mapcount, | 337 | referenced += page_referenced_one(page, vma, &mapcount); |
| 340 | ignore_token); | ||
| 341 | if (!mapcount) | 338 | if (!mapcount) |
| 342 | break; | 339 | break; |
| 343 | } | 340 | } |
| @@ -356,7 +353,7 @@ static int page_referenced_anon(struct page *page, int ignore_token) | |||
| 356 | * | 353 | * |
| 357 | * This function is only called from page_referenced for object-based pages. | 354 | * This function is only called from page_referenced for object-based pages. |
| 358 | */ | 355 | */ |
| 359 | static int page_referenced_file(struct page *page, int ignore_token) | 356 | static int page_referenced_file(struct page *page) |
| 360 | { | 357 | { |
| 361 | unsigned int mapcount; | 358 | unsigned int mapcount; |
| 362 | struct address_space *mapping = page->mapping; | 359 | struct address_space *mapping = page->mapping; |
| @@ -394,8 +391,7 @@ static int page_referenced_file(struct page *page, int ignore_token) | |||
| 394 | referenced++; | 391 | referenced++; |
| 395 | break; | 392 | break; |
| 396 | } | 393 | } |
| 397 | referenced += page_referenced_one(page, vma, &mapcount, | 394 | referenced += page_referenced_one(page, vma, &mapcount); |
| 398 | ignore_token); | ||
| 399 | if (!mapcount) | 395 | if (!mapcount) |
| 400 | break; | 396 | break; |
| 401 | } | 397 | } |
| @@ -412,13 +408,10 @@ static int page_referenced_file(struct page *page, int ignore_token) | |||
| 412 | * Quick test_and_clear_referenced for all mappings to a page, | 408 | * Quick test_and_clear_referenced for all mappings to a page, |
| 413 | * returns the number of ptes which referenced the page. | 409 | * returns the number of ptes which referenced the page. |
| 414 | */ | 410 | */ |
| 415 | int page_referenced(struct page *page, int is_locked, int ignore_token) | 411 | int page_referenced(struct page *page, int is_locked) |
| 416 | { | 412 | { |
| 417 | int referenced = 0; | 413 | int referenced = 0; |
| 418 | 414 | ||
| 419 | if (!swap_token_default_timeout) | ||
| 420 | ignore_token = 1; | ||
| 421 | |||
| 422 | if (page_test_and_clear_young(page)) | 415 | if (page_test_and_clear_young(page)) |
| 423 | referenced++; | 416 | referenced++; |
| 424 | 417 | ||
| @@ -427,15 +420,14 @@ int page_referenced(struct page *page, int is_locked, int ignore_token) | |||
| 427 | 420 | ||
| 428 | if (page_mapped(page) && page->mapping) { | 421 | if (page_mapped(page) && page->mapping) { |
| 429 | if (PageAnon(page)) | 422 | if (PageAnon(page)) |
| 430 | referenced += page_referenced_anon(page, ignore_token); | 423 | referenced += page_referenced_anon(page); |
| 431 | else if (is_locked) | 424 | else if (is_locked) |
| 432 | referenced += page_referenced_file(page, ignore_token); | 425 | referenced += page_referenced_file(page); |
| 433 | else if (TestSetPageLocked(page)) | 426 | else if (TestSetPageLocked(page)) |
| 434 | referenced++; | 427 | referenced++; |
| 435 | else { | 428 | else { |
| 436 | if (page->mapping) | 429 | if (page->mapping) |
| 437 | referenced += page_referenced_file(page, | 430 | referenced += page_referenced_file(page); |
| 438 | ignore_token); | ||
| 439 | unlock_page(page); | 431 | unlock_page(page); |
| 440 | } | 432 | } |
| 441 | } | 433 | } |
| @@ -614,7 +606,6 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
| 614 | struct page *page; | 606 | struct page *page; |
| 615 | unsigned long address; | 607 | unsigned long address; |
| 616 | unsigned long end; | 608 | unsigned long end; |
| 617 | unsigned long pfn; | ||
| 618 | 609 | ||
| 619 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | 610 | address = (vma->vm_start + cursor) & CLUSTER_MASK; |
| 620 | end = address + CLUSTER_SIZE; | 611 | end = address + CLUSTER_SIZE; |
| @@ -643,21 +634,14 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
| 643 | for (; address < end; pte++, address += PAGE_SIZE) { | 634 | for (; address < end; pte++, address += PAGE_SIZE) { |
| 644 | if (!pte_present(*pte)) | 635 | if (!pte_present(*pte)) |
| 645 | continue; | 636 | continue; |
| 646 | 637 | page = vm_normal_page(vma, address, *pte); | |
| 647 | pfn = pte_pfn(*pte); | 638 | BUG_ON(!page || PageAnon(page)); |
| 648 | if (unlikely(!pfn_valid(pfn))) { | ||
| 649 | print_bad_pte(vma, *pte, address); | ||
| 650 | continue; | ||
| 651 | } | ||
| 652 | |||
| 653 | page = pfn_to_page(pfn); | ||
| 654 | BUG_ON(PageAnon(page)); | ||
| 655 | 639 | ||
| 656 | if (ptep_clear_flush_young(vma, address, pte)) | 640 | if (ptep_clear_flush_young(vma, address, pte)) |
| 657 | continue; | 641 | continue; |
| 658 | 642 | ||
| 659 | /* Nuke the page table entry. */ | 643 | /* Nuke the page table entry. */ |
| 660 | flush_cache_page(vma, address, pfn); | 644 | flush_cache_page(vma, address, pte_pfn(*pte)); |
| 661 | pteval = ptep_clear_flush(vma, address, pte); | 645 | pteval = ptep_clear_flush(vma, address, pte); |
| 662 | 646 | ||
| 663 | /* If nonlinear, store the file page offset in the pte. */ | 647 | /* If nonlinear, store the file page offset in the pte. */ |
diff --git a/mm/thrash.c b/mm/thrash.c index eff3c18c33a1..f4c560b4a2b7 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
| @@ -57,14 +57,17 @@ void grab_swap_token(void) | |||
| 57 | /* We have the token. Let others know we still need it. */ | 57 | /* We have the token. Let others know we still need it. */ |
| 58 | if (has_swap_token(current->mm)) { | 58 | if (has_swap_token(current->mm)) { |
| 59 | current->mm->recent_pagein = 1; | 59 | current->mm->recent_pagein = 1; |
| 60 | if (unlikely(!swap_token_default_timeout)) | ||
| 61 | disable_swap_token(); | ||
| 60 | return; | 62 | return; |
| 61 | } | 63 | } |
| 62 | 64 | ||
| 63 | if (time_after(jiffies, swap_token_check)) { | 65 | if (time_after(jiffies, swap_token_check)) { |
| 64 | 66 | ||
| 65 | /* Can't get swapout protection if we exceed our RSS limit. */ | 67 | if (!swap_token_default_timeout) { |
| 66 | // if (current->mm->rss > current->mm->rlimit_rss) | 68 | swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; |
| 67 | // return; | 69 | return; |
| 70 | } | ||
| 68 | 71 | ||
| 69 | /* ... or if we recently held the token. */ | 72 | /* ... or if we recently held the token. */ |
| 70 | if (time_before(jiffies, current->mm->swap_token_time)) | 73 | if (time_before(jiffies, current->mm->swap_token_time)) |
| @@ -95,6 +98,7 @@ void __put_swap_token(struct mm_struct *mm) | |||
| 95 | { | 98 | { |
| 96 | spin_lock(&swap_token_lock); | 99 | spin_lock(&swap_token_lock); |
| 97 | if (likely(mm == swap_token_mm)) { | 100 | if (likely(mm == swap_token_mm)) { |
| 101 | mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | ||
| 98 | swap_token_mm = &init_mm; | 102 | swap_token_mm = &init_mm; |
| 99 | swap_token_check = jiffies; | 103 | swap_token_check = jiffies; |
| 100 | } | 104 | } |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 28130541270f..b0cd81c32de6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -201,13 +201,25 @@ static int shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
| 201 | list_for_each_entry(shrinker, &shrinker_list, list) { | 201 | list_for_each_entry(shrinker, &shrinker_list, list) { |
| 202 | unsigned long long delta; | 202 | unsigned long long delta; |
| 203 | unsigned long total_scan; | 203 | unsigned long total_scan; |
| 204 | unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); | ||
| 204 | 205 | ||
| 205 | delta = (4 * scanned) / shrinker->seeks; | 206 | delta = (4 * scanned) / shrinker->seeks; |
| 206 | delta *= (*shrinker->shrinker)(0, gfp_mask); | 207 | delta *= max_pass; |
| 207 | do_div(delta, lru_pages + 1); | 208 | do_div(delta, lru_pages + 1); |
| 208 | shrinker->nr += delta; | 209 | shrinker->nr += delta; |
| 209 | if (shrinker->nr < 0) | 210 | if (shrinker->nr < 0) { |
| 210 | shrinker->nr = LONG_MAX; /* It wrapped! */ | 211 | printk(KERN_ERR "%s: nr=%ld\n", |
| 212 | __FUNCTION__, shrinker->nr); | ||
| 213 | shrinker->nr = max_pass; | ||
| 214 | } | ||
| 215 | |||
| 216 | /* | ||
| 217 | * Avoid risking looping forever due to too large nr value: | ||
| 218 | * never try to free more than twice the estimate number of | ||
| 219 | * freeable entries. | ||
| 220 | */ | ||
| 221 | if (shrinker->nr > max_pass * 2) | ||
| 222 | shrinker->nr = max_pass * 2; | ||
| 211 | 223 | ||
| 212 | total_scan = shrinker->nr; | 224 | total_scan = shrinker->nr; |
| 213 | shrinker->nr = 0; | 225 | shrinker->nr = 0; |
| @@ -407,7 +419,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
| 407 | if (PageWriteback(page)) | 419 | if (PageWriteback(page)) |
| 408 | goto keep_locked; | 420 | goto keep_locked; |
| 409 | 421 | ||
| 410 | referenced = page_referenced(page, 1, sc->priority <= 0); | 422 | referenced = page_referenced(page, 1); |
| 411 | /* In active use or really unfreeable? Activate it. */ | 423 | /* In active use or really unfreeable? Activate it. */ |
| 412 | if (referenced && page_mapping_inuse(page)) | 424 | if (referenced && page_mapping_inuse(page)) |
| 413 | goto activate_locked; | 425 | goto activate_locked; |
| @@ -756,7 +768,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
| 756 | if (page_mapped(page)) { | 768 | if (page_mapped(page)) { |
| 757 | if (!reclaim_mapped || | 769 | if (!reclaim_mapped || |
| 758 | (total_swap_pages == 0 && PageAnon(page)) || | 770 | (total_swap_pages == 0 && PageAnon(page)) || |
| 759 | page_referenced(page, 0, sc->priority <= 0)) { | 771 | page_referenced(page, 0)) { |
| 760 | list_add(&page->lru, &l_active); | 772 | list_add(&page->lru, &l_active); |
| 761 | continue; | 773 | continue; |
| 762 | } | 774 | } |
| @@ -960,6 +972,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
| 960 | sc.nr_reclaimed = 0; | 972 | sc.nr_reclaimed = 0; |
| 961 | sc.priority = priority; | 973 | sc.priority = priority; |
| 962 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | 974 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; |
| 975 | if (!priority) | ||
| 976 | disable_swap_token(); | ||
| 963 | shrink_caches(zones, &sc); | 977 | shrink_caches(zones, &sc); |
| 964 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); | 978 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); |
| 965 | if (reclaim_state) { | 979 | if (reclaim_state) { |
| @@ -1056,6 +1070,10 @@ loop_again: | |||
| 1056 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 1070 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
| 1057 | unsigned long lru_pages = 0; | 1071 | unsigned long lru_pages = 0; |
| 1058 | 1072 | ||
| 1073 | /* The swap token gets in the way of swapout... */ | ||
| 1074 | if (!priority) | ||
| 1075 | disable_swap_token(); | ||
| 1076 | |||
| 1059 | all_zones_ok = 1; | 1077 | all_zones_ok = 1; |
| 1060 | 1078 | ||
| 1061 | if (nr_pages == 0) { | 1079 | if (nr_pages == 0) { |
| @@ -1360,6 +1378,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1360 | sc.nr_reclaimed = 0; | 1378 | sc.nr_reclaimed = 0; |
| 1361 | /* scan at the highest priority */ | 1379 | /* scan at the highest priority */ |
| 1362 | sc.priority = 0; | 1380 | sc.priority = 0; |
| 1381 | disable_swap_token(); | ||
| 1363 | 1382 | ||
| 1364 | if (nr_pages > SWAP_CLUSTER_MAX) | 1383 | if (nr_pages > SWAP_CLUSTER_MAX) |
| 1365 | sc.swap_cluster_max = nr_pages; | 1384 | sc.swap_cluster_max = nr_pages; |
