aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2005-11-28 17:34:23 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2005-11-28 17:34:23 -0500
commit6aab341e0a28aff100a09831c5300a2994b8b986 (patch)
tree1af3908275aa5e1b16e80efee554a9a7504c56d4 /mm/memory.c
parent458af5439fe7ae7d95ca14106844e61f0795166c (diff)
mm: re-architect the VM_UNPAGED logic
This replaces the (in my opinion horrible) VM_UNMAPPED logic with very explicit support for a "remapped page range" aka VM_PFNMAP. It allows a VM area to contain an arbitrary range of page table entries that the VM never touches, and never considers to be normal pages. Any user of "remap_pfn_range()" automatically gets this new functionality, and doesn't even have to mark the pages reserved or indeed mark them any other way. It just works. As a side effect, doing mmap() on /dev/mem works for arbitrary ranges. Sparc update from David in the next commit. Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c189
1 files changed, 100 insertions, 89 deletions
diff --git a/mm/memory.c b/mm/memory.c
index d1f46f4e4c8a..b57fbc636058 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
333} 333}
334 334
335/* 335/*
336 * This function is called to print an error when a pte in a 336 * This function is called to print an error when a bad pte
337 * !VM_UNPAGED region is found pointing to an invalid pfn (which 337 * is found. For example, we might have a PFN-mapped pte in
338 * is an error. 338 * a region that doesn't allow it.
339 * 339 *
340 * The calling function must still handle the error. 340 * The calling function must still handle the error.
341 */ 341 */
@@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
350} 350}
351 351
352/* 352/*
353 * page_is_anon applies strict checks for an anonymous page belonging to 353 * This function gets the "struct page" associated with a pte.
354 * this vma at this address. It is used on VM_UNPAGED vmas, which are 354 *
355 * usually populated with shared originals (which must not be counted), 355 * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
356 * but occasionally contain private COWed copies (when !VM_SHARED, or 356 * will have each page table entry just pointing to a raw page frame
357 * perhaps via ptrace when VM_SHARED). An mmap of /dev/mem might window 357 * number, and as far as the VM layer is concerned, those do not have
358 * free pages, pages from other processes, or from other parts of this: 358 * pages associated with them - even if the PFN might point to memory
359 * it's tricky, but try not to be deceived by foreign anonymous pages. 359 * that otherwise is perfectly fine and has a "struct page".
360 *
361 * The way we recognize those mappings is through the rules set up
362 * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
363 * and the vm_pgoff will point to the first PFN mapped: thus every
364 * page that is a raw mapping will always honor the rule
365 *
366 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
367 *
368 * and if that isn't true, the page has been COW'ed (in which case it
369 * _does_ have a "struct page" associated with it even if it is in a
370 * VM_PFNMAP range).
360 */ 371 */
361static inline int page_is_anon(struct page *page, 372struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
362 struct vm_area_struct *vma, unsigned long addr)
363{ 373{
364 return page && PageAnon(page) && page_mapped(page) && 374 unsigned long pfn = pte_pfn(pte);
365 page_address_in_vma(page, vma) == addr; 375
376 if (vma->vm_flags & VM_PFNMAP) {
377 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
378 if (pfn == vma->vm_pgoff + off)
379 return NULL;
380 }
381
382 /*
383 * Add some anal sanity checks for now. Eventually,
384 * we should just do "return pfn_to_page(pfn)", but
385 * in the meantime we check that we get a valid pfn,
386 * and that the resulting page looks ok.
387 *
388 * Remove this test eventually!
389 */
390 if (unlikely(!pfn_valid(pfn))) {
391 print_bad_pte(vma, pte, addr);
392 return NULL;
393 }
394
395 /*
396 * NOTE! We still have PageReserved() pages in the page
397 * tables.
398 *
399 * The PAGE_ZERO() pages and various VDSO mappings can
400 * cause them to exist.
401 */
402 return pfn_to_page(pfn);
366} 403}
367 404
368/* 405/*
@@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
379 unsigned long vm_flags = vma->vm_flags; 416 unsigned long vm_flags = vma->vm_flags;
380 pte_t pte = *src_pte; 417 pte_t pte = *src_pte;
381 struct page *page; 418 struct page *page;
382 unsigned long pfn;
383 419
384 /* pte contains position in swap or file, so copy. */ 420 /* pte contains position in swap or file, so copy. */
385 if (unlikely(!pte_present(pte))) { 421 if (unlikely(!pte_present(pte))) {
@@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
397 goto out_set_pte; 433 goto out_set_pte;
398 } 434 }
399 435
400 pfn = pte_pfn(pte);
401 page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
402
403 if (unlikely(vm_flags & VM_UNPAGED))
404 if (!page_is_anon(page, vma, addr))
405 goto out_set_pte;
406
407 /*
408 * If the pte points outside of valid memory but
409 * the region is not VM_UNPAGED, we have a problem.
410 */
411 if (unlikely(!page)) {
412 print_bad_pte(vma, pte, addr);
413 goto out_set_pte; /* try to do something sane */
414 }
415
416 /* 436 /*
417 * If it's a COW mapping, write protect it both 437 * If it's a COW mapping, write protect it both
418 * in the parent and the child 438 * in the parent and the child
@@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
429 if (vm_flags & VM_SHARED) 449 if (vm_flags & VM_SHARED)
430 pte = pte_mkclean(pte); 450 pte = pte_mkclean(pte);
431 pte = pte_mkold(pte); 451 pte = pte_mkold(pte);
432 get_page(page); 452
433 page_dup_rmap(page); 453 page = vm_normal_page(vma, addr, pte);
434 rss[!!PageAnon(page)]++; 454 if (page) {
455 get_page(page);
456 page_dup_rmap(page);
457 rss[!!PageAnon(page)]++;
458 }
435 459
436out_set_pte: 460out_set_pte:
437 set_pte_at(dst_mm, addr, dst_pte, pte); 461 set_pte_at(dst_mm, addr, dst_pte, pte);
@@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
543 * readonly mappings. The tradeoff is that copy_page_range is more 567 * readonly mappings. The tradeoff is that copy_page_range is more
544 * efficient than faulting. 568 * efficient than faulting.
545 */ 569 */
546 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) { 570 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
547 if (!vma->anon_vma) 571 if (!vma->anon_vma)
548 return 0; 572 return 0;
549 } 573 }
@@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
584 } 608 }
585 if (pte_present(ptent)) { 609 if (pte_present(ptent)) {
586 struct page *page; 610 struct page *page;
587 unsigned long pfn;
588 611
589 (*zap_work) -= PAGE_SIZE; 612 (*zap_work) -= PAGE_SIZE;
590 613
591 pfn = pte_pfn(ptent); 614 page = vm_normal_page(vma, addr, ptent);
592 page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
593
594 if (unlikely(vma->vm_flags & VM_UNPAGED)) {
595 if (!page_is_anon(page, vma, addr))
596 page = NULL;
597 } else if (unlikely(!page))
598 print_bad_pte(vma, ptent, addr);
599
600 if (unlikely(details) && page) { 615 if (unlikely(details) && page) {
601 /* 616 /*
602 * unmap_shared_mapping_pages() wants to 617 * unmap_shared_mapping_pages() wants to
@@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
852/* 867/*
853 * Do a quick page-table lookup for a single page. 868 * Do a quick page-table lookup for a single page.
854 */ 869 */
855struct page *follow_page(struct mm_struct *mm, unsigned long address, 870struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
856 unsigned int flags) 871 unsigned int flags)
857{ 872{
858 pgd_t *pgd; 873 pgd_t *pgd;
@@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
860 pmd_t *pmd; 875 pmd_t *pmd;
861 pte_t *ptep, pte; 876 pte_t *ptep, pte;
862 spinlock_t *ptl; 877 spinlock_t *ptl;
863 unsigned long pfn;
864 struct page *page; 878 struct page *page;
879 struct mm_struct *mm = vma->vm_mm;
865 880
866 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 881 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
867 if (!IS_ERR(page)) { 882 if (!IS_ERR(page)) {
@@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
897 goto unlock; 912 goto unlock;
898 if ((flags & FOLL_WRITE) && !pte_write(pte)) 913 if ((flags & FOLL_WRITE) && !pte_write(pte))
899 goto unlock; 914 goto unlock;
900 pfn = pte_pfn(pte); 915 page = vm_normal_page(vma, address, pte);
901 if (!pfn_valid(pfn)) 916 if (unlikely(!page))
902 goto unlock; 917 goto unlock;
903 918
904 page = pfn_to_page(pfn);
905 if (flags & FOLL_GET) 919 if (flags & FOLL_GET)
906 get_page(page); 920 get_page(page);
907 if (flags & FOLL_TOUCH) { 921 if (flags & FOLL_TOUCH) {
@@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
974 return i ? : -EFAULT; 988 return i ? : -EFAULT;
975 } 989 }
976 if (pages) { 990 if (pages) {
977 pages[i] = pte_page(*pte); 991 struct page *page = vm_normal_page(vma, start, *pte);
978 get_page(pages[i]); 992 pages[i] = page;
993 if (page)
994 get_page(page);
979 } 995 }
980 pte_unmap(pte); 996 pte_unmap(pte);
981 if (vmas) 997 if (vmas)
@@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1010 foll_flags |= FOLL_WRITE; 1026 foll_flags |= FOLL_WRITE;
1011 1027
1012 cond_resched(); 1028 cond_resched();
1013 while (!(page = follow_page(mm, start, foll_flags))) { 1029 while (!(page = follow_page(vma, start, foll_flags))) {
1014 int ret; 1030 int ret;
1015 ret = __handle_mm_fault(mm, vma, start, 1031 ret = __handle_mm_fault(mm, vma, start,
1016 foll_flags & FOLL_WRITE); 1032 foll_flags & FOLL_WRITE);
@@ -1214,11 +1230,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1214 * in 2.6 the LRU scan won't even find its pages, so this 1230 * in 2.6 the LRU scan won't even find its pages, so this
1215 * flag means no more than count its pages in reserved_vm, 1231 * flag means no more than count its pages in reserved_vm,
1216 * and omit it from core dump, even when VM_IO turned off. 1232 * and omit it from core dump, even when VM_IO turned off.
1217 * VM_UNPAGED tells the core MM not to "manage" these pages 1233 * VM_PFNMAP tells the core MM that the base pages are just
1218 * (e.g. refcount, mapcount, try to swap them out): in 1234 * raw PFN mappings, and do not have a "struct page" associated
1219 * particular, zap_pte_range does not try to free them. 1235 * with them.
1220 */ 1236 */
1221 vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED; 1237 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1238 vma->vm_pgoff = pfn;
1222 1239
1223 BUG_ON(addr >= end); 1240 BUG_ON(addr >= end);
1224 pfn -= addr >> PAGE_SHIFT; 1241 pfn -= addr >> PAGE_SHIFT;
@@ -1273,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1273 return pte; 1290 return pte;
1274} 1291}
1275 1292
1293static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
1294{
1295 /*
1296 * If the source page was a PFN mapping, we don't have
1297 * a "struct page" for it. We do a best-effort copy by
1298 * just copying from the original user address. If that
1299 * fails, we just zero-fill it. Live with it.
1300 */
1301 if (unlikely(!src)) {
1302 void *kaddr = kmap_atomic(dst, KM_USER0);
1303 unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE);
1304 if (left)
1305 memset(kaddr, 0, PAGE_SIZE);
1306 kunmap_atomic(kaddr, KM_USER0);
1307 return;
1308
1309 }
1310 copy_user_highpage(dst, src, va);
1311}
1312
1276/* 1313/*
1277 * This routine handles present pages, when users try to write 1314 * This routine handles present pages, when users try to write
1278 * to a shared page. It is done by copying the page to a new address 1315 * to a shared page. It is done by copying the page to a new address
@@ -1296,28 +1333,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1296 spinlock_t *ptl, pte_t orig_pte) 1333 spinlock_t *ptl, pte_t orig_pte)
1297{ 1334{
1298 struct page *old_page, *src_page, *new_page; 1335 struct page *old_page, *src_page, *new_page;
1299 unsigned long pfn = pte_pfn(orig_pte);
1300 pte_t entry; 1336 pte_t entry;
1301 int ret = VM_FAULT_MINOR; 1337 int ret = VM_FAULT_MINOR;
1302 1338
1303 if (unlikely(!pfn_valid(pfn))) { 1339 old_page = vm_normal_page(vma, address, orig_pte);
1304 /*
1305 * Page table corrupted: show pte and kill process.
1306 * Or it's an attempt to COW an out-of-map VM_UNPAGED
1307 * entry, which copy_user_highpage does not support.
1308 */
1309 print_bad_pte(vma, orig_pte, address);
1310 ret = VM_FAULT_OOM;
1311 goto unlock;
1312 }
1313 old_page = pfn_to_page(pfn);
1314 src_page = old_page; 1340 src_page = old_page;
1315 1341 if (!old_page)
1316 if (unlikely(vma->vm_flags & VM_UNPAGED)) 1342 goto gotten;
1317 if (!page_is_anon(old_page, vma, address)) {
1318 old_page = NULL;
1319 goto gotten;
1320 }
1321 1343
1322 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { 1344 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1323 int reuse = can_share_swap_page(old_page); 1345 int reuse = can_share_swap_page(old_page);
@@ -1351,7 +1373,7 @@ gotten:
1351 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1373 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1352 if (!new_page) 1374 if (!new_page)
1353 goto oom; 1375 goto oom;
1354 copy_user_highpage(new_page, src_page, address); 1376 cow_user_page(new_page, src_page, address);
1355 } 1377 }
1356 1378
1357 /* 1379 /*
@@ -1812,16 +1834,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1812 spinlock_t *ptl; 1834 spinlock_t *ptl;
1813 pte_t entry; 1835 pte_t entry;
1814 1836
1815 /* 1837 if (write_access) {
1816 * A VM_UNPAGED vma will normally be filled with present ptes
1817 * by remap_pfn_range, and never arrive here; but it might have
1818 * holes, or if !VM_DONTEXPAND, mremap might have expanded it.
1819 * It's weird enough handling anon pages in unpaged vmas, we do
1820 * not want to worry about ZERO_PAGEs too (it may or may not
1821 * matter if their counts wrap): just give them anon pages.
1822 */
1823
1824 if (write_access || (vma->vm_flags & VM_UNPAGED)) {
1825 /* Allocate our own private page. */ 1838 /* Allocate our own private page. */
1826 pte_unmap(page_table); 1839 pte_unmap(page_table);
1827 1840
@@ -1896,8 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1896 int anon = 0; 1909 int anon = 0;
1897 1910
1898 pte_unmap(page_table); 1911 pte_unmap(page_table);
1899 BUG_ON(vma->vm_flags & VM_UNPAGED);
1900
1901 if (vma->vm_file) { 1912 if (vma->vm_file) {
1902 mapping = vma->vm_file->f_mapping; 1913 mapping = vma->vm_file->f_mapping;
1903 sequence = mapping->truncate_count; 1914 sequence = mapping->truncate_count;
@@ -1930,7 +1941,7 @@ retry:
1930 page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1941 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1931 if (!page) 1942 if (!page)
1932 goto oom; 1943 goto oom;
1933 copy_user_highpage(page, new_page, address); 1944 cow_user_page(page, new_page, address);
1934 page_cache_release(new_page); 1945 page_cache_release(new_page);
1935 new_page = page; 1946 new_page = page;
1936 anon = 1; 1947 anon = 1;