aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorNick Piggin <nickpiggin@yahoo.com.au>2005-10-29 21:16:12 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 00:40:39 -0400
commitb5810039a54e5babf428e9a1e89fc1940fabff11 (patch)
tree835836cb527ec9bd525f93eb7e016f3dfb8c8ae2 /mm/memory.c
parentf9c98d0287de42221c624482fd4f8d485c98ab22 (diff)
[PATCH] core remove PageReserved
Remove PageReserved() calls from core code by tightening VM_RESERVED handling in mm/ to cover PageReserved functionality. PageReserved special casing is removed from get_page and put_page. All setting and clearing of PageReserved is retained, and it is now flagged in the page_alloc checks to help ensure we don't introduce any refcount based freeing of Reserved pages. MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being deprecated. We never completely handled it correctly anyway, and is be reintroduced in future if required (Hugh has a proof of concept). Once PageReserved() calls are removed from kernel/power/swsusp.c, and all arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can be trivially removed. Last real user of PageReserved is swsusp, which uses PageReserved to determine whether a struct page points to valid memory or not. This still needs to be addressed (a generic page_is_ram() should work). A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and thus mapcounted and count towards shared rss). These writes to the struct page could cause excessive cacheline bouncing on big systems. There are a number of ways this could be addressed if it is an issue. Signed-off-by: Nick Piggin <npiggin@suse.de> Refcount bug fix for filemap_xip.c Signed-off-by: Carsten Otte <cotte@de.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c131
1 files changed, 79 insertions, 52 deletions
diff --git a/mm/memory.c b/mm/memory.c
index da642b5528fa..e83f9440bb66 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -343,6 +343,23 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
343#define NO_RSS 2 /* Increment neither file_rss nor anon_rss */ 343#define NO_RSS 2 /* Increment neither file_rss nor anon_rss */
344 344
345/* 345/*
346 * This function is called to print an error when a pte in a
347 * !VM_RESERVED region is found pointing to an invalid pfn (which
348 * is an error.
349 *
350 * The calling function must still handle the error.
351 */
352void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
353{
354 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
355 "vm_flags = %lx, vaddr = %lx\n",
356 (long long)pte_val(pte),
357 (vma->vm_mm == current->mm ? current->comm : "???"),
358 vma->vm_flags, vaddr);
359 dump_stack();
360}
361
362/*
346 * copy one vm_area from one task to the other. Assumes the page tables 363 * copy one vm_area from one task to the other. Assumes the page tables
347 * already present in the new task to be cleared in the whole range 364 * already present in the new task to be cleared in the whole range
348 * covered by this vma. 365 * covered by this vma.
@@ -353,9 +370,10 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
353 370
354static inline int 371static inline int
355copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 372copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
356 pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, 373 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
357 unsigned long addr) 374 unsigned long addr)
358{ 375{
376 unsigned long vm_flags = vma->vm_flags;
359 pte_t pte = *src_pte; 377 pte_t pte = *src_pte;
360 struct page *page; 378 struct page *page;
361 unsigned long pfn; 379 unsigned long pfn;
@@ -375,18 +393,22 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
375 goto out_set_pte; 393 goto out_set_pte;
376 } 394 }
377 395
396 /* If the region is VM_RESERVED, the mapping is not
397 * mapped via rmap - duplicate the pte as is.
398 */
399 if (vm_flags & VM_RESERVED)
400 goto out_set_pte;
401
378 pfn = pte_pfn(pte); 402 pfn = pte_pfn(pte);
379 /* the pte points outside of valid memory, the 403 /* If the pte points outside of valid memory but
380 * mapping is assumed to be good, meaningful 404 * the region is not VM_RESERVED, we have a problem.
381 * and not mapped via rmap - duplicate the
382 * mapping as is.
383 */ 405 */
384 page = NULL; 406 if (unlikely(!pfn_valid(pfn))) {
385 if (pfn_valid(pfn)) 407 print_bad_pte(vma, pte, addr);
386 page = pfn_to_page(pfn); 408 goto out_set_pte; /* try to do something sane */
409 }
387 410
388 if (!page || PageReserved(page)) 411 page = pfn_to_page(pfn);
389 goto out_set_pte;
390 412
391 /* 413 /*
392 * If it's a COW mapping, write protect it both 414 * If it's a COW mapping, write protect it both
@@ -418,7 +440,6 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
418 unsigned long addr, unsigned long end) 440 unsigned long addr, unsigned long end)
419{ 441{
420 pte_t *src_pte, *dst_pte; 442 pte_t *src_pte, *dst_pte;
421 unsigned long vm_flags = vma->vm_flags;
422 int progress = 0; 443 int progress = 0;
423 int rss[NO_RSS+1], anon; 444 int rss[NO_RSS+1], anon;
424 445
@@ -446,8 +467,7 @@ again:
446 progress++; 467 progress++;
447 continue; 468 continue;
448 } 469 }
449 anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, 470 anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma,addr);
450 vm_flags, addr);
451 rss[anon]++; 471 rss[anon]++;
452 progress += 8; 472 progress += 8;
453 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 473 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -541,10 +561,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
541 return 0; 561 return 0;
542} 562}
543 563
544static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 564static void zap_pte_range(struct mmu_gather *tlb,
565 struct vm_area_struct *vma, pmd_t *pmd,
545 unsigned long addr, unsigned long end, 566 unsigned long addr, unsigned long end,
546 struct zap_details *details) 567 struct zap_details *details)
547{ 568{
569 struct mm_struct *mm = tlb->mm;
548 pte_t *pte; 570 pte_t *pte;
549 int file_rss = 0; 571 int file_rss = 0;
550 int anon_rss = 0; 572 int anon_rss = 0;
@@ -556,11 +578,12 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
556 continue; 578 continue;
557 if (pte_present(ptent)) { 579 if (pte_present(ptent)) {
558 struct page *page = NULL; 580 struct page *page = NULL;
559 unsigned long pfn = pte_pfn(ptent); 581 if (!(vma->vm_flags & VM_RESERVED)) {
560 if (pfn_valid(pfn)) { 582 unsigned long pfn = pte_pfn(ptent);
561 page = pfn_to_page(pfn); 583 if (unlikely(!pfn_valid(pfn)))
562 if (PageReserved(page)) 584 print_bad_pte(vma, ptent, addr);
563 page = NULL; 585 else
586 page = pfn_to_page(pfn);
564 } 587 }
565 if (unlikely(details) && page) { 588 if (unlikely(details) && page) {
566 /* 589 /*
@@ -580,7 +603,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
580 page->index > details->last_index)) 603 page->index > details->last_index))
581 continue; 604 continue;
582 } 605 }
583 ptent = ptep_get_and_clear_full(tlb->mm, addr, pte, 606 ptent = ptep_get_and_clear_full(mm, addr, pte,
584 tlb->fullmm); 607 tlb->fullmm);
585 tlb_remove_tlb_entry(tlb, pte, addr); 608 tlb_remove_tlb_entry(tlb, pte, addr);
586 if (unlikely(!page)) 609 if (unlikely(!page))
@@ -588,7 +611,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
588 if (unlikely(details) && details->nonlinear_vma 611 if (unlikely(details) && details->nonlinear_vma
589 && linear_page_index(details->nonlinear_vma, 612 && linear_page_index(details->nonlinear_vma,
590 addr) != page->index) 613 addr) != page->index)
591 set_pte_at(tlb->mm, addr, pte, 614 set_pte_at(mm, addr, pte,
592 pgoff_to_pte(page->index)); 615 pgoff_to_pte(page->index));
593 if (PageAnon(page)) 616 if (PageAnon(page))
594 anon_rss++; 617 anon_rss++;
@@ -611,14 +634,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
611 continue; 634 continue;
612 if (!pte_file(ptent)) 635 if (!pte_file(ptent))
613 free_swap_and_cache(pte_to_swp_entry(ptent)); 636 free_swap_and_cache(pte_to_swp_entry(ptent));
614 pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); 637 pte_clear_full(mm, addr, pte, tlb->fullmm);
615 } while (pte++, addr += PAGE_SIZE, addr != end); 638 } while (pte++, addr += PAGE_SIZE, addr != end);
616 639
617 add_mm_rss(tlb->mm, -file_rss, -anon_rss); 640 add_mm_rss(mm, -file_rss, -anon_rss);
618 pte_unmap(pte - 1); 641 pte_unmap(pte - 1);
619} 642}
620 643
621static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, 644static inline void zap_pmd_range(struct mmu_gather *tlb,
645 struct vm_area_struct *vma, pud_t *pud,
622 unsigned long addr, unsigned long end, 646 unsigned long addr, unsigned long end,
623 struct zap_details *details) 647 struct zap_details *details)
624{ 648{
@@ -630,11 +654,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
630 next = pmd_addr_end(addr, end); 654 next = pmd_addr_end(addr, end);
631 if (pmd_none_or_clear_bad(pmd)) 655 if (pmd_none_or_clear_bad(pmd))
632 continue; 656 continue;
633 zap_pte_range(tlb, pmd, addr, next, details); 657 zap_pte_range(tlb, vma, pmd, addr, next, details);
634 } while (pmd++, addr = next, addr != end); 658 } while (pmd++, addr = next, addr != end);
635} 659}
636 660
637static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 661static inline void zap_pud_range(struct mmu_gather *tlb,
662 struct vm_area_struct *vma, pgd_t *pgd,
638 unsigned long addr, unsigned long end, 663 unsigned long addr, unsigned long end,
639 struct zap_details *details) 664 struct zap_details *details)
640{ 665{
@@ -646,7 +671,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
646 next = pud_addr_end(addr, end); 671 next = pud_addr_end(addr, end);
647 if (pud_none_or_clear_bad(pud)) 672 if (pud_none_or_clear_bad(pud))
648 continue; 673 continue;
649 zap_pmd_range(tlb, pud, addr, next, details); 674 zap_pmd_range(tlb, vma, pud, addr, next, details);
650 } while (pud++, addr = next, addr != end); 675 } while (pud++, addr = next, addr != end);
651} 676}
652 677
@@ -667,7 +692,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
667 next = pgd_addr_end(addr, end); 692 next = pgd_addr_end(addr, end);
668 if (pgd_none_or_clear_bad(pgd)) 693 if (pgd_none_or_clear_bad(pgd))
669 continue; 694 continue;
670 zap_pud_range(tlb, pgd, addr, next, details); 695 zap_pud_range(tlb, vma, pgd, addr, next, details);
671 } while (pgd++, addr = next, addr != end); 696 } while (pgd++, addr = next, addr != end);
672 tlb_end_vma(tlb, vma); 697 tlb_end_vma(tlb, vma);
673} 698}
@@ -967,7 +992,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
967 continue; 992 continue;
968 } 993 }
969 994
970 if (!vma || (vma->vm_flags & VM_IO) 995 if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
971 || !(flags & vma->vm_flags)) 996 || !(flags & vma->vm_flags))
972 return i ? : -EFAULT; 997 return i ? : -EFAULT;
973 998
@@ -1027,8 +1052,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1027 if (pages) { 1052 if (pages) {
1028 pages[i] = page; 1053 pages[i] = page;
1029 flush_dcache_page(page); 1054 flush_dcache_page(page);
1030 if (!PageReserved(page)) 1055 page_cache_get(page);
1031 page_cache_get(page);
1032 } 1056 }
1033 if (vmas) 1057 if (vmas)
1034 vmas[i] = vma; 1058 vmas[i] = vma;
@@ -1051,7 +1075,11 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1051 if (!pte) 1075 if (!pte)
1052 return -ENOMEM; 1076 return -ENOMEM;
1053 do { 1077 do {
1054 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); 1078 struct page *page = ZERO_PAGE(addr);
1079 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1080 page_cache_get(page);
1081 page_add_file_rmap(page);
1082 inc_mm_counter(mm, file_rss);
1055 BUG_ON(!pte_none(*pte)); 1083 BUG_ON(!pte_none(*pte));
1056 set_pte_at(mm, addr, pte, zero_pte); 1084 set_pte_at(mm, addr, pte, zero_pte);
1057 } while (pte++, addr += PAGE_SIZE, addr != end); 1085 } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1132,8 +1160,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1132 return -ENOMEM; 1160 return -ENOMEM;
1133 do { 1161 do {
1134 BUG_ON(!pte_none(*pte)); 1162 BUG_ON(!pte_none(*pte));
1135 if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) 1163 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1136 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1137 pfn++; 1164 pfn++;
1138 } while (pte++, addr += PAGE_SIZE, addr != end); 1165 } while (pte++, addr += PAGE_SIZE, addr != end);
1139 pte_unmap(pte - 1); 1166 pte_unmap(pte - 1);
@@ -1195,8 +1222,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1195 * rest of the world about it: 1222 * rest of the world about it:
1196 * VM_IO tells people not to look at these pages 1223 * VM_IO tells people not to look at these pages
1197 * (accesses can have side effects). 1224 * (accesses can have side effects).
1198 * VM_RESERVED tells swapout not to try to touch 1225 * VM_RESERVED tells the core MM not to "manage" these pages
1199 * this region. 1226 * (e.g. refcount, mapcount, try to swap them out).
1200 */ 1227 */
1201 vma->vm_flags |= VM_IO | VM_RESERVED; 1228 vma->vm_flags |= VM_IO | VM_RESERVED;
1202 1229
@@ -1256,11 +1283,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1256 pte_t entry; 1283 pte_t entry;
1257 int ret = VM_FAULT_MINOR; 1284 int ret = VM_FAULT_MINOR;
1258 1285
1286 BUG_ON(vma->vm_flags & VM_RESERVED);
1287
1259 if (unlikely(!pfn_valid(pfn))) { 1288 if (unlikely(!pfn_valid(pfn))) {
1260 /* 1289 /*
1261 * Page table corrupted: show pte and kill process. 1290 * Page table corrupted: show pte and kill process.
1262 */ 1291 */
1263 pte_ERROR(orig_pte); 1292 print_bad_pte(vma, orig_pte, address);
1264 ret = VM_FAULT_OOM; 1293 ret = VM_FAULT_OOM;
1265 goto unlock; 1294 goto unlock;
1266 } 1295 }
@@ -1284,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1284 /* 1313 /*
1285 * Ok, we need to copy. Oh, well.. 1314 * Ok, we need to copy. Oh, well..
1286 */ 1315 */
1287 if (!PageReserved(old_page)) 1316 page_cache_get(old_page);
1288 page_cache_get(old_page);
1289 pte_unmap(page_table); 1317 pte_unmap(page_table);
1290 spin_unlock(&mm->page_table_lock); 1318 spin_unlock(&mm->page_table_lock);
1291 1319
@@ -1308,14 +1336,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1308 spin_lock(&mm->page_table_lock); 1336 spin_lock(&mm->page_table_lock);
1309 page_table = pte_offset_map(pmd, address); 1337 page_table = pte_offset_map(pmd, address);
1310 if (likely(pte_same(*page_table, orig_pte))) { 1338 if (likely(pte_same(*page_table, orig_pte))) {
1311 if (PageReserved(old_page)) 1339 page_remove_rmap(old_page);
1340 if (!PageAnon(old_page)) {
1312 inc_mm_counter(mm, anon_rss); 1341 inc_mm_counter(mm, anon_rss);
1313 else { 1342 dec_mm_counter(mm, file_rss);
1314 page_remove_rmap(old_page);
1315 if (!PageAnon(old_page)) {
1316 inc_mm_counter(mm, anon_rss);
1317 dec_mm_counter(mm, file_rss);
1318 }
1319 } 1343 }
1320 flush_cache_page(vma, address, pfn); 1344 flush_cache_page(vma, address, pfn);
1321 entry = mk_pte(new_page, vma->vm_page_prot); 1345 entry = mk_pte(new_page, vma->vm_page_prot);
@@ -1769,14 +1793,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1769 unsigned long address, pte_t *page_table, pmd_t *pmd, 1793 unsigned long address, pte_t *page_table, pmd_t *pmd,
1770 int write_access) 1794 int write_access)
1771{ 1795{
1796 struct page *page = ZERO_PAGE(addr);
1772 pte_t entry; 1797 pte_t entry;
1773 1798
1774 /* Mapping of ZERO_PAGE - vm_page_prot is readonly */ 1799 /* Mapping of ZERO_PAGE - vm_page_prot is readonly */
1775 entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot); 1800 entry = mk_pte(page, vma->vm_page_prot);
1776 1801
1777 if (write_access) { 1802 if (write_access) {
1778 struct page *page;
1779
1780 /* Allocate our own private page. */ 1803 /* Allocate our own private page. */
1781 pte_unmap(page_table); 1804 pte_unmap(page_table);
1782 spin_unlock(&mm->page_table_lock); 1805 spin_unlock(&mm->page_table_lock);
@@ -1800,6 +1823,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1800 lru_cache_add_active(page); 1823 lru_cache_add_active(page);
1801 SetPageReferenced(page); 1824 SetPageReferenced(page);
1802 page_add_anon_rmap(page, vma, address); 1825 page_add_anon_rmap(page, vma, address);
1826 } else {
1827 inc_mm_counter(mm, file_rss);
1828 page_add_file_rmap(page);
1829 page_cache_get(page);
1803 } 1830 }
1804 1831
1805 set_pte_at(mm, address, page_table, entry); 1832 set_pte_at(mm, address, page_table, entry);
@@ -1916,7 +1943,7 @@ retry:
1916 inc_mm_counter(mm, anon_rss); 1943 inc_mm_counter(mm, anon_rss);
1917 lru_cache_add_active(new_page); 1944 lru_cache_add_active(new_page);
1918 page_add_anon_rmap(new_page, vma, address); 1945 page_add_anon_rmap(new_page, vma, address);
1919 } else if (!PageReserved(new_page)) { 1946 } else if (!(vma->vm_flags & VM_RESERVED)) {
1920 inc_mm_counter(mm, file_rss); 1947 inc_mm_counter(mm, file_rss);
1921 page_add_file_rmap(new_page); 1948 page_add_file_rmap(new_page);
1922 } 1949 }
@@ -1957,7 +1984,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
1957 /* 1984 /*
1958 * Page table corrupted: show pte and kill process. 1985 * Page table corrupted: show pte and kill process.
1959 */ 1986 */
1960 pte_ERROR(orig_pte); 1987 print_bad_pte(vma, orig_pte, address);
1961 return VM_FAULT_OOM; 1988 return VM_FAULT_OOM;
1962 } 1989 }
1963 /* We can then assume vm->vm_ops && vma->vm_ops->populate */ 1990 /* We can then assume vm->vm_ops && vma->vm_ops->populate */
@@ -2232,7 +2259,7 @@ static int __init gate_vma_init(void)
2232 gate_vma.vm_start = FIXADDR_USER_START; 2259 gate_vma.vm_start = FIXADDR_USER_START;
2233 gate_vma.vm_end = FIXADDR_USER_END; 2260 gate_vma.vm_end = FIXADDR_USER_END;
2234 gate_vma.vm_page_prot = PAGE_READONLY; 2261 gate_vma.vm_page_prot = PAGE_READONLY;
2235 gate_vma.vm_flags = 0; 2262 gate_vma.vm_flags = VM_RESERVED;
2236 return 0; 2263 return 0;
2237} 2264}
2238__initcall(gate_vma_init); 2265__initcall(gate_vma_init);