diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 234 |
1 files changed, 168 insertions, 66 deletions
diff --git a/mm/memory.c b/mm/memory.c index 0a2010a9518c..baa999e87cd2 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -52,6 +52,9 @@ | |||
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/memcontrol.h> | 53 | #include <linux/memcontrol.h> |
54 | #include <linux/mmu_notifier.h> | 54 | #include <linux/mmu_notifier.h> |
55 | #include <linux/kallsyms.h> | ||
56 | #include <linux/swapops.h> | ||
57 | #include <linux/elf.h> | ||
55 | 58 | ||
56 | #include <asm/pgalloc.h> | 59 | #include <asm/pgalloc.h> |
57 | #include <asm/uaccess.h> | 60 | #include <asm/uaccess.h> |
@@ -59,9 +62,6 @@ | |||
59 | #include <asm/tlbflush.h> | 62 | #include <asm/tlbflush.h> |
60 | #include <asm/pgtable.h> | 63 | #include <asm/pgtable.h> |
61 | 64 | ||
62 | #include <linux/swapops.h> | ||
63 | #include <linux/elf.h> | ||
64 | |||
65 | #include "internal.h" | 65 | #include "internal.h" |
66 | 66 | ||
67 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 67 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | |||
375 | * | 375 | * |
376 | * The calling function must still handle the error. | 376 | * The calling function must still handle the error. |
377 | */ | 377 | */ |
378 | static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, | 378 | static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, |
379 | unsigned long vaddr) | 379 | pte_t pte, struct page *page) |
380 | { | 380 | { |
381 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | 381 | pgd_t *pgd = pgd_offset(vma->vm_mm, addr); |
382 | "vm_flags = %lx, vaddr = %lx\n", | 382 | pud_t *pud = pud_offset(pgd, addr); |
383 | (long long)pte_val(pte), | 383 | pmd_t *pmd = pmd_offset(pud, addr); |
384 | (vma->vm_mm == current->mm ? current->comm : "???"), | 384 | struct address_space *mapping; |
385 | vma->vm_flags, vaddr); | 385 | pgoff_t index; |
386 | static unsigned long resume; | ||
387 | static unsigned long nr_shown; | ||
388 | static unsigned long nr_unshown; | ||
389 | |||
390 | /* | ||
391 | * Allow a burst of 60 reports, then keep quiet for that minute; | ||
392 | * or allow a steady drip of one report per second. | ||
393 | */ | ||
394 | if (nr_shown == 60) { | ||
395 | if (time_before(jiffies, resume)) { | ||
396 | nr_unshown++; | ||
397 | return; | ||
398 | } | ||
399 | if (nr_unshown) { | ||
400 | printk(KERN_ALERT | ||
401 | "BUG: Bad page map: %lu messages suppressed\n", | ||
402 | nr_unshown); | ||
403 | nr_unshown = 0; | ||
404 | } | ||
405 | nr_shown = 0; | ||
406 | } | ||
407 | if (nr_shown++ == 0) | ||
408 | resume = jiffies + 60 * HZ; | ||
409 | |||
410 | mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; | ||
411 | index = linear_page_index(vma, addr); | ||
412 | |||
413 | printk(KERN_ALERT | ||
414 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", | ||
415 | current->comm, | ||
416 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | ||
417 | if (page) { | ||
418 | printk(KERN_ALERT | ||
419 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
420 | page, (void *)page->flags, page_count(page), | ||
421 | page_mapcount(page), page->mapping, page->index); | ||
422 | } | ||
423 | printk(KERN_ALERT | ||
424 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | ||
425 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | ||
426 | /* | ||
427 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y | ||
428 | */ | ||
429 | if (vma->vm_ops) | ||
430 | print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", | ||
431 | (unsigned long)vma->vm_ops->fault); | ||
432 | if (vma->vm_file && vma->vm_file->f_op) | ||
433 | print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", | ||
434 | (unsigned long)vma->vm_file->f_op->mmap); | ||
386 | dump_stack(); | 435 | dump_stack(); |
436 | add_taint(TAINT_BAD_PAGE); | ||
387 | } | 437 | } |
388 | 438 | ||
389 | static inline int is_cow_mapping(unsigned int flags) | 439 | static inline int is_cow_mapping(unsigned int flags) |
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags) | |||
441 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | 491 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
442 | pte_t pte) | 492 | pte_t pte) |
443 | { | 493 | { |
444 | unsigned long pfn; | 494 | unsigned long pfn = pte_pfn(pte); |
445 | 495 | ||
446 | if (HAVE_PTE_SPECIAL) { | 496 | if (HAVE_PTE_SPECIAL) { |
447 | if (likely(!pte_special(pte))) { | 497 | if (likely(!pte_special(pte))) |
448 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 498 | goto check_pfn; |
449 | return pte_page(pte); | 499 | if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) |
450 | } | 500 | print_bad_pte(vma, addr, pte, NULL); |
451 | VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); | ||
452 | return NULL; | 501 | return NULL; |
453 | } | 502 | } |
454 | 503 | ||
455 | /* !HAVE_PTE_SPECIAL case follows: */ | 504 | /* !HAVE_PTE_SPECIAL case follows: */ |
456 | 505 | ||
457 | pfn = pte_pfn(pte); | ||
458 | |||
459 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { | 506 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { |
460 | if (vma->vm_flags & VM_MIXEDMAP) { | 507 | if (vma->vm_flags & VM_MIXEDMAP) { |
461 | if (!pfn_valid(pfn)) | 508 | if (!pfn_valid(pfn)) |
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
471 | } | 518 | } |
472 | } | 519 | } |
473 | 520 | ||
474 | VM_BUG_ON(!pfn_valid(pfn)); | 521 | check_pfn: |
522 | if (unlikely(pfn > highest_memmap_pfn)) { | ||
523 | print_bad_pte(vma, addr, pte, NULL); | ||
524 | return NULL; | ||
525 | } | ||
475 | 526 | ||
476 | /* | 527 | /* |
477 | * NOTE! We still have PageReserved() pages in the page tables. | 528 | * NOTE! We still have PageReserved() pages in the page tables. |
478 | * | ||
479 | * eg. VDSO mappings can cause them to exist. | 529 | * eg. VDSO mappings can cause them to exist. |
480 | */ | 530 | */ |
481 | out: | 531 | out: |
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
767 | else { | 817 | else { |
768 | if (pte_dirty(ptent)) | 818 | if (pte_dirty(ptent)) |
769 | set_page_dirty(page); | 819 | set_page_dirty(page); |
770 | if (pte_young(ptent)) | 820 | if (pte_young(ptent) && |
771 | SetPageReferenced(page); | 821 | likely(!VM_SequentialReadHint(vma))) |
822 | mark_page_accessed(page); | ||
772 | file_rss--; | 823 | file_rss--; |
773 | } | 824 | } |
774 | page_remove_rmap(page, vma); | 825 | page_remove_rmap(page); |
826 | if (unlikely(page_mapcount(page) < 0)) | ||
827 | print_bad_pte(vma, addr, ptent, page); | ||
775 | tlb_remove_page(tlb, page); | 828 | tlb_remove_page(tlb, page); |
776 | continue; | 829 | continue; |
777 | } | 830 | } |
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
781 | */ | 834 | */ |
782 | if (unlikely(details)) | 835 | if (unlikely(details)) |
783 | continue; | 836 | continue; |
784 | if (!pte_file(ptent)) | 837 | if (pte_file(ptent)) { |
785 | free_swap_and_cache(pte_to_swp_entry(ptent)); | 838 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) |
839 | print_bad_pte(vma, addr, ptent, NULL); | ||
840 | } else if | ||
841 | (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) | ||
842 | print_bad_pte(vma, addr, ptent, NULL); | ||
786 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 843 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
787 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 844 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); |
788 | 845 | ||
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1153 | int write = !!(flags & GUP_FLAGS_WRITE); | 1210 | int write = !!(flags & GUP_FLAGS_WRITE); |
1154 | int force = !!(flags & GUP_FLAGS_FORCE); | 1211 | int force = !!(flags & GUP_FLAGS_FORCE); |
1155 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | 1212 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); |
1213 | int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); | ||
1156 | 1214 | ||
1157 | if (len <= 0) | 1215 | if (len <= 0) |
1158 | return 0; | 1216 | return 0; |
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1231 | struct page *page; | 1289 | struct page *page; |
1232 | 1290 | ||
1233 | /* | 1291 | /* |
1234 | * If tsk is ooming, cut off its access to large memory | 1292 | * If we have a pending SIGKILL, don't keep faulting |
1235 | * allocations. It has a pending SIGKILL, but it can't | 1293 | * pages and potentially allocating memory, unless |
1236 | * be processed until returning to user space. | 1294 | * current is handling munlock--e.g., on exit. In |
1295 | * that case, we are not allocating memory. Rather, | ||
1296 | * we're only unlocking already resident/mapped pages. | ||
1237 | */ | 1297 | */ |
1238 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) | 1298 | if (unlikely(!ignore_sigkill && |
1239 | return i ? i : -ENOMEM; | 1299 | fatal_signal_pending(current))) |
1300 | return i ? i : -ERESTARTSYS; | ||
1240 | 1301 | ||
1241 | if (write) | 1302 | if (write) |
1242 | foll_flags |= FOLL_WRITE; | 1303 | foll_flags |= FOLL_WRITE; |
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1263 | * do_wp_page has broken COW when necessary, | 1324 | * do_wp_page has broken COW when necessary, |
1264 | * even if maybe_mkwrite decided not to set | 1325 | * even if maybe_mkwrite decided not to set |
1265 | * pte_write. We can thus safely do subsequent | 1326 | * pte_write. We can thus safely do subsequent |
1266 | * page lookups as if they were reads. | 1327 | * page lookups as if they were reads. But only |
1328 | * do so when looping for pte_write is futile: | ||
1329 | * in some cases userspace may also be wanting | ||
1330 | * to write to the gotten user page, which a | ||
1331 | * read fault here might prevent (a readonly | ||
1332 | * page might get reCOWed by userspace write). | ||
1267 | */ | 1333 | */ |
1268 | if (ret & VM_FAULT_WRITE) | 1334 | if ((ret & VM_FAULT_WRITE) && |
1335 | !(vma->vm_flags & VM_WRITE)) | ||
1269 | foll_flags &= ~FOLL_WRITE; | 1336 | foll_flags &= ~FOLL_WRITE; |
1270 | 1337 | ||
1271 | cond_resched(); | 1338 | cond_resched(); |
@@ -1444,6 +1511,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1444 | unsigned long pfn) | 1511 | unsigned long pfn) |
1445 | { | 1512 | { |
1446 | int ret; | 1513 | int ret; |
1514 | pgprot_t pgprot = vma->vm_page_prot; | ||
1447 | /* | 1515 | /* |
1448 | * Technically, architectures with pte_special can avoid all these | 1516 | * Technically, architectures with pte_special can avoid all these |
1449 | * restrictions (same for remap_pfn_range). However we would like | 1517 | * restrictions (same for remap_pfn_range). However we would like |
@@ -1458,10 +1526,10 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1458 | 1526 | ||
1459 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1527 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1460 | return -EFAULT; | 1528 | return -EFAULT; |
1461 | if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE)) | 1529 | if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) |
1462 | return -EINVAL; | 1530 | return -EINVAL; |
1463 | 1531 | ||
1464 | ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot); | 1532 | ret = insert_pfn(vma, addr, pfn, pgprot); |
1465 | 1533 | ||
1466 | if (ret) | 1534 | if (ret) |
1467 | untrack_pfn_vma(vma, pfn, PAGE_SIZE); | 1535 | untrack_pfn_vma(vma, pfn, PAGE_SIZE); |
@@ -1604,9 +1672,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
1604 | 1672 | ||
1605 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | 1673 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; |
1606 | 1674 | ||
1607 | err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size)); | 1675 | err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); |
1608 | if (err) | 1676 | if (err) { |
1677 | /* | ||
1678 | * To indicate that track_pfn related cleanup is not | ||
1679 | * needed from higher level routine calling unmap_vmas | ||
1680 | */ | ||
1681 | vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); | ||
1609 | return -EINVAL; | 1682 | return -EINVAL; |
1683 | } | ||
1610 | 1684 | ||
1611 | BUG_ON(addr >= end); | 1685 | BUG_ON(addr >= end); |
1612 | pfn -= addr >> PAGE_SHIFT; | 1686 | pfn -= addr >> PAGE_SHIFT; |
@@ -1644,6 +1718,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1644 | 1718 | ||
1645 | BUG_ON(pmd_huge(*pmd)); | 1719 | BUG_ON(pmd_huge(*pmd)); |
1646 | 1720 | ||
1721 | arch_enter_lazy_mmu_mode(); | ||
1722 | |||
1647 | token = pmd_pgtable(*pmd); | 1723 | token = pmd_pgtable(*pmd); |
1648 | 1724 | ||
1649 | do { | 1725 | do { |
@@ -1652,6 +1728,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1652 | break; | 1728 | break; |
1653 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1729 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1654 | 1730 | ||
1731 | arch_leave_lazy_mmu_mode(); | ||
1732 | |||
1655 | if (mm != &init_mm) | 1733 | if (mm != &init_mm) |
1656 | pte_unmap_unlock(pte-1, ptl); | 1734 | pte_unmap_unlock(pte-1, ptl); |
1657 | return err; | 1735 | return err; |
@@ -1837,10 +1915,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1837 | * not dirty accountable. | 1915 | * not dirty accountable. |
1838 | */ | 1916 | */ |
1839 | if (PageAnon(old_page)) { | 1917 | if (PageAnon(old_page)) { |
1840 | if (trylock_page(old_page)) { | 1918 | if (!trylock_page(old_page)) { |
1841 | reuse = can_share_swap_page(old_page); | 1919 | page_cache_get(old_page); |
1842 | unlock_page(old_page); | 1920 | pte_unmap_unlock(page_table, ptl); |
1921 | lock_page(old_page); | ||
1922 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
1923 | &ptl); | ||
1924 | if (!pte_same(*page_table, orig_pte)) { | ||
1925 | unlock_page(old_page); | ||
1926 | page_cache_release(old_page); | ||
1927 | goto unlock; | ||
1928 | } | ||
1929 | page_cache_release(old_page); | ||
1843 | } | 1930 | } |
1931 | reuse = reuse_swap_page(old_page); | ||
1932 | unlock_page(old_page); | ||
1844 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 1933 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
1845 | (VM_WRITE|VM_SHARED))) { | 1934 | (VM_WRITE|VM_SHARED))) { |
1846 | /* | 1935 | /* |
@@ -1910,7 +1999,7 @@ gotten: | |||
1910 | * Don't let another task, with possibly unlocked vma, | 1999 | * Don't let another task, with possibly unlocked vma, |
1911 | * keep the mlocked page. | 2000 | * keep the mlocked page. |
1912 | */ | 2001 | */ |
1913 | if (vma->vm_flags & VM_LOCKED) { | 2002 | if ((vma->vm_flags & VM_LOCKED) && old_page) { |
1914 | lock_page(old_page); /* for LRU manipulation */ | 2003 | lock_page(old_page); /* for LRU manipulation */ |
1915 | clear_page_mlock(old_page); | 2004 | clear_page_mlock(old_page); |
1916 | unlock_page(old_page); | 2005 | unlock_page(old_page); |
@@ -1918,7 +2007,7 @@ gotten: | |||
1918 | cow_user_page(new_page, old_page, address, vma); | 2007 | cow_user_page(new_page, old_page, address, vma); |
1919 | __SetPageUptodate(new_page); | 2008 | __SetPageUptodate(new_page); |
1920 | 2009 | ||
1921 | if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) | 2010 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
1922 | goto oom_free_new; | 2011 | goto oom_free_new; |
1923 | 2012 | ||
1924 | /* | 2013 | /* |
@@ -1943,11 +2032,7 @@ gotten: | |||
1943 | * thread doing COW. | 2032 | * thread doing COW. |
1944 | */ | 2033 | */ |
1945 | ptep_clear_flush_notify(vma, address, page_table); | 2034 | ptep_clear_flush_notify(vma, address, page_table); |
1946 | SetPageSwapBacked(new_page); | ||
1947 | lru_cache_add_active_or_unevictable(new_page, vma); | ||
1948 | page_add_new_anon_rmap(new_page, vma, address); | 2035 | page_add_new_anon_rmap(new_page, vma, address); |
1949 | |||
1950 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
1951 | set_pte_at(mm, address, page_table, entry); | 2036 | set_pte_at(mm, address, page_table, entry); |
1952 | update_mmu_cache(vma, address, entry); | 2037 | update_mmu_cache(vma, address, entry); |
1953 | if (old_page) { | 2038 | if (old_page) { |
@@ -1973,7 +2058,7 @@ gotten: | |||
1973 | * mapcount is visible. So transitively, TLBs to | 2058 | * mapcount is visible. So transitively, TLBs to |
1974 | * old page will be flushed before it can be reused. | 2059 | * old page will be flushed before it can be reused. |
1975 | */ | 2060 | */ |
1976 | page_remove_rmap(old_page, vma); | 2061 | page_remove_rmap(old_page); |
1977 | } | 2062 | } |
1978 | 2063 | ||
1979 | /* Free the old page.. */ | 2064 | /* Free the old page.. */ |
@@ -2266,7 +2351,7 @@ int vmtruncate(struct inode * inode, loff_t offset) | |||
2266 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | 2351 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); |
2267 | } | 2352 | } |
2268 | 2353 | ||
2269 | if (inode->i_op && inode->i_op->truncate) | 2354 | if (inode->i_op->truncate) |
2270 | inode->i_op->truncate(inode); | 2355 | inode->i_op->truncate(inode); |
2271 | return 0; | 2356 | return 0; |
2272 | 2357 | ||
@@ -2286,7 +2371,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
2286 | * a way to truncate a range of blocks (punch a hole) - | 2371 | * a way to truncate a range of blocks (punch a hole) - |
2287 | * we should return failure right now. | 2372 | * we should return failure right now. |
2288 | */ | 2373 | */ |
2289 | if (!inode->i_op || !inode->i_op->truncate_range) | 2374 | if (!inode->i_op->truncate_range) |
2290 | return -ENOSYS; | 2375 | return -ENOSYS; |
2291 | 2376 | ||
2292 | mutex_lock(&inode->i_mutex); | 2377 | mutex_lock(&inode->i_mutex); |
@@ -2314,6 +2399,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2314 | struct page *page; | 2399 | struct page *page; |
2315 | swp_entry_t entry; | 2400 | swp_entry_t entry; |
2316 | pte_t pte; | 2401 | pte_t pte; |
2402 | struct mem_cgroup *ptr = NULL; | ||
2317 | int ret = 0; | 2403 | int ret = 0; |
2318 | 2404 | ||
2319 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2405 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
@@ -2352,7 +2438,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2352 | lock_page(page); | 2438 | lock_page(page); |
2353 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2439 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2354 | 2440 | ||
2355 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | 2441 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
2356 | ret = VM_FAULT_OOM; | 2442 | ret = VM_FAULT_OOM; |
2357 | unlock_page(page); | 2443 | unlock_page(page); |
2358 | goto out; | 2444 | goto out; |
@@ -2370,22 +2456,35 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2370 | goto out_nomap; | 2456 | goto out_nomap; |
2371 | } | 2457 | } |
2372 | 2458 | ||
2373 | /* The page isn't present yet, go ahead with the fault. */ | 2459 | /* |
2460 | * The page isn't present yet, go ahead with the fault. | ||
2461 | * | ||
2462 | * Be careful about the sequence of operations here. | ||
2463 | * To get its accounting right, reuse_swap_page() must be called | ||
2464 | * while the page is counted on swap but not yet in mapcount i.e. | ||
2465 | * before page_add_anon_rmap() and swap_free(); try_to_free_swap() | ||
2466 | * must be called after the swap_free(), or it will never succeed. | ||
2467 | * Because delete_from_swap_page() may be called by reuse_swap_page(), | ||
2468 | * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry | ||
2469 | * in page->private. In this case, a record in swap_cgroup is silently | ||
2470 | * discarded at swap_free(). | ||
2471 | */ | ||
2374 | 2472 | ||
2375 | inc_mm_counter(mm, anon_rss); | 2473 | inc_mm_counter(mm, anon_rss); |
2376 | pte = mk_pte(page, vma->vm_page_prot); | 2474 | pte = mk_pte(page, vma->vm_page_prot); |
2377 | if (write_access && can_share_swap_page(page)) { | 2475 | if (write_access && reuse_swap_page(page)) { |
2378 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2476 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2379 | write_access = 0; | 2477 | write_access = 0; |
2380 | } | 2478 | } |
2381 | |||
2382 | flush_icache_page(vma, page); | 2479 | flush_icache_page(vma, page); |
2383 | set_pte_at(mm, address, page_table, pte); | 2480 | set_pte_at(mm, address, page_table, pte); |
2384 | page_add_anon_rmap(page, vma, address); | 2481 | page_add_anon_rmap(page, vma, address); |
2482 | /* It's better to call commit-charge after rmap is established */ | ||
2483 | mem_cgroup_commit_charge_swapin(page, ptr); | ||
2385 | 2484 | ||
2386 | swap_free(entry); | 2485 | swap_free(entry); |
2387 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 2486 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
2388 | remove_exclusive_swap_page(page); | 2487 | try_to_free_swap(page); |
2389 | unlock_page(page); | 2488 | unlock_page(page); |
2390 | 2489 | ||
2391 | if (write_access) { | 2490 | if (write_access) { |
@@ -2402,7 +2501,7 @@ unlock: | |||
2402 | out: | 2501 | out: |
2403 | return ret; | 2502 | return ret; |
2404 | out_nomap: | 2503 | out_nomap: |
2405 | mem_cgroup_uncharge_page(page); | 2504 | mem_cgroup_cancel_charge_swapin(ptr); |
2406 | pte_unmap_unlock(page_table, ptl); | 2505 | pte_unmap_unlock(page_table, ptl); |
2407 | unlock_page(page); | 2506 | unlock_page(page); |
2408 | page_cache_release(page); | 2507 | page_cache_release(page); |
@@ -2432,7 +2531,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2432 | goto oom; | 2531 | goto oom; |
2433 | __SetPageUptodate(page); | 2532 | __SetPageUptodate(page); |
2434 | 2533 | ||
2435 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) | 2534 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) |
2436 | goto oom_free_page; | 2535 | goto oom_free_page; |
2437 | 2536 | ||
2438 | entry = mk_pte(page, vma->vm_page_prot); | 2537 | entry = mk_pte(page, vma->vm_page_prot); |
@@ -2442,8 +2541,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2442 | if (!pte_none(*page_table)) | 2541 | if (!pte_none(*page_table)) |
2443 | goto release; | 2542 | goto release; |
2444 | inc_mm_counter(mm, anon_rss); | 2543 | inc_mm_counter(mm, anon_rss); |
2445 | SetPageSwapBacked(page); | ||
2446 | lru_cache_add_active_or_unevictable(page, vma); | ||
2447 | page_add_new_anon_rmap(page, vma, address); | 2544 | page_add_new_anon_rmap(page, vma, address); |
2448 | set_pte_at(mm, address, page_table, entry); | 2545 | set_pte_at(mm, address, page_table, entry); |
2449 | 2546 | ||
@@ -2525,7 +2622,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2525 | ret = VM_FAULT_OOM; | 2622 | ret = VM_FAULT_OOM; |
2526 | goto out; | 2623 | goto out; |
2527 | } | 2624 | } |
2528 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | 2625 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { |
2529 | ret = VM_FAULT_OOM; | 2626 | ret = VM_FAULT_OOM; |
2530 | page_cache_release(page); | 2627 | page_cache_release(page); |
2531 | goto out; | 2628 | goto out; |
@@ -2591,8 +2688,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2591 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2688 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2592 | if (anon) { | 2689 | if (anon) { |
2593 | inc_mm_counter(mm, anon_rss); | 2690 | inc_mm_counter(mm, anon_rss); |
2594 | SetPageSwapBacked(page); | ||
2595 | lru_cache_add_active_or_unevictable(page, vma); | ||
2596 | page_add_new_anon_rmap(page, vma, address); | 2691 | page_add_new_anon_rmap(page, vma, address); |
2597 | } else { | 2692 | } else { |
2598 | inc_mm_counter(mm, file_rss); | 2693 | inc_mm_counter(mm, file_rss); |
@@ -2602,7 +2697,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2602 | get_page(dirty_page); | 2697 | get_page(dirty_page); |
2603 | } | 2698 | } |
2604 | } | 2699 | } |
2605 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
2606 | set_pte_at(mm, address, page_table, entry); | 2700 | set_pte_at(mm, address, page_table, entry); |
2607 | 2701 | ||
2608 | /* no need to invalidate: a not-present page won't be cached */ | 2702 | /* no need to invalidate: a not-present page won't be cached */ |
@@ -2666,12 +2760,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2666 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2760 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
2667 | return 0; | 2761 | return 0; |
2668 | 2762 | ||
2669 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || | 2763 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { |
2670 | !(vma->vm_flags & VM_CAN_NONLINEAR))) { | ||
2671 | /* | 2764 | /* |
2672 | * Page table corrupted: show pte and kill process. | 2765 | * Page table corrupted: show pte and kill process. |
2673 | */ | 2766 | */ |
2674 | print_bad_pte(vma, orig_pte, address); | 2767 | print_bad_pte(vma, address, orig_pte, NULL); |
2675 | return VM_FAULT_OOM; | 2768 | return VM_FAULT_OOM; |
2676 | } | 2769 | } |
2677 | 2770 | ||
@@ -2953,7 +3046,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | |||
2953 | { | 3046 | { |
2954 | resource_size_t phys_addr; | 3047 | resource_size_t phys_addr; |
2955 | unsigned long prot = 0; | 3048 | unsigned long prot = 0; |
2956 | void *maddr; | 3049 | void __iomem *maddr; |
2957 | int offset = addr & (PAGE_SIZE-1); | 3050 | int offset = addr & (PAGE_SIZE-1); |
2958 | 3051 | ||
2959 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) | 3052 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) |
@@ -3079,6 +3172,15 @@ void print_vma_addr(char *prefix, unsigned long ip) | |||
3079 | #ifdef CONFIG_PROVE_LOCKING | 3172 | #ifdef CONFIG_PROVE_LOCKING |
3080 | void might_fault(void) | 3173 | void might_fault(void) |
3081 | { | 3174 | { |
3175 | /* | ||
3176 | * Some code (nfs/sunrpc) uses socket ops on kernel memory while | ||
3177 | * holding the mmap_sem, this is safe because kernel memory doesn't | ||
3178 | * get paged out, therefore we'll never actually fault, and the | ||
3179 | * below annotations will generate false positives. | ||
3180 | */ | ||
3181 | if (segment_eq(get_fs(), KERNEL_DS)) | ||
3182 | return; | ||
3183 | |||
3082 | might_sleep(); | 3184 | might_sleep(); |
3083 | /* | 3185 | /* |
3084 | * it would be nicer only to annotate paths which are not under | 3186 | * it would be nicer only to annotate paths which are not under |