diff options
Diffstat (limited to 'mm/memory.c')
| -rw-r--r-- | mm/memory.c | 85 |
1 files changed, 74 insertions, 11 deletions
diff --git a/mm/memory.c b/mm/memory.c index fb5608a120ed..2302d228fe04 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -999,17 +999,15 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
| 999 | goto no_page_table; | 999 | goto no_page_table; |
| 1000 | 1000 | ||
| 1001 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 1001 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 1002 | if (!ptep) | ||
| 1003 | goto out; | ||
| 1004 | 1002 | ||
| 1005 | pte = *ptep; | 1003 | pte = *ptep; |
| 1006 | if (!pte_present(pte)) | 1004 | if (!pte_present(pte)) |
| 1007 | goto unlock; | 1005 | goto no_page; |
| 1008 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1006 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
| 1009 | goto unlock; | 1007 | goto unlock; |
| 1010 | page = vm_normal_page(vma, address, pte); | 1008 | page = vm_normal_page(vma, address, pte); |
| 1011 | if (unlikely(!page)) | 1009 | if (unlikely(!page)) |
| 1012 | goto unlock; | 1010 | goto bad_page; |
| 1013 | 1011 | ||
| 1014 | if (flags & FOLL_GET) | 1012 | if (flags & FOLL_GET) |
| 1015 | get_page(page); | 1013 | get_page(page); |
| @@ -1024,6 +1022,15 @@ unlock: | |||
| 1024 | out: | 1022 | out: |
| 1025 | return page; | 1023 | return page; |
| 1026 | 1024 | ||
| 1025 | bad_page: | ||
| 1026 | pte_unmap_unlock(ptep, ptl); | ||
| 1027 | return ERR_PTR(-EFAULT); | ||
| 1028 | |||
| 1029 | no_page: | ||
| 1030 | pte_unmap_unlock(ptep, ptl); | ||
| 1031 | if (!pte_none(pte)) | ||
| 1032 | return page; | ||
| 1033 | /* Fall through to ZERO_PAGE handling */ | ||
| 1027 | no_page_table: | 1034 | no_page_table: |
| 1028 | /* | 1035 | /* |
| 1029 | * When core dumping an enormous anonymous area that nobody | 1036 | * When core dumping an enormous anonymous area that nobody |
| @@ -1038,6 +1045,26 @@ no_page_table: | |||
| 1038 | return page; | 1045 | return page; |
| 1039 | } | 1046 | } |
| 1040 | 1047 | ||
| 1048 | /* Can we do the FOLL_ANON optimization? */ | ||
| 1049 | static inline int use_zero_page(struct vm_area_struct *vma) | ||
| 1050 | { | ||
| 1051 | /* | ||
| 1052 | * We don't want to optimize FOLL_ANON for make_pages_present() | ||
| 1053 | * when it tries to page in a VM_LOCKED region. As to VM_SHARED, | ||
| 1054 | * we want to get the page from the page tables to make sure | ||
| 1055 | * that we serialize and update with any other user of that | ||
| 1056 | * mapping. | ||
| 1057 | */ | ||
| 1058 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) | ||
| 1059 | return 0; | ||
| 1060 | /* | ||
| 1061 | * And if we have a fault or a nopfn routine, it's not an | ||
| 1062 | * anonymous region. | ||
| 1063 | */ | ||
| 1064 | return !vma->vm_ops || | ||
| 1065 | (!vma->vm_ops->fault && !vma->vm_ops->nopfn); | ||
| 1066 | } | ||
| 1067 | |||
| 1041 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1068 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 1042 | unsigned long start, int len, int write, int force, | 1069 | unsigned long start, int len, int write, int force, |
| 1043 | struct page **pages, struct vm_area_struct **vmas) | 1070 | struct page **pages, struct vm_area_struct **vmas) |
| @@ -1112,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1112 | foll_flags = FOLL_TOUCH; | 1139 | foll_flags = FOLL_TOUCH; |
| 1113 | if (pages) | 1140 | if (pages) |
| 1114 | foll_flags |= FOLL_GET; | 1141 | foll_flags |= FOLL_GET; |
| 1115 | if (!write && !(vma->vm_flags & VM_LOCKED) && | 1142 | if (!write && use_zero_page(vma)) |
| 1116 | (!vma->vm_ops || !vma->vm_ops->fault)) | ||
| 1117 | foll_flags |= FOLL_ANON; | 1143 | foll_flags |= FOLL_ANON; |
| 1118 | 1144 | ||
| 1119 | do { | 1145 | do { |
| @@ -1125,7 +1151,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1125 | * be processed until returning to user space. | 1151 | * be processed until returning to user space. |
| 1126 | */ | 1152 | */ |
| 1127 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) | 1153 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) |
| 1128 | return -ENOMEM; | 1154 | return i ? i : -ENOMEM; |
| 1129 | 1155 | ||
| 1130 | if (write) | 1156 | if (write) |
| 1131 | foll_flags |= FOLL_WRITE; | 1157 | foll_flags |= FOLL_WRITE; |
| @@ -1159,6 +1185,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1159 | 1185 | ||
| 1160 | cond_resched(); | 1186 | cond_resched(); |
| 1161 | } | 1187 | } |
| 1188 | if (IS_ERR(page)) | ||
| 1189 | return i ? i : PTR_ERR(page); | ||
| 1162 | if (pages) { | 1190 | if (pages) { |
| 1163 | pages[i] = page; | 1191 | pages[i] = page; |
| 1164 | 1192 | ||
| @@ -1669,8 +1697,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1669 | struct page *dirty_page = NULL; | 1697 | struct page *dirty_page = NULL; |
| 1670 | 1698 | ||
| 1671 | old_page = vm_normal_page(vma, address, orig_pte); | 1699 | old_page = vm_normal_page(vma, address, orig_pte); |
| 1672 | if (!old_page) | 1700 | if (!old_page) { |
| 1701 | /* | ||
| 1702 | * VM_MIXEDMAP !pfn_valid() case | ||
| 1703 | * | ||
| 1704 | * We should not cow pages in a shared writeable mapping. | ||
| 1705 | * Just mark the pages writable as we can't do any dirty | ||
| 1706 | * accounting on raw pfn maps. | ||
| 1707 | */ | ||
| 1708 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
| 1709 | (VM_WRITE|VM_SHARED)) | ||
| 1710 | goto reuse; | ||
| 1673 | goto gotten; | 1711 | goto gotten; |
| 1712 | } | ||
| 1674 | 1713 | ||
| 1675 | /* | 1714 | /* |
| 1676 | * Take out anonymous pages first, anonymous shared vmas are | 1715 | * Take out anonymous pages first, anonymous shared vmas are |
| @@ -1723,6 +1762,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1723 | } | 1762 | } |
| 1724 | 1763 | ||
| 1725 | if (reuse) { | 1764 | if (reuse) { |
| 1765 | reuse: | ||
| 1726 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1766 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
| 1727 | entry = pte_mkyoung(orig_pte); | 1767 | entry = pte_mkyoung(orig_pte); |
| 1728 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1768 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| @@ -1757,7 +1797,6 @@ gotten: | |||
| 1757 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 1797 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 1758 | if (likely(pte_same(*page_table, orig_pte))) { | 1798 | if (likely(pte_same(*page_table, orig_pte))) { |
| 1759 | if (old_page) { | 1799 | if (old_page) { |
| 1760 | page_remove_rmap(old_page, vma); | ||
| 1761 | if (!PageAnon(old_page)) { | 1800 | if (!PageAnon(old_page)) { |
| 1762 | dec_mm_counter(mm, file_rss); | 1801 | dec_mm_counter(mm, file_rss); |
| 1763 | inc_mm_counter(mm, anon_rss); | 1802 | inc_mm_counter(mm, anon_rss); |
| @@ -1779,6 +1818,32 @@ gotten: | |||
| 1779 | lru_cache_add_active(new_page); | 1818 | lru_cache_add_active(new_page); |
| 1780 | page_add_new_anon_rmap(new_page, vma, address); | 1819 | page_add_new_anon_rmap(new_page, vma, address); |
| 1781 | 1820 | ||
| 1821 | if (old_page) { | ||
| 1822 | /* | ||
| 1823 | * Only after switching the pte to the new page may | ||
| 1824 | * we remove the mapcount here. Otherwise another | ||
| 1825 | * process may come and find the rmap count decremented | ||
| 1826 | * before the pte is switched to the new page, and | ||
| 1827 | * "reuse" the old page writing into it while our pte | ||
| 1828 | * here still points into it and can be read by other | ||
| 1829 | * threads. | ||
| 1830 | * | ||
| 1831 | * The critical issue is to order this | ||
| 1832 | * page_remove_rmap with the ptp_clear_flush above. | ||
| 1833 | * Those stores are ordered by (if nothing else,) | ||
| 1834 | * the barrier present in the atomic_add_negative | ||
| 1835 | * in page_remove_rmap. | ||
| 1836 | * | ||
| 1837 | * Then the TLB flush in ptep_clear_flush ensures that | ||
| 1838 | * no process can access the old page before the | ||
| 1839 | * decremented mapcount is visible. And the old page | ||
| 1840 | * cannot be reused until after the decremented | ||
| 1841 | * mapcount is visible. So transitively, TLBs to | ||
| 1842 | * old page will be flushed before it can be reused. | ||
| 1843 | */ | ||
| 1844 | page_remove_rmap(old_page, vma); | ||
| 1845 | } | ||
| 1846 | |||
| 1782 | /* Free the old page.. */ | 1847 | /* Free the old page.. */ |
| 1783 | new_page = old_page; | 1848 | new_page = old_page; |
| 1784 | ret |= VM_FAULT_WRITE; | 1849 | ret |= VM_FAULT_WRITE; |
| @@ -2295,8 +2360,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2295 | vmf.flags = flags; | 2360 | vmf.flags = flags; |
| 2296 | vmf.page = NULL; | 2361 | vmf.page = NULL; |
| 2297 | 2362 | ||
| 2298 | BUG_ON(vma->vm_flags & VM_PFNMAP); | ||
| 2299 | |||
| 2300 | ret = vma->vm_ops->fault(vma, &vmf); | 2363 | ret = vma->vm_ops->fault(vma, &vmf); |
| 2301 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2364 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
| 2302 | return ret; | 2365 | return ret; |
