diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 158 |
1 files changed, 97 insertions, 61 deletions
diff --git a/mm/memory.c b/mm/memory.c index 6953d3926e01..a56e3ba816b2 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
305 | if (batch->nr == batch->max) { | 305 | if (batch->nr == batch->max) { |
306 | if (!tlb_next_batch(tlb)) | 306 | if (!tlb_next_batch(tlb)) |
307 | return 0; | 307 | return 0; |
308 | batch = tlb->active; | ||
308 | } | 309 | } |
309 | VM_BUG_ON(batch->nr > batch->max); | 310 | VM_BUG_ON(batch->nr > batch->max); |
310 | 311 | ||
@@ -1112,11 +1113,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
1112 | int force_flush = 0; | 1113 | int force_flush = 0; |
1113 | int rss[NR_MM_COUNTERS]; | 1114 | int rss[NR_MM_COUNTERS]; |
1114 | spinlock_t *ptl; | 1115 | spinlock_t *ptl; |
1116 | pte_t *start_pte; | ||
1115 | pte_t *pte; | 1117 | pte_t *pte; |
1116 | 1118 | ||
1117 | again: | 1119 | again: |
1118 | init_rss_vec(rss); | 1120 | init_rss_vec(rss); |
1119 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 1121 | start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
1122 | pte = start_pte; | ||
1120 | arch_enter_lazy_mmu_mode(); | 1123 | arch_enter_lazy_mmu_mode(); |
1121 | do { | 1124 | do { |
1122 | pte_t ptent = *pte; | 1125 | pte_t ptent = *pte; |
@@ -1196,7 +1199,7 @@ again: | |||
1196 | 1199 | ||
1197 | add_mm_rss_vec(mm, rss); | 1200 | add_mm_rss_vec(mm, rss); |
1198 | arch_leave_lazy_mmu_mode(); | 1201 | arch_leave_lazy_mmu_mode(); |
1199 | pte_unmap_unlock(pte - 1, ptl); | 1202 | pte_unmap_unlock(start_pte, ptl); |
1200 | 1203 | ||
1201 | /* | 1204 | /* |
1202 | * mmu_gather ran out of room to batch pages, we break out of | 1205 | * mmu_gather ran out of room to batch pages, we break out of |
@@ -1287,16 +1290,9 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1287 | return addr; | 1290 | return addr; |
1288 | } | 1291 | } |
1289 | 1292 | ||
1290 | #ifdef CONFIG_PREEMPT | ||
1291 | # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) | ||
1292 | #else | ||
1293 | /* No preempt: go for improved straight-line efficiency */ | ||
1294 | # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) | ||
1295 | #endif | ||
1296 | |||
1297 | /** | 1293 | /** |
1298 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 1294 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
1299 | * @tlbp: address of the caller's struct mmu_gather | 1295 | * @tlb: address of the caller's struct mmu_gather |
1300 | * @vma: the starting vma | 1296 | * @vma: the starting vma |
1301 | * @start_addr: virtual address at which to start unmapping | 1297 | * @start_addr: virtual address at which to start unmapping |
1302 | * @end_addr: virtual address at which to end unmapping | 1298 | * @end_addr: virtual address at which to end unmapping |
@@ -1307,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1307 | * | 1303 | * |
1308 | * Unmap all pages in the vma list. | 1304 | * Unmap all pages in the vma list. |
1309 | * | 1305 | * |
1310 | * We aim to not hold locks for too long (for scheduling latency reasons). | ||
1311 | * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to | ||
1312 | * return the ending mmu_gather to the caller. | ||
1313 | * | ||
1314 | * Only addresses between `start' and `end' will be unmapped. | 1306 | * Only addresses between `start' and `end' will be unmapped. |
1315 | * | 1307 | * |
1316 | * The VMA list must be sorted in ascending virtual address order. | 1308 | * The VMA list must be sorted in ascending virtual address order. |
@@ -1813,7 +1805,63 @@ next_page: | |||
1813 | } | 1805 | } |
1814 | EXPORT_SYMBOL(__get_user_pages); | 1806 | EXPORT_SYMBOL(__get_user_pages); |
1815 | 1807 | ||
1816 | /** | 1808 | /* |
1809 | * fixup_user_fault() - manually resolve a user page fault | ||
1810 | * @tsk: the task_struct to use for page fault accounting, or | ||
1811 | * NULL if faults are not to be recorded. | ||
1812 | * @mm: mm_struct of target mm | ||
1813 | * @address: user address | ||
1814 | * @fault_flags:flags to pass down to handle_mm_fault() | ||
1815 | * | ||
1816 | * This is meant to be called in the specific scenario where for locking reasons | ||
1817 | * we try to access user memory in atomic context (within a pagefault_disable() | ||
1818 | * section), this returns -EFAULT, and we want to resolve the user fault before | ||
1819 | * trying again. | ||
1820 | * | ||
1821 | * Typically this is meant to be used by the futex code. | ||
1822 | * | ||
1823 | * The main difference with get_user_pages() is that this function will | ||
1824 | * unconditionally call handle_mm_fault() which will in turn perform all the | ||
1825 | * necessary SW fixup of the dirty and young bits in the PTE, while | ||
1826 | * handle_mm_fault() only guarantees to update these in the struct page. | ||
1827 | * | ||
1828 | * This is important for some architectures where those bits also gate the | ||
1829 | * access permission to the page because they are maintained in software. On | ||
1830 | * such architectures, gup() will not be enough to make a subsequent access | ||
1831 | * succeed. | ||
1832 | * | ||
1833 | * This should be called with the mm_sem held for read. | ||
1834 | */ | ||
1835 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | ||
1836 | unsigned long address, unsigned int fault_flags) | ||
1837 | { | ||
1838 | struct vm_area_struct *vma; | ||
1839 | int ret; | ||
1840 | |||
1841 | vma = find_extend_vma(mm, address); | ||
1842 | if (!vma || address < vma->vm_start) | ||
1843 | return -EFAULT; | ||
1844 | |||
1845 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
1846 | if (ret & VM_FAULT_ERROR) { | ||
1847 | if (ret & VM_FAULT_OOM) | ||
1848 | return -ENOMEM; | ||
1849 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
1850 | return -EHWPOISON; | ||
1851 | if (ret & VM_FAULT_SIGBUS) | ||
1852 | return -EFAULT; | ||
1853 | BUG(); | ||
1854 | } | ||
1855 | if (tsk) { | ||
1856 | if (ret & VM_FAULT_MAJOR) | ||
1857 | tsk->maj_flt++; | ||
1858 | else | ||
1859 | tsk->min_flt++; | ||
1860 | } | ||
1861 | return 0; | ||
1862 | } | ||
1863 | |||
1864 | /* | ||
1817 | * get_user_pages() - pin user pages in memory | 1865 | * get_user_pages() - pin user pages in memory |
1818 | * @tsk: the task_struct to use for page fault accounting, or | 1866 | * @tsk: the task_struct to use for page fault accounting, or |
1819 | * NULL if faults are not to be recorded. | 1867 | * NULL if faults are not to be recorded. |
@@ -2796,30 +2844,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2796 | } | 2844 | } |
2797 | EXPORT_SYMBOL(unmap_mapping_range); | 2845 | EXPORT_SYMBOL(unmap_mapping_range); |
2798 | 2846 | ||
2799 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | ||
2800 | { | ||
2801 | struct address_space *mapping = inode->i_mapping; | ||
2802 | |||
2803 | /* | ||
2804 | * If the underlying filesystem is not going to provide | ||
2805 | * a way to truncate a range of blocks (punch a hole) - | ||
2806 | * we should return failure right now. | ||
2807 | */ | ||
2808 | if (!inode->i_op->truncate_range) | ||
2809 | return -ENOSYS; | ||
2810 | |||
2811 | mutex_lock(&inode->i_mutex); | ||
2812 | down_write(&inode->i_alloc_sem); | ||
2813 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
2814 | truncate_inode_pages_range(mapping, offset, end); | ||
2815 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
2816 | inode->i_op->truncate_range(inode, offset, end); | ||
2817 | up_write(&inode->i_alloc_sem); | ||
2818 | mutex_unlock(&inode->i_mutex); | ||
2819 | |||
2820 | return 0; | ||
2821 | } | ||
2822 | |||
2823 | /* | 2847 | /* |
2824 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2848 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2825 | * but allow concurrent faults), and pte mapped but not yet locked. | 2849 | * but allow concurrent faults), and pte mapped but not yet locked. |
@@ -3125,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3125 | pte_t *page_table; | 3149 | pte_t *page_table; |
3126 | spinlock_t *ptl; | 3150 | spinlock_t *ptl; |
3127 | struct page *page; | 3151 | struct page *page; |
3152 | struct page *cow_page; | ||
3128 | pte_t entry; | 3153 | pte_t entry; |
3129 | int anon = 0; | 3154 | int anon = 0; |
3130 | int charged = 0; | ||
3131 | struct page *dirty_page = NULL; | 3155 | struct page *dirty_page = NULL; |
3132 | struct vm_fault vmf; | 3156 | struct vm_fault vmf; |
3133 | int ret; | 3157 | int ret; |
3134 | int page_mkwrite = 0; | 3158 | int page_mkwrite = 0; |
3135 | 3159 | ||
3160 | /* | ||
3161 | * If we do COW later, allocate page befor taking lock_page() | ||
3162 | * on the file cache page. This will reduce lock holding time. | ||
3163 | */ | ||
3164 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { | ||
3165 | |||
3166 | if (unlikely(anon_vma_prepare(vma))) | ||
3167 | return VM_FAULT_OOM; | ||
3168 | |||
3169 | cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
3170 | if (!cow_page) | ||
3171 | return VM_FAULT_OOM; | ||
3172 | |||
3173 | if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { | ||
3174 | page_cache_release(cow_page); | ||
3175 | return VM_FAULT_OOM; | ||
3176 | } | ||
3177 | } else | ||
3178 | cow_page = NULL; | ||
3179 | |||
3136 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 3180 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); |
3137 | vmf.pgoff = pgoff; | 3181 | vmf.pgoff = pgoff; |
3138 | vmf.flags = flags; | 3182 | vmf.flags = flags; |
@@ -3141,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3141 | ret = vma->vm_ops->fault(vma, &vmf); | 3185 | ret = vma->vm_ops->fault(vma, &vmf); |
3142 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | | 3186 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | |
3143 | VM_FAULT_RETRY))) | 3187 | VM_FAULT_RETRY))) |
3144 | return ret; | 3188 | goto uncharge_out; |
3145 | 3189 | ||
3146 | if (unlikely(PageHWPoison(vmf.page))) { | 3190 | if (unlikely(PageHWPoison(vmf.page))) { |
3147 | if (ret & VM_FAULT_LOCKED) | 3191 | if (ret & VM_FAULT_LOCKED) |
3148 | unlock_page(vmf.page); | 3192 | unlock_page(vmf.page); |
3149 | return VM_FAULT_HWPOISON; | 3193 | ret = VM_FAULT_HWPOISON; |
3194 | goto uncharge_out; | ||
3150 | } | 3195 | } |
3151 | 3196 | ||
3152 | /* | 3197 | /* |
@@ -3164,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3164 | page = vmf.page; | 3209 | page = vmf.page; |
3165 | if (flags & FAULT_FLAG_WRITE) { | 3210 | if (flags & FAULT_FLAG_WRITE) { |
3166 | if (!(vma->vm_flags & VM_SHARED)) { | 3211 | if (!(vma->vm_flags & VM_SHARED)) { |
3212 | page = cow_page; | ||
3167 | anon = 1; | 3213 | anon = 1; |
3168 | if (unlikely(anon_vma_prepare(vma))) { | ||
3169 | ret = VM_FAULT_OOM; | ||
3170 | goto out; | ||
3171 | } | ||
3172 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | ||
3173 | vma, address); | ||
3174 | if (!page) { | ||
3175 | ret = VM_FAULT_OOM; | ||
3176 | goto out; | ||
3177 | } | ||
3178 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | ||
3179 | ret = VM_FAULT_OOM; | ||
3180 | page_cache_release(page); | ||
3181 | goto out; | ||
3182 | } | ||
3183 | charged = 1; | ||
3184 | copy_user_highpage(page, vmf.page, address, vma); | 3214 | copy_user_highpage(page, vmf.page, address, vma); |
3185 | __SetPageUptodate(page); | 3215 | __SetPageUptodate(page); |
3186 | } else { | 3216 | } else { |
@@ -3249,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3249 | /* no need to invalidate: a not-present page won't be cached */ | 3279 | /* no need to invalidate: a not-present page won't be cached */ |
3250 | update_mmu_cache(vma, address, page_table); | 3280 | update_mmu_cache(vma, address, page_table); |
3251 | } else { | 3281 | } else { |
3252 | if (charged) | 3282 | if (cow_page) |
3253 | mem_cgroup_uncharge_page(page); | 3283 | mem_cgroup_uncharge_page(cow_page); |
3254 | if (anon) | 3284 | if (anon) |
3255 | page_cache_release(page); | 3285 | page_cache_release(page); |
3256 | else | 3286 | else |
@@ -3259,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3259 | 3289 | ||
3260 | pte_unmap_unlock(page_table, ptl); | 3290 | pte_unmap_unlock(page_table, ptl); |
3261 | 3291 | ||
3262 | out: | ||
3263 | if (dirty_page) { | 3292 | if (dirty_page) { |
3264 | struct address_space *mapping = page->mapping; | 3293 | struct address_space *mapping = page->mapping; |
3265 | 3294 | ||
@@ -3289,6 +3318,13 @@ out: | |||
3289 | unwritable_page: | 3318 | unwritable_page: |
3290 | page_cache_release(page); | 3319 | page_cache_release(page); |
3291 | return ret; | 3320 | return ret; |
3321 | uncharge_out: | ||
3322 | /* fs's fault handler get error */ | ||
3323 | if (cow_page) { | ||
3324 | mem_cgroup_uncharge_page(cow_page); | ||
3325 | page_cache_release(cow_page); | ||
3326 | } | ||
3327 | return ret; | ||
3292 | } | 3328 | } |
3293 | 3329 | ||
3294 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3330 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |