aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c158
1 files changed, 97 insertions, 61 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 6953d3926e01..a56e3ba816b2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
305 if (batch->nr == batch->max) { 305 if (batch->nr == batch->max) {
306 if (!tlb_next_batch(tlb)) 306 if (!tlb_next_batch(tlb))
307 return 0; 307 return 0;
308 batch = tlb->active;
308 } 309 }
309 VM_BUG_ON(batch->nr > batch->max); 310 VM_BUG_ON(batch->nr > batch->max);
310 311
@@ -1112,11 +1113,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1112 int force_flush = 0; 1113 int force_flush = 0;
1113 int rss[NR_MM_COUNTERS]; 1114 int rss[NR_MM_COUNTERS];
1114 spinlock_t *ptl; 1115 spinlock_t *ptl;
1116 pte_t *start_pte;
1115 pte_t *pte; 1117 pte_t *pte;
1116 1118
1117again: 1119again:
1118 init_rss_vec(rss); 1120 init_rss_vec(rss);
1119 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 1121 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1122 pte = start_pte;
1120 arch_enter_lazy_mmu_mode(); 1123 arch_enter_lazy_mmu_mode();
1121 do { 1124 do {
1122 pte_t ptent = *pte; 1125 pte_t ptent = *pte;
@@ -1196,7 +1199,7 @@ again:
1196 1199
1197 add_mm_rss_vec(mm, rss); 1200 add_mm_rss_vec(mm, rss);
1198 arch_leave_lazy_mmu_mode(); 1201 arch_leave_lazy_mmu_mode();
1199 pte_unmap_unlock(pte - 1, ptl); 1202 pte_unmap_unlock(start_pte, ptl);
1200 1203
1201 /* 1204 /*
1202 * mmu_gather ran out of room to batch pages, we break out of 1205 * mmu_gather ran out of room to batch pages, we break out of
@@ -1287,16 +1290,9 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1287 return addr; 1290 return addr;
1288} 1291}
1289 1292
1290#ifdef CONFIG_PREEMPT
1291# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
1292#else
1293/* No preempt: go for improved straight-line efficiency */
1294# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
1295#endif
1296
1297/** 1293/**
1298 * unmap_vmas - unmap a range of memory covered by a list of vma's 1294 * unmap_vmas - unmap a range of memory covered by a list of vma's
1299 * @tlbp: address of the caller's struct mmu_gather 1295 * @tlb: address of the caller's struct mmu_gather
1300 * @vma: the starting vma 1296 * @vma: the starting vma
1301 * @start_addr: virtual address at which to start unmapping 1297 * @start_addr: virtual address at which to start unmapping
1302 * @end_addr: virtual address at which to end unmapping 1298 * @end_addr: virtual address at which to end unmapping
@@ -1307,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1307 * 1303 *
1308 * Unmap all pages in the vma list. 1304 * Unmap all pages in the vma list.
1309 * 1305 *
1310 * We aim to not hold locks for too long (for scheduling latency reasons).
1311 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
1312 * return the ending mmu_gather to the caller.
1313 *
1314 * Only addresses between `start' and `end' will be unmapped. 1306 * Only addresses between `start' and `end' will be unmapped.
1315 * 1307 *
1316 * The VMA list must be sorted in ascending virtual address order. 1308 * The VMA list must be sorted in ascending virtual address order.
@@ -1813,7 +1805,63 @@ next_page:
1813} 1805}
1814EXPORT_SYMBOL(__get_user_pages); 1806EXPORT_SYMBOL(__get_user_pages);
1815 1807
1816/** 1808/*
1809 * fixup_user_fault() - manually resolve a user page fault
1810 * @tsk: the task_struct to use for page fault accounting, or
1811 * NULL if faults are not to be recorded.
1812 * @mm: mm_struct of target mm
1813 * @address: user address
1814 * @fault_flags:flags to pass down to handle_mm_fault()
1815 *
1816 * This is meant to be called in the specific scenario where for locking reasons
1817 * we try to access user memory in atomic context (within a pagefault_disable()
1818 * section), this returns -EFAULT, and we want to resolve the user fault before
1819 * trying again.
1820 *
1821 * Typically this is meant to be used by the futex code.
1822 *
1823 * The main difference with get_user_pages() is that this function will
1824 * unconditionally call handle_mm_fault() which will in turn perform all the
1825 * necessary SW fixup of the dirty and young bits in the PTE, while
1826 * handle_mm_fault() only guarantees to update these in the struct page.
1827 *
1828 * This is important for some architectures where those bits also gate the
1829 * access permission to the page because they are maintained in software. On
1830 * such architectures, gup() will not be enough to make a subsequent access
1831 * succeed.
1832 *
1833 * This should be called with the mm_sem held for read.
1834 */
1835int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1836 unsigned long address, unsigned int fault_flags)
1837{
1838 struct vm_area_struct *vma;
1839 int ret;
1840
1841 vma = find_extend_vma(mm, address);
1842 if (!vma || address < vma->vm_start)
1843 return -EFAULT;
1844
1845 ret = handle_mm_fault(mm, vma, address, fault_flags);
1846 if (ret & VM_FAULT_ERROR) {
1847 if (ret & VM_FAULT_OOM)
1848 return -ENOMEM;
1849 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1850 return -EHWPOISON;
1851 if (ret & VM_FAULT_SIGBUS)
1852 return -EFAULT;
1853 BUG();
1854 }
1855 if (tsk) {
1856 if (ret & VM_FAULT_MAJOR)
1857 tsk->maj_flt++;
1858 else
1859 tsk->min_flt++;
1860 }
1861 return 0;
1862}
1863
1864/*
1817 * get_user_pages() - pin user pages in memory 1865 * get_user_pages() - pin user pages in memory
1818 * @tsk: the task_struct to use for page fault accounting, or 1866 * @tsk: the task_struct to use for page fault accounting, or
1819 * NULL if faults are not to be recorded. 1867 * NULL if faults are not to be recorded.
@@ -2796,30 +2844,6 @@ void unmap_mapping_range(struct address_space *mapping,
2796} 2844}
2797EXPORT_SYMBOL(unmap_mapping_range); 2845EXPORT_SYMBOL(unmap_mapping_range);
2798 2846
2799int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2800{
2801 struct address_space *mapping = inode->i_mapping;
2802
2803 /*
2804 * If the underlying filesystem is not going to provide
2805 * a way to truncate a range of blocks (punch a hole) -
2806 * we should return failure right now.
2807 */
2808 if (!inode->i_op->truncate_range)
2809 return -ENOSYS;
2810
2811 mutex_lock(&inode->i_mutex);
2812 down_write(&inode->i_alloc_sem);
2813 unmap_mapping_range(mapping, offset, (end - offset), 1);
2814 truncate_inode_pages_range(mapping, offset, end);
2815 unmap_mapping_range(mapping, offset, (end - offset), 1);
2816 inode->i_op->truncate_range(inode, offset, end);
2817 up_write(&inode->i_alloc_sem);
2818 mutex_unlock(&inode->i_mutex);
2819
2820 return 0;
2821}
2822
2823/* 2847/*
2824 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2848 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2825 * but allow concurrent faults), and pte mapped but not yet locked. 2849 * but allow concurrent faults), and pte mapped but not yet locked.
@@ -3125,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3125 pte_t *page_table; 3149 pte_t *page_table;
3126 spinlock_t *ptl; 3150 spinlock_t *ptl;
3127 struct page *page; 3151 struct page *page;
3152 struct page *cow_page;
3128 pte_t entry; 3153 pte_t entry;
3129 int anon = 0; 3154 int anon = 0;
3130 int charged = 0;
3131 struct page *dirty_page = NULL; 3155 struct page *dirty_page = NULL;
3132 struct vm_fault vmf; 3156 struct vm_fault vmf;
3133 int ret; 3157 int ret;
3134 int page_mkwrite = 0; 3158 int page_mkwrite = 0;
3135 3159
3160 /*
3161 * If we do COW later, allocate page befor taking lock_page()
3162 * on the file cache page. This will reduce lock holding time.
3163 */
3164 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3165
3166 if (unlikely(anon_vma_prepare(vma)))
3167 return VM_FAULT_OOM;
3168
3169 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3170 if (!cow_page)
3171 return VM_FAULT_OOM;
3172
3173 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3174 page_cache_release(cow_page);
3175 return VM_FAULT_OOM;
3176 }
3177 } else
3178 cow_page = NULL;
3179
3136 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 3180 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3137 vmf.pgoff = pgoff; 3181 vmf.pgoff = pgoff;
3138 vmf.flags = flags; 3182 vmf.flags = flags;
@@ -3141,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3141 ret = vma->vm_ops->fault(vma, &vmf); 3185 ret = vma->vm_ops->fault(vma, &vmf);
3142 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 3186 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3143 VM_FAULT_RETRY))) 3187 VM_FAULT_RETRY)))
3144 return ret; 3188 goto uncharge_out;
3145 3189
3146 if (unlikely(PageHWPoison(vmf.page))) { 3190 if (unlikely(PageHWPoison(vmf.page))) {
3147 if (ret & VM_FAULT_LOCKED) 3191 if (ret & VM_FAULT_LOCKED)
3148 unlock_page(vmf.page); 3192 unlock_page(vmf.page);
3149 return VM_FAULT_HWPOISON; 3193 ret = VM_FAULT_HWPOISON;
3194 goto uncharge_out;
3150 } 3195 }
3151 3196
3152 /* 3197 /*
@@ -3164,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3164 page = vmf.page; 3209 page = vmf.page;
3165 if (flags & FAULT_FLAG_WRITE) { 3210 if (flags & FAULT_FLAG_WRITE) {
3166 if (!(vma->vm_flags & VM_SHARED)) { 3211 if (!(vma->vm_flags & VM_SHARED)) {
3212 page = cow_page;
3167 anon = 1; 3213 anon = 1;
3168 if (unlikely(anon_vma_prepare(vma))) {
3169 ret = VM_FAULT_OOM;
3170 goto out;
3171 }
3172 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
3173 vma, address);
3174 if (!page) {
3175 ret = VM_FAULT_OOM;
3176 goto out;
3177 }
3178 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
3179 ret = VM_FAULT_OOM;
3180 page_cache_release(page);
3181 goto out;
3182 }
3183 charged = 1;
3184 copy_user_highpage(page, vmf.page, address, vma); 3214 copy_user_highpage(page, vmf.page, address, vma);
3185 __SetPageUptodate(page); 3215 __SetPageUptodate(page);
3186 } else { 3216 } else {
@@ -3249,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3249 /* no need to invalidate: a not-present page won't be cached */ 3279 /* no need to invalidate: a not-present page won't be cached */
3250 update_mmu_cache(vma, address, page_table); 3280 update_mmu_cache(vma, address, page_table);
3251 } else { 3281 } else {
3252 if (charged) 3282 if (cow_page)
3253 mem_cgroup_uncharge_page(page); 3283 mem_cgroup_uncharge_page(cow_page);
3254 if (anon) 3284 if (anon)
3255 page_cache_release(page); 3285 page_cache_release(page);
3256 else 3286 else
@@ -3259,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3259 3289
3260 pte_unmap_unlock(page_table, ptl); 3290 pte_unmap_unlock(page_table, ptl);
3261 3291
3262out:
3263 if (dirty_page) { 3292 if (dirty_page) {
3264 struct address_space *mapping = page->mapping; 3293 struct address_space *mapping = page->mapping;
3265 3294
@@ -3289,6 +3318,13 @@ out:
3289unwritable_page: 3318unwritable_page:
3290 page_cache_release(page); 3319 page_cache_release(page);
3291 return ret; 3320 return ret;
3321uncharge_out:
3322 /* fs's fault handler get error */
3323 if (cow_page) {
3324 mem_cgroup_uncharge_page(cow_page);
3325 page_cache_release(cow_page);
3326 }
3327 return ret;
3292} 3328}
3293 3329
3294static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3330static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,