diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/memory.c | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 1030 |
1 files changed, 684 insertions, 346 deletions
diff --git a/mm/memory.c b/mm/memory.c index 0e18b4d649ec..9b8a01d941cb 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -182,7 +182,7 @@ void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | |||
182 | { | 182 | { |
183 | __sync_task_rss_stat(task, mm); | 183 | __sync_task_rss_stat(task, mm); |
184 | } | 184 | } |
185 | #else | 185 | #else /* SPLIT_RSS_COUNTING */ |
186 | 186 | ||
187 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) | 187 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) |
188 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) | 188 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) |
@@ -191,8 +191,206 @@ static void check_sync_rss_stat(struct task_struct *task) | |||
191 | { | 191 | { |
192 | } | 192 | } |
193 | 193 | ||
194 | #endif /* SPLIT_RSS_COUNTING */ | ||
195 | |||
196 | #ifdef HAVE_GENERIC_MMU_GATHER | ||
197 | |||
198 | static int tlb_next_batch(struct mmu_gather *tlb) | ||
199 | { | ||
200 | struct mmu_gather_batch *batch; | ||
201 | |||
202 | batch = tlb->active; | ||
203 | if (batch->next) { | ||
204 | tlb->active = batch->next; | ||
205 | return 1; | ||
206 | } | ||
207 | |||
208 | batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); | ||
209 | if (!batch) | ||
210 | return 0; | ||
211 | |||
212 | batch->next = NULL; | ||
213 | batch->nr = 0; | ||
214 | batch->max = MAX_GATHER_BATCH; | ||
215 | |||
216 | tlb->active->next = batch; | ||
217 | tlb->active = batch; | ||
218 | |||
219 | return 1; | ||
220 | } | ||
221 | |||
222 | /* tlb_gather_mmu | ||
223 | * Called to initialize an (on-stack) mmu_gather structure for page-table | ||
224 | * tear-down from @mm. The @fullmm argument is used when @mm is without | ||
225 | * users and we're going to destroy the full address space (exit/execve). | ||
226 | */ | ||
227 | void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) | ||
228 | { | ||
229 | tlb->mm = mm; | ||
230 | |||
231 | tlb->fullmm = fullmm; | ||
232 | tlb->need_flush = 0; | ||
233 | tlb->fast_mode = (num_possible_cpus() == 1); | ||
234 | tlb->local.next = NULL; | ||
235 | tlb->local.nr = 0; | ||
236 | tlb->local.max = ARRAY_SIZE(tlb->__pages); | ||
237 | tlb->active = &tlb->local; | ||
238 | |||
239 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
240 | tlb->batch = NULL; | ||
241 | #endif | ||
242 | } | ||
243 | |||
244 | void tlb_flush_mmu(struct mmu_gather *tlb) | ||
245 | { | ||
246 | struct mmu_gather_batch *batch; | ||
247 | |||
248 | if (!tlb->need_flush) | ||
249 | return; | ||
250 | tlb->need_flush = 0; | ||
251 | tlb_flush(tlb); | ||
252 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
253 | tlb_table_flush(tlb); | ||
194 | #endif | 254 | #endif |
195 | 255 | ||
256 | if (tlb_fast_mode(tlb)) | ||
257 | return; | ||
258 | |||
259 | for (batch = &tlb->local; batch; batch = batch->next) { | ||
260 | free_pages_and_swap_cache(batch->pages, batch->nr); | ||
261 | batch->nr = 0; | ||
262 | } | ||
263 | tlb->active = &tlb->local; | ||
264 | } | ||
265 | |||
266 | /* tlb_finish_mmu | ||
267 | * Called at the end of the shootdown operation to free up any resources | ||
268 | * that were required. | ||
269 | */ | ||
270 | void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) | ||
271 | { | ||
272 | struct mmu_gather_batch *batch, *next; | ||
273 | |||
274 | tlb_flush_mmu(tlb); | ||
275 | |||
276 | /* keep the page table cache within bounds */ | ||
277 | check_pgt_cache(); | ||
278 | |||
279 | for (batch = tlb->local.next; batch; batch = next) { | ||
280 | next = batch->next; | ||
281 | free_pages((unsigned long)batch, 0); | ||
282 | } | ||
283 | tlb->local.next = NULL; | ||
284 | } | ||
285 | |||
286 | /* __tlb_remove_page | ||
287 | * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while | ||
288 | * handling the additional races in SMP caused by other CPUs caching valid | ||
289 | * mappings in their TLBs. Returns the number of free page slots left. | ||
290 | * When out of page slots we must call tlb_flush_mmu(). | ||
291 | */ | ||
292 | int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | ||
293 | { | ||
294 | struct mmu_gather_batch *batch; | ||
295 | |||
296 | tlb->need_flush = 1; | ||
297 | |||
298 | if (tlb_fast_mode(tlb)) { | ||
299 | free_page_and_swap_cache(page); | ||
300 | return 1; /* avoid calling tlb_flush_mmu() */ | ||
301 | } | ||
302 | |||
303 | batch = tlb->active; | ||
304 | batch->pages[batch->nr++] = page; | ||
305 | if (batch->nr == batch->max) { | ||
306 | if (!tlb_next_batch(tlb)) | ||
307 | return 0; | ||
308 | batch = tlb->active; | ||
309 | } | ||
310 | VM_BUG_ON(batch->nr > batch->max); | ||
311 | |||
312 | return batch->max - batch->nr; | ||
313 | } | ||
314 | |||
315 | #endif /* HAVE_GENERIC_MMU_GATHER */ | ||
316 | |||
317 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
318 | |||
319 | /* | ||
320 | * See the comment near struct mmu_table_batch. | ||
321 | */ | ||
322 | |||
323 | static void tlb_remove_table_smp_sync(void *arg) | ||
324 | { | ||
325 | /* Simply deliver the interrupt */ | ||
326 | } | ||
327 | |||
328 | static void tlb_remove_table_one(void *table) | ||
329 | { | ||
330 | /* | ||
331 | * This isn't an RCU grace period and hence the page-tables cannot be | ||
332 | * assumed to be actually RCU-freed. | ||
333 | * | ||
334 | * It is however sufficient for software page-table walkers that rely on | ||
335 | * IRQ disabling. See the comment near struct mmu_table_batch. | ||
336 | */ | ||
337 | smp_call_function(tlb_remove_table_smp_sync, NULL, 1); | ||
338 | __tlb_remove_table(table); | ||
339 | } | ||
340 | |||
341 | static void tlb_remove_table_rcu(struct rcu_head *head) | ||
342 | { | ||
343 | struct mmu_table_batch *batch; | ||
344 | int i; | ||
345 | |||
346 | batch = container_of(head, struct mmu_table_batch, rcu); | ||
347 | |||
348 | for (i = 0; i < batch->nr; i++) | ||
349 | __tlb_remove_table(batch->tables[i]); | ||
350 | |||
351 | free_page((unsigned long)batch); | ||
352 | } | ||
353 | |||
354 | void tlb_table_flush(struct mmu_gather *tlb) | ||
355 | { | ||
356 | struct mmu_table_batch **batch = &tlb->batch; | ||
357 | |||
358 | if (*batch) { | ||
359 | call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); | ||
360 | *batch = NULL; | ||
361 | } | ||
362 | } | ||
363 | |||
364 | void tlb_remove_table(struct mmu_gather *tlb, void *table) | ||
365 | { | ||
366 | struct mmu_table_batch **batch = &tlb->batch; | ||
367 | |||
368 | tlb->need_flush = 1; | ||
369 | |||
370 | /* | ||
371 | * When there's less then two users of this mm there cannot be a | ||
372 | * concurrent page-table walk. | ||
373 | */ | ||
374 | if (atomic_read(&tlb->mm->mm_users) < 2) { | ||
375 | __tlb_remove_table(table); | ||
376 | return; | ||
377 | } | ||
378 | |||
379 | if (*batch == NULL) { | ||
380 | *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); | ||
381 | if (*batch == NULL) { | ||
382 | tlb_remove_table_one(table); | ||
383 | return; | ||
384 | } | ||
385 | (*batch)->nr = 0; | ||
386 | } | ||
387 | (*batch)->tables[(*batch)->nr++] = table; | ||
388 | if ((*batch)->nr == MAX_TABLE_BATCH) | ||
389 | tlb_table_flush(tlb); | ||
390 | } | ||
391 | |||
392 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | ||
393 | |||
196 | /* | 394 | /* |
197 | * If a p?d_bad entry is found while walking page tables, report | 395 | * If a p?d_bad entry is found while walking page tables, report |
198 | * the error, before resetting entry to p?d_none. Usually (but | 396 | * the error, before resetting entry to p?d_none. Usually (but |
@@ -394,9 +592,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
394 | } | 592 | } |
395 | } | 593 | } |
396 | 594 | ||
397 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | 595 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
596 | pmd_t *pmd, unsigned long address) | ||
398 | { | 597 | { |
399 | pgtable_t new = pte_alloc_one(mm, address); | 598 | pgtable_t new = pte_alloc_one(mm, address); |
599 | int wait_split_huge_page; | ||
400 | if (!new) | 600 | if (!new) |
401 | return -ENOMEM; | 601 | return -ENOMEM; |
402 | 602 | ||
@@ -416,14 +616,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | |||
416 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ | 616 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ |
417 | 617 | ||
418 | spin_lock(&mm->page_table_lock); | 618 | spin_lock(&mm->page_table_lock); |
419 | if (!pmd_present(*pmd)) { /* Has another populated it ? */ | 619 | wait_split_huge_page = 0; |
620 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | ||
420 | mm->nr_ptes++; | 621 | mm->nr_ptes++; |
421 | pmd_populate(mm, pmd, new); | 622 | pmd_populate(mm, pmd, new); |
422 | new = NULL; | 623 | new = NULL; |
423 | } | 624 | } else if (unlikely(pmd_trans_splitting(*pmd))) |
625 | wait_split_huge_page = 1; | ||
424 | spin_unlock(&mm->page_table_lock); | 626 | spin_unlock(&mm->page_table_lock); |
425 | if (new) | 627 | if (new) |
426 | pte_free(mm, new); | 628 | pte_free(mm, new); |
629 | if (wait_split_huge_page) | ||
630 | wait_split_huge_page(vma->anon_vma, pmd); | ||
427 | return 0; | 631 | return 0; |
428 | } | 632 | } |
429 | 633 | ||
@@ -436,10 +640,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
436 | smp_wmb(); /* See comment in __pte_alloc */ | 640 | smp_wmb(); /* See comment in __pte_alloc */ |
437 | 641 | ||
438 | spin_lock(&init_mm.page_table_lock); | 642 | spin_lock(&init_mm.page_table_lock); |
439 | if (!pmd_present(*pmd)) { /* Has another populated it ? */ | 643 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
440 | pmd_populate_kernel(&init_mm, pmd, new); | 644 | pmd_populate_kernel(&init_mm, pmd, new); |
441 | new = NULL; | 645 | new = NULL; |
442 | } | 646 | } else |
647 | VM_BUG_ON(pmd_trans_splitting(*pmd)); | ||
443 | spin_unlock(&init_mm.page_table_lock); | 648 | spin_unlock(&init_mm.page_table_lock); |
444 | if (new) | 649 | if (new) |
445 | pte_free_kernel(&init_mm, new); | 650 | pte_free_kernel(&init_mm, new); |
@@ -526,7 +731,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
526 | add_taint(TAINT_BAD_PAGE); | 731 | add_taint(TAINT_BAD_PAGE); |
527 | } | 732 | } |
528 | 733 | ||
529 | static inline int is_cow_mapping(unsigned int flags) | 734 | static inline int is_cow_mapping(vm_flags_t flags) |
530 | { | 735 | { |
531 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 736 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
532 | } | 737 | } |
@@ -719,9 +924,9 @@ out_set_pte: | |||
719 | return 0; | 924 | return 0; |
720 | } | 925 | } |
721 | 926 | ||
722 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 927 | int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
723 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 928 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
724 | unsigned long addr, unsigned long end) | 929 | unsigned long addr, unsigned long end) |
725 | { | 930 | { |
726 | pte_t *orig_src_pte, *orig_dst_pte; | 931 | pte_t *orig_src_pte, *orig_dst_pte; |
727 | pte_t *src_pte, *dst_pte; | 932 | pte_t *src_pte, *dst_pte; |
@@ -736,7 +941,7 @@ again: | |||
736 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | 941 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); |
737 | if (!dst_pte) | 942 | if (!dst_pte) |
738 | return -ENOMEM; | 943 | return -ENOMEM; |
739 | src_pte = pte_offset_map_nested(src_pmd, addr); | 944 | src_pte = pte_offset_map(src_pmd, addr); |
740 | src_ptl = pte_lockptr(src_mm, src_pmd); | 945 | src_ptl = pte_lockptr(src_mm, src_pmd); |
741 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | 946 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); |
742 | orig_src_pte = src_pte; | 947 | orig_src_pte = src_pte; |
@@ -767,7 +972,7 @@ again: | |||
767 | 972 | ||
768 | arch_leave_lazy_mmu_mode(); | 973 | arch_leave_lazy_mmu_mode(); |
769 | spin_unlock(src_ptl); | 974 | spin_unlock(src_ptl); |
770 | pte_unmap_nested(orig_src_pte); | 975 | pte_unmap(orig_src_pte); |
771 | add_mm_rss_vec(dst_mm, rss); | 976 | add_mm_rss_vec(dst_mm, rss); |
772 | pte_unmap_unlock(orig_dst_pte, dst_ptl); | 977 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
773 | cond_resched(); | 978 | cond_resched(); |
@@ -795,6 +1000,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src | |||
795 | src_pmd = pmd_offset(src_pud, addr); | 1000 | src_pmd = pmd_offset(src_pud, addr); |
796 | do { | 1001 | do { |
797 | next = pmd_addr_end(addr, end); | 1002 | next = pmd_addr_end(addr, end); |
1003 | if (pmd_trans_huge(*src_pmd)) { | ||
1004 | int err; | ||
1005 | VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); | ||
1006 | err = copy_huge_pmd(dst_mm, src_mm, | ||
1007 | dst_pmd, src_pmd, addr, vma); | ||
1008 | if (err == -ENOMEM) | ||
1009 | return -ENOMEM; | ||
1010 | if (!err) | ||
1011 | continue; | ||
1012 | /* fall through */ | ||
1013 | } | ||
798 | if (pmd_none_or_clear_bad(src_pmd)) | 1014 | if (pmd_none_or_clear_bad(src_pmd)) |
799 | continue; | 1015 | continue; |
800 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, | 1016 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, |
@@ -891,26 +1107,26 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
891 | static unsigned long zap_pte_range(struct mmu_gather *tlb, | 1107 | static unsigned long zap_pte_range(struct mmu_gather *tlb, |
892 | struct vm_area_struct *vma, pmd_t *pmd, | 1108 | struct vm_area_struct *vma, pmd_t *pmd, |
893 | unsigned long addr, unsigned long end, | 1109 | unsigned long addr, unsigned long end, |
894 | long *zap_work, struct zap_details *details) | 1110 | struct zap_details *details) |
895 | { | 1111 | { |
896 | struct mm_struct *mm = tlb->mm; | 1112 | struct mm_struct *mm = tlb->mm; |
897 | pte_t *pte; | 1113 | int force_flush = 0; |
898 | spinlock_t *ptl; | ||
899 | int rss[NR_MM_COUNTERS]; | 1114 | int rss[NR_MM_COUNTERS]; |
1115 | spinlock_t *ptl; | ||
1116 | pte_t *start_pte; | ||
1117 | pte_t *pte; | ||
900 | 1118 | ||
1119 | again: | ||
901 | init_rss_vec(rss); | 1120 | init_rss_vec(rss); |
902 | 1121 | start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | |
903 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 1122 | pte = start_pte; |
904 | arch_enter_lazy_mmu_mode(); | 1123 | arch_enter_lazy_mmu_mode(); |
905 | do { | 1124 | do { |
906 | pte_t ptent = *pte; | 1125 | pte_t ptent = *pte; |
907 | if (pte_none(ptent)) { | 1126 | if (pte_none(ptent)) { |
908 | (*zap_work)--; | ||
909 | continue; | 1127 | continue; |
910 | } | 1128 | } |
911 | 1129 | ||
912 | (*zap_work) -= PAGE_SIZE; | ||
913 | |||
914 | if (pte_present(ptent)) { | 1130 | if (pte_present(ptent)) { |
915 | struct page *page; | 1131 | struct page *page; |
916 | 1132 | ||
@@ -956,7 +1172,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
956 | page_remove_rmap(page); | 1172 | page_remove_rmap(page); |
957 | if (unlikely(page_mapcount(page) < 0)) | 1173 | if (unlikely(page_mapcount(page) < 0)) |
958 | print_bad_pte(vma, addr, ptent, page); | 1174 | print_bad_pte(vma, addr, ptent, page); |
959 | tlb_remove_page(tlb, page); | 1175 | force_flush = !__tlb_remove_page(tlb, page); |
1176 | if (force_flush) | ||
1177 | break; | ||
960 | continue; | 1178 | continue; |
961 | } | 1179 | } |
962 | /* | 1180 | /* |
@@ -977,11 +1195,23 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
977 | print_bad_pte(vma, addr, ptent, NULL); | 1195 | print_bad_pte(vma, addr, ptent, NULL); |
978 | } | 1196 | } |
979 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 1197 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
980 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 1198 | } while (pte++, addr += PAGE_SIZE, addr != end); |
981 | 1199 | ||
982 | add_mm_rss_vec(mm, rss); | 1200 | add_mm_rss_vec(mm, rss); |
983 | arch_leave_lazy_mmu_mode(); | 1201 | arch_leave_lazy_mmu_mode(); |
984 | pte_unmap_unlock(pte - 1, ptl); | 1202 | pte_unmap_unlock(start_pte, ptl); |
1203 | |||
1204 | /* | ||
1205 | * mmu_gather ran out of room to batch pages, we break out of | ||
1206 | * the PTE lock to avoid doing the potential expensive TLB invalidate | ||
1207 | * and page-free while holding it. | ||
1208 | */ | ||
1209 | if (force_flush) { | ||
1210 | force_flush = 0; | ||
1211 | tlb_flush_mmu(tlb); | ||
1212 | if (addr != end) | ||
1213 | goto again; | ||
1214 | } | ||
985 | 1215 | ||
986 | return addr; | 1216 | return addr; |
987 | } | 1217 | } |
@@ -989,7 +1219,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
989 | static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | 1219 | static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, |
990 | struct vm_area_struct *vma, pud_t *pud, | 1220 | struct vm_area_struct *vma, pud_t *pud, |
991 | unsigned long addr, unsigned long end, | 1221 | unsigned long addr, unsigned long end, |
992 | long *zap_work, struct zap_details *details) | 1222 | struct zap_details *details) |
993 | { | 1223 | { |
994 | pmd_t *pmd; | 1224 | pmd_t *pmd; |
995 | unsigned long next; | 1225 | unsigned long next; |
@@ -997,13 +1227,19 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
997 | pmd = pmd_offset(pud, addr); | 1227 | pmd = pmd_offset(pud, addr); |
998 | do { | 1228 | do { |
999 | next = pmd_addr_end(addr, end); | 1229 | next = pmd_addr_end(addr, end); |
1000 | if (pmd_none_or_clear_bad(pmd)) { | 1230 | if (pmd_trans_huge(*pmd)) { |
1001 | (*zap_work)--; | 1231 | if (next-addr != HPAGE_PMD_SIZE) { |
1002 | continue; | 1232 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); |
1233 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
1234 | } else if (zap_huge_pmd(tlb, vma, pmd)) | ||
1235 | continue; | ||
1236 | /* fall through */ | ||
1003 | } | 1237 | } |
1004 | next = zap_pte_range(tlb, vma, pmd, addr, next, | 1238 | if (pmd_none_or_clear_bad(pmd)) |
1005 | zap_work, details); | 1239 | continue; |
1006 | } while (pmd++, addr = next, (addr != end && *zap_work > 0)); | 1240 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); |
1241 | cond_resched(); | ||
1242 | } while (pmd++, addr = next, addr != end); | ||
1007 | 1243 | ||
1008 | return addr; | 1244 | return addr; |
1009 | } | 1245 | } |
@@ -1011,7 +1247,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1011 | static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | 1247 | static inline unsigned long zap_pud_range(struct mmu_gather *tlb, |
1012 | struct vm_area_struct *vma, pgd_t *pgd, | 1248 | struct vm_area_struct *vma, pgd_t *pgd, |
1013 | unsigned long addr, unsigned long end, | 1249 | unsigned long addr, unsigned long end, |
1014 | long *zap_work, struct zap_details *details) | 1250 | struct zap_details *details) |
1015 | { | 1251 | { |
1016 | pud_t *pud; | 1252 | pud_t *pud; |
1017 | unsigned long next; | 1253 | unsigned long next; |
@@ -1019,13 +1255,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | |||
1019 | pud = pud_offset(pgd, addr); | 1255 | pud = pud_offset(pgd, addr); |
1020 | do { | 1256 | do { |
1021 | next = pud_addr_end(addr, end); | 1257 | next = pud_addr_end(addr, end); |
1022 | if (pud_none_or_clear_bad(pud)) { | 1258 | if (pud_none_or_clear_bad(pud)) |
1023 | (*zap_work)--; | ||
1024 | continue; | 1259 | continue; |
1025 | } | 1260 | next = zap_pmd_range(tlb, vma, pud, addr, next, details); |
1026 | next = zap_pmd_range(tlb, vma, pud, addr, next, | 1261 | } while (pud++, addr = next, addr != end); |
1027 | zap_work, details); | ||
1028 | } while (pud++, addr = next, (addr != end && *zap_work > 0)); | ||
1029 | 1262 | ||
1030 | return addr; | 1263 | return addr; |
1031 | } | 1264 | } |
@@ -1033,7 +1266,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | |||
1033 | static unsigned long unmap_page_range(struct mmu_gather *tlb, | 1266 | static unsigned long unmap_page_range(struct mmu_gather *tlb, |
1034 | struct vm_area_struct *vma, | 1267 | struct vm_area_struct *vma, |
1035 | unsigned long addr, unsigned long end, | 1268 | unsigned long addr, unsigned long end, |
1036 | long *zap_work, struct zap_details *details) | 1269 | struct zap_details *details) |
1037 | { | 1270 | { |
1038 | pgd_t *pgd; | 1271 | pgd_t *pgd; |
1039 | unsigned long next; | 1272 | unsigned long next; |
@@ -1047,13 +1280,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1047 | pgd = pgd_offset(vma->vm_mm, addr); | 1280 | pgd = pgd_offset(vma->vm_mm, addr); |
1048 | do { | 1281 | do { |
1049 | next = pgd_addr_end(addr, end); | 1282 | next = pgd_addr_end(addr, end); |
1050 | if (pgd_none_or_clear_bad(pgd)) { | 1283 | if (pgd_none_or_clear_bad(pgd)) |
1051 | (*zap_work)--; | ||
1052 | continue; | 1284 | continue; |
1053 | } | 1285 | next = zap_pud_range(tlb, vma, pgd, addr, next, details); |
1054 | next = zap_pud_range(tlb, vma, pgd, addr, next, | 1286 | } while (pgd++, addr = next, addr != end); |
1055 | zap_work, details); | ||
1056 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); | ||
1057 | tlb_end_vma(tlb, vma); | 1287 | tlb_end_vma(tlb, vma); |
1058 | mem_cgroup_uncharge_end(); | 1288 | mem_cgroup_uncharge_end(); |
1059 | 1289 | ||
@@ -1069,7 +1299,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1069 | 1299 | ||
1070 | /** | 1300 | /** |
1071 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 1301 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
1072 | * @tlbp: address of the caller's struct mmu_gather | 1302 | * @tlb: address of the caller's struct mmu_gather |
1073 | * @vma: the starting vma | 1303 | * @vma: the starting vma |
1074 | * @start_addr: virtual address at which to start unmapping | 1304 | * @start_addr: virtual address at which to start unmapping |
1075 | * @end_addr: virtual address at which to end unmapping | 1305 | * @end_addr: virtual address at which to end unmapping |
@@ -1093,17 +1323,12 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1093 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() | 1323 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() |
1094 | * drops the lock and schedules. | 1324 | * drops the lock and schedules. |
1095 | */ | 1325 | */ |
1096 | unsigned long unmap_vmas(struct mmu_gather **tlbp, | 1326 | unsigned long unmap_vmas(struct mmu_gather *tlb, |
1097 | struct vm_area_struct *vma, unsigned long start_addr, | 1327 | struct vm_area_struct *vma, unsigned long start_addr, |
1098 | unsigned long end_addr, unsigned long *nr_accounted, | 1328 | unsigned long end_addr, unsigned long *nr_accounted, |
1099 | struct zap_details *details) | 1329 | struct zap_details *details) |
1100 | { | 1330 | { |
1101 | long zap_work = ZAP_BLOCK_SIZE; | ||
1102 | unsigned long tlb_start = 0; /* For tlb_finish_mmu */ | ||
1103 | int tlb_start_valid = 0; | ||
1104 | unsigned long start = start_addr; | 1331 | unsigned long start = start_addr; |
1105 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; | ||
1106 | int fullmm = (*tlbp)->fullmm; | ||
1107 | struct mm_struct *mm = vma->vm_mm; | 1332 | struct mm_struct *mm = vma->vm_mm; |
1108 | 1333 | ||
1109 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | 1334 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); |
@@ -1124,11 +1349,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
1124 | untrack_pfn_vma(vma, 0, 0); | 1349 | untrack_pfn_vma(vma, 0, 0); |
1125 | 1350 | ||
1126 | while (start != end) { | 1351 | while (start != end) { |
1127 | if (!tlb_start_valid) { | ||
1128 | tlb_start = start; | ||
1129 | tlb_start_valid = 1; | ||
1130 | } | ||
1131 | |||
1132 | if (unlikely(is_vm_hugetlb_page(vma))) { | 1352 | if (unlikely(is_vm_hugetlb_page(vma))) { |
1133 | /* | 1353 | /* |
1134 | * It is undesirable to test vma->vm_file as it | 1354 | * It is undesirable to test vma->vm_file as it |
@@ -1141,39 +1361,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
1141 | * Since no pte has actually been setup, it is | 1361 | * Since no pte has actually been setup, it is |
1142 | * safe to do nothing in this case. | 1362 | * safe to do nothing in this case. |
1143 | */ | 1363 | */ |
1144 | if (vma->vm_file) { | 1364 | if (vma->vm_file) |
1145 | unmap_hugepage_range(vma, start, end, NULL); | 1365 | unmap_hugepage_range(vma, start, end, NULL); |
1146 | zap_work -= (end - start) / | ||
1147 | pages_per_huge_page(hstate_vma(vma)); | ||
1148 | } | ||
1149 | 1366 | ||
1150 | start = end; | 1367 | start = end; |
1151 | } else | 1368 | } else |
1152 | start = unmap_page_range(*tlbp, vma, | 1369 | start = unmap_page_range(tlb, vma, start, end, details); |
1153 | start, end, &zap_work, details); | ||
1154 | |||
1155 | if (zap_work > 0) { | ||
1156 | BUG_ON(start != end); | ||
1157 | break; | ||
1158 | } | ||
1159 | |||
1160 | tlb_finish_mmu(*tlbp, tlb_start, start); | ||
1161 | |||
1162 | if (need_resched() || | ||
1163 | (i_mmap_lock && spin_needbreak(i_mmap_lock))) { | ||
1164 | if (i_mmap_lock) { | ||
1165 | *tlbp = NULL; | ||
1166 | goto out; | ||
1167 | } | ||
1168 | cond_resched(); | ||
1169 | } | ||
1170 | |||
1171 | *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); | ||
1172 | tlb_start_valid = 0; | ||
1173 | zap_work = ZAP_BLOCK_SIZE; | ||
1174 | } | 1370 | } |
1175 | } | 1371 | } |
1176 | out: | 1372 | |
1177 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | 1373 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); |
1178 | return start; /* which is now the end (or restart) address */ | 1374 | return start; /* which is now the end (or restart) address */ |
1179 | } | 1375 | } |
@@ -1189,16 +1385,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
1189 | unsigned long size, struct zap_details *details) | 1385 | unsigned long size, struct zap_details *details) |
1190 | { | 1386 | { |
1191 | struct mm_struct *mm = vma->vm_mm; | 1387 | struct mm_struct *mm = vma->vm_mm; |
1192 | struct mmu_gather *tlb; | 1388 | struct mmu_gather tlb; |
1193 | unsigned long end = address + size; | 1389 | unsigned long end = address + size; |
1194 | unsigned long nr_accounted = 0; | 1390 | unsigned long nr_accounted = 0; |
1195 | 1391 | ||
1196 | lru_add_drain(); | 1392 | lru_add_drain(); |
1197 | tlb = tlb_gather_mmu(mm, 0); | 1393 | tlb_gather_mmu(&tlb, mm, 0); |
1198 | update_hiwater_rss(mm); | 1394 | update_hiwater_rss(mm); |
1199 | end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); | 1395 | end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); |
1200 | if (tlb) | 1396 | tlb_finish_mmu(&tlb, address, end); |
1201 | tlb_finish_mmu(tlb, address, end); | ||
1202 | return end; | 1397 | return end; |
1203 | } | 1398 | } |
1204 | 1399 | ||
@@ -1262,7 +1457,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1262 | pud = pud_offset(pgd, address); | 1457 | pud = pud_offset(pgd, address); |
1263 | if (pud_none(*pud)) | 1458 | if (pud_none(*pud)) |
1264 | goto no_page_table; | 1459 | goto no_page_table; |
1265 | if (pud_huge(*pud)) { | 1460 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
1266 | BUG_ON(flags & FOLL_GET); | 1461 | BUG_ON(flags & FOLL_GET); |
1267 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 1462 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); |
1268 | goto out; | 1463 | goto out; |
@@ -1273,11 +1468,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1273 | pmd = pmd_offset(pud, address); | 1468 | pmd = pmd_offset(pud, address); |
1274 | if (pmd_none(*pmd)) | 1469 | if (pmd_none(*pmd)) |
1275 | goto no_page_table; | 1470 | goto no_page_table; |
1276 | if (pmd_huge(*pmd)) { | 1471 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
1277 | BUG_ON(flags & FOLL_GET); | 1472 | BUG_ON(flags & FOLL_GET); |
1278 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1473 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
1279 | goto out; | 1474 | goto out; |
1280 | } | 1475 | } |
1476 | if (pmd_trans_huge(*pmd)) { | ||
1477 | if (flags & FOLL_SPLIT) { | ||
1478 | split_huge_page_pmd(mm, pmd); | ||
1479 | goto split_fallthrough; | ||
1480 | } | ||
1481 | spin_lock(&mm->page_table_lock); | ||
1482 | if (likely(pmd_trans_huge(*pmd))) { | ||
1483 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
1484 | spin_unlock(&mm->page_table_lock); | ||
1485 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1486 | } else { | ||
1487 | page = follow_trans_huge_pmd(mm, address, | ||
1488 | pmd, flags); | ||
1489 | spin_unlock(&mm->page_table_lock); | ||
1490 | goto out; | ||
1491 | } | ||
1492 | } else | ||
1493 | spin_unlock(&mm->page_table_lock); | ||
1494 | /* fall through */ | ||
1495 | } | ||
1496 | split_fallthrough: | ||
1281 | if (unlikely(pmd_bad(*pmd))) | 1497 | if (unlikely(pmd_bad(*pmd))) |
1282 | goto no_page_table; | 1498 | goto no_page_table; |
1283 | 1499 | ||
@@ -1310,6 +1526,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1310 | */ | 1526 | */ |
1311 | mark_page_accessed(page); | 1527 | mark_page_accessed(page); |
1312 | } | 1528 | } |
1529 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
1530 | /* | ||
1531 | * The preliminary mapping check is mainly to avoid the | ||
1532 | * pointless overhead of lock_page on the ZERO_PAGE | ||
1533 | * which might bounce very badly if there is contention. | ||
1534 | * | ||
1535 | * If the page is already locked, we don't need to | ||
1536 | * handle it now - vmscan will handle it later if and | ||
1537 | * when it attempts to reclaim the page. | ||
1538 | */ | ||
1539 | if (page->mapping && trylock_page(page)) { | ||
1540 | lru_add_drain(); /* push cached pages to LRU */ | ||
1541 | /* | ||
1542 | * Because we lock page here and migration is | ||
1543 | * blocked by the pte's page reference, we need | ||
1544 | * only check for file-cache page truncation. | ||
1545 | */ | ||
1546 | if (page->mapping) | ||
1547 | mlock_vma_page(page); | ||
1548 | unlock_page(page); | ||
1549 | } | ||
1550 | } | ||
1313 | unlock: | 1551 | unlock: |
1314 | pte_unmap_unlock(ptep, ptl); | 1552 | pte_unmap_unlock(ptep, ptl); |
1315 | out: | 1553 | out: |
@@ -1339,9 +1577,65 @@ no_page_table: | |||
1339 | return page; | 1577 | return page; |
1340 | } | 1578 | } |
1341 | 1579 | ||
1580 | static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) | ||
1581 | { | ||
1582 | return stack_guard_page_start(vma, addr) || | ||
1583 | stack_guard_page_end(vma, addr+PAGE_SIZE); | ||
1584 | } | ||
1585 | |||
1586 | /** | ||
1587 | * __get_user_pages() - pin user pages in memory | ||
1588 | * @tsk: task_struct of target task | ||
1589 | * @mm: mm_struct of target mm | ||
1590 | * @start: starting user address | ||
1591 | * @nr_pages: number of pages from start to pin | ||
1592 | * @gup_flags: flags modifying pin behaviour | ||
1593 | * @pages: array that receives pointers to the pages pinned. | ||
1594 | * Should be at least nr_pages long. Or NULL, if caller | ||
1595 | * only intends to ensure the pages are faulted in. | ||
1596 | * @vmas: array of pointers to vmas corresponding to each page. | ||
1597 | * Or NULL if the caller does not require them. | ||
1598 | * @nonblocking: whether waiting for disk IO or mmap_sem contention | ||
1599 | * | ||
1600 | * Returns number of pages pinned. This may be fewer than the number | ||
1601 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
1602 | * were pinned, returns -errno. Each page returned must be released | ||
1603 | * with a put_page() call when it is finished with. vmas will only | ||
1604 | * remain valid while mmap_sem is held. | ||
1605 | * | ||
1606 | * Must be called with mmap_sem held for read or write. | ||
1607 | * | ||
1608 | * __get_user_pages walks a process's page tables and takes a reference to | ||
1609 | * each struct page that each user address corresponds to at a given | ||
1610 | * instant. That is, it takes the page that would be accessed if a user | ||
1611 | * thread accesses the given user virtual address at that instant. | ||
1612 | * | ||
1613 | * This does not guarantee that the page exists in the user mappings when | ||
1614 | * __get_user_pages returns, and there may even be a completely different | ||
1615 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
1616 | * and subsequently re faulted). However it does guarantee that the page | ||
1617 | * won't be freed completely. And mostly callers simply care that the page | ||
1618 | * contains data that was valid *at some point in time*. Typically, an IO | ||
1619 | * or similar operation cannot guarantee anything stronger anyway because | ||
1620 | * locks can't be held over the syscall boundary. | ||
1621 | * | ||
1622 | * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If | ||
1623 | * the page is written to, set_page_dirty (or set_page_dirty_lock, as | ||
1624 | * appropriate) must be called after the page is finished with, and | ||
1625 | * before put_page is called. | ||
1626 | * | ||
1627 | * If @nonblocking != NULL, __get_user_pages will not wait for disk IO | ||
1628 | * or mmap_sem contention, and if waiting is needed to pin all pages, | ||
1629 | * *@nonblocking will be set to 0. | ||
1630 | * | ||
1631 | * In most cases, get_user_pages or get_user_pages_fast should be used | ||
1632 | * instead of __get_user_pages. __get_user_pages should be used only if | ||
1633 | * you need some special @gup_flags. | ||
1634 | */ | ||
1342 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1635 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1343 | unsigned long start, int nr_pages, unsigned int gup_flags, | 1636 | unsigned long start, int nr_pages, unsigned int gup_flags, |
1344 | struct page **pages, struct vm_area_struct **vmas) | 1637 | struct page **pages, struct vm_area_struct **vmas, |
1638 | int *nonblocking) | ||
1345 | { | 1639 | { |
1346 | int i; | 1640 | int i; |
1347 | unsigned long vm_flags; | 1641 | unsigned long vm_flags; |
@@ -1365,9 +1659,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1365 | struct vm_area_struct *vma; | 1659 | struct vm_area_struct *vma; |
1366 | 1660 | ||
1367 | vma = find_extend_vma(mm, start); | 1661 | vma = find_extend_vma(mm, start); |
1368 | if (!vma && in_gate_area(tsk, start)) { | 1662 | if (!vma && in_gate_area(mm, start)) { |
1369 | unsigned long pg = start & PAGE_MASK; | 1663 | unsigned long pg = start & PAGE_MASK; |
1370 | struct vm_area_struct *gate_vma = get_gate_vma(tsk); | ||
1371 | pgd_t *pgd; | 1664 | pgd_t *pgd; |
1372 | pud_t *pud; | 1665 | pud_t *pud; |
1373 | pmd_t *pmd; | 1666 | pmd_t *pmd; |
@@ -1386,15 +1679,17 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1386 | pmd = pmd_offset(pud, pg); | 1679 | pmd = pmd_offset(pud, pg); |
1387 | if (pmd_none(*pmd)) | 1680 | if (pmd_none(*pmd)) |
1388 | return i ? : -EFAULT; | 1681 | return i ? : -EFAULT; |
1682 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1389 | pte = pte_offset_map(pmd, pg); | 1683 | pte = pte_offset_map(pmd, pg); |
1390 | if (pte_none(*pte)) { | 1684 | if (pte_none(*pte)) { |
1391 | pte_unmap(pte); | 1685 | pte_unmap(pte); |
1392 | return i ? : -EFAULT; | 1686 | return i ? : -EFAULT; |
1393 | } | 1687 | } |
1688 | vma = get_gate_vma(mm); | ||
1394 | if (pages) { | 1689 | if (pages) { |
1395 | struct page *page; | 1690 | struct page *page; |
1396 | 1691 | ||
1397 | page = vm_normal_page(gate_vma, start, *pte); | 1692 | page = vm_normal_page(vma, start, *pte); |
1398 | if (!page) { | 1693 | if (!page) { |
1399 | if (!(gup_flags & FOLL_DUMP) && | 1694 | if (!(gup_flags & FOLL_DUMP) && |
1400 | is_zero_pfn(pte_pfn(*pte))) | 1695 | is_zero_pfn(pte_pfn(*pte))) |
@@ -1408,12 +1703,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1408 | get_page(page); | 1703 | get_page(page); |
1409 | } | 1704 | } |
1410 | pte_unmap(pte); | 1705 | pte_unmap(pte); |
1411 | if (vmas) | 1706 | goto next_page; |
1412 | vmas[i] = gate_vma; | ||
1413 | i++; | ||
1414 | start += PAGE_SIZE; | ||
1415 | nr_pages--; | ||
1416 | continue; | ||
1417 | } | 1707 | } |
1418 | 1708 | ||
1419 | if (!vma || | 1709 | if (!vma || |
@@ -1441,23 +1731,52 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1441 | cond_resched(); | 1731 | cond_resched(); |
1442 | while (!(page = follow_page(vma, start, foll_flags))) { | 1732 | while (!(page = follow_page(vma, start, foll_flags))) { |
1443 | int ret; | 1733 | int ret; |
1734 | unsigned int fault_flags = 0; | ||
1735 | |||
1736 | /* For mlock, just skip the stack guard page. */ | ||
1737 | if (foll_flags & FOLL_MLOCK) { | ||
1738 | if (stack_guard_page(vma, start)) | ||
1739 | goto next_page; | ||
1740 | } | ||
1741 | if (foll_flags & FOLL_WRITE) | ||
1742 | fault_flags |= FAULT_FLAG_WRITE; | ||
1743 | if (nonblocking) | ||
1744 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
1745 | if (foll_flags & FOLL_NOWAIT) | ||
1746 | fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); | ||
1444 | 1747 | ||
1445 | ret = handle_mm_fault(mm, vma, start, | 1748 | ret = handle_mm_fault(mm, vma, start, |
1446 | (foll_flags & FOLL_WRITE) ? | 1749 | fault_flags); |
1447 | FAULT_FLAG_WRITE : 0); | ||
1448 | 1750 | ||
1449 | if (ret & VM_FAULT_ERROR) { | 1751 | if (ret & VM_FAULT_ERROR) { |
1450 | if (ret & VM_FAULT_OOM) | 1752 | if (ret & VM_FAULT_OOM) |
1451 | return i ? i : -ENOMEM; | 1753 | return i ? i : -ENOMEM; |
1452 | if (ret & | 1754 | if (ret & (VM_FAULT_HWPOISON | |
1453 | (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) | 1755 | VM_FAULT_HWPOISON_LARGE)) { |
1756 | if (i) | ||
1757 | return i; | ||
1758 | else if (gup_flags & FOLL_HWPOISON) | ||
1759 | return -EHWPOISON; | ||
1760 | else | ||
1761 | return -EFAULT; | ||
1762 | } | ||
1763 | if (ret & VM_FAULT_SIGBUS) | ||
1454 | return i ? i : -EFAULT; | 1764 | return i ? i : -EFAULT; |
1455 | BUG(); | 1765 | BUG(); |
1456 | } | 1766 | } |
1457 | if (ret & VM_FAULT_MAJOR) | 1767 | |
1458 | tsk->maj_flt++; | 1768 | if (tsk) { |
1459 | else | 1769 | if (ret & VM_FAULT_MAJOR) |
1460 | tsk->min_flt++; | 1770 | tsk->maj_flt++; |
1771 | else | ||
1772 | tsk->min_flt++; | ||
1773 | } | ||
1774 | |||
1775 | if (ret & VM_FAULT_RETRY) { | ||
1776 | if (nonblocking) | ||
1777 | *nonblocking = 0; | ||
1778 | return i; | ||
1779 | } | ||
1461 | 1780 | ||
1462 | /* | 1781 | /* |
1463 | * The VM_FAULT_WRITE bit tells us that | 1782 | * The VM_FAULT_WRITE bit tells us that |
@@ -1485,6 +1804,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1485 | flush_anon_page(vma, page, start); | 1804 | flush_anon_page(vma, page, start); |
1486 | flush_dcache_page(page); | 1805 | flush_dcache_page(page); |
1487 | } | 1806 | } |
1807 | next_page: | ||
1488 | if (vmas) | 1808 | if (vmas) |
1489 | vmas[i] = vma; | 1809 | vmas[i] = vma; |
1490 | i++; | 1810 | i++; |
@@ -1494,10 +1814,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1494 | } while (nr_pages); | 1814 | } while (nr_pages); |
1495 | return i; | 1815 | return i; |
1496 | } | 1816 | } |
1817 | EXPORT_SYMBOL(__get_user_pages); | ||
1497 | 1818 | ||
1498 | /** | 1819 | /** |
1499 | * get_user_pages() - pin user pages in memory | 1820 | * get_user_pages() - pin user pages in memory |
1500 | * @tsk: task_struct of target task | 1821 | * @tsk: the task_struct to use for page fault accounting, or |
1822 | * NULL if faults are not to be recorded. | ||
1501 | * @mm: mm_struct of target mm | 1823 | * @mm: mm_struct of target mm |
1502 | * @start: starting user address | 1824 | * @start: starting user address |
1503 | * @nr_pages: number of pages from start to pin | 1825 | * @nr_pages: number of pages from start to pin |
@@ -1558,7 +1880,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1558 | if (force) | 1880 | if (force) |
1559 | flags |= FOLL_FORCE; | 1881 | flags |= FOLL_FORCE; |
1560 | 1882 | ||
1561 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 1883 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, |
1884 | NULL); | ||
1562 | } | 1885 | } |
1563 | EXPORT_SYMBOL(get_user_pages); | 1886 | EXPORT_SYMBOL(get_user_pages); |
1564 | 1887 | ||
@@ -1583,22 +1906,25 @@ struct page *get_dump_page(unsigned long addr) | |||
1583 | struct page *page; | 1906 | struct page *page; |
1584 | 1907 | ||
1585 | if (__get_user_pages(current, current->mm, addr, 1, | 1908 | if (__get_user_pages(current, current->mm, addr, 1, |
1586 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) | 1909 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, |
1910 | NULL) < 1) | ||
1587 | return NULL; | 1911 | return NULL; |
1588 | flush_cache_page(vma, addr, page_to_pfn(page)); | 1912 | flush_cache_page(vma, addr, page_to_pfn(page)); |
1589 | return page; | 1913 | return page; |
1590 | } | 1914 | } |
1591 | #endif /* CONFIG_ELF_CORE */ | 1915 | #endif /* CONFIG_ELF_CORE */ |
1592 | 1916 | ||
1593 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, | 1917 | pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, |
1594 | spinlock_t **ptl) | 1918 | spinlock_t **ptl) |
1595 | { | 1919 | { |
1596 | pgd_t * pgd = pgd_offset(mm, addr); | 1920 | pgd_t * pgd = pgd_offset(mm, addr); |
1597 | pud_t * pud = pud_alloc(mm, pgd, addr); | 1921 | pud_t * pud = pud_alloc(mm, pgd, addr); |
1598 | if (pud) { | 1922 | if (pud) { |
1599 | pmd_t * pmd = pmd_alloc(mm, pud, addr); | 1923 | pmd_t * pmd = pmd_alloc(mm, pud, addr); |
1600 | if (pmd) | 1924 | if (pmd) { |
1925 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1601 | return pte_alloc_map_lock(mm, pmd, addr, ptl); | 1926 | return pte_alloc_map_lock(mm, pmd, addr, ptl); |
1927 | } | ||
1602 | } | 1928 | } |
1603 | return NULL; | 1929 | return NULL; |
1604 | } | 1930 | } |
@@ -1817,6 +2143,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
1817 | pmd = pmd_alloc(mm, pud, addr); | 2143 | pmd = pmd_alloc(mm, pud, addr); |
1818 | if (!pmd) | 2144 | if (!pmd) |
1819 | return -ENOMEM; | 2145 | return -ENOMEM; |
2146 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1820 | do { | 2147 | do { |
1821 | next = pmd_addr_end(addr, end); | 2148 | next = pmd_addr_end(addr, end); |
1822 | if (remap_pte_range(mm, pmd, addr, next, | 2149 | if (remap_pte_range(mm, pmd, addr, next, |
@@ -2026,10 +2353,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range); | |||
2026 | * handle_pte_fault chooses page fault handler according to an entry | 2353 | * handle_pte_fault chooses page fault handler according to an entry |
2027 | * which was read non-atomically. Before making any commitment, on | 2354 | * which was read non-atomically. Before making any commitment, on |
2028 | * those architectures or configurations (e.g. i386 with PAE) which | 2355 | * those architectures or configurations (e.g. i386 with PAE) which |
2029 | * might give a mix of unmatched parts, do_swap_page and do_file_page | 2356 | * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault |
2030 | * must check under lock before unmapping the pte and proceeding | 2357 | * must check under lock before unmapping the pte and proceeding |
2031 | * (but do_wp_page is only called after already making such a check; | 2358 | * (but do_wp_page is only called after already making such a check; |
2032 | * and do_anonymous_page and do_no_page can safely check later on). | 2359 | * and do_anonymous_page can safely check later on). |
2033 | */ | 2360 | */ |
2034 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | 2361 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, |
2035 | pte_t *page_table, pte_t orig_pte) | 2362 | pte_t *page_table, pte_t orig_pte) |
@@ -2047,19 +2374,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | |||
2047 | return same; | 2374 | return same; |
2048 | } | 2375 | } |
2049 | 2376 | ||
2050 | /* | ||
2051 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when | ||
2052 | * servicing faults for write access. In the normal case, do always want | ||
2053 | * pte_mkwrite. But get_user_pages can cause write faults for mappings | ||
2054 | * that do not have writing enabled, when used by access_process_vm. | ||
2055 | */ | ||
2056 | static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | ||
2057 | { | ||
2058 | if (likely(vma->vm_flags & VM_WRITE)) | ||
2059 | pte = pte_mkwrite(pte); | ||
2060 | return pte; | ||
2061 | } | ||
2062 | |||
2063 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 2377 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
2064 | { | 2378 | { |
2065 | /* | 2379 | /* |
@@ -2079,7 +2393,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo | |||
2079 | * zeroes. | 2393 | * zeroes. |
2080 | */ | 2394 | */ |
2081 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) | 2395 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) |
2082 | memset(kaddr, 0, PAGE_SIZE); | 2396 | clear_page(kaddr); |
2083 | kunmap_atomic(kaddr, KM_USER0); | 2397 | kunmap_atomic(kaddr, KM_USER0); |
2084 | flush_dcache_page(dst); | 2398 | flush_dcache_page(dst); |
2085 | } else | 2399 | } else |
@@ -2107,10 +2421,11 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo | |||
2107 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2421 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2108 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2422 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2109 | spinlock_t *ptl, pte_t orig_pte) | 2423 | spinlock_t *ptl, pte_t orig_pte) |
2424 | __releases(ptl) | ||
2110 | { | 2425 | { |
2111 | struct page *old_page, *new_page; | 2426 | struct page *old_page, *new_page; |
2112 | pte_t entry; | 2427 | pte_t entry; |
2113 | int reuse = 0, ret = 0; | 2428 | int ret = 0; |
2114 | int page_mkwrite = 0; | 2429 | int page_mkwrite = 0; |
2115 | struct page *dirty_page = NULL; | 2430 | struct page *dirty_page = NULL; |
2116 | 2431 | ||
@@ -2142,19 +2457,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2142 | &ptl); | 2457 | &ptl); |
2143 | if (!pte_same(*page_table, orig_pte)) { | 2458 | if (!pte_same(*page_table, orig_pte)) { |
2144 | unlock_page(old_page); | 2459 | unlock_page(old_page); |
2145 | page_cache_release(old_page); | ||
2146 | goto unlock; | 2460 | goto unlock; |
2147 | } | 2461 | } |
2148 | page_cache_release(old_page); | 2462 | page_cache_release(old_page); |
2149 | } | 2463 | } |
2150 | reuse = reuse_swap_page(old_page); | 2464 | if (reuse_swap_page(old_page)) { |
2151 | if (reuse) | ||
2152 | /* | 2465 | /* |
2153 | * The page is all ours. Move it to our anon_vma so | 2466 | * The page is all ours. Move it to our anon_vma so |
2154 | * the rmap code will not search our parent or siblings. | 2467 | * the rmap code will not search our parent or siblings. |
2155 | * Protected against the rmap code by the page lock. | 2468 | * Protected against the rmap code by the page lock. |
2156 | */ | 2469 | */ |
2157 | page_move_anon_rmap(old_page, vma, address); | 2470 | page_move_anon_rmap(old_page, vma, address); |
2471 | unlock_page(old_page); | ||
2472 | goto reuse; | ||
2473 | } | ||
2158 | unlock_page(old_page); | 2474 | unlock_page(old_page); |
2159 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2475 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2160 | (VM_WRITE|VM_SHARED))) { | 2476 | (VM_WRITE|VM_SHARED))) { |
@@ -2210,7 +2526,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2210 | &ptl); | 2526 | &ptl); |
2211 | if (!pte_same(*page_table, orig_pte)) { | 2527 | if (!pte_same(*page_table, orig_pte)) { |
2212 | unlock_page(old_page); | 2528 | unlock_page(old_page); |
2213 | page_cache_release(old_page); | ||
2214 | goto unlock; | 2529 | goto unlock; |
2215 | } | 2530 | } |
2216 | 2531 | ||
@@ -2218,18 +2533,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2218 | } | 2533 | } |
2219 | dirty_page = old_page; | 2534 | dirty_page = old_page; |
2220 | get_page(dirty_page); | 2535 | get_page(dirty_page); |
2221 | reuse = 1; | ||
2222 | } | ||
2223 | 2536 | ||
2224 | if (reuse) { | ||
2225 | reuse: | 2537 | reuse: |
2226 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2538 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2227 | entry = pte_mkyoung(orig_pte); | 2539 | entry = pte_mkyoung(orig_pte); |
2228 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2540 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2229 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 2541 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
2230 | update_mmu_cache(vma, address, page_table); | 2542 | update_mmu_cache(vma, address, page_table); |
2543 | pte_unmap_unlock(page_table, ptl); | ||
2231 | ret |= VM_FAULT_WRITE; | 2544 | ret |= VM_FAULT_WRITE; |
2232 | goto unlock; | 2545 | |
2546 | if (!dirty_page) | ||
2547 | return ret; | ||
2548 | |||
2549 | /* | ||
2550 | * Yes, Virginia, this is actually required to prevent a race | ||
2551 | * with clear_page_dirty_for_io() from clearing the page dirty | ||
2552 | * bit after it clear all dirty ptes, but before a racing | ||
2553 | * do_wp_page installs a dirty pte. | ||
2554 | * | ||
2555 | * __do_fault is protected similarly. | ||
2556 | */ | ||
2557 | if (!page_mkwrite) { | ||
2558 | wait_on_page_locked(dirty_page); | ||
2559 | set_page_dirty_balance(dirty_page, page_mkwrite); | ||
2560 | } | ||
2561 | put_page(dirty_page); | ||
2562 | if (page_mkwrite) { | ||
2563 | struct address_space *mapping = dirty_page->mapping; | ||
2564 | |||
2565 | set_page_dirty(dirty_page); | ||
2566 | unlock_page(dirty_page); | ||
2567 | page_cache_release(dirty_page); | ||
2568 | if (mapping) { | ||
2569 | /* | ||
2570 | * Some device drivers do not set page.mapping | ||
2571 | * but still dirty their pages | ||
2572 | */ | ||
2573 | balance_dirty_pages_ratelimited(mapping); | ||
2574 | } | ||
2575 | } | ||
2576 | |||
2577 | /* file_update_time outside page_lock */ | ||
2578 | if (vma->vm_file) | ||
2579 | file_update_time(vma->vm_file); | ||
2580 | |||
2581 | return ret; | ||
2233 | } | 2582 | } |
2234 | 2583 | ||
2235 | /* | 2584 | /* |
@@ -2254,16 +2603,6 @@ gotten: | |||
2254 | } | 2603 | } |
2255 | __SetPageUptodate(new_page); | 2604 | __SetPageUptodate(new_page); |
2256 | 2605 | ||
2257 | /* | ||
2258 | * Don't let another task, with possibly unlocked vma, | ||
2259 | * keep the mlocked page. | ||
2260 | */ | ||
2261 | if ((vma->vm_flags & VM_LOCKED) && old_page) { | ||
2262 | lock_page(old_page); /* for LRU manipulation */ | ||
2263 | clear_page_mlock(old_page); | ||
2264 | unlock_page(old_page); | ||
2265 | } | ||
2266 | |||
2267 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) | 2606 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
2268 | goto oom_free_new; | 2607 | goto oom_free_new; |
2269 | 2608 | ||
@@ -2331,42 +2670,19 @@ gotten: | |||
2331 | 2670 | ||
2332 | if (new_page) | 2671 | if (new_page) |
2333 | page_cache_release(new_page); | 2672 | page_cache_release(new_page); |
2334 | if (old_page) | ||
2335 | page_cache_release(old_page); | ||
2336 | unlock: | 2673 | unlock: |
2337 | pte_unmap_unlock(page_table, ptl); | 2674 | pte_unmap_unlock(page_table, ptl); |
2338 | if (dirty_page) { | 2675 | if (old_page) { |
2339 | /* | 2676 | /* |
2340 | * Yes, Virginia, this is actually required to prevent a race | 2677 | * Don't let another task, with possibly unlocked vma, |
2341 | * with clear_page_dirty_for_io() from clearing the page dirty | 2678 | * keep the mlocked page. |
2342 | * bit after it clear all dirty ptes, but before a racing | ||
2343 | * do_wp_page installs a dirty pte. | ||
2344 | * | ||
2345 | * do_no_page is protected similarly. | ||
2346 | */ | 2679 | */ |
2347 | if (!page_mkwrite) { | 2680 | if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { |
2348 | wait_on_page_locked(dirty_page); | 2681 | lock_page(old_page); /* LRU manipulation */ |
2349 | set_page_dirty_balance(dirty_page, page_mkwrite); | 2682 | munlock_vma_page(old_page); |
2350 | } | 2683 | unlock_page(old_page); |
2351 | put_page(dirty_page); | ||
2352 | if (page_mkwrite) { | ||
2353 | struct address_space *mapping = dirty_page->mapping; | ||
2354 | |||
2355 | set_page_dirty(dirty_page); | ||
2356 | unlock_page(dirty_page); | ||
2357 | page_cache_release(dirty_page); | ||
2358 | if (mapping) { | ||
2359 | /* | ||
2360 | * Some device drivers do not set page.mapping | ||
2361 | * but still dirty their pages | ||
2362 | */ | ||
2363 | balance_dirty_pages_ratelimited(mapping); | ||
2364 | } | ||
2365 | } | 2684 | } |
2366 | 2685 | page_cache_release(old_page); | |
2367 | /* file_update_time outside page_lock */ | ||
2368 | if (vma->vm_file) | ||
2369 | file_update_time(vma->vm_file); | ||
2370 | } | 2686 | } |
2371 | return ret; | 2687 | return ret; |
2372 | oom_free_new: | 2688 | oom_free_new: |
@@ -2386,96 +2702,11 @@ unwritable_page: | |||
2386 | return ret; | 2702 | return ret; |
2387 | } | 2703 | } |
2388 | 2704 | ||
2389 | /* | 2705 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
2390 | * Helper functions for unmap_mapping_range(). | ||
2391 | * | ||
2392 | * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ | ||
2393 | * | ||
2394 | * We have to restart searching the prio_tree whenever we drop the lock, | ||
2395 | * since the iterator is only valid while the lock is held, and anyway | ||
2396 | * a later vma might be split and reinserted earlier while lock dropped. | ||
2397 | * | ||
2398 | * The list of nonlinear vmas could be handled more efficiently, using | ||
2399 | * a placeholder, but handle it in the same way until a need is shown. | ||
2400 | * It is important to search the prio_tree before nonlinear list: a vma | ||
2401 | * may become nonlinear and be shifted from prio_tree to nonlinear list | ||
2402 | * while the lock is dropped; but never shifted from list to prio_tree. | ||
2403 | * | ||
2404 | * In order to make forward progress despite restarting the search, | ||
2405 | * vm_truncate_count is used to mark a vma as now dealt with, so we can | ||
2406 | * quickly skip it next time around. Since the prio_tree search only | ||
2407 | * shows us those vmas affected by unmapping the range in question, we | ||
2408 | * can't efficiently keep all vmas in step with mapping->truncate_count: | ||
2409 | * so instead reset them all whenever it wraps back to 0 (then go to 1). | ||
2410 | * mapping->truncate_count and vma->vm_truncate_count are protected by | ||
2411 | * i_mmap_lock. | ||
2412 | * | ||
2413 | * In order to make forward progress despite repeatedly restarting some | ||
2414 | * large vma, note the restart_addr from unmap_vmas when it breaks out: | ||
2415 | * and restart from that address when we reach that vma again. It might | ||
2416 | * have been split or merged, shrunk or extended, but never shifted: so | ||
2417 | * restart_addr remains valid so long as it remains in the vma's range. | ||
2418 | * unmap_mapping_range forces truncate_count to leap over page-aligned | ||
2419 | * values so we can save vma's restart_addr in its truncate_count field. | ||
2420 | */ | ||
2421 | #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) | ||
2422 | |||
2423 | static void reset_vma_truncate_counts(struct address_space *mapping) | ||
2424 | { | ||
2425 | struct vm_area_struct *vma; | ||
2426 | struct prio_tree_iter iter; | ||
2427 | |||
2428 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) | ||
2429 | vma->vm_truncate_count = 0; | ||
2430 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | ||
2431 | vma->vm_truncate_count = 0; | ||
2432 | } | ||
2433 | |||
2434 | static int unmap_mapping_range_vma(struct vm_area_struct *vma, | ||
2435 | unsigned long start_addr, unsigned long end_addr, | 2706 | unsigned long start_addr, unsigned long end_addr, |
2436 | struct zap_details *details) | 2707 | struct zap_details *details) |
2437 | { | 2708 | { |
2438 | unsigned long restart_addr; | 2709 | zap_page_range(vma, start_addr, end_addr - start_addr, details); |
2439 | int need_break; | ||
2440 | |||
2441 | /* | ||
2442 | * files that support invalidating or truncating portions of the | ||
2443 | * file from under mmaped areas must have their ->fault function | ||
2444 | * return a locked page (and set VM_FAULT_LOCKED in the return). | ||
2445 | * This provides synchronisation against concurrent unmapping here. | ||
2446 | */ | ||
2447 | |||
2448 | again: | ||
2449 | restart_addr = vma->vm_truncate_count; | ||
2450 | if (is_restart_addr(restart_addr) && start_addr < restart_addr) { | ||
2451 | start_addr = restart_addr; | ||
2452 | if (start_addr >= end_addr) { | ||
2453 | /* Top of vma has been split off since last time */ | ||
2454 | vma->vm_truncate_count = details->truncate_count; | ||
2455 | return 0; | ||
2456 | } | ||
2457 | } | ||
2458 | |||
2459 | restart_addr = zap_page_range(vma, start_addr, | ||
2460 | end_addr - start_addr, details); | ||
2461 | need_break = need_resched() || spin_needbreak(details->i_mmap_lock); | ||
2462 | |||
2463 | if (restart_addr >= end_addr) { | ||
2464 | /* We have now completed this vma: mark it so */ | ||
2465 | vma->vm_truncate_count = details->truncate_count; | ||
2466 | if (!need_break) | ||
2467 | return 0; | ||
2468 | } else { | ||
2469 | /* Note restart_addr in vma's truncate_count field */ | ||
2470 | vma->vm_truncate_count = restart_addr; | ||
2471 | if (!need_break) | ||
2472 | goto again; | ||
2473 | } | ||
2474 | |||
2475 | spin_unlock(details->i_mmap_lock); | ||
2476 | cond_resched(); | ||
2477 | spin_lock(details->i_mmap_lock); | ||
2478 | return -EINTR; | ||
2479 | } | 2710 | } |
2480 | 2711 | ||
2481 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, | 2712 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, |
@@ -2485,12 +2716,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root, | |||
2485 | struct prio_tree_iter iter; | 2716 | struct prio_tree_iter iter; |
2486 | pgoff_t vba, vea, zba, zea; | 2717 | pgoff_t vba, vea, zba, zea; |
2487 | 2718 | ||
2488 | restart: | ||
2489 | vma_prio_tree_foreach(vma, &iter, root, | 2719 | vma_prio_tree_foreach(vma, &iter, root, |
2490 | details->first_index, details->last_index) { | 2720 | details->first_index, details->last_index) { |
2491 | /* Skip quickly over those we have already dealt with */ | ||
2492 | if (vma->vm_truncate_count == details->truncate_count) | ||
2493 | continue; | ||
2494 | 2721 | ||
2495 | vba = vma->vm_pgoff; | 2722 | vba = vma->vm_pgoff; |
2496 | vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; | 2723 | vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; |
@@ -2502,11 +2729,10 @@ restart: | |||
2502 | if (zea > vea) | 2729 | if (zea > vea) |
2503 | zea = vea; | 2730 | zea = vea; |
2504 | 2731 | ||
2505 | if (unmap_mapping_range_vma(vma, | 2732 | unmap_mapping_range_vma(vma, |
2506 | ((zba - vba) << PAGE_SHIFT) + vma->vm_start, | 2733 | ((zba - vba) << PAGE_SHIFT) + vma->vm_start, |
2507 | ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, | 2734 | ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, |
2508 | details) < 0) | 2735 | details); |
2509 | goto restart; | ||
2510 | } | 2736 | } |
2511 | } | 2737 | } |
2512 | 2738 | ||
@@ -2521,15 +2747,9 @@ static inline void unmap_mapping_range_list(struct list_head *head, | |||
2521 | * across *all* the pages in each nonlinear VMA, not just the pages | 2747 | * across *all* the pages in each nonlinear VMA, not just the pages |
2522 | * whose virtual address lies outside the file truncation point. | 2748 | * whose virtual address lies outside the file truncation point. |
2523 | */ | 2749 | */ |
2524 | restart: | ||
2525 | list_for_each_entry(vma, head, shared.vm_set.list) { | 2750 | list_for_each_entry(vma, head, shared.vm_set.list) { |
2526 | /* Skip quickly over those we have already dealt with */ | ||
2527 | if (vma->vm_truncate_count == details->truncate_count) | ||
2528 | continue; | ||
2529 | details->nonlinear_vma = vma; | 2751 | details->nonlinear_vma = vma; |
2530 | if (unmap_mapping_range_vma(vma, vma->vm_start, | 2752 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); |
2531 | vma->vm_end, details) < 0) | ||
2532 | goto restart; | ||
2533 | } | 2753 | } |
2534 | } | 2754 | } |
2535 | 2755 | ||
@@ -2568,51 +2788,17 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2568 | details.last_index = hba + hlen - 1; | 2788 | details.last_index = hba + hlen - 1; |
2569 | if (details.last_index < details.first_index) | 2789 | if (details.last_index < details.first_index) |
2570 | details.last_index = ULONG_MAX; | 2790 | details.last_index = ULONG_MAX; |
2571 | details.i_mmap_lock = &mapping->i_mmap_lock; | ||
2572 | |||
2573 | spin_lock(&mapping->i_mmap_lock); | ||
2574 | 2791 | ||
2575 | /* Protect against endless unmapping loops */ | ||
2576 | mapping->truncate_count++; | ||
2577 | if (unlikely(is_restart_addr(mapping->truncate_count))) { | ||
2578 | if (mapping->truncate_count == 0) | ||
2579 | reset_vma_truncate_counts(mapping); | ||
2580 | mapping->truncate_count++; | ||
2581 | } | ||
2582 | details.truncate_count = mapping->truncate_count; | ||
2583 | 2792 | ||
2793 | mutex_lock(&mapping->i_mmap_mutex); | ||
2584 | if (unlikely(!prio_tree_empty(&mapping->i_mmap))) | 2794 | if (unlikely(!prio_tree_empty(&mapping->i_mmap))) |
2585 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2795 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
2586 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 2796 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
2587 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 2797 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
2588 | spin_unlock(&mapping->i_mmap_lock); | 2798 | mutex_unlock(&mapping->i_mmap_mutex); |
2589 | } | 2799 | } |
2590 | EXPORT_SYMBOL(unmap_mapping_range); | 2800 | EXPORT_SYMBOL(unmap_mapping_range); |
2591 | 2801 | ||
2592 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | ||
2593 | { | ||
2594 | struct address_space *mapping = inode->i_mapping; | ||
2595 | |||
2596 | /* | ||
2597 | * If the underlying filesystem is not going to provide | ||
2598 | * a way to truncate a range of blocks (punch a hole) - | ||
2599 | * we should return failure right now. | ||
2600 | */ | ||
2601 | if (!inode->i_op->truncate_range) | ||
2602 | return -ENOSYS; | ||
2603 | |||
2604 | mutex_lock(&inode->i_mutex); | ||
2605 | down_write(&inode->i_alloc_sem); | ||
2606 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
2607 | truncate_inode_pages_range(mapping, offset, end); | ||
2608 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
2609 | inode->i_op->truncate_range(inode, offset, end); | ||
2610 | up_write(&inode->i_alloc_sem); | ||
2611 | mutex_unlock(&inode->i_mutex); | ||
2612 | |||
2613 | return 0; | ||
2614 | } | ||
2615 | |||
2616 | /* | 2802 | /* |
2617 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2803 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2618 | * but allow concurrent faults), and pte mapped but not yet locked. | 2804 | * but allow concurrent faults), and pte mapped but not yet locked. |
@@ -2626,7 +2812,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2626 | struct page *page, *swapcache = NULL; | 2812 | struct page *page, *swapcache = NULL; |
2627 | swp_entry_t entry; | 2813 | swp_entry_t entry; |
2628 | pte_t pte; | 2814 | pte_t pte; |
2629 | struct mem_cgroup *ptr = NULL; | 2815 | int locked; |
2816 | struct mem_cgroup *ptr; | ||
2630 | int exclusive = 0; | 2817 | int exclusive = 0; |
2631 | int ret = 0; | 2818 | int ret = 0; |
2632 | 2819 | ||
@@ -2666,6 +2853,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2666 | /* Had to read the page from swap area: Major fault */ | 2853 | /* Had to read the page from swap area: Major fault */ |
2667 | ret = VM_FAULT_MAJOR; | 2854 | ret = VM_FAULT_MAJOR; |
2668 | count_vm_event(PGMAJFAULT); | 2855 | count_vm_event(PGMAJFAULT); |
2856 | mem_cgroup_count_vm_event(mm, PGMAJFAULT); | ||
2669 | } else if (PageHWPoison(page)) { | 2857 | } else if (PageHWPoison(page)) { |
2670 | /* | 2858 | /* |
2671 | * hwpoisoned dirty swapcache pages are kept for killing | 2859 | * hwpoisoned dirty swapcache pages are kept for killing |
@@ -2676,8 +2864,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2676 | goto out_release; | 2864 | goto out_release; |
2677 | } | 2865 | } |
2678 | 2866 | ||
2679 | lock_page(page); | 2867 | locked = lock_page_or_retry(page, mm, flags); |
2680 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2868 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2869 | if (!locked) { | ||
2870 | ret |= VM_FAULT_RETRY; | ||
2871 | goto out_release; | ||
2872 | } | ||
2681 | 2873 | ||
2682 | /* | 2874 | /* |
2683 | * Make sure try_to_free_swap or reuse_swap_page or swapoff did not | 2875 | * Make sure try_to_free_swap or reuse_swap_page or swapoff did not |
@@ -2810,7 +3002,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2810 | if (prev && prev->vm_end == address) | 3002 | if (prev && prev->vm_end == address) |
2811 | return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; | 3003 | return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; |
2812 | 3004 | ||
2813 | expand_stack(vma, address - PAGE_SIZE); | 3005 | expand_downwards(vma, address - PAGE_SIZE); |
2814 | } | 3006 | } |
2815 | if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { | 3007 | if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { |
2816 | struct vm_area_struct *next = vma->vm_next; | 3008 | struct vm_area_struct *next = vma->vm_next; |
@@ -2926,7 +3118,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2926 | vmf.page = NULL; | 3118 | vmf.page = NULL; |
2927 | 3119 | ||
2928 | ret = vma->vm_ops->fault(vma, &vmf); | 3120 | ret = vma->vm_ops->fault(vma, &vmf); |
2929 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 3121 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | |
3122 | VM_FAULT_RETRY))) | ||
2930 | return ret; | 3123 | return ret; |
2931 | 3124 | ||
2932 | if (unlikely(PageHWPoison(vmf.page))) { | 3125 | if (unlikely(PageHWPoison(vmf.page))) { |
@@ -2967,12 +3160,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2967 | goto out; | 3160 | goto out; |
2968 | } | 3161 | } |
2969 | charged = 1; | 3162 | charged = 1; |
2970 | /* | ||
2971 | * Don't let another task, with possibly unlocked vma, | ||
2972 | * keep the mlocked page. | ||
2973 | */ | ||
2974 | if (vma->vm_flags & VM_LOCKED) | ||
2975 | clear_page_mlock(vmf.page); | ||
2976 | copy_user_highpage(page, vmf.page, address, vma); | 3163 | copy_user_highpage(page, vmf.page, address, vma); |
2977 | __SetPageUptodate(page); | 3164 | __SetPageUptodate(page); |
2978 | } else { | 3165 | } else { |
@@ -3139,9 +3326,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3139 | * but allow concurrent faults), and pte mapped but not yet locked. | 3326 | * but allow concurrent faults), and pte mapped but not yet locked. |
3140 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 3327 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
3141 | */ | 3328 | */ |
3142 | static inline int handle_pte_fault(struct mm_struct *mm, | 3329 | int handle_pte_fault(struct mm_struct *mm, |
3143 | struct vm_area_struct *vma, unsigned long address, | 3330 | struct vm_area_struct *vma, unsigned long address, |
3144 | pte_t *pte, pmd_t *pmd, unsigned int flags) | 3331 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
3145 | { | 3332 | { |
3146 | pte_t entry; | 3333 | pte_t entry; |
3147 | spinlock_t *ptl; | 3334 | spinlock_t *ptl; |
@@ -3185,7 +3372,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
3185 | * with threads. | 3372 | * with threads. |
3186 | */ | 3373 | */ |
3187 | if (flags & FAULT_FLAG_WRITE) | 3374 | if (flags & FAULT_FLAG_WRITE) |
3188 | flush_tlb_page(vma, address); | 3375 | flush_tlb_fix_spurious_fault(vma, address); |
3189 | } | 3376 | } |
3190 | unlock: | 3377 | unlock: |
3191 | pte_unmap_unlock(pte, ptl); | 3378 | pte_unmap_unlock(pte, ptl); |
@@ -3206,6 +3393,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3206 | __set_current_state(TASK_RUNNING); | 3393 | __set_current_state(TASK_RUNNING); |
3207 | 3394 | ||
3208 | count_vm_event(PGFAULT); | 3395 | count_vm_event(PGFAULT); |
3396 | mem_cgroup_count_vm_event(mm, PGFAULT); | ||
3209 | 3397 | ||
3210 | /* do counter updates before entering really critical section. */ | 3398 | /* do counter updates before entering really critical section. */ |
3211 | check_sync_rss_stat(current); | 3399 | check_sync_rss_stat(current); |
@@ -3220,9 +3408,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3220 | pmd = pmd_alloc(mm, pud, address); | 3408 | pmd = pmd_alloc(mm, pud, address); |
3221 | if (!pmd) | 3409 | if (!pmd) |
3222 | return VM_FAULT_OOM; | 3410 | return VM_FAULT_OOM; |
3223 | pte = pte_alloc_map(mm, pmd, address); | 3411 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { |
3224 | if (!pte) | 3412 | if (!vma->vm_ops) |
3413 | return do_huge_pmd_anonymous_page(mm, vma, address, | ||
3414 | pmd, flags); | ||
3415 | } else { | ||
3416 | pmd_t orig_pmd = *pmd; | ||
3417 | barrier(); | ||
3418 | if (pmd_trans_huge(orig_pmd)) { | ||
3419 | if (flags & FAULT_FLAG_WRITE && | ||
3420 | !pmd_write(orig_pmd) && | ||
3421 | !pmd_trans_splitting(orig_pmd)) | ||
3422 | return do_huge_pmd_wp_page(mm, vma, address, | ||
3423 | pmd, orig_pmd); | ||
3424 | return 0; | ||
3425 | } | ||
3426 | } | ||
3427 | |||
3428 | /* | ||
3429 | * Use __pte_alloc instead of pte_alloc_map, because we can't | ||
3430 | * run pte_offset_map on the pmd, if an huge pmd could | ||
3431 | * materialize from under us from a different thread. | ||
3432 | */ | ||
3433 | if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) | ||
3225 | return VM_FAULT_OOM; | 3434 | return VM_FAULT_OOM; |
3435 | /* if an huge pmd materialized from under us just retry later */ | ||
3436 | if (unlikely(pmd_trans_huge(*pmd))) | ||
3437 | return 0; | ||
3438 | /* | ||
3439 | * A regular pmd is established and it can't morph into a huge pmd | ||
3440 | * from under us anymore at this point because we hold the mmap_sem | ||
3441 | * read mode and khugepaged takes it in write mode. So now it's | ||
3442 | * safe to run pte_offset_map(). | ||
3443 | */ | ||
3444 | pte = pte_offset_map(pmd, address); | ||
3226 | 3445 | ||
3227 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3446 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
3228 | } | 3447 | } |
@@ -3288,7 +3507,12 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
3288 | vma = find_vma(current->mm, addr); | 3507 | vma = find_vma(current->mm, addr); |
3289 | if (!vma) | 3508 | if (!vma) |
3290 | return -ENOMEM; | 3509 | return -ENOMEM; |
3291 | write = (vma->vm_flags & VM_WRITE) != 0; | 3510 | /* |
3511 | * We want to touch writable mappings with a write fault in order | ||
3512 | * to break COW, except for shared mappings because these don't COW | ||
3513 | * and we would not want to dirty them for nothing. | ||
3514 | */ | ||
3515 | write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; | ||
3292 | BUG_ON(addr >= end); | 3516 | BUG_ON(addr >= end); |
3293 | BUG_ON(end > vma->vm_end); | 3517 | BUG_ON(end > vma->vm_end); |
3294 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | 3518 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
@@ -3323,7 +3547,7 @@ static int __init gate_vma_init(void) | |||
3323 | __initcall(gate_vma_init); | 3547 | __initcall(gate_vma_init); |
3324 | #endif | 3548 | #endif |
3325 | 3549 | ||
3326 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | 3550 | struct vm_area_struct *get_gate_vma(struct mm_struct *mm) |
3327 | { | 3551 | { |
3328 | #ifdef AT_SYSINFO_EHDR | 3552 | #ifdef AT_SYSINFO_EHDR |
3329 | return &gate_vma; | 3553 | return &gate_vma; |
@@ -3332,7 +3556,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | |||
3332 | #endif | 3556 | #endif |
3333 | } | 3557 | } |
3334 | 3558 | ||
3335 | int in_gate_area_no_task(unsigned long addr) | 3559 | int in_gate_area_no_mm(unsigned long addr) |
3336 | { | 3560 | { |
3337 | #ifdef AT_SYSINFO_EHDR | 3561 | #ifdef AT_SYSINFO_EHDR |
3338 | if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) | 3562 | if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) |
@@ -3343,7 +3567,7 @@ int in_gate_area_no_task(unsigned long addr) | |||
3343 | 3567 | ||
3344 | #endif /* __HAVE_ARCH_GATE_AREA */ | 3568 | #endif /* __HAVE_ARCH_GATE_AREA */ |
3345 | 3569 | ||
3346 | static int follow_pte(struct mm_struct *mm, unsigned long address, | 3570 | static int __follow_pte(struct mm_struct *mm, unsigned long address, |
3347 | pte_t **ptepp, spinlock_t **ptlp) | 3571 | pte_t **ptepp, spinlock_t **ptlp) |
3348 | { | 3572 | { |
3349 | pgd_t *pgd; | 3573 | pgd_t *pgd; |
@@ -3360,6 +3584,7 @@ static int follow_pte(struct mm_struct *mm, unsigned long address, | |||
3360 | goto out; | 3584 | goto out; |
3361 | 3585 | ||
3362 | pmd = pmd_offset(pud, address); | 3586 | pmd = pmd_offset(pud, address); |
3587 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
3363 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 3588 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
3364 | goto out; | 3589 | goto out; |
3365 | 3590 | ||
@@ -3380,6 +3605,17 @@ out: | |||
3380 | return -EINVAL; | 3605 | return -EINVAL; |
3381 | } | 3606 | } |
3382 | 3607 | ||
3608 | static inline int follow_pte(struct mm_struct *mm, unsigned long address, | ||
3609 | pte_t **ptepp, spinlock_t **ptlp) | ||
3610 | { | ||
3611 | int res; | ||
3612 | |||
3613 | /* (void) is needed to make gcc happy */ | ||
3614 | (void) __cond_lock(*ptlp, | ||
3615 | !(res = __follow_pte(mm, address, ptepp, ptlp))); | ||
3616 | return res; | ||
3617 | } | ||
3618 | |||
3383 | /** | 3619 | /** |
3384 | * follow_pfn - look up PFN at a user virtual address | 3620 | * follow_pfn - look up PFN at a user virtual address |
3385 | * @vma: memory mapping | 3621 | * @vma: memory mapping |
@@ -3461,20 +3697,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | |||
3461 | #endif | 3697 | #endif |
3462 | 3698 | ||
3463 | /* | 3699 | /* |
3464 | * Access another process' address space. | 3700 | * Access another process' address space as given in mm. If non-NULL, use the |
3465 | * Source/target buffer must be kernel space, | 3701 | * given task for page fault accounting. |
3466 | * Do not walk the page table directly, use get_user_pages | ||
3467 | */ | 3702 | */ |
3468 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | 3703 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
3704 | unsigned long addr, void *buf, int len, int write) | ||
3469 | { | 3705 | { |
3470 | struct mm_struct *mm; | ||
3471 | struct vm_area_struct *vma; | 3706 | struct vm_area_struct *vma; |
3472 | void *old_buf = buf; | 3707 | void *old_buf = buf; |
3473 | 3708 | ||
3474 | mm = get_task_mm(tsk); | ||
3475 | if (!mm) | ||
3476 | return 0; | ||
3477 | |||
3478 | down_read(&mm->mmap_sem); | 3709 | down_read(&mm->mmap_sem); |
3479 | /* ignore errors, just check how much was successfully transferred */ | 3710 | /* ignore errors, just check how much was successfully transferred */ |
3480 | while (len) { | 3711 | while (len) { |
@@ -3491,7 +3722,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
3491 | */ | 3722 | */ |
3492 | #ifdef CONFIG_HAVE_IOREMAP_PROT | 3723 | #ifdef CONFIG_HAVE_IOREMAP_PROT |
3493 | vma = find_vma(mm, addr); | 3724 | vma = find_vma(mm, addr); |
3494 | if (!vma) | 3725 | if (!vma || vma->vm_start > addr) |
3495 | break; | 3726 | break; |
3496 | if (vma->vm_ops && vma->vm_ops->access) | 3727 | if (vma->vm_ops && vma->vm_ops->access) |
3497 | ret = vma->vm_ops->access(vma, addr, buf, | 3728 | ret = vma->vm_ops->access(vma, addr, buf, |
@@ -3523,11 +3754,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
3523 | addr += bytes; | 3754 | addr += bytes; |
3524 | } | 3755 | } |
3525 | up_read(&mm->mmap_sem); | 3756 | up_read(&mm->mmap_sem); |
3526 | mmput(mm); | ||
3527 | 3757 | ||
3528 | return buf - old_buf; | 3758 | return buf - old_buf; |
3529 | } | 3759 | } |
3530 | 3760 | ||
3761 | /** | ||
3762 | * access_remote_vm - access another process' address space | ||
3763 | * @mm: the mm_struct of the target address space | ||
3764 | * @addr: start address to access | ||
3765 | * @buf: source or destination buffer | ||
3766 | * @len: number of bytes to transfer | ||
3767 | * @write: whether the access is a write | ||
3768 | * | ||
3769 | * The caller must hold a reference on @mm. | ||
3770 | */ | ||
3771 | int access_remote_vm(struct mm_struct *mm, unsigned long addr, | ||
3772 | void *buf, int len, int write) | ||
3773 | { | ||
3774 | return __access_remote_vm(NULL, mm, addr, buf, len, write); | ||
3775 | } | ||
3776 | |||
3777 | /* | ||
3778 | * Access another process' address space. | ||
3779 | * Source/target buffer must be kernel space, | ||
3780 | * Do not walk the page table directly, use get_user_pages | ||
3781 | */ | ||
3782 | int access_process_vm(struct task_struct *tsk, unsigned long addr, | ||
3783 | void *buf, int len, int write) | ||
3784 | { | ||
3785 | struct mm_struct *mm; | ||
3786 | int ret; | ||
3787 | |||
3788 | mm = get_task_mm(tsk); | ||
3789 | if (!mm) | ||
3790 | return 0; | ||
3791 | |||
3792 | ret = __access_remote_vm(tsk, mm, addr, buf, len, write); | ||
3793 | mmput(mm); | ||
3794 | |||
3795 | return ret; | ||
3796 | } | ||
3797 | |||
3531 | /* | 3798 | /* |
3532 | * Print the name of a VMA. | 3799 | * Print the name of a VMA. |
3533 | */ | 3800 | */ |
@@ -3589,3 +3856,74 @@ void might_fault(void) | |||
3589 | } | 3856 | } |
3590 | EXPORT_SYMBOL(might_fault); | 3857 | EXPORT_SYMBOL(might_fault); |
3591 | #endif | 3858 | #endif |
3859 | |||
3860 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) | ||
3861 | static void clear_gigantic_page(struct page *page, | ||
3862 | unsigned long addr, | ||
3863 | unsigned int pages_per_huge_page) | ||
3864 | { | ||
3865 | int i; | ||
3866 | struct page *p = page; | ||
3867 | |||
3868 | might_sleep(); | ||
3869 | for (i = 0; i < pages_per_huge_page; | ||
3870 | i++, p = mem_map_next(p, page, i)) { | ||
3871 | cond_resched(); | ||
3872 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
3873 | } | ||
3874 | } | ||
3875 | void clear_huge_page(struct page *page, | ||
3876 | unsigned long addr, unsigned int pages_per_huge_page) | ||
3877 | { | ||
3878 | int i; | ||
3879 | |||
3880 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | ||
3881 | clear_gigantic_page(page, addr, pages_per_huge_page); | ||
3882 | return; | ||
3883 | } | ||
3884 | |||
3885 | might_sleep(); | ||
3886 | for (i = 0; i < pages_per_huge_page; i++) { | ||
3887 | cond_resched(); | ||
3888 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
3889 | } | ||
3890 | } | ||
3891 | |||
3892 | static void copy_user_gigantic_page(struct page *dst, struct page *src, | ||
3893 | unsigned long addr, | ||
3894 | struct vm_area_struct *vma, | ||
3895 | unsigned int pages_per_huge_page) | ||
3896 | { | ||
3897 | int i; | ||
3898 | struct page *dst_base = dst; | ||
3899 | struct page *src_base = src; | ||
3900 | |||
3901 | for (i = 0; i < pages_per_huge_page; ) { | ||
3902 | cond_resched(); | ||
3903 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | ||
3904 | |||
3905 | i++; | ||
3906 | dst = mem_map_next(dst, dst_base, i); | ||
3907 | src = mem_map_next(src, src_base, i); | ||
3908 | } | ||
3909 | } | ||
3910 | |||
3911 | void copy_user_huge_page(struct page *dst, struct page *src, | ||
3912 | unsigned long addr, struct vm_area_struct *vma, | ||
3913 | unsigned int pages_per_huge_page) | ||
3914 | { | ||
3915 | int i; | ||
3916 | |||
3917 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | ||
3918 | copy_user_gigantic_page(dst, src, addr, vma, | ||
3919 | pages_per_huge_page); | ||
3920 | return; | ||
3921 | } | ||
3922 | |||
3923 | might_sleep(); | ||
3924 | for (i = 0; i < pages_per_huge_page; i++) { | ||
3925 | cond_resched(); | ||
3926 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | ||
3927 | } | ||
3928 | } | ||
3929 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | ||