diff options
author | Michel Lespinasse <walken@google.com> | 2011-01-13 18:46:14 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-13 20:32:36 -0500 |
commit | 53a7706d5ed8f1a53ba062b318773160cc476dde (patch) | |
tree | a1990d90d5af3686b7a83b2bbc2ae6463971efc5 | |
parent | 5fdb2002131cd4e210b9638a4fc932ec7be491d1 (diff) |
mlock: do not hold mmap_sem for extended periods of time
__get_user_pages gets a new 'nonblocking' parameter to signal that the
caller is prepared to re-acquire mmap_sem and retry the operation if
needed. This is used to split off long operations if they are going to
block on a disk transfer, or when we detect contention on the mmap_sem.
[akpm@linux-foundation.org: remove ref to rwsem_is_contended()]
Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/internal.h | 3 | ||||
-rw-r--r-- | mm/memory.c | 23 | ||||
-rw-r--r-- | mm/mlock.c | 40 | ||||
-rw-r--r-- | mm/nommu.c | 6 |
4 files changed, 47 insertions, 25 deletions
diff --git a/mm/internal.h b/mm/internal.h index dedb0aff673f..bd4f581f624a 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -243,7 +243,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
243 | 243 | ||
244 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 244 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
245 | unsigned long start, int len, unsigned int foll_flags, | 245 | unsigned long start, int len, unsigned int foll_flags, |
246 | struct page **pages, struct vm_area_struct **vmas); | 246 | struct page **pages, struct vm_area_struct **vmas, |
247 | int *nonblocking); | ||
247 | 248 | ||
248 | #define ZONE_RECLAIM_NOSCAN -2 | 249 | #define ZONE_RECLAIM_NOSCAN -2 |
249 | #define ZONE_RECLAIM_FULL -1 | 250 | #define ZONE_RECLAIM_FULL -1 |
diff --git a/mm/memory.c b/mm/memory.c index 15e1f19a3b10..1bbe9a22429c 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1363,7 +1363,8 @@ no_page_table: | |||
1363 | 1363 | ||
1364 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1364 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1365 | unsigned long start, int nr_pages, unsigned int gup_flags, | 1365 | unsigned long start, int nr_pages, unsigned int gup_flags, |
1366 | struct page **pages, struct vm_area_struct **vmas) | 1366 | struct page **pages, struct vm_area_struct **vmas, |
1367 | int *nonblocking) | ||
1367 | { | 1368 | { |
1368 | int i; | 1369 | int i; |
1369 | unsigned long vm_flags; | 1370 | unsigned long vm_flags; |
@@ -1463,10 +1464,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1463 | cond_resched(); | 1464 | cond_resched(); |
1464 | while (!(page = follow_page(vma, start, foll_flags))) { | 1465 | while (!(page = follow_page(vma, start, foll_flags))) { |
1465 | int ret; | 1466 | int ret; |
1467 | unsigned int fault_flags = 0; | ||
1468 | |||
1469 | if (foll_flags & FOLL_WRITE) | ||
1470 | fault_flags |= FAULT_FLAG_WRITE; | ||
1471 | if (nonblocking) | ||
1472 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
1466 | 1473 | ||
1467 | ret = handle_mm_fault(mm, vma, start, | 1474 | ret = handle_mm_fault(mm, vma, start, |
1468 | (foll_flags & FOLL_WRITE) ? | 1475 | fault_flags); |
1469 | FAULT_FLAG_WRITE : 0); | ||
1470 | 1476 | ||
1471 | if (ret & VM_FAULT_ERROR) { | 1477 | if (ret & VM_FAULT_ERROR) { |
1472 | if (ret & VM_FAULT_OOM) | 1478 | if (ret & VM_FAULT_OOM) |
@@ -1482,6 +1488,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1482 | else | 1488 | else |
1483 | tsk->min_flt++; | 1489 | tsk->min_flt++; |
1484 | 1490 | ||
1491 | if (ret & VM_FAULT_RETRY) { | ||
1492 | *nonblocking = 0; | ||
1493 | return i; | ||
1494 | } | ||
1495 | |||
1485 | /* | 1496 | /* |
1486 | * The VM_FAULT_WRITE bit tells us that | 1497 | * The VM_FAULT_WRITE bit tells us that |
1487 | * do_wp_page has broken COW when necessary, | 1498 | * do_wp_page has broken COW when necessary, |
@@ -1581,7 +1592,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1581 | if (force) | 1592 | if (force) |
1582 | flags |= FOLL_FORCE; | 1593 | flags |= FOLL_FORCE; |
1583 | 1594 | ||
1584 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 1595 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, |
1596 | NULL); | ||
1585 | } | 1597 | } |
1586 | EXPORT_SYMBOL(get_user_pages); | 1598 | EXPORT_SYMBOL(get_user_pages); |
1587 | 1599 | ||
@@ -1606,7 +1618,8 @@ struct page *get_dump_page(unsigned long addr) | |||
1606 | struct page *page; | 1618 | struct page *page; |
1607 | 1619 | ||
1608 | if (__get_user_pages(current, current->mm, addr, 1, | 1620 | if (__get_user_pages(current, current->mm, addr, 1, |
1609 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) | 1621 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, |
1622 | NULL) < 1) | ||
1610 | return NULL; | 1623 | return NULL; |
1611 | flush_cache_page(vma, addr, page_to_pfn(page)); | 1624 | flush_cache_page(vma, addr, page_to_pfn(page)); |
1612 | return page; | 1625 | return page; |
diff --git a/mm/mlock.c b/mm/mlock.c index 84da66b7bbf0..13e81ee8be9d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -155,13 +155,13 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add | |||
155 | * vma->vm_mm->mmap_sem must be held for at least read. | 155 | * vma->vm_mm->mmap_sem must be held for at least read. |
156 | */ | 156 | */ |
157 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | 157 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, |
158 | unsigned long start, unsigned long end) | 158 | unsigned long start, unsigned long end, |
159 | int *nonblocking) | ||
159 | { | 160 | { |
160 | struct mm_struct *mm = vma->vm_mm; | 161 | struct mm_struct *mm = vma->vm_mm; |
161 | unsigned long addr = start; | 162 | unsigned long addr = start; |
162 | int nr_pages = (end - start) / PAGE_SIZE; | 163 | int nr_pages = (end - start) / PAGE_SIZE; |
163 | int gup_flags; | 164 | int gup_flags; |
164 | int ret; | ||
165 | 165 | ||
166 | VM_BUG_ON(start & ~PAGE_MASK); | 166 | VM_BUG_ON(start & ~PAGE_MASK); |
167 | VM_BUG_ON(end & ~PAGE_MASK); | 167 | VM_BUG_ON(end & ~PAGE_MASK); |
@@ -187,9 +187,8 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
187 | nr_pages--; | 187 | nr_pages--; |
188 | } | 188 | } |
189 | 189 | ||
190 | ret = __get_user_pages(current, mm, addr, nr_pages, gup_flags, | 190 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, |
191 | NULL, NULL); | 191 | NULL, NULL, nonblocking); |
192 | return max(ret, 0); /* 0 or negative error code */ | ||
193 | } | 192 | } |
194 | 193 | ||
195 | /* | 194 | /* |
@@ -233,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
233 | is_vm_hugetlb_page(vma) || | 232 | is_vm_hugetlb_page(vma) || |
234 | vma == get_gate_vma(current))) { | 233 | vma == get_gate_vma(current))) { |
235 | 234 | ||
236 | __mlock_vma_pages_range(vma, start, end); | 235 | __mlock_vma_pages_range(vma, start, end, NULL); |
237 | 236 | ||
238 | /* Hide errors from mmap() and other callers */ | 237 | /* Hide errors from mmap() and other callers */ |
239 | return 0; | 238 | return 0; |
@@ -429,21 +428,23 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | |||
429 | struct mm_struct *mm = current->mm; | 428 | struct mm_struct *mm = current->mm; |
430 | unsigned long end, nstart, nend; | 429 | unsigned long end, nstart, nend; |
431 | struct vm_area_struct *vma = NULL; | 430 | struct vm_area_struct *vma = NULL; |
431 | int locked = 0; | ||
432 | int ret = 0; | 432 | int ret = 0; |
433 | 433 | ||
434 | VM_BUG_ON(start & ~PAGE_MASK); | 434 | VM_BUG_ON(start & ~PAGE_MASK); |
435 | VM_BUG_ON(len != PAGE_ALIGN(len)); | 435 | VM_BUG_ON(len != PAGE_ALIGN(len)); |
436 | end = start + len; | 436 | end = start + len; |
437 | 437 | ||
438 | down_read(&mm->mmap_sem); | ||
439 | for (nstart = start; nstart < end; nstart = nend) { | 438 | for (nstart = start; nstart < end; nstart = nend) { |
440 | /* | 439 | /* |
441 | * We want to fault in pages for [nstart; end) address range. | 440 | * We want to fault in pages for [nstart; end) address range. |
442 | * Find first corresponding VMA. | 441 | * Find first corresponding VMA. |
443 | */ | 442 | */ |
444 | if (!vma) | 443 | if (!locked) { |
444 | locked = 1; | ||
445 | down_read(&mm->mmap_sem); | ||
445 | vma = find_vma(mm, nstart); | 446 | vma = find_vma(mm, nstart); |
446 | else | 447 | } else if (nstart >= vma->vm_end) |
447 | vma = vma->vm_next; | 448 | vma = vma->vm_next; |
448 | if (!vma || vma->vm_start >= end) | 449 | if (!vma || vma->vm_start >= end) |
449 | break; | 450 | break; |
@@ -457,19 +458,24 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | |||
457 | if (nstart < vma->vm_start) | 458 | if (nstart < vma->vm_start) |
458 | nstart = vma->vm_start; | 459 | nstart = vma->vm_start; |
459 | /* | 460 | /* |
460 | * Now fault in a range of pages within the first VMA. | 461 | * Now fault in a range of pages. __mlock_vma_pages_range() |
462 | * double checks the vma flags, so that it won't mlock pages | ||
463 | * if the vma was already munlocked. | ||
461 | */ | 464 | */ |
462 | ret = __mlock_vma_pages_range(vma, nstart, nend); | 465 | ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); |
463 | if (ret < 0 && ignore_errors) { | 466 | if (ret < 0) { |
464 | ret = 0; | 467 | if (ignore_errors) { |
465 | continue; /* continue at next VMA */ | 468 | ret = 0; |
466 | } | 469 | continue; /* continue at next VMA */ |
467 | if (ret) { | 470 | } |
468 | ret = __mlock_posix_error_return(ret); | 471 | ret = __mlock_posix_error_return(ret); |
469 | break; | 472 | break; |
470 | } | 473 | } |
474 | nend = nstart + ret * PAGE_SIZE; | ||
475 | ret = 0; | ||
471 | } | 476 | } |
472 | up_read(&mm->mmap_sem); | 477 | if (locked) |
478 | up_read(&mm->mmap_sem); | ||
473 | return ret; /* 0 or negative error code */ | 479 | return ret; /* 0 or negative error code */ |
474 | } | 480 | } |
475 | 481 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index ef4045d010d5..f59e1424d3db 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp) | |||
127 | 127 | ||
128 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 128 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
129 | unsigned long start, int nr_pages, unsigned int foll_flags, | 129 | unsigned long start, int nr_pages, unsigned int foll_flags, |
130 | struct page **pages, struct vm_area_struct **vmas) | 130 | struct page **pages, struct vm_area_struct **vmas, |
131 | int *retry) | ||
131 | { | 132 | { |
132 | struct vm_area_struct *vma; | 133 | struct vm_area_struct *vma; |
133 | unsigned long vm_flags; | 134 | unsigned long vm_flags; |
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
185 | if (force) | 186 | if (force) |
186 | flags |= FOLL_FORCE; | 187 | flags |= FOLL_FORCE; |
187 | 188 | ||
188 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 189 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, |
190 | NULL); | ||
189 | } | 191 | } |
190 | EXPORT_SYMBOL(get_user_pages); | 192 | EXPORT_SYMBOL(get_user_pages); |
191 | 193 | ||