diff options
author | Ying Han <yinghan@google.com> | 2009-01-06 17:40:18 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-06 18:59:08 -0500 |
commit | 4779280d1ea4d361af13ae77ba55217fbcd16d4c (patch) | |
tree | 1abb35d85f2280aebb9cd565cc223d14b8731203 | |
parent | 91bf189c3a766927694ce9de7d545e96b23f20fc (diff) |
mm: make get_user_pages() interruptible
The initial implementation of checking TIF_MEMDIE covers the cases of OOM
killing. If the process has been OOM killed, the TIF_MEMDIE is set and it
return immediately. This patch includes:
1. add the case that the SIGKILL is sent by user processes. The
process can try to get_user_pages() unlimited memory even if a user
process has sent a SIGKILL to it(maybe a monitor find the process
exceed its memory limit and try to kill it). In the old
implementation, the SIGKILL won't be handled until the get_user_pages()
returns.
2. change the return value to be ERESTARTSYS. It makes no sense to
return ENOMEM if the get_user_pages returned by getting a SIGKILL
signal. Considering the general convention for a system call
interrupted by a signal is ERESTARTNOSYS, so the current return value
is consistant to that.
Lee:
An unfortunate side effect of "make-get_user_pages-interruptible" is that
it prevents a SIGKILL'd task from munlock-ing pages that it had mlocked,
resulting in freeing of mlocked pages. Freeing of mlocked pages, in
itself, is not so bad. We just count them now--altho' I had hoped to
remove this stat and add PG_MLOCKED to the free pages flags check.
However, consider pages in shared libraries mapped by more than one task
that a task mlocked--e.g., via mlockall(). If the task that mlocked the
pages exits via SIGKILL, these pages would be left mlocked and
unevictable.
Proposed fix:
Add another GUP flag to ignore sigkill when calling get_user_pages from
munlock()--similar to Kosaki Motohiro's 'IGNORE_VMA_PERMISSIONS flag for
the same purpose. We are not actually allocating memory in this case,
which "make-get_user_pages-interruptible" intends to avoid. We're just
munlocking pages that are already resident and mapped, and we're reusing
get_user_pages() to access those pages.
?? Maybe we should combine 'IGNORE_VMA_PERMISSIONS and '_IGNORE_SIGKILL
into a single flag: GUP_FLAGS_MUNLOCK ???
[Lee.Schermerhorn@hp.com: ignore sigkill in get_user_pages during munlock]
Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/internal.h | 1 | ||||
-rw-r--r-- | mm/memory.c | 14 | ||||
-rw-r--r-- | mm/mlock.c | 9 |
3 files changed, 15 insertions, 9 deletions
diff --git a/mm/internal.h b/mm/internal.h index 1981bc9454f3..478223b73a2a 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -276,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
276 | #define GUP_FLAGS_WRITE 0x1 | 276 | #define GUP_FLAGS_WRITE 0x1 |
277 | #define GUP_FLAGS_FORCE 0x2 | 277 | #define GUP_FLAGS_FORCE 0x2 |
278 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 | 278 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 |
279 | #define GUP_FLAGS_IGNORE_SIGKILL 0x8 | ||
279 | 280 | ||
280 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 281 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
281 | unsigned long start, int len, int flags, | 282 | unsigned long start, int len, int flags, |
diff --git a/mm/memory.c b/mm/memory.c index db68af8e0bc4..3f8fa06b963b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1210,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1210 | int write = !!(flags & GUP_FLAGS_WRITE); | 1210 | int write = !!(flags & GUP_FLAGS_WRITE); |
1211 | int force = !!(flags & GUP_FLAGS_FORCE); | 1211 | int force = !!(flags & GUP_FLAGS_FORCE); |
1212 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | 1212 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); |
1213 | int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); | ||
1213 | 1214 | ||
1214 | if (len <= 0) | 1215 | if (len <= 0) |
1215 | return 0; | 1216 | return 0; |
@@ -1288,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1288 | struct page *page; | 1289 | struct page *page; |
1289 | 1290 | ||
1290 | /* | 1291 | /* |
1291 | * If tsk is ooming, cut off its access to large memory | 1292 | * If we have a pending SIGKILL, don't keep faulting |
1292 | * allocations. It has a pending SIGKILL, but it can't | 1293 | * pages and potentially allocating memory, unless |
1293 | * be processed until returning to user space. | 1294 | * current is handling munlock--e.g., on exit. In |
1295 | * that case, we are not allocating memory. Rather, | ||
1296 | * we're only unlocking already resident/mapped pages. | ||
1294 | */ | 1297 | */ |
1295 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) | 1298 | if (unlikely(!ignore_sigkill && |
1296 | return i ? i : -ENOMEM; | 1299 | fatal_signal_pending(current))) |
1300 | return i ? i : -ERESTARTSYS; | ||
1297 | 1301 | ||
1298 | if (write) | 1302 | if (write) |
1299 | foll_flags |= FOLL_WRITE; | 1303 | foll_flags |= FOLL_WRITE; |
diff --git a/mm/mlock.c b/mm/mlock.c index 3035a56e7616..e125156c664e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
173 | (atomic_read(&mm->mm_users) != 0)); | 173 | (atomic_read(&mm->mm_users) != 0)); |
174 | 174 | ||
175 | /* | 175 | /* |
176 | * mlock: don't page populate if page has PROT_NONE permission. | 176 | * mlock: don't page populate if vma has PROT_NONE permission. |
177 | * munlock: the pages always do munlock althrough | 177 | * munlock: always do munlock although the vma has PROT_NONE |
178 | * its has PROT_NONE permission. | 178 | * permission, or SIGKILL is pending. |
179 | */ | 179 | */ |
180 | if (!mlock) | 180 | if (!mlock) |
181 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; | 181 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | |
182 | GUP_FLAGS_IGNORE_SIGKILL; | ||
182 | 183 | ||
183 | if (vma->vm_flags & VM_WRITE) | 184 | if (vma->vm_flags & VM_WRITE) |
184 | gup_flags |= GUP_FLAGS_WRITE; | 185 | gup_flags |= GUP_FLAGS_WRITE; |