aboutsummaryrefslogtreecommitdiffstats
path: root/mm/ksm.c
diff options
context:
space:
mode:
authorHugh Dickins <hugh.dickins@tiscali.co.uk>2009-09-21 20:02:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-22 10:17:32 -0400
commit9ba6929480088a85c1ff60a4b1f1c9fc80dbd2b7 (patch)
tree39aab8cdffae598b55e35c578f70820712286ab4 /mm/ksm.c
parentcd551f97519d35855be5a8720a47cc802ee4fd06 (diff)
ksm: fix oom deadlock
There's a now-obvious deadlock in KSM's out-of-memory handling: imagine ksmd or KSM_RUN_UNMERGE handling, holding ksm_thread_mutex, trying to allocate a page to break KSM in an mm which becomes the OOM victim (quite likely in the unmerge case): it's killed and goes to exit, and hangs there waiting to acquire ksm_thread_mutex. Clearly we must not require ksm_thread_mutex in __ksm_exit, simple though that made everything else: perhaps use mmap_sem somehow? And part of the answer lies in the comments on unmerge_ksm_pages: __ksm_exit should also leave all the rmap_item removal to ksmd. But there's a fundamental problem, that KSM relies upon mmap_sem to guarantee the consistency of the mm it's dealing with, yet exit_mmap tears down an mm without taking mmap_sem. And bumping mm_users won't help at all, that just ensures that the pages the OOM killer assumes are on their way to being freed will not be freed. The best answer seems to be, to move the ksm_exit callout from just before exit_mmap, to the middle of exit_mmap: after the mm's pages have been freed (if the mmu_gather is flushed), but before its page tables and vma structures have been freed; and down_write,up_write mmap_sem there to serialize with KSM's own reliance on mmap_sem. But KSM then needs to be careful, whenever it downs mmap_sem, to check that the mm is not already exiting: there's a danger of using find_vma on a layout that's being torn apart, or writing into page tables which have been freed for reuse; and even do_anonymous_page and __do_fault need to check they're not being called by break_ksm to reinstate a pte after zap_pte_range has zapped that page table. Though it might be clearer to add an exiting flag, set while holding mmap_sem in __ksm_exit, that wouldn't cover the issue of reinstating a zapped pte. All we need is to check whether mm_users is 0 - but must remember that ksmd may detect that before __ksm_exit is reached. So, ksm_test_exit(mm) added to comment such checks on mm->mm_users. __ksm_exit now has to leave clearing up the rmap_items to ksmd, that needs ksm_thread_mutex; but shift the exiting mm just after the ksm_scan cursor so that it will soon be dealt with. __ksm_enter raise mm_count to hold the mm_struct, ksmd's exit processing (exactly like its processing when it finds all VM_MERGEABLEs unmapped) mmdrop it, similar procedure for KSM_RUN_UNMERGE (which has stopped ksmd). But also give __ksm_exit a fast path: when there's no complication (no rmap_items attached to mm and it's not at the ksm_scan cursor), it can safely do all the exiting work itself. This is not just an optimization: when ksmd is not running, the raised mm_count would otherwise leak mm_structs. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Acked-by: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/ksm.c')
-rw-r--r--mm/ksm.c144
1 files changed, 98 insertions, 46 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index 7e4d255dadc0..722e3f2a8dc5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -32,6 +32,7 @@
32#include <linux/mmu_notifier.h> 32#include <linux/mmu_notifier.h>
33#include <linux/ksm.h> 33#include <linux/ksm.h>
34 34
35#include <asm/tlb.h>
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
36 37
37/* 38/*
@@ -347,6 +348,8 @@ static void break_cow(struct mm_struct *mm, unsigned long addr)
347 struct vm_area_struct *vma; 348 struct vm_area_struct *vma;
348 349
349 down_read(&mm->mmap_sem); 350 down_read(&mm->mmap_sem);
351 if (ksm_test_exit(mm))
352 goto out;
350 vma = find_vma(mm, addr); 353 vma = find_vma(mm, addr);
351 if (!vma || vma->vm_start > addr) 354 if (!vma || vma->vm_start > addr)
352 goto out; 355 goto out;
@@ -365,6 +368,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
365 struct page *page; 368 struct page *page;
366 369
367 down_read(&mm->mmap_sem); 370 down_read(&mm->mmap_sem);
371 if (ksm_test_exit(mm))
372 goto out;
368 vma = find_vma(mm, addr); 373 vma = find_vma(mm, addr);
369 if (!vma || vma->vm_start > addr) 374 if (!vma || vma->vm_start > addr)
370 goto out; 375 goto out;
@@ -439,11 +444,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
439 } else if (rmap_item->address & NODE_FLAG) { 444 } else if (rmap_item->address & NODE_FLAG) {
440 unsigned char age; 445 unsigned char age;
441 /* 446 /*
442 * ksm_thread can and must skip the rb_erase, because 447 * Usually ksmd can and must skip the rb_erase, because
443 * root_unstable_tree was already reset to RB_ROOT. 448 * root_unstable_tree was already reset to RB_ROOT.
444 * But __ksm_exit has to be careful: do the rb_erase 449 * But be careful when an mm is exiting: do the rb_erase
445 * if it's interrupting a scan, and this rmap_item was 450 * if this rmap_item was inserted by this scan, rather
446 * inserted by this scan rather than left from before. 451 * than left over from before.
447 */ 452 */
448 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 453 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
449 BUG_ON(age > 1); 454 BUG_ON(age > 1);
@@ -491,6 +496,8 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
491 int err = 0; 496 int err = 0;
492 497
493 for (addr = start; addr < end && !err; addr += PAGE_SIZE) { 498 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
499 if (ksm_test_exit(vma->vm_mm))
500 break;
494 if (signal_pending(current)) 501 if (signal_pending(current))
495 err = -ERESTARTSYS; 502 err = -ERESTARTSYS;
496 else 503 else
@@ -507,34 +514,50 @@ static int unmerge_and_remove_all_rmap_items(void)
507 int err = 0; 514 int err = 0;
508 515
509 spin_lock(&ksm_mmlist_lock); 516 spin_lock(&ksm_mmlist_lock);
510 mm_slot = list_entry(ksm_mm_head.mm_list.next, 517 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
511 struct mm_slot, mm_list); 518 struct mm_slot, mm_list);
512 spin_unlock(&ksm_mmlist_lock); 519 spin_unlock(&ksm_mmlist_lock);
513 520
514 while (mm_slot != &ksm_mm_head) { 521 for (mm_slot = ksm_scan.mm_slot;
522 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
515 mm = mm_slot->mm; 523 mm = mm_slot->mm;
516 down_read(&mm->mmap_sem); 524 down_read(&mm->mmap_sem);
517 for (vma = mm->mmap; vma; vma = vma->vm_next) { 525 for (vma = mm->mmap; vma; vma = vma->vm_next) {
526 if (ksm_test_exit(mm))
527 break;
518 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 528 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
519 continue; 529 continue;
520 err = unmerge_ksm_pages(vma, 530 err = unmerge_ksm_pages(vma,
521 vma->vm_start, vma->vm_end); 531 vma->vm_start, vma->vm_end);
522 if (err) { 532 if (err)
523 up_read(&mm->mmap_sem); 533 goto error;
524 goto out;
525 }
526 } 534 }
535
527 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); 536 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
528 up_read(&mm->mmap_sem);
529 537
530 spin_lock(&ksm_mmlist_lock); 538 spin_lock(&ksm_mmlist_lock);
531 mm_slot = list_entry(mm_slot->mm_list.next, 539 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
532 struct mm_slot, mm_list); 540 struct mm_slot, mm_list);
533 spin_unlock(&ksm_mmlist_lock); 541 if (ksm_test_exit(mm)) {
542 hlist_del(&mm_slot->link);
543 list_del(&mm_slot->mm_list);
544 spin_unlock(&ksm_mmlist_lock);
545
546 free_mm_slot(mm_slot);
547 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
548 up_read(&mm->mmap_sem);
549 mmdrop(mm);
550 } else {
551 spin_unlock(&ksm_mmlist_lock);
552 up_read(&mm->mmap_sem);
553 }
534 } 554 }
535 555
536 ksm_scan.seqnr = 0; 556 ksm_scan.seqnr = 0;
537out: 557 return 0;
558
559error:
560 up_read(&mm->mmap_sem);
538 spin_lock(&ksm_mmlist_lock); 561 spin_lock(&ksm_mmlist_lock);
539 ksm_scan.mm_slot = &ksm_mm_head; 562 ksm_scan.mm_slot = &ksm_mm_head;
540 spin_unlock(&ksm_mmlist_lock); 563 spin_unlock(&ksm_mmlist_lock);
@@ -755,6 +778,9 @@ static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
755 int err = -EFAULT; 778 int err = -EFAULT;
756 779
757 down_read(&mm1->mmap_sem); 780 down_read(&mm1->mmap_sem);
781 if (ksm_test_exit(mm1))
782 goto out;
783
758 vma = find_vma(mm1, addr1); 784 vma = find_vma(mm1, addr1);
759 if (!vma || vma->vm_start > addr1) 785 if (!vma || vma->vm_start > addr1)
760 goto out; 786 goto out;
@@ -796,6 +822,10 @@ static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
796 return err; 822 return err;
797 823
798 down_read(&mm1->mmap_sem); 824 down_read(&mm1->mmap_sem);
825 if (ksm_test_exit(mm1)) {
826 up_read(&mm1->mmap_sem);
827 goto out;
828 }
799 vma = find_vma(mm1, addr1); 829 vma = find_vma(mm1, addr1);
800 if (!vma || vma->vm_start > addr1) { 830 if (!vma || vma->vm_start > addr1) {
801 up_read(&mm1->mmap_sem); 831 up_read(&mm1->mmap_sem);
@@ -1174,7 +1204,12 @@ next_mm:
1174 1204
1175 mm = slot->mm; 1205 mm = slot->mm;
1176 down_read(&mm->mmap_sem); 1206 down_read(&mm->mmap_sem);
1177 for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) { 1207 if (ksm_test_exit(mm))
1208 vma = NULL;
1209 else
1210 vma = find_vma(mm, ksm_scan.address);
1211
1212 for (; vma; vma = vma->vm_next) {
1178 if (!(vma->vm_flags & VM_MERGEABLE)) 1213 if (!(vma->vm_flags & VM_MERGEABLE))
1179 continue; 1214 continue;
1180 if (ksm_scan.address < vma->vm_start) 1215 if (ksm_scan.address < vma->vm_start)
@@ -1183,6 +1218,8 @@ next_mm:
1183 ksm_scan.address = vma->vm_end; 1218 ksm_scan.address = vma->vm_end;
1184 1219
1185 while (ksm_scan.address < vma->vm_end) { 1220 while (ksm_scan.address < vma->vm_end) {
1221 if (ksm_test_exit(mm))
1222 break;
1186 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1223 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1187 if (*page && PageAnon(*page)) { 1224 if (*page && PageAnon(*page)) {
1188 flush_anon_page(vma, *page, ksm_scan.address); 1225 flush_anon_page(vma, *page, ksm_scan.address);
@@ -1205,6 +1242,11 @@ next_mm:
1205 } 1242 }
1206 } 1243 }
1207 1244
1245 if (ksm_test_exit(mm)) {
1246 ksm_scan.address = 0;
1247 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1248 struct rmap_item, link);
1249 }
1208 /* 1250 /*
1209 * Nuke all the rmap_items that are above this current rmap: 1251 * Nuke all the rmap_items that are above this current rmap:
1210 * because there were no VM_MERGEABLE vmas with such addresses. 1252 * because there were no VM_MERGEABLE vmas with such addresses.
@@ -1219,24 +1261,29 @@ next_mm:
1219 * We've completed a full scan of all vmas, holding mmap_sem 1261 * We've completed a full scan of all vmas, holding mmap_sem
1220 * throughout, and found no VM_MERGEABLE: so do the same as 1262 * throughout, and found no VM_MERGEABLE: so do the same as
1221 * __ksm_exit does to remove this mm from all our lists now. 1263 * __ksm_exit does to remove this mm from all our lists now.
1264 * This applies either when cleaning up after __ksm_exit
1265 * (but beware: we can reach here even before __ksm_exit),
1266 * or when all VM_MERGEABLE areas have been unmapped (and
1267 * mmap_sem then protects against race with MADV_MERGEABLE).
1222 */ 1268 */
1223 hlist_del(&slot->link); 1269 hlist_del(&slot->link);
1224 list_del(&slot->mm_list); 1270 list_del(&slot->mm_list);
1271 spin_unlock(&ksm_mmlist_lock);
1272
1225 free_mm_slot(slot); 1273 free_mm_slot(slot);
1226 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1274 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1275 up_read(&mm->mmap_sem);
1276 mmdrop(mm);
1277 } else {
1278 spin_unlock(&ksm_mmlist_lock);
1279 up_read(&mm->mmap_sem);
1227 } 1280 }
1228 spin_unlock(&ksm_mmlist_lock);
1229 up_read(&mm->mmap_sem);
1230 1281
1231 /* Repeat until we've completed scanning the whole list */ 1282 /* Repeat until we've completed scanning the whole list */
1232 slot = ksm_scan.mm_slot; 1283 slot = ksm_scan.mm_slot;
1233 if (slot != &ksm_mm_head) 1284 if (slot != &ksm_mm_head)
1234 goto next_mm; 1285 goto next_mm;
1235 1286
1236 /*
1237 * Bump seqnr here rather than at top, so that __ksm_exit
1238 * can skip rb_erase on unstable tree until we run again.
1239 */
1240 ksm_scan.seqnr++; 1287 ksm_scan.seqnr++;
1241 return NULL; 1288 return NULL;
1242} 1289}
@@ -1361,6 +1408,7 @@ int __ksm_enter(struct mm_struct *mm)
1361 spin_unlock(&ksm_mmlist_lock); 1408 spin_unlock(&ksm_mmlist_lock);
1362 1409
1363 set_bit(MMF_VM_MERGEABLE, &mm->flags); 1410 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1411 atomic_inc(&mm->mm_count);
1364 1412
1365 if (needs_wakeup) 1413 if (needs_wakeup)
1366 wake_up_interruptible(&ksm_thread_wait); 1414 wake_up_interruptible(&ksm_thread_wait);
@@ -1368,41 +1416,45 @@ int __ksm_enter(struct mm_struct *mm)
1368 return 0; 1416 return 0;
1369} 1417}
1370 1418
1371void __ksm_exit(struct mm_struct *mm) 1419void __ksm_exit(struct mm_struct *mm,
1420 struct mmu_gather **tlbp, unsigned long end)
1372{ 1421{
1373 struct mm_slot *mm_slot; 1422 struct mm_slot *mm_slot;
1423 int easy_to_free = 0;
1374 1424
1375 /* 1425 /*
1376 * This process is exiting: doesn't hold and doesn't need mmap_sem; 1426 * This process is exiting: if it's straightforward (as is the
1377 * but we do need to exclude ksmd and other exiters while we modify 1427 * case when ksmd was never running), free mm_slot immediately.
1378 * the various lists and trees. 1428 * But if it's at the cursor or has rmap_items linked to it, use
1429 * mmap_sem to synchronize with any break_cows before pagetables
1430 * are freed, and leave the mm_slot on the list for ksmd to free.
1431 * Beware: ksm may already have noticed it exiting and freed the slot.
1379 */ 1432 */
1380 mutex_lock(&ksm_thread_mutex); 1433
1381 spin_lock(&ksm_mmlist_lock); 1434 spin_lock(&ksm_mmlist_lock);
1382 mm_slot = get_mm_slot(mm); 1435 mm_slot = get_mm_slot(mm);
1383 if (!list_empty(&mm_slot->rmap_list)) { 1436 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1384 spin_unlock(&ksm_mmlist_lock); 1437 if (list_empty(&mm_slot->rmap_list)) {
1385 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); 1438 hlist_del(&mm_slot->link);
1386 spin_lock(&ksm_mmlist_lock); 1439 list_del(&mm_slot->mm_list);
1387 } 1440 easy_to_free = 1;
1388 1441 } else {
1389 if (ksm_scan.mm_slot == mm_slot) { 1442 list_move(&mm_slot->mm_list,
1390 ksm_scan.mm_slot = list_entry( 1443 &ksm_scan.mm_slot->mm_list);
1391 mm_slot->mm_list.next, struct mm_slot, mm_list); 1444 }
1392 ksm_scan.address = 0;
1393 ksm_scan.rmap_item = list_entry(
1394 &ksm_scan.mm_slot->rmap_list, struct rmap_item, link);
1395 if (ksm_scan.mm_slot == &ksm_mm_head)
1396 ksm_scan.seqnr++;
1397 } 1445 }
1398
1399 hlist_del(&mm_slot->link);
1400 list_del(&mm_slot->mm_list);
1401 spin_unlock(&ksm_mmlist_lock); 1446 spin_unlock(&ksm_mmlist_lock);
1402 1447
1403 free_mm_slot(mm_slot); 1448 if (easy_to_free) {
1404 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1449 free_mm_slot(mm_slot);
1405 mutex_unlock(&ksm_thread_mutex); 1450 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1451 mmdrop(mm);
1452 } else if (mm_slot) {
1453 tlb_finish_mmu(*tlbp, 0, end);
1454 down_write(&mm->mmap_sem);
1455 up_write(&mm->mmap_sem);
1456 *tlbp = tlb_gather_mmu(mm, 1);
1457 }
1406} 1458}
1407 1459
1408#define KSM_ATTR_RO(_name) \ 1460#define KSM_ATTR_RO(_name) \