aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorJosef Bacik <josef@toxicpanda.com>2019-03-13 14:44:22 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-15 14:21:25 -0400
commit6b4c9f4469819a0c1a38a0a4541337e0f9bf6c11 (patch)
tree00616309fe21cf48e11295e8e19b2d75889cbf12 /mm/filemap.c
parenta75d4c33377277b6034dd1e2663bce444f952c14 (diff)
filemap: drop the mmap_sem for all blocking operations
Currently we only drop the mmap_sem if there is contention on the page lock. The idea is that we issue readahead and then go to lock the page while it is under IO and we want to not hold the mmap_sem during the IO. The problem with this is the assumption that the readahead does anything. In the case that the box is under extreme memory or IO pressure we may end up not reading anything at all for readahead, which means we will end up reading in the page under the mmap_sem. Even if the readahead does something, it could get throttled because of io pressure on the system and the process is in a lower priority cgroup. Holding the mmap_sem while doing IO is problematic because it can cause system-wide priority inversions. Consider some large company that does a lot of web traffic. This large company has load balancing logic in it's core web server, cause some engineer thought this was a brilliant plan. This load balancing logic gets statistics from /proc about the system, which trip over processes mmap_sem for various reasons. Now the web server application is in a protected cgroup, but these other processes may not be, and if they are being throttled while their mmap_sem is held we'll stall, and cause this nice death spiral. Instead rework filemap fault path to drop the mmap sem at any point that we may do IO or block for an extended period of time. This includes while issuing readahead, locking the page, or needing to call ->readpage because readahead did not occur. Then once we have a fully uptodate page we can return with VM_FAULT_RETRY and come back again to find our nicely in-cache page that was gotten outside of the mmap_sem. This patch also adds a new helper for locking the page with the mmap_sem dropped. This doesn't make sense currently as generally speaking if the page is already locked it'll have been read in (unless there was an error) before it was unlocked. However a forthcoming patchset will change this with the ability to abort read-ahead bio's if necessary, making it more likely that we could contend for a page lock and still have a not uptodate page. This allows us to deal with this case by grabbing the lock and issuing the IO without the mmap_sem held, and then returning VM_FAULT_RETRY to come back around. [josef@toxicpanda.com: v6] Link: http://lkml.kernel.org/r/20181212152757.10017-1-josef@toxicpanda.com [kirill@shutemov.name: fix race in filemap_fault()] Link: http://lkml.kernel.org/r/20181228235106.okk3oastsnpxusxs@kshutemo-mobl1 [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20181211173801.29535-4-josef@toxicpanda.com Signed-off-by: Josef Bacik <josef@toxicpanda.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Jan Kara <jack@suse.cz> Tested-by: syzbot+b437b5a429d680cf2217@syzkaller.appspotmail.com Cc: Dave Chinner <david@fromorbit.com> Cc: Rik van Riel <riel@redhat.com> Cc: Tejun Heo <tj@kernel.org> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c136
1 files changed, 117 insertions, 19 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 64d014f940e9..2815cb79a246 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2390,28 +2390,92 @@ EXPORT_SYMBOL(generic_file_read_iter);
2390 2390
2391#ifdef CONFIG_MMU 2391#ifdef CONFIG_MMU
2392#define MMAP_LOTSAMISS (100) 2392#define MMAP_LOTSAMISS (100)
2393static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
2394 struct file *fpin)
2395{
2396 int flags = vmf->flags;
2397
2398 if (fpin)
2399 return fpin;
2400
2401 /*
2402 * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
2403 * anything, so we only pin the file and drop the mmap_sem if only
2404 * FAULT_FLAG_ALLOW_RETRY is set.
2405 */
2406 if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
2407 FAULT_FLAG_ALLOW_RETRY) {
2408 fpin = get_file(vmf->vma->vm_file);
2409 up_read(&vmf->vma->vm_mm->mmap_sem);
2410 }
2411 return fpin;
2412}
2413
2414/*
2415 * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
2416 * @vmf - the vm_fault for this fault.
2417 * @page - the page to lock.
2418 * @fpin - the pointer to the file we may pin (or is already pinned).
2419 *
2420 * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
2421 * It differs in that it actually returns the page locked if it returns 1 and 0
2422 * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin
2423 * will point to the pinned file and needs to be fput()'ed at a later point.
2424 */
2425static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
2426 struct file **fpin)
2427{
2428 if (trylock_page(page))
2429 return 1;
2430
2431 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
2432 return 0;
2433
2434 *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
2435 if (vmf->flags & FAULT_FLAG_KILLABLE) {
2436 if (__lock_page_killable(page)) {
2437 /*
2438 * We didn't have the right flags to drop the mmap_sem,
2439 * but all fault_handlers only check for fatal signals
2440 * if we return VM_FAULT_RETRY, so we need to drop the
2441 * mmap_sem here and return 0 if we don't have a fpin.
2442 */
2443 if (*fpin == NULL)
2444 up_read(&vmf->vma->vm_mm->mmap_sem);
2445 return 0;
2446 }
2447 } else
2448 __lock_page(page);
2449 return 1;
2450}
2451
2393 2452
2394/* 2453/*
2395 * Synchronous readahead happens when we don't even find 2454 * Synchronous readahead happens when we don't even find a page in the page
2396 * a page in the page cache at all. 2455 * cache at all. We don't want to perform IO under the mmap sem, so if we have
2456 * to drop the mmap sem we return the file that was pinned in order for us to do
2457 * that. If we didn't pin a file then we return NULL. The file that is
2458 * returned needs to be fput()'ed when we're done with it.
2397 */ 2459 */
2398static void do_sync_mmap_readahead(struct vm_fault *vmf) 2460static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
2399{ 2461{
2400 struct file *file = vmf->vma->vm_file; 2462 struct file *file = vmf->vma->vm_file;
2401 struct file_ra_state *ra = &file->f_ra; 2463 struct file_ra_state *ra = &file->f_ra;
2402 struct address_space *mapping = file->f_mapping; 2464 struct address_space *mapping = file->f_mapping;
2465 struct file *fpin = NULL;
2403 pgoff_t offset = vmf->pgoff; 2466 pgoff_t offset = vmf->pgoff;
2404 2467
2405 /* If we don't want any read-ahead, don't bother */ 2468 /* If we don't want any read-ahead, don't bother */
2406 if (vmf->vma->vm_flags & VM_RAND_READ) 2469 if (vmf->vma->vm_flags & VM_RAND_READ)
2407 return; 2470 return fpin;
2408 if (!ra->ra_pages) 2471 if (!ra->ra_pages)
2409 return; 2472 return fpin;
2410 2473
2411 if (vmf->vma->vm_flags & VM_SEQ_READ) { 2474 if (vmf->vma->vm_flags & VM_SEQ_READ) {
2475 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2412 page_cache_sync_readahead(mapping, ra, file, offset, 2476 page_cache_sync_readahead(mapping, ra, file, offset,
2413 ra->ra_pages); 2477 ra->ra_pages);
2414 return; 2478 return fpin;
2415 } 2479 }
2416 2480
2417 /* Avoid banging the cache line if not needed */ 2481 /* Avoid banging the cache line if not needed */
@@ -2423,37 +2487,44 @@ static void do_sync_mmap_readahead(struct vm_fault *vmf)
2423 * stop bothering with read-ahead. It will only hurt. 2487 * stop bothering with read-ahead. It will only hurt.
2424 */ 2488 */
2425 if (ra->mmap_miss > MMAP_LOTSAMISS) 2489 if (ra->mmap_miss > MMAP_LOTSAMISS)
2426 return; 2490 return fpin;
2427 2491
2428 /* 2492 /*
2429 * mmap read-around 2493 * mmap read-around
2430 */ 2494 */
2495 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2431 ra->start = max_t(long, 0, offset - ra->ra_pages / 2); 2496 ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
2432 ra->size = ra->ra_pages; 2497 ra->size = ra->ra_pages;
2433 ra->async_size = ra->ra_pages / 4; 2498 ra->async_size = ra->ra_pages / 4;
2434 ra_submit(ra, mapping, file); 2499 ra_submit(ra, mapping, file);
2500 return fpin;
2435} 2501}
2436 2502
2437/* 2503/*
2438 * Asynchronous readahead happens when we find the page and PG_readahead, 2504 * Asynchronous readahead happens when we find the page and PG_readahead,
2439 * so we want to possibly extend the readahead further.. 2505 * so we want to possibly extend the readahead further. We return the file that
2506 * was pinned if we have to drop the mmap_sem in order to do IO.
2440 */ 2507 */
2441static void do_async_mmap_readahead(struct vm_fault *vmf, 2508static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
2442 struct page *page) 2509 struct page *page)
2443{ 2510{
2444 struct file *file = vmf->vma->vm_file; 2511 struct file *file = vmf->vma->vm_file;
2445 struct file_ra_state *ra = &file->f_ra; 2512 struct file_ra_state *ra = &file->f_ra;
2446 struct address_space *mapping = file->f_mapping; 2513 struct address_space *mapping = file->f_mapping;
2514 struct file *fpin = NULL;
2447 pgoff_t offset = vmf->pgoff; 2515 pgoff_t offset = vmf->pgoff;
2448 2516
2449 /* If we don't want any read-ahead, don't bother */ 2517 /* If we don't want any read-ahead, don't bother */
2450 if (vmf->vma->vm_flags & VM_RAND_READ) 2518 if (vmf->vma->vm_flags & VM_RAND_READ)
2451 return; 2519 return fpin;
2452 if (ra->mmap_miss > 0) 2520 if (ra->mmap_miss > 0)
2453 ra->mmap_miss--; 2521 ra->mmap_miss--;
2454 if (PageReadahead(page)) 2522 if (PageReadahead(page)) {
2523 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2455 page_cache_async_readahead(mapping, ra, file, 2524 page_cache_async_readahead(mapping, ra, file,
2456 page, offset, ra->ra_pages); 2525 page, offset, ra->ra_pages);
2526 }
2527 return fpin;
2457} 2528}
2458 2529
2459/** 2530/**
@@ -2485,6 +2556,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
2485{ 2556{
2486 int error; 2557 int error;
2487 struct file *file = vmf->vma->vm_file; 2558 struct file *file = vmf->vma->vm_file;
2559 struct file *fpin = NULL;
2488 struct address_space *mapping = file->f_mapping; 2560 struct address_space *mapping = file->f_mapping;
2489 struct file_ra_state *ra = &file->f_ra; 2561 struct file_ra_state *ra = &file->f_ra;
2490 struct inode *inode = mapping->host; 2562 struct inode *inode = mapping->host;
@@ -2506,25 +2578,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
2506 * We found the page, so try async readahead before 2578 * We found the page, so try async readahead before
2507 * waiting for the lock. 2579 * waiting for the lock.
2508 */ 2580 */
2509 do_async_mmap_readahead(vmf, page); 2581 fpin = do_async_mmap_readahead(vmf, page);
2510 } else if (!page) { 2582 } else if (!page) {
2511 /* No page in the page cache at all */ 2583 /* No page in the page cache at all */
2512 do_sync_mmap_readahead(vmf);
2513 count_vm_event(PGMAJFAULT); 2584 count_vm_event(PGMAJFAULT);
2514 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 2585 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
2515 ret = VM_FAULT_MAJOR; 2586 ret = VM_FAULT_MAJOR;
2587 fpin = do_sync_mmap_readahead(vmf);
2516retry_find: 2588retry_find:
2517 page = pagecache_get_page(mapping, offset, 2589 page = pagecache_get_page(mapping, offset,
2518 FGP_CREAT|FGP_FOR_MMAP, 2590 FGP_CREAT|FGP_FOR_MMAP,
2519 vmf->gfp_mask); 2591 vmf->gfp_mask);
2520 if (!page) 2592 if (!page) {
2593 if (fpin)
2594 goto out_retry;
2521 return vmf_error(-ENOMEM); 2595 return vmf_error(-ENOMEM);
2596 }
2522 } 2597 }
2523 2598
2524 if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { 2599 if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
2525 put_page(page); 2600 goto out_retry;
2526 return ret | VM_FAULT_RETRY;
2527 }
2528 2601
2529 /* Did it get truncated? */ 2602 /* Did it get truncated? */
2530 if (unlikely(page->mapping != mapping)) { 2603 if (unlikely(page->mapping != mapping)) {
@@ -2542,6 +2615,16 @@ retry_find:
2542 goto page_not_uptodate; 2615 goto page_not_uptodate;
2543 2616
2544 /* 2617 /*
2618 * We've made it this far and we had to drop our mmap_sem, now is the
2619 * time to return to the upper layer and have it re-find the vma and
2620 * redo the fault.
2621 */
2622 if (fpin) {
2623 unlock_page(page);
2624 goto out_retry;
2625 }
2626
2627 /*
2545 * Found the page and have a reference on it. 2628 * Found the page and have a reference on it.
2546 * We must recheck i_size under page lock. 2629 * We must recheck i_size under page lock.
2547 */ 2630 */
@@ -2563,12 +2646,15 @@ page_not_uptodate:
2563 * and we need to check for errors. 2646 * and we need to check for errors.
2564 */ 2647 */
2565 ClearPageError(page); 2648 ClearPageError(page);
2649 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2566 error = mapping->a_ops->readpage(file, page); 2650 error = mapping->a_ops->readpage(file, page);
2567 if (!error) { 2651 if (!error) {
2568 wait_on_page_locked(page); 2652 wait_on_page_locked(page);
2569 if (!PageUptodate(page)) 2653 if (!PageUptodate(page))
2570 error = -EIO; 2654 error = -EIO;
2571 } 2655 }
2656 if (fpin)
2657 goto out_retry;
2572 put_page(page); 2658 put_page(page);
2573 2659
2574 if (!error || error == AOP_TRUNCATED_PAGE) 2660 if (!error || error == AOP_TRUNCATED_PAGE)
@@ -2577,6 +2663,18 @@ page_not_uptodate:
2577 /* Things didn't work out. Return zero to tell the mm layer so. */ 2663 /* Things didn't work out. Return zero to tell the mm layer so. */
2578 shrink_readahead_size_eio(file, ra); 2664 shrink_readahead_size_eio(file, ra);
2579 return VM_FAULT_SIGBUS; 2665 return VM_FAULT_SIGBUS;
2666
2667out_retry:
2668 /*
2669 * We dropped the mmap_sem, we need to return to the fault handler to
2670 * re-find the vma and come back and find our hopefully still populated
2671 * page.
2672 */
2673 if (page)
2674 put_page(page);
2675 if (fpin)
2676 fput(fpin);
2677 return ret | VM_FAULT_RETRY;
2580} 2678}
2581EXPORT_SYMBOL(filemap_fault); 2679EXPORT_SYMBOL(filemap_fault);
2582 2680