aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorJosef Bacik <josef@toxicpanda.com>2019-03-13 14:44:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-15 14:21:25 -0400
commita75d4c33377277b6034dd1e2663bce444f952c14 (patch)
treed89129b17058130ae073290ee0db5b4b65da00b5 /mm/filemap.c
parenta4046c06be50a4f01d435aa7fe57514818e6cc82 (diff)
filemap: kill page_cache_read usage in filemap_fault
Patch series "drop the mmap_sem when doing IO in the fault path", v6. Now that we have proper isolation in place with cgroups2 we have started going through and fixing the various priority inversions. Most are all gone now, but this one is sort of weird since it's not necessarily a priority inversion that happens within the kernel, but rather because of something userspace does. We have giant applications that we want to protect, and parts of these giant applications do things like watch the system state to determine how healthy the box is for load balancing and such. This involves running 'ps' or other such utilities. These utilities will often walk /proc/<pid>/whatever, and these files can sometimes need to down_read(&task->mmap_sem). Not usually a big deal, but we noticed when we are stress testing that sometimes our protected application has latency spikes trying to get the mmap_sem for tasks that are in lower priority cgroups. This is because any down_write() on a semaphore essentially turns it into a mutex, so even if we currently have it held for reading, any new readers will not be allowed on to keep from starving the writer. This is fine, except a lower priority task could be stuck doing IO because it has been throttled to the point that its IO is taking much longer than normal. But because a higher priority group depends on this completing it is now stuck behind lower priority work. In order to avoid this particular priority inversion we want to use the existing retry mechanism to stop from holding the mmap_sem at all if we are going to do IO. This already exists in the read case sort of, but needed to be extended for more than just grabbing the page lock. With io.latency we throttle at submit_bio() time, so the readahead stuff can block and even page_cache_read can block, so all these paths need to have the mmap_sem dropped. The other big thing is ->page_mkwrite. btrfs is particularly shitty here because we have to reserve space for the dirty page, which can be a very expensive operation. We use the same retry method as the read path, and simply cache the page and verify the page is still setup properly the next pass through ->page_mkwrite(). I've tested these patches with xfstests and there are no regressions. This patch (of 3): If we do not have a page at filemap_fault time we'll do this weird forced page_cache_read thing to populate the page, and then drop it again and loop around and find it. This makes for 2 ways we can read a page in filemap_fault, and it's not really needed. Instead add a FGP_FOR_MMAP flag so that pagecache_get_page() will return a unlocked page that's in pagecache. Then use the normal page locking and readpage logic already in filemap_fault. This simplifies the no page in page cache case significantly. [akpm@linux-foundation.org: fix comment text] [josef@toxicpanda.com: don't unlock null page in FGP_FOR_MMAP case] Link: http://lkml.kernel.org/r/20190312201742.22935-1-josef@toxicpanda.com Link: http://lkml.kernel.org/r/20181211173801.29535-2-josef@toxicpanda.com Signed-off-by: Josef Bacik <josef@toxicpanda.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Andrew Morton <akpm@linux-foundation.org> Cc: Tejun Heo <tj@kernel.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Rik van Riel <riel@redhat.com> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c75
1 files changed, 15 insertions, 60 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index ec6566ffbd90..64d014f940e9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1587,6 +1587,9 @@ EXPORT_SYMBOL(find_lock_entry);
1587 * @gfp_mask and added to the page cache and the VM's LRU 1587 * @gfp_mask and added to the page cache and the VM's LRU
1588 * list. The page is returned locked and with an increased 1588 * list. The page is returned locked and with an increased
1589 * refcount. 1589 * refcount.
1590 * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
1591 * its own locking dance if the page is already in cache, or unlock the page
1592 * before returning if we had to add the page to pagecache.
1590 * 1593 *
1591 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even 1594 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
1592 * if the GFP flags specified for FGP_CREAT are atomic. 1595 * if the GFP flags specified for FGP_CREAT are atomic.
@@ -1641,7 +1644,7 @@ no_page:
1641 if (!page) 1644 if (!page)
1642 return NULL; 1645 return NULL;
1643 1646
1644 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) 1647 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
1645 fgp_flags |= FGP_LOCK; 1648 fgp_flags |= FGP_LOCK;
1646 1649
1647 /* Init accessed so avoid atomic mark_page_accessed later */ 1650 /* Init accessed so avoid atomic mark_page_accessed later */
@@ -1655,6 +1658,13 @@ no_page:
1655 if (err == -EEXIST) 1658 if (err == -EEXIST)
1656 goto repeat; 1659 goto repeat;
1657 } 1660 }
1661
1662 /*
1663 * add_to_page_cache_lru locks the page, and for mmap we expect
1664 * an unlocked page.
1665 */
1666 if (page && (fgp_flags & FGP_FOR_MMAP))
1667 unlock_page(page);
1658 } 1668 }
1659 1669
1660 return page; 1670 return page;
@@ -2379,41 +2389,6 @@ out:
2379EXPORT_SYMBOL(generic_file_read_iter); 2389EXPORT_SYMBOL(generic_file_read_iter);
2380 2390
2381#ifdef CONFIG_MMU 2391#ifdef CONFIG_MMU
2382/**
2383 * page_cache_read - adds requested page to the page cache if not already there
2384 * @file: file to read
2385 * @offset: page index
2386 * @gfp_mask: memory allocation flags
2387 *
2388 * This adds the requested page to the page cache if it isn't already there,
2389 * and schedules an I/O to read in its contents from disk.
2390 *
2391 * Return: %0 on success, negative error code otherwise.
2392 */
2393static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
2394{
2395 struct address_space *mapping = file->f_mapping;
2396 struct page *page;
2397 int ret;
2398
2399 do {
2400 page = __page_cache_alloc(gfp_mask);
2401 if (!page)
2402 return -ENOMEM;
2403
2404 ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
2405 if (ret == 0)
2406 ret = mapping->a_ops->readpage(file, page);
2407 else if (ret == -EEXIST)
2408 ret = 0; /* losing race to add is OK */
2409
2410 put_page(page);
2411
2412 } while (ret == AOP_TRUNCATED_PAGE);
2413
2414 return ret;
2415}
2416
2417#define MMAP_LOTSAMISS (100) 2392#define MMAP_LOTSAMISS (100)
2418 2393
2419/* 2394/*
@@ -2539,9 +2514,11 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
2539 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 2514 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
2540 ret = VM_FAULT_MAJOR; 2515 ret = VM_FAULT_MAJOR;
2541retry_find: 2516retry_find:
2542 page = find_get_page(mapping, offset); 2517 page = pagecache_get_page(mapping, offset,
2518 FGP_CREAT|FGP_FOR_MMAP,
2519 vmf->gfp_mask);
2543 if (!page) 2520 if (!page)
2544 goto no_cached_page; 2521 return vmf_error(-ENOMEM);
2545 } 2522 }
2546 2523
2547 if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { 2524 if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
@@ -2578,28 +2555,6 @@ retry_find:
2578 vmf->page = page; 2555 vmf->page = page;
2579 return ret | VM_FAULT_LOCKED; 2556 return ret | VM_FAULT_LOCKED;
2580 2557
2581no_cached_page:
2582 /*
2583 * We're only likely to ever get here if MADV_RANDOM is in
2584 * effect.
2585 */
2586 error = page_cache_read(file, offset, vmf->gfp_mask);
2587
2588 /*
2589 * The page we want has now been added to the page cache.
2590 * In the unlikely event that someone removed it in the
2591 * meantime, we'll just come back here and read it again.
2592 */
2593 if (error >= 0)
2594 goto retry_find;
2595
2596 /*
2597 * An error return from page_cache_read can result if the
2598 * system is low on memory, or a problem occurs while trying
2599 * to schedule I/O.
2600 */
2601 return vmf_error(error);
2602
2603page_not_uptodate: 2558page_not_uptodate:
2604 /* 2559 /*
2605 * Umm, take care of errors if the page isn't up-to-date. 2560 * Umm, take care of errors if the page isn't up-to-date.