aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2014-06-04 19:10:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-04 19:54:10 -0400
commit2457aec63745e235bcafb7ef312b182d8682f0fc (patch)
treec658266ed5a8c1acd4f2028c8bf69ab2a7c8ba42 /mm/filemap.c
parente7470ee89f003634a88e7b5e5a7b65b3025987de (diff)
mm: non-atomically mark page accessed during page cache allocation where possible
aops->write_begin may allocate a new page and make it visible only to have mark_page_accessed called almost immediately after. Once the page is visible the atomic operations are necessary which is noticable overhead when writing to an in-memory filesystem like tmpfs but should also be noticable with fast storage. The objective of the patch is to initialse the accessed information with non-atomic operations before the page is visible. The bulk of filesystems directly or indirectly use grab_cache_page_write_begin or find_or_create_page for the initial allocation of a page cache page. This patch adds an init_page_accessed() helper which behaves like the first call to mark_page_accessed() but may called before the page is visible and can be done non-atomically. The primary APIs of concern in this care are the following and are used by most filesystems. find_get_page find_lock_page find_or_create_page grab_cache_page_nowait grab_cache_page_write_begin All of them are very similar in detail to the patch creates a core helper pagecache_get_page() which takes a flags parameter that affects its behavior such as whether the page should be marked accessed or not. Then old API is preserved but is basically a thin wrapper around this core function. Each of the filesystems are then updated to avoid calling mark_page_accessed when it is known that the VM interfaces have already done the job. There is a slight snag in that the timing of the mark_page_accessed() has now changed so in rare cases it's possible a page gets to the end of the LRU as PageReferenced where as previously it might have been repromoted. This is expected to be rare but it's worth the filesystem people thinking about it in case they see a problem with the timing change. It is also the case that some filesystems may be marking pages accessed that previously did not but it makes sense that filesystems have consistent behaviour in this regard. The test case used to evaulate this is a simple dd of a large file done multiple times with the file deleted on each iterations. The size of the file is 1/10th physical memory to avoid dirty page balancing. In the async case it will be possible that the workload completes without even hitting the disk and will have variable results but highlight the impact of mark_page_accessed for async IO. The sync results are expected to be more stable. The exception is tmpfs where the normal case is for the "IO" to not hit the disk. The test machine was single socket and UMA to avoid any scheduling or NUMA artifacts. Throughput and wall times are presented for sync IO, only wall times are shown for async as the granularity reported by dd and the variability is unsuitable for comparison. As async results were variable do to writback timings, I'm only reporting the maximum figures. The sync results were stable enough to make the mean and stddev uninteresting. The performance results are reported based on a run with no profiling. Profile data is based on a separate run with oprofile running. async dd 3.15.0-rc3 3.15.0-rc3 vanilla accessed-v2 ext3 Max elapsed 13.9900 ( 0.00%) 11.5900 ( 17.16%) tmpfs Max elapsed 0.5100 ( 0.00%) 0.4900 ( 3.92%) btrfs Max elapsed 12.8100 ( 0.00%) 12.7800 ( 0.23%) ext4 Max elapsed 18.6000 ( 0.00%) 13.3400 ( 28.28%) xfs Max elapsed 12.5600 ( 0.00%) 2.0900 ( 83.36%) The XFS figure is a bit strange as it managed to avoid a worst case by sheer luck but the average figures looked reasonable. samples percentage ext3 86107 0.9783 vmlinux-3.15.0-rc4-vanilla mark_page_accessed ext3 23833 0.2710 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed ext3 5036 0.0573 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed ext4 64566 0.8961 vmlinux-3.15.0-rc4-vanilla mark_page_accessed ext4 5322 0.0713 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed ext4 2869 0.0384 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed xfs 62126 1.7675 vmlinux-3.15.0-rc4-vanilla mark_page_accessed xfs 1904 0.0554 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed xfs 103 0.0030 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed btrfs 10655 0.1338 vmlinux-3.15.0-rc4-vanilla mark_page_accessed btrfs 2020 0.0273 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed btrfs 587 0.0079 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed tmpfs 59562 3.2628 vmlinux-3.15.0-rc4-vanilla mark_page_accessed tmpfs 1210 0.0696 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed tmpfs 94 0.0054 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed [akpm@linux-foundation.org: don't run init_page_accessed() against an uninitialised pointer] Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Jan Kara <jack@suse.cz> Cc: Michal Hocko <mhocko@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Theodore Ts'o <tytso@mit.edu> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Tested-by: Prabhakar Lad <prabhakar.csengg@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c202
1 files changed, 75 insertions, 127 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 47d235b357a7..0fcd792103f3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -982,26 +982,6 @@ out:
982EXPORT_SYMBOL(find_get_entry); 982EXPORT_SYMBOL(find_get_entry);
983 983
984/** 984/**
985 * find_get_page - find and get a page reference
986 * @mapping: the address_space to search
987 * @offset: the page index
988 *
989 * Looks up the page cache slot at @mapping & @offset. If there is a
990 * page cache page, it is returned with an increased refcount.
991 *
992 * Otherwise, %NULL is returned.
993 */
994struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
995{
996 struct page *page = find_get_entry(mapping, offset);
997
998 if (radix_tree_exceptional_entry(page))
999 page = NULL;
1000 return page;
1001}
1002EXPORT_SYMBOL(find_get_page);
1003
1004/**
1005 * find_lock_entry - locate, pin and lock a page cache entry 985 * find_lock_entry - locate, pin and lock a page cache entry
1006 * @mapping: the address_space to search 986 * @mapping: the address_space to search
1007 * @offset: the page cache index 987 * @offset: the page cache index
@@ -1038,66 +1018,84 @@ repeat:
1038EXPORT_SYMBOL(find_lock_entry); 1018EXPORT_SYMBOL(find_lock_entry);
1039 1019
1040/** 1020/**
1041 * find_lock_page - locate, pin and lock a pagecache page 1021 * pagecache_get_page - find and get a page reference
1042 * @mapping: the address_space to search 1022 * @mapping: the address_space to search
1043 * @offset: the page index 1023 * @offset: the page index
1024 * @fgp_flags: PCG flags
1025 * @gfp_mask: gfp mask to use if a page is to be allocated
1044 * 1026 *
1045 * Looks up the page cache slot at @mapping & @offset. If there is a 1027 * Looks up the page cache slot at @mapping & @offset.
1046 * page cache page, it is returned locked and with an increased
1047 * refcount.
1048 *
1049 * Otherwise, %NULL is returned.
1050 *
1051 * find_lock_page() may sleep.
1052 */
1053struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
1054{
1055 struct page *page = find_lock_entry(mapping, offset);
1056
1057 if (radix_tree_exceptional_entry(page))
1058 page = NULL;
1059 return page;
1060}
1061EXPORT_SYMBOL(find_lock_page);
1062
1063/**
1064 * find_or_create_page - locate or add a pagecache page
1065 * @mapping: the page's address_space
1066 * @index: the page's index into the mapping
1067 * @gfp_mask: page allocation mode
1068 * 1028 *
1069 * Looks up the page cache slot at @mapping & @offset. If there is a 1029 * PCG flags modify how the page is returned
1070 * page cache page, it is returned locked and with an increased
1071 * refcount.
1072 * 1030 *
1073 * If the page is not present, a new page is allocated using @gfp_mask 1031 * FGP_ACCESSED: the page will be marked accessed
1074 * and added to the page cache and the VM's LRU list. The page is 1032 * FGP_LOCK: Page is return locked
1075 * returned locked and with an increased refcount. 1033 * FGP_CREAT: If page is not present then a new page is allocated using
1034 * @gfp_mask and added to the page cache and the VM's LRU
1035 * list. The page is returned locked and with an increased
1036 * refcount. Otherwise, %NULL is returned.
1076 * 1037 *
1077 * On memory exhaustion, %NULL is returned. 1038 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
1039 * if the GFP flags specified for FGP_CREAT are atomic.
1078 * 1040 *
1079 * find_or_create_page() may sleep, even if @gfp_flags specifies an 1041 * If there is a page cache page, it is returned with an increased refcount.
1080 * atomic allocation!
1081 */ 1042 */
1082struct page *find_or_create_page(struct address_space *mapping, 1043struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
1083 pgoff_t index, gfp_t gfp_mask) 1044 int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
1084{ 1045{
1085 struct page *page; 1046 struct page *page;
1086 int err; 1047
1087repeat: 1048repeat:
1088 page = find_lock_page(mapping, index); 1049 page = find_get_entry(mapping, offset);
1089 if (!page) { 1050 if (radix_tree_exceptional_entry(page))
1090 page = __page_cache_alloc(gfp_mask); 1051 page = NULL;
1052 if (!page)
1053 goto no_page;
1054
1055 if (fgp_flags & FGP_LOCK) {
1056 if (fgp_flags & FGP_NOWAIT) {
1057 if (!trylock_page(page)) {
1058 page_cache_release(page);
1059 return NULL;
1060 }
1061 } else {
1062 lock_page(page);
1063 }
1064
1065 /* Has the page been truncated? */
1066 if (unlikely(page->mapping != mapping)) {
1067 unlock_page(page);
1068 page_cache_release(page);
1069 goto repeat;
1070 }
1071 VM_BUG_ON_PAGE(page->index != offset, page);
1072 }
1073
1074 if (page && (fgp_flags & FGP_ACCESSED))
1075 mark_page_accessed(page);
1076
1077no_page:
1078 if (!page && (fgp_flags & FGP_CREAT)) {
1079 int err;
1080 if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
1081 cache_gfp_mask |= __GFP_WRITE;
1082 if (fgp_flags & FGP_NOFS) {
1083 cache_gfp_mask &= ~__GFP_FS;
1084 radix_gfp_mask &= ~__GFP_FS;
1085 }
1086
1087 page = __page_cache_alloc(cache_gfp_mask);
1091 if (!page) 1088 if (!page)
1092 return NULL; 1089 return NULL;
1093 /* 1090
1094 * We want a regular kernel memory (not highmem or DMA etc) 1091 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
1095 * allocation for the radix tree nodes, but we need to honour 1092 fgp_flags |= FGP_LOCK;
1096 * the context-specific requirements the caller has asked for. 1093
1097 * GFP_RECLAIM_MASK collects those requirements. 1094 /* Init accessed so avoit atomic mark_page_accessed later */
1098 */ 1095 if (fgp_flags & FGP_ACCESSED)
1099 err = add_to_page_cache_lru(page, mapping, index, 1096 init_page_accessed(page);
1100 (gfp_mask & GFP_RECLAIM_MASK)); 1097
1098 err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
1101 if (unlikely(err)) { 1099 if (unlikely(err)) {
1102 page_cache_release(page); 1100 page_cache_release(page);
1103 page = NULL; 1101 page = NULL;
@@ -1105,9 +1103,10 @@ repeat:
1105 goto repeat; 1103 goto repeat;
1106 } 1104 }
1107 } 1105 }
1106
1108 return page; 1107 return page;
1109} 1108}
1110EXPORT_SYMBOL(find_or_create_page); 1109EXPORT_SYMBOL(pagecache_get_page);
1111 1110
1112/** 1111/**
1113 * find_get_entries - gang pagecache lookup 1112 * find_get_entries - gang pagecache lookup
@@ -1404,39 +1403,6 @@ repeat:
1404} 1403}
1405EXPORT_SYMBOL(find_get_pages_tag); 1404EXPORT_SYMBOL(find_get_pages_tag);
1406 1405
1407/**
1408 * grab_cache_page_nowait - returns locked page at given index in given cache
1409 * @mapping: target address_space
1410 * @index: the page index
1411 *
1412 * Same as grab_cache_page(), but do not wait if the page is unavailable.
1413 * This is intended for speculative data generators, where the data can
1414 * be regenerated if the page couldn't be grabbed. This routine should
1415 * be safe to call while holding the lock for another page.
1416 *
1417 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
1418 * and deadlock against the caller's locked page.
1419 */
1420struct page *
1421grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
1422{
1423 struct page *page = find_get_page(mapping, index);
1424
1425 if (page) {
1426 if (trylock_page(page))
1427 return page;
1428 page_cache_release(page);
1429 return NULL;
1430 }
1431 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
1432 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
1433 page_cache_release(page);
1434 page = NULL;
1435 }
1436 return page;
1437}
1438EXPORT_SYMBOL(grab_cache_page_nowait);
1439
1440/* 1406/*
1441 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1407 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
1442 * a _large_ part of the i/o request. Imagine the worst scenario: 1408 * a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2406,7 +2372,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
2406{ 2372{
2407 const struct address_space_operations *aops = mapping->a_ops; 2373 const struct address_space_operations *aops = mapping->a_ops;
2408 2374
2409 mark_page_accessed(page);
2410 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2375 return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2411} 2376}
2412EXPORT_SYMBOL(pagecache_write_end); 2377EXPORT_SYMBOL(pagecache_write_end);
@@ -2488,34 +2453,18 @@ EXPORT_SYMBOL(generic_file_direct_write);
2488struct page *grab_cache_page_write_begin(struct address_space *mapping, 2453struct page *grab_cache_page_write_begin(struct address_space *mapping,
2489 pgoff_t index, unsigned flags) 2454 pgoff_t index, unsigned flags)
2490{ 2455{
2491 int status;
2492 gfp_t gfp_mask;
2493 struct page *page; 2456 struct page *page;
2494 gfp_t gfp_notmask = 0; 2457 int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
2495 2458
2496 gfp_mask = mapping_gfp_mask(mapping);
2497 if (mapping_cap_account_dirty(mapping))
2498 gfp_mask |= __GFP_WRITE;
2499 if (flags & AOP_FLAG_NOFS) 2459 if (flags & AOP_FLAG_NOFS)
2500 gfp_notmask = __GFP_FS; 2460 fgp_flags |= FGP_NOFS;
2501repeat: 2461
2502 page = find_lock_page(mapping, index); 2462 page = pagecache_get_page(mapping, index, fgp_flags,
2463 mapping_gfp_mask(mapping),
2464 GFP_KERNEL);
2503 if (page) 2465 if (page)
2504 goto found; 2466 wait_for_stable_page(page);
2505 2467
2506 page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
2507 if (!page)
2508 return NULL;
2509 status = add_to_page_cache_lru(page, mapping, index,
2510 GFP_KERNEL & ~gfp_notmask);
2511 if (unlikely(status)) {
2512 page_cache_release(page);
2513 if (status == -EEXIST)
2514 goto repeat;
2515 return NULL;
2516 }
2517found:
2518 wait_for_stable_page(page);
2519 return page; 2468 return page;
2520} 2469}
2521EXPORT_SYMBOL(grab_cache_page_write_begin); 2470EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2564,7 +2513,7 @@ again:
2564 2513
2565 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2514 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2566 &page, &fsdata); 2515 &page, &fsdata);
2567 if (unlikely(status)) 2516 if (unlikely(status < 0))
2568 break; 2517 break;
2569 2518
2570 if (mapping_writably_mapped(mapping)) 2519 if (mapping_writably_mapped(mapping))
@@ -2573,7 +2522,6 @@ again:
2573 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2522 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2574 flush_dcache_page(page); 2523 flush_dcache_page(page);
2575 2524
2576 mark_page_accessed(page);
2577 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2525 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2578 page, fsdata); 2526 page, fsdata);
2579 if (unlikely(status < 0)) 2527 if (unlikely(status < 0))