From 9f4e41f4717832e34cca153ced62b4a1d7e26c0e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:15 -0800 Subject: mm: refactor truncate_complete_page() Move call of delete_from_page_cache() and page->mapping check out of truncate_complete_page() into the single caller - truncate_inode_page(). Also move page_mapped() check into truncate_complete_page(). That way it will be easier to batch operations. Also rename truncate_complete_page() to truncate_cleanup_page(). Link: http://lkml.kernel.org/r/20171010151937.26984-3-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index 2330223841fb..383a530d511e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -134,11 +134,17 @@ void do_invalidatepage(struct page *page, unsigned int offset, * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static int -truncate_complete_page(struct address_space *mapping, struct page *page) +static void +truncate_cleanup_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) - return -EIO; + if (page_mapped(page)) { + loff_t holelen; + + holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; + unmap_mapping_range(mapping, + (loff_t)page->index << PAGE_SHIFT, + holelen, 0); + } if (page_has_private(page)) do_invalidatepage(page, 0, PAGE_SIZE); @@ -150,8 +156,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) */ cancel_dirty_page(page); ClearPageMappedToDisk(page); - delete_from_page_cache(page); - return 0; } /* @@ -180,16 +184,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) int truncate_inode_page(struct address_space *mapping, struct page *page) { - loff_t holelen; VM_BUG_ON_PAGE(PageTail(page), page); - holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_SHIFT, - holelen, 0); - } - return truncate_complete_page(mapping, page); + if (page->mapping != mapping) + return -EIO; + + truncate_cleanup_page(mapping, page); + delete_from_page_cache(page); + return 0; } /* -- cgit v1.2.2 From aa65c29ce1b6e1990cd2c7d8004bbea7ff3aff38 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:33 -0800 Subject: mm: batch radix tree operations when truncating pages Currently we remove pages from the radix tree one by one. To speed up page cache truncation, lock several pages at once and free them in one go. This allows us to batch radix tree operations in a more efficient way and also save round-trips on mapping->tree_lock. As a result we gain about 20% speed improvement in page cache truncation. Data from a simple benchmark timing 10000 truncates of 1024 pages (on ext4 on ramdisk but the filesystem is barely visible in the profiles). The range shows 1% and 95% percentiles of the measured times: 4.14-rc2 4.14-rc2 + batched truncation 248-256 209-219 249-258 209-217 248-255 211-239 248-255 209-217 247-256 210-218 [jack@suse.cz: convert delete_from_page_cache_batch() to pagevec] Link: http://lkml.kernel.org/r/20171018111648.13714-1-jack@suse.cz [akpm@linux-foundation.org: move struct pagevec forward declaration to top-of-file] Link: http://lkml.kernel.org/r/20171010151937.26984-8-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index 383a530d511e..4a39a3150ee2 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -294,6 +294,14 @@ void truncate_inode_pages_range(struct address_space *mapping, while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + /* + * Pagevec array has exceptional entries and we may also fail + * to lock some pages. So we store pages that can be deleted + * in a new pagevec. + */ + struct pagevec locked_pvec; + + pagevec_init(&locked_pvec, 0); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -315,9 +323,17 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } - truncate_inode_page(mapping, page); - unlock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + continue; + } + pagevec_add(&locked_pvec, page); } + for (i = 0; i < pagevec_count(&locked_pvec); i++) + truncate_cleanup_page(mapping, locked_pvec.pages[i]); + delete_from_page_cache_batch(mapping, &locked_pvec); + for (i = 0; i < pagevec_count(&locked_pvec); i++) + unlock_page(locked_pvec.pages[i]); pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); cond_resched(); -- cgit v1.2.2 From c7df8ad2910e965a6241b6d8f52fd122e26b0315 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:41 -0800 Subject: mm, truncate: do not check mapping for every page being truncated During truncation, the mapping has already been checked for shmem and dax so it's known that workingset_update_node is required. This patch avoids the checks on mapping for each page being truncated. In all other cases, a lookup helper is used to determine if workingset_update_node() needs to be called. The one danger is that the API is slightly harder to use as calling workingset_update_node directly without checking for dax or shmem mappings could lead to surprises. However, the API rarely needs to be used and hopefully the comment is enough to give people the hint. sparsetruncate (tiny) 4.14.0-rc4 4.14.0-rc4 oneirq-v1r1 pickhelper-v1r1 Min Time 141.00 ( 0.00%) 140.00 ( 0.71%) 1st-qrtle Time 142.00 ( 0.00%) 141.00 ( 0.70%) 2nd-qrtle Time 142.00 ( 0.00%) 142.00 ( 0.00%) 3rd-qrtle Time 143.00 ( 0.00%) 143.00 ( 0.00%) Max-90% Time 144.00 ( 0.00%) 144.00 ( 0.00%) Max-95% Time 147.00 ( 0.00%) 145.00 ( 1.36%) Max-99% Time 195.00 ( 0.00%) 191.00 ( 2.05%) Max Time 230.00 ( 0.00%) 205.00 ( 10.87%) Amean Time 144.37 ( 0.00%) 143.82 ( 0.38%) Stddev Time 10.44 ( 0.00%) 9.00 ( 13.74%) Coeff Time 7.23 ( 0.00%) 6.26 ( 13.41%) Best99%Amean Time 143.72 ( 0.00%) 143.34 ( 0.26%) Best95%Amean Time 142.37 ( 0.00%) 142.00 ( 0.26%) Best90%Amean Time 142.19 ( 0.00%) 141.85 ( 0.24%) Best75%Amean Time 141.92 ( 0.00%) 141.58 ( 0.24%) Best50%Amean Time 141.69 ( 0.00%) 141.31 ( 0.27%) Best25%Amean Time 141.38 ( 0.00%) 140.97 ( 0.29%) As you'd expect, the gain is marginal but it can be detected. The differences in bonnie are all within the noise which is not surprising given the impact on the microbenchmark. radix_tree_update_node_t is a callback for some radix operations that optionally passes in a private field. The only user of the callback is workingset_update_node and as it no longer requires a mapping, the private field is removed. Link: http://lkml.kernel.org/r/20171018075952.10627-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index 4a39a3150ee2..02a0c0466c78 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -42,7 +42,7 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, if (*slot != entry) goto unlock; __radix_tree_replace(&mapping->page_tree, node, slot, NULL, - workingset_update_node, mapping); + workingset_update_node); mapping->nrexceptional--; unlock: spin_unlock_irq(&mapping->tree_lock); -- cgit v1.2.2 From f2187599189d94aeeee2fa5d9806186c7732ed37 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:44 -0800 Subject: mm, truncate: remove all exceptional entries from pagevec under one lock During truncate each entry in a pagevec is checked to see if it is an exceptional entry and if so, the shadow entry is cleaned up. This is potentially expensive as multiple entries for a mapping locks/unlocks the tree lock. This batches the operation such that any exceptional entries removed from a pagevec only acquire the mapping tree lock once. The corner case where this is more expensive is where there is only one exceptional entry but this is unlikely due to temporal locality and how it affects LRU ordering. Note that for truncations of small files created recently, this patch should show no gain because it only batches the handling of exceptional entries. sparsetruncate (large) 4.14.0-rc4 4.14.0-rc4 pickhelper-v1r1 batchshadow-v1r1 Min Time 38.00 ( 0.00%) 27.00 ( 28.95%) 1st-qrtle Time 40.00 ( 0.00%) 28.00 ( 30.00%) 2nd-qrtle Time 44.00 ( 0.00%) 41.00 ( 6.82%) 3rd-qrtle Time 146.00 ( 0.00%) 147.00 ( -0.68%) Max-90% Time 153.00 ( 0.00%) 153.00 ( 0.00%) Max-95% Time 155.00 ( 0.00%) 156.00 ( -0.65%) Max-99% Time 181.00 ( 0.00%) 171.00 ( 5.52%) Amean Time 93.04 ( 0.00%) 88.43 ( 4.96%) Best99%Amean Time 92.08 ( 0.00%) 86.13 ( 6.46%) Best95%Amean Time 89.19 ( 0.00%) 83.13 ( 6.80%) Best90%Amean Time 85.60 ( 0.00%) 79.15 ( 7.53%) Best75%Amean Time 72.95 ( 0.00%) 65.09 ( 10.78%) Best50%Amean Time 39.86 ( 0.00%) 28.20 ( 29.25%) Best25%Amean Time 39.44 ( 0.00%) 27.70 ( 29.77%) bonnie 4.14.0-rc4 4.14.0-rc4 pickhelper-v1r1 batchshadow-v1r1 Hmean SeqCreate ops 71.92 ( 0.00%) 76.78 ( 6.76%) Hmean SeqCreate read 42.42 ( 0.00%) 45.01 ( 6.10%) Hmean SeqCreate del 26519.88 ( 0.00%) 27191.87 ( 2.53%) Hmean RandCreate ops 71.92 ( 0.00%) 76.95 ( 7.00%) Hmean RandCreate read 44.44 ( 0.00%) 49.23 ( 10.78%) Hmean RandCreate del 24948.62 ( 0.00%) 24764.97 ( -0.74%) Truncation of a large number of files shows a substantial gain with 99% of files being truncated 6.46% faster. bonnie shows a modest gain of 2.53% [jack@suse.cz: fix truncate_exceptional_pvec_entries()] Link: http://lkml.kernel.org/r/20171108164226.26788-1-jack@suse.cz Link: http://lkml.kernel.org/r/20171018075952.10627-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Signed-off-by: Jan Kara Reviewed-by: Jan Kara Acked-by: Johannes Weiner Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 91 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 28 deletions(-) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index 02a0c0466c78..c30e8fa3d063 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -25,44 +25,85 @@ #include #include "internal.h" -static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, - void *entry) +/* + * Regular page slots are stabilized by the page lock even without the tree + * itself locked. These unlocked entries need verification under the tree + * lock. + */ +static inline void __clear_shadow_entry(struct address_space *mapping, + pgoff_t index, void *entry) { struct radix_tree_node *node; void **slot; - spin_lock_irq(&mapping->tree_lock); - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) - goto unlock; + return; if (*slot != entry) - goto unlock; + return; __radix_tree_replace(&mapping->page_tree, node, slot, NULL, workingset_update_node); mapping->nrexceptional--; -unlock: +} + +static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, + void *entry) +{ + spin_lock_irq(&mapping->tree_lock); + __clear_shadow_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); } /* - * Unconditionally remove exceptional entry. Usually called from truncate path. + * Unconditionally remove exceptional entries. Usually called from truncate + * path. Note that the pagevec may be altered by this function by removing + * exceptional entries similar to what pagevec_remove_exceptionals does. */ -static void truncate_exceptional_entry(struct address_space *mapping, - pgoff_t index, void *entry) +static void truncate_exceptional_pvec_entries(struct address_space *mapping, + struct pagevec *pvec, pgoff_t *indices, + pgoff_t end) { + int i, j; + bool dax, lock; + /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; - if (dax_mapping(mapping)) { - dax_delete_mapping_entry(mapping, index); + for (j = 0; j < pagevec_count(pvec); j++) + if (radix_tree_exceptional_entry(pvec->pages[j])) + break; + + if (j == pagevec_count(pvec)) return; + + dax = dax_mapping(mapping); + lock = !dax && indices[j] < end; + if (lock) + spin_lock_irq(&mapping->tree_lock); + + for (i = j; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + pgoff_t index = indices[i]; + + if (!radix_tree_exceptional_entry(page)) { + pvec->pages[j++] = page; + continue; + } + + if (index >= end) + continue; + + if (unlikely(dax)) { + dax_delete_mapping_entry(mapping, index); + continue; + } + + __clear_shadow_entry(mapping, index, page); } - clear_shadow_entry(mapping, index, entry); + + if (lock) + spin_unlock_irq(&mapping->tree_lock); + pvec->nr = j; } /* @@ -310,11 +351,8 @@ void truncate_inode_pages_range(struct address_space *mapping, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { - truncate_exceptional_entry(mapping, index, - page); + if (radix_tree_exceptional_entry(page)) continue; - } if (!trylock_page(page)) continue; @@ -334,12 +372,11 @@ void truncate_inode_pages_range(struct address_space *mapping, delete_from_page_cache_batch(mapping, &locked_pvec); for (i = 0; i < pagevec_count(&locked_pvec); i++) unlock_page(locked_pvec.pages[i]); - pagevec_remove_exceptionals(&pvec); + truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); pagevec_release(&pvec); cond_resched(); index++; } - if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { @@ -397,6 +434,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); break; } + for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -408,11 +446,8 @@ void truncate_inode_pages_range(struct address_space *mapping, break; } - if (radix_tree_exceptional_entry(page)) { - truncate_exceptional_entry(mapping, index, - page); + if (radix_tree_exceptional_entry(page)) continue; - } lock_page(page); WARN_ON(page_to_index(page) != index); @@ -420,7 +455,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } - pagevec_remove_exceptionals(&pvec); + truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); pagevec_release(&pvec); index++; } -- cgit v1.2.2 From 8667982014d6048e0b5e286b6247ff24f48d4cc6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:52 -0800 Subject: mm, pagevec: remove cold parameter for pagevecs Every pagevec_init user claims the pages being released are hot even in cases where it is unlikely the pages are hot. As no one cares about the hotness of pages being released to the allocator, just ditch the parameter. No performance impact is expected as the overhead is marginal. The parameter is removed simply because it is a bit stupid to have a useless parameter copied everywhere. Link: http://lkml.kernel.org/r/20171018075952.10627-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/truncate.c') diff --git a/mm/truncate.c b/mm/truncate.c index c30e8fa3d063..e4b4cf0f4070 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -330,7 +330,7 @@ void truncate_inode_pages_range(struct address_space *mapping, else end = (lend + 1) >> PAGE_SHIFT; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), @@ -342,7 +342,7 @@ void truncate_inode_pages_range(struct address_space *mapping, */ struct pagevec locked_pvec; - pagevec_init(&locked_pvec, 0); + pagevec_init(&locked_pvec); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -553,7 +553,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, unsigned long count = 0; int i; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { @@ -683,7 +683,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (mapping->nrpages == 0 && mapping->nrexceptional == 0) goto out; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, -- cgit v1.2.2