summaryrefslogtreecommitdiffstats
path: root/mm/compaction.c
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2016-12-14 18:04:07 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 19:04:07 -0500
commit73e64c51afc56d4863ae225e947ba2f16ad04487 (patch)
tree056f2d50f693c412866b885ca246b563376fd0fa /mm/compaction.c
parent4d1f0fb096aedea7bb5489af93498a82e467c480 (diff)
mm, compaction: allow compaction for GFP_NOFS requests
compaction has been disabled for GFP_NOFS and GFP_NOIO requests since the direct compaction was introduced by commit 56de7263fcf3 ("mm: compaction: direct compact when a high-order allocation fails"). The main reason is that the migration of page cache pages might recurse back to fs/io layer and we could potentially deadlock. This is overly conservative because all the anonymous memory is migrateable in the GFP_NOFS context just fine. This might be a large portion of the memory in many/most workkloads. Remove the GFP_NOFS restriction and make sure that we skip all fs pages (those with a mapping) while isolating pages to be migrated. We cannot consider clean fs pages because they might need a metadata update so only isolate pages without any mapping for nofs requests. The effect of this patch will be probably very limited in many/most workloads because higher order GFP_NOFS requests are quite rare, although different configurations might lead to very different results. David Chinner has mentioned a heavy metadata workload with 64kB block which to quote him: : Unfortunately, there was an era of cargo cult configuration tweaks in the : Ceph community that has resulted in a large number of production machines : with XFS filesystems configured this way. And a lot of them store large : numbers of small files and run under significant sustained memory : pressure. : : I slowly working towards getting rid of these high order allocations and : replacing them with the equivalent number of single page allocations, but : I haven't got that (complex) change working yet. We can do the following to simulate that workload: $ mkfs.xfs -f -n size=64k <dev> $ mount <dev> /mnt/scratch $ time ./fs_mark -D 10000 -S0 -n 100000 -s 0 -L 32 \ -d /mnt/scratch/0 -d /mnt/scratch/1 \ -d /mnt/scratch/2 -d /mnt/scratch/3 \ -d /mnt/scratch/4 -d /mnt/scratch/5 \ -d /mnt/scratch/6 -d /mnt/scratch/7 \ -d /mnt/scratch/8 -d /mnt/scratch/9 \ -d /mnt/scratch/10 -d /mnt/scratch/11 \ -d /mnt/scratch/12 -d /mnt/scratch/13 \ -d /mnt/scratch/14 -d /mnt/scratch/15 and indeed is hammers the system with many high order GFP_NOFS requests as per a simle tracepoint during the load: $ echo '!(gfp_flags & 0x80) && (gfp_flags &0x400000)' > $TRACE_MNT/events/kmem/mm_page_alloc/filter I am getting 5287609 order=0 37 order=1 1594905 order=2 3048439 order=3 6699207 order=4 66645 order=5 My testing was done in a kvm guest so performance numbers should be taken with a grain of salt but there seems to be a difference when the patch is applied: * Original kernel FSUse% Count Size Files/sec App Overhead 1 1600000 0 4300.1 20745838 3 3200000 0 4239.9 23849857 5 4800000 0 4243.4 25939543 6 6400000 0 4248.4 19514050 8 8000000 0 4262.1 20796169 9 9600000 0 4257.6 21288675 11 11200000 0 4259.7 19375120 13 12800000 0 4220.7 22734141 14 14400000 0 4238.5 31936458 16 16000000 0 4231.5 23409901 18 17600000 0 4045.3 23577700 19 19200000 0 2783.4 58299526 21 20800000 0 2678.2 40616302 23 22400000 0 2693.5 83973996 and xfs complaining about memory allocation not making progress [ 2304.372647] XFS: fs_mark(3289) possible memory allocation deadlock size 65624 in kmem_alloc (mode:0x2408240) [ 2304.443323] XFS: fs_mark(3285) possible memory allocation deadlock size 65728 in kmem_alloc (mode:0x2408240) [ 4796.772477] XFS: fs_mark(3424) possible memory allocation deadlock size 46936 in kmem_alloc (mode:0x2408240) [ 4796.775329] XFS: fs_mark(3423) possible memory allocation deadlock size 51416 in kmem_alloc (mode:0x2408240) [ 4797.388808] XFS: fs_mark(3424) possible memory allocation deadlock size 65728 in kmem_alloc (mode:0x2408240) * Patched kernel FSUse% Count Size Files/sec App Overhead 1 1600000 0 4289.1 19243934 3 3200000 0 4241.6 32828865 5 4800000 0 4248.7 32884693 6 6400000 0 4314.4 19608921 8 8000000 0 4269.9 24953292 9 9600000 0 4270.7 33235572 11 11200000 0 4346.4 40817101 13 12800000 0 4285.3 29972397 14 14400000 0 4297.2 20539765 16 16000000 0 4219.6 18596767 18 17600000 0 4273.8 49611187 19 19200000 0 4300.4 27944451 21 20800000 0 4270.6 22324585 22 22400000 0 4317.6 22650382 24 24000000 0 4065.2 22297964 So the dropdown at Count 19200000 didn't happen and there was only a single warning about allocation not making progress [ 3063.815003] XFS: fs_mark(3272) possible memory allocation deadlock size 65624 in kmem_alloc (mode:0x2408240) This suggests that the patch has helped even though there is not all that much of anonymous memory as the workload mostly generates fs metadata. I assume the success rate would be higher with more anonymous memory which should be the case in many workloads. [akpm@linux-foundation.org: fix comment] Link: http://lkml.kernel.org/r/20161012114721.31853-1-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/compaction.c')
-rw-r--r--mm/compaction.c17
1 files changed, 14 insertions, 3 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 223464227299..949198d01260 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
818 page_count(page) > page_mapcount(page)) 818 page_count(page) > page_mapcount(page))
819 goto isolate_fail; 819 goto isolate_fail;
820 820
821 /*
822 * Only allow to migrate anonymous pages in GFP_NOFS context
823 * because those do not depend on fs locks.
824 */
825 if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
826 goto isolate_fail;
827
821 /* If we already hold the lock, we can skip some rechecking */ 828 /* If we already hold the lock, we can skip some rechecking */
822 if (!locked) { 829 if (!locked) {
823 locked = compact_trylock_irqsave(zone_lru_lock(zone), 830 locked = compact_trylock_irqsave(zone_lru_lock(zone),
@@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1677 unsigned int alloc_flags, const struct alloc_context *ac, 1684 unsigned int alloc_flags, const struct alloc_context *ac,
1678 enum compact_priority prio) 1685 enum compact_priority prio)
1679{ 1686{
1680 int may_enter_fs = gfp_mask & __GFP_FS;
1681 int may_perform_io = gfp_mask & __GFP_IO; 1687 int may_perform_io = gfp_mask & __GFP_IO;
1682 struct zoneref *z; 1688 struct zoneref *z;
1683 struct zone *zone; 1689 struct zone *zone;
1684 enum compact_result rc = COMPACT_SKIPPED; 1690 enum compact_result rc = COMPACT_SKIPPED;
1685 1691
1686 /* Check if the GFP flags allow compaction */ 1692 /*
1687 if (!may_enter_fs || !may_perform_io) 1693 * Check if the GFP flags allow compaction - GFP_NOIO is really
1694 * tricky context because the migration might require IO
1695 */
1696 if (!may_perform_io)
1688 return COMPACT_SKIPPED; 1697 return COMPACT_SKIPPED;
1689 1698
1690 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); 1699 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
@@ -1751,6 +1760,7 @@ static void compact_node(int nid)
1751 .mode = MIGRATE_SYNC, 1760 .mode = MIGRATE_SYNC,
1752 .ignore_skip_hint = true, 1761 .ignore_skip_hint = true,
1753 .whole_zone = true, 1762 .whole_zone = true,
1763 .gfp_mask = GFP_KERNEL,
1754 }; 1764 };
1755 1765
1756 1766
@@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
1876 .classzone_idx = pgdat->kcompactd_classzone_idx, 1886 .classzone_idx = pgdat->kcompactd_classzone_idx,
1877 .mode = MIGRATE_SYNC_LIGHT, 1887 .mode = MIGRATE_SYNC_LIGHT,
1878 .ignore_skip_hint = true, 1888 .ignore_skip_hint = true,
1889 .gfp_mask = GFP_KERNEL,
1879 1890
1880 }; 1891 };
1881 trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, 1892 trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,