aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-08-21 19:16:17 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-08-21 19:45:03 -0400
commitc67fe3752abe6ab47639e2f9b836900c3dc3da84 (patch)
treec66f8f1c7a26c0277875e90107d9315f69ec2adf /mm/page_alloc.c
parentde74f1cc3b1e9730d9b58580cd11361d30cd182d (diff)
mm: compaction: Abort async compaction if locks are contended or taking too long
Jim Schutt reported a problem that pointed at compaction contending heavily on locks. The workload is straight-forward and in his own words; The systems in question have 24 SAS drives spread across 3 HBAs, running 24 Ceph OSD instances, one per drive. FWIW these servers are dual-socket Intel 5675 Xeons w/48 GB memory. I've got ~160 Ceph Linux clients doing dd simultaneously to a Ceph file system backed by 12 of these servers. Early in the test everything looks fine procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 31 15 0 287216 576 38606628 0 0 2 1158 2 14 1 3 95 0 0 27 15 0 225288 576 38583384 0 0 18 2222016 203357 134876 11 56 17 15 0 28 17 0 219256 576 38544736 0 0 11 2305932 203141 146296 11 49 23 17 0 6 18 0 215596 576 38552872 0 0 7 2363207 215264 166502 12 45 22 20 0 22 18 0 226984 576 38596404 0 0 3 2445741 223114 179527 12 43 23 22 0 and then it goes to pot procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 163 8 0 464308 576 36791368 0 0 11 22210 866 536 3 13 79 4 0 207 14 0 917752 576 36181928 0 0 712 1345376 134598 47367 7 90 1 2 0 123 12 0 685516 576 36296148 0 0 429 1386615 158494 60077 8 84 5 3 0 123 12 0 598572 576 36333728 0 0 1107 1233281 147542 62351 7 84 5 4 0 622 7 0 660768 576 36118264 0 0 557 1345548 151394 59353 7 85 4 3 0 223 11 0 283960 576 36463868 0 0 46 1107160 121846 33006 6 93 1 1 0 Note that system CPU usage is very high blocks being written out has dropped by 42%. He analysed this with perf and found perf record -g -a sleep 10 perf report --sort symbol --call-graph fractal,5 34.63% [k] _raw_spin_lock_irqsave | |--97.30%-- isolate_freepages | compaction_alloc | unmap_and_move | migrate_pages | compact_zone | compact_zone_order | try_to_compact_pages | __alloc_pages_direct_compact | __alloc_pages_slowpath | __alloc_pages_nodemask | alloc_pages_vma | do_huge_pmd_anonymous_page | handle_mm_fault | do_page_fault | page_fault | | | |--87.39%-- skb_copy_datagram_iovec | | tcp_recvmsg | | inet_recvmsg | | sock_recvmsg | | sys_recvfrom | | system_call | | __recv | | | | | --100.00%-- (nil) | | | --12.61%-- memcpy --2.70%-- [...] There was other data but primarily it is all showing that compaction is contended heavily on the zone->lock and zone->lru_lock. commit [b2eef8c0: mm: compaction: minimise the time IRQs are disabled while isolating pages for migration] noted that it was possible for migration to hold the lru_lock for an excessive amount of time. Very broadly speaking this patch expands the concept. This patch introduces compact_checklock_irqsave() to check if a lock is contended or the process needs to be scheduled. If either condition is true then async compaction is aborted and the caller is informed. The page allocator will fail a THP allocation if compaction failed due to contention. This patch also introduces compact_trylock_irqsave() which will acquire the lock only if it is not contended and the process does not need to schedule. Reported-by: Jim Schutt <jaschut@sandia.gov> Tested-by: Jim Schutt <jaschut@sandia.gov> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c17
1 files changed, 11 insertions, 6 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07f19248acb5..c66fb875104a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2102,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2102 struct zonelist *zonelist, enum zone_type high_zoneidx, 2102 struct zonelist *zonelist, enum zone_type high_zoneidx,
2103 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2103 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2104 int migratetype, bool sync_migration, 2104 int migratetype, bool sync_migration,
2105 bool *deferred_compaction, 2105 bool *contended_compaction, bool *deferred_compaction,
2106 unsigned long *did_some_progress) 2106 unsigned long *did_some_progress)
2107{ 2107{
2108 struct page *page; 2108 struct page *page;
@@ -2117,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2117 2117
2118 current->flags |= PF_MEMALLOC; 2118 current->flags |= PF_MEMALLOC;
2119 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2119 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2120 nodemask, sync_migration); 2120 nodemask, sync_migration,
2121 contended_compaction);
2121 current->flags &= ~PF_MEMALLOC; 2122 current->flags &= ~PF_MEMALLOC;
2122 if (*did_some_progress != COMPACT_SKIPPED) { 2123 if (*did_some_progress != COMPACT_SKIPPED) {
2123 2124
@@ -2163,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2163 struct zonelist *zonelist, enum zone_type high_zoneidx, 2164 struct zonelist *zonelist, enum zone_type high_zoneidx,
2164 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2165 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2165 int migratetype, bool sync_migration, 2166 int migratetype, bool sync_migration,
2166 bool *deferred_compaction, 2167 bool *contended_compaction, bool *deferred_compaction,
2167 unsigned long *did_some_progress) 2168 unsigned long *did_some_progress)
2168{ 2169{
2169 return NULL; 2170 return NULL;
@@ -2336,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2336 unsigned long did_some_progress; 2337 unsigned long did_some_progress;
2337 bool sync_migration = false; 2338 bool sync_migration = false;
2338 bool deferred_compaction = false; 2339 bool deferred_compaction = false;
2340 bool contended_compaction = false;
2339 2341
2340 /* 2342 /*
2341 * In the slowpath, we sanity check order to avoid ever trying to 2343 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2425,6 +2427,7 @@ rebalance:
2425 nodemask, 2427 nodemask,
2426 alloc_flags, preferred_zone, 2428 alloc_flags, preferred_zone,
2427 migratetype, sync_migration, 2429 migratetype, sync_migration,
2430 &contended_compaction,
2428 &deferred_compaction, 2431 &deferred_compaction,
2429 &did_some_progress); 2432 &did_some_progress);
2430 if (page) 2433 if (page)
@@ -2434,10 +2437,11 @@ rebalance:
2434 /* 2437 /*
2435 * If compaction is deferred for high-order allocations, it is because 2438 * If compaction is deferred for high-order allocations, it is because
2436 * sync compaction recently failed. In this is the case and the caller 2439 * sync compaction recently failed. In this is the case and the caller
2437 * has requested the system not be heavily disrupted, fail the 2440 * requested a movable allocation that does not heavily disrupt the
2438 * allocation now instead of entering direct reclaim 2441 * system then fail the allocation instead of entering direct reclaim.
2439 */ 2442 */
2440 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) 2443 if ((deferred_compaction || contended_compaction) &&
2444 (gfp_mask & __GFP_NO_KSWAPD))
2441 goto nopage; 2445 goto nopage;
2442 2446
2443 /* Try direct reclaim and then allocating */ 2447 /* Try direct reclaim and then allocating */
@@ -2508,6 +2512,7 @@ rebalance:
2508 nodemask, 2512 nodemask,
2509 alloc_flags, preferred_zone, 2513 alloc_flags, preferred_zone,
2510 migratetype, sync_migration, 2514 migratetype, sync_migration,
2515 &contended_compaction,
2511 &deferred_compaction, 2516 &deferred_compaction,
2512 &did_some_progress); 2517 &did_some_progress);
2513 if (page) 2518 if (page)