diff options
author | Mel Gorman <mgorman@suse.de> | 2012-08-21 19:16:17 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-08-21 19:45:03 -0400 |
commit | c67fe3752abe6ab47639e2f9b836900c3dc3da84 (patch) | |
tree | c66f8f1c7a26c0277875e90107d9315f69ec2adf /mm/page_alloc.c | |
parent | de74f1cc3b1e9730d9b58580cd11361d30cd182d (diff) |
mm: compaction: Abort async compaction if locks are contended or taking too long
Jim Schutt reported a problem that pointed at compaction contending
heavily on locks. The workload is straight-forward and in his own words;
The systems in question have 24 SAS drives spread across 3 HBAs,
running 24 Ceph OSD instances, one per drive. FWIW these servers
are dual-socket Intel 5675 Xeons w/48 GB memory. I've got ~160
Ceph Linux clients doing dd simultaneously to a Ceph file system
backed by 12 of these servers.
Early in the test everything looks fine
procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu-------
r b swpd free buff cache si so bi bo in cs us sy id wa st
31 15 0 287216 576 38606628 0 0 2 1158 2 14 1 3 95 0 0
27 15 0 225288 576 38583384 0 0 18 2222016 203357 134876 11 56 17 15 0
28 17 0 219256 576 38544736 0 0 11 2305932 203141 146296 11 49 23 17 0
6 18 0 215596 576 38552872 0 0 7 2363207 215264 166502 12 45 22 20 0
22 18 0 226984 576 38596404 0 0 3 2445741 223114 179527 12 43 23 22 0
and then it goes to pot
procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu-------
r b swpd free buff cache si so bi bo in cs us sy id wa st
163 8 0 464308 576 36791368 0 0 11 22210 866 536 3 13 79 4 0
207 14 0 917752 576 36181928 0 0 712 1345376 134598 47367 7 90 1 2 0
123 12 0 685516 576 36296148 0 0 429 1386615 158494 60077 8 84 5 3 0
123 12 0 598572 576 36333728 0 0 1107 1233281 147542 62351 7 84 5 4 0
622 7 0 660768 576 36118264 0 0 557 1345548 151394 59353 7 85 4 3 0
223 11 0 283960 576 36463868 0 0 46 1107160 121846 33006 6 93 1 1 0
Note that system CPU usage is very high blocks being written out has
dropped by 42%. He analysed this with perf and found
perf record -g -a sleep 10
perf report --sort symbol --call-graph fractal,5
34.63% [k] _raw_spin_lock_irqsave
|
|--97.30%-- isolate_freepages
| compaction_alloc
| unmap_and_move
| migrate_pages
| compact_zone
| compact_zone_order
| try_to_compact_pages
| __alloc_pages_direct_compact
| __alloc_pages_slowpath
| __alloc_pages_nodemask
| alloc_pages_vma
| do_huge_pmd_anonymous_page
| handle_mm_fault
| do_page_fault
| page_fault
| |
| |--87.39%-- skb_copy_datagram_iovec
| | tcp_recvmsg
| | inet_recvmsg
| | sock_recvmsg
| | sys_recvfrom
| | system_call
| | __recv
| | |
| | --100.00%-- (nil)
| |
| --12.61%-- memcpy
--2.70%-- [...]
There was other data but primarily it is all showing that compaction is
contended heavily on the zone->lock and zone->lru_lock.
commit [b2eef8c0: mm: compaction: minimise the time IRQs are disabled
while isolating pages for migration] noted that it was possible for
migration to hold the lru_lock for an excessive amount of time. Very
broadly speaking this patch expands the concept.
This patch introduces compact_checklock_irqsave() to check if a lock
is contended or the process needs to be scheduled. If either condition
is true then async compaction is aborted and the caller is informed.
The page allocator will fail a THP allocation if compaction failed due
to contention. This patch also introduces compact_trylock_irqsave()
which will acquire the lock only if it is not contended and the process
does not need to schedule.
Reported-by: Jim Schutt <jaschut@sandia.gov>
Tested-by: Jim Schutt <jaschut@sandia.gov>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 17 |
1 files changed, 11 insertions, 6 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07f19248acb5..c66fb875104a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -2102,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2102 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2102 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2103 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2103 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2104 | int migratetype, bool sync_migration, | 2104 | int migratetype, bool sync_migration, |
2105 | bool *deferred_compaction, | 2105 | bool *contended_compaction, bool *deferred_compaction, |
2106 | unsigned long *did_some_progress) | 2106 | unsigned long *did_some_progress) |
2107 | { | 2107 | { |
2108 | struct page *page; | 2108 | struct page *page; |
@@ -2117,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2117 | 2117 | ||
2118 | current->flags |= PF_MEMALLOC; | 2118 | current->flags |= PF_MEMALLOC; |
2119 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2119 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2120 | nodemask, sync_migration); | 2120 | nodemask, sync_migration, |
2121 | contended_compaction); | ||
2121 | current->flags &= ~PF_MEMALLOC; | 2122 | current->flags &= ~PF_MEMALLOC; |
2122 | if (*did_some_progress != COMPACT_SKIPPED) { | 2123 | if (*did_some_progress != COMPACT_SKIPPED) { |
2123 | 2124 | ||
@@ -2163,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2163 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2164 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2164 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2165 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2165 | int migratetype, bool sync_migration, | 2166 | int migratetype, bool sync_migration, |
2166 | bool *deferred_compaction, | 2167 | bool *contended_compaction, bool *deferred_compaction, |
2167 | unsigned long *did_some_progress) | 2168 | unsigned long *did_some_progress) |
2168 | { | 2169 | { |
2169 | return NULL; | 2170 | return NULL; |
@@ -2336,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2336 | unsigned long did_some_progress; | 2337 | unsigned long did_some_progress; |
2337 | bool sync_migration = false; | 2338 | bool sync_migration = false; |
2338 | bool deferred_compaction = false; | 2339 | bool deferred_compaction = false; |
2340 | bool contended_compaction = false; | ||
2339 | 2341 | ||
2340 | /* | 2342 | /* |
2341 | * In the slowpath, we sanity check order to avoid ever trying to | 2343 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -2425,6 +2427,7 @@ rebalance: | |||
2425 | nodemask, | 2427 | nodemask, |
2426 | alloc_flags, preferred_zone, | 2428 | alloc_flags, preferred_zone, |
2427 | migratetype, sync_migration, | 2429 | migratetype, sync_migration, |
2430 | &contended_compaction, | ||
2428 | &deferred_compaction, | 2431 | &deferred_compaction, |
2429 | &did_some_progress); | 2432 | &did_some_progress); |
2430 | if (page) | 2433 | if (page) |
@@ -2434,10 +2437,11 @@ rebalance: | |||
2434 | /* | 2437 | /* |
2435 | * If compaction is deferred for high-order allocations, it is because | 2438 | * If compaction is deferred for high-order allocations, it is because |
2436 | * sync compaction recently failed. In this is the case and the caller | 2439 | * sync compaction recently failed. In this is the case and the caller |
2437 | * has requested the system not be heavily disrupted, fail the | 2440 | * requested a movable allocation that does not heavily disrupt the |
2438 | * allocation now instead of entering direct reclaim | 2441 | * system then fail the allocation instead of entering direct reclaim. |
2439 | */ | 2442 | */ |
2440 | if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) | 2443 | if ((deferred_compaction || contended_compaction) && |
2444 | (gfp_mask & __GFP_NO_KSWAPD)) | ||
2441 | goto nopage; | 2445 | goto nopage; |
2442 | 2446 | ||
2443 | /* Try direct reclaim and then allocating */ | 2447 | /* Try direct reclaim and then allocating */ |
@@ -2508,6 +2512,7 @@ rebalance: | |||
2508 | nodemask, | 2512 | nodemask, |
2509 | alloc_flags, preferred_zone, | 2513 | alloc_flags, preferred_zone, |
2510 | migratetype, sync_migration, | 2514 | migratetype, sync_migration, |
2515 | &contended_compaction, | ||
2511 | &deferred_compaction, | 2516 | &deferred_compaction, |
2512 | &did_some_progress); | 2517 | &did_some_progress); |
2513 | if (page) | 2518 | if (page) |