aboutsummaryrefslogtreecommitdiffstats
path: root/mm/compaction.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-08-21 19:16:17 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-08-21 19:45:03 -0400
commitc67fe3752abe6ab47639e2f9b836900c3dc3da84 (patch)
treec66f8f1c7a26c0277875e90107d9315f69ec2adf /mm/compaction.c
parentde74f1cc3b1e9730d9b58580cd11361d30cd182d (diff)
mm: compaction: Abort async compaction if locks are contended or taking too long
Jim Schutt reported a problem that pointed at compaction contending heavily on locks. The workload is straight-forward and in his own words; The systems in question have 24 SAS drives spread across 3 HBAs, running 24 Ceph OSD instances, one per drive. FWIW these servers are dual-socket Intel 5675 Xeons w/48 GB memory. I've got ~160 Ceph Linux clients doing dd simultaneously to a Ceph file system backed by 12 of these servers. Early in the test everything looks fine procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 31 15 0 287216 576 38606628 0 0 2 1158 2 14 1 3 95 0 0 27 15 0 225288 576 38583384 0 0 18 2222016 203357 134876 11 56 17 15 0 28 17 0 219256 576 38544736 0 0 11 2305932 203141 146296 11 49 23 17 0 6 18 0 215596 576 38552872 0 0 7 2363207 215264 166502 12 45 22 20 0 22 18 0 226984 576 38596404 0 0 3 2445741 223114 179527 12 43 23 22 0 and then it goes to pot procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 163 8 0 464308 576 36791368 0 0 11 22210 866 536 3 13 79 4 0 207 14 0 917752 576 36181928 0 0 712 1345376 134598 47367 7 90 1 2 0 123 12 0 685516 576 36296148 0 0 429 1386615 158494 60077 8 84 5 3 0 123 12 0 598572 576 36333728 0 0 1107 1233281 147542 62351 7 84 5 4 0 622 7 0 660768 576 36118264 0 0 557 1345548 151394 59353 7 85 4 3 0 223 11 0 283960 576 36463868 0 0 46 1107160 121846 33006 6 93 1 1 0 Note that system CPU usage is very high blocks being written out has dropped by 42%. He analysed this with perf and found perf record -g -a sleep 10 perf report --sort symbol --call-graph fractal,5 34.63% [k] _raw_spin_lock_irqsave | |--97.30%-- isolate_freepages | compaction_alloc | unmap_and_move | migrate_pages | compact_zone | compact_zone_order | try_to_compact_pages | __alloc_pages_direct_compact | __alloc_pages_slowpath | __alloc_pages_nodemask | alloc_pages_vma | do_huge_pmd_anonymous_page | handle_mm_fault | do_page_fault | page_fault | | | |--87.39%-- skb_copy_datagram_iovec | | tcp_recvmsg | | inet_recvmsg | | sock_recvmsg | | sys_recvfrom | | system_call | | __recv | | | | | --100.00%-- (nil) | | | --12.61%-- memcpy --2.70%-- [...] There was other data but primarily it is all showing that compaction is contended heavily on the zone->lock and zone->lru_lock. commit [b2eef8c0: mm: compaction: minimise the time IRQs are disabled while isolating pages for migration] noted that it was possible for migration to hold the lru_lock for an excessive amount of time. Very broadly speaking this patch expands the concept. This patch introduces compact_checklock_irqsave() to check if a lock is contended or the process needs to be scheduled. If either condition is true then async compaction is aborted and the caller is informed. The page allocator will fail a THP allocation if compaction failed due to contention. This patch also introduces compact_trylock_irqsave() which will acquire the lock only if it is not contended and the process does not need to schedule. Reported-by: Jim Schutt <jaschut@sandia.gov> Tested-by: Jim Schutt <jaschut@sandia.gov> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/compaction.c')
-rw-r--r--mm/compaction.c100
1 files changed, 79 insertions, 21 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index bcce7897e17a..7fcd3a52e68d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -51,6 +51,47 @@ static inline bool migrate_async_suitable(int migratetype)
51} 51}
52 52
53/* 53/*
54 * Compaction requires the taking of some coarse locks that are potentially
55 * very heavily contended. Check if the process needs to be scheduled or
56 * if the lock is contended. For async compaction, back out in the event
57 * if contention is severe. For sync compaction, schedule.
58 *
59 * Returns true if the lock is held.
60 * Returns false if the lock is released and compaction should abort
61 */
62static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
63 bool locked, struct compact_control *cc)
64{
65 if (need_resched() || spin_is_contended(lock)) {
66 if (locked) {
67 spin_unlock_irqrestore(lock, *flags);
68 locked = false;
69 }
70
71 /* async aborts if taking too long or contended */
72 if (!cc->sync) {
73 if (cc->contended)
74 *cc->contended = true;
75 return false;
76 }
77
78 cond_resched();
79 if (fatal_signal_pending(current))
80 return false;
81 }
82
83 if (!locked)
84 spin_lock_irqsave(lock, *flags);
85 return true;
86}
87
88static inline bool compact_trylock_irqsave(spinlock_t *lock,
89 unsigned long *flags, struct compact_control *cc)
90{
91 return compact_checklock_irqsave(lock, flags, false, cc);
92}
93
94/*
54 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 95 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
55 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 96 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
56 * pages inside of the pageblock (even though it may still end up isolating 97 * pages inside of the pageblock (even though it may still end up isolating
@@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
173} 214}
174 215
175/* Update the number of anon and file isolated pages in the zone */ 216/* Update the number of anon and file isolated pages in the zone */
176static void acct_isolated(struct zone *zone, struct compact_control *cc) 217static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
177{ 218{
178 struct page *page; 219 struct page *page;
179 unsigned int count[2] = { 0, }; 220 unsigned int count[2] = { 0, };
@@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
181 list_for_each_entry(page, &cc->migratepages, lru) 222 list_for_each_entry(page, &cc->migratepages, lru)
182 count[!!page_is_file_cache(page)]++; 223 count[!!page_is_file_cache(page)]++;
183 224
184 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 225 /* If locked we can use the interrupt unsafe versions */
185 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 226 if (locked) {
227 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
228 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
229 } else {
230 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
231 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
232 }
186} 233}
187 234
188/* Similar to reclaim, but different enough that they don't share logic */ 235/* Similar to reclaim, but different enough that they don't share logic */
@@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
228 struct list_head *migratelist = &cc->migratepages; 275 struct list_head *migratelist = &cc->migratepages;
229 isolate_mode_t mode = 0; 276 isolate_mode_t mode = 0;
230 struct lruvec *lruvec; 277 struct lruvec *lruvec;
278 unsigned long flags;
279 bool locked;
231 280
232 /* 281 /*
233 * Ensure that there are not too many pages isolated from the LRU 282 * Ensure that there are not too many pages isolated from the LRU
@@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
247 296
248 /* Time to isolate some pages for migration */ 297 /* Time to isolate some pages for migration */
249 cond_resched(); 298 cond_resched();
250 spin_lock_irq(&zone->lru_lock); 299 spin_lock_irqsave(&zone->lru_lock, flags);
300 locked = true;
251 for (; low_pfn < end_pfn; low_pfn++) { 301 for (; low_pfn < end_pfn; low_pfn++) {
252 struct page *page; 302 struct page *page;
253 bool locked = true;
254 303
255 /* give a chance to irqs before checking need_resched() */ 304 /* give a chance to irqs before checking need_resched() */
256 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { 305 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
257 spin_unlock_irq(&zone->lru_lock); 306 spin_unlock_irqrestore(&zone->lru_lock, flags);
258 locked = false; 307 locked = false;
259 } 308 }
260 if (need_resched() || spin_is_contended(&zone->lru_lock)) { 309
261 if (locked) 310 /* Check if it is ok to still hold the lock */
262 spin_unlock_irq(&zone->lru_lock); 311 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
263 cond_resched(); 312 locked, cc);
264 spin_lock_irq(&zone->lru_lock); 313 if (!locked)
265 if (fatal_signal_pending(current)) 314 break;
266 break;
267 } else if (!locked)
268 spin_lock_irq(&zone->lru_lock);
269 315
270 /* 316 /*
271 * migrate_pfn does not necessarily start aligned to a 317 * migrate_pfn does not necessarily start aligned to a
@@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
349 } 395 }
350 } 396 }
351 397
352 acct_isolated(zone, cc); 398 acct_isolated(zone, locked, cc);
353 399
354 spin_unlock_irq(&zone->lru_lock); 400 if (locked)
401 spin_unlock_irqrestore(&zone->lru_lock, flags);
355 402
356 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 403 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
357 404
@@ -461,7 +508,16 @@ static void isolate_freepages(struct zone *zone,
461 * are disabled 508 * are disabled
462 */ 509 */
463 isolated = 0; 510 isolated = 0;
464 spin_lock_irqsave(&zone->lock, flags); 511
512 /*
513 * The zone lock must be held to isolate freepages. This
514 * unfortunately this is a very coarse lock and can be
515 * heavily contended if there are parallel allocations
516 * or parallel compactions. For async compaction do not
517 * spin on the lock
518 */
519 if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
520 break;
465 if (suitable_migration_target(page)) { 521 if (suitable_migration_target(page)) {
466 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); 522 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
467 isolated = isolate_freepages_block(pfn, end_pfn, 523 isolated = isolate_freepages_block(pfn, end_pfn,
@@ -773,7 +829,7 @@ out:
773 829
774static unsigned long compact_zone_order(struct zone *zone, 830static unsigned long compact_zone_order(struct zone *zone,
775 int order, gfp_t gfp_mask, 831 int order, gfp_t gfp_mask,
776 bool sync) 832 bool sync, bool *contended)
777{ 833{
778 struct compact_control cc = { 834 struct compact_control cc = {
779 .nr_freepages = 0, 835 .nr_freepages = 0,
@@ -782,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
782 .migratetype = allocflags_to_migratetype(gfp_mask), 838 .migratetype = allocflags_to_migratetype(gfp_mask),
783 .zone = zone, 839 .zone = zone,
784 .sync = sync, 840 .sync = sync,
841 .contended = contended,
785 }; 842 };
786 INIT_LIST_HEAD(&cc.freepages); 843 INIT_LIST_HEAD(&cc.freepages);
787 INIT_LIST_HEAD(&cc.migratepages); 844 INIT_LIST_HEAD(&cc.migratepages);
@@ -803,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
803 */ 860 */
804unsigned long try_to_compact_pages(struct zonelist *zonelist, 861unsigned long try_to_compact_pages(struct zonelist *zonelist,
805 int order, gfp_t gfp_mask, nodemask_t *nodemask, 862 int order, gfp_t gfp_mask, nodemask_t *nodemask,
806 bool sync) 863 bool sync, bool *contended)
807{ 864{
808 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 865 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
809 int may_enter_fs = gfp_mask & __GFP_FS; 866 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -827,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
827 nodemask) { 884 nodemask) {
828 int status; 885 int status;
829 886
830 status = compact_zone_order(zone, order, gfp_mask, sync); 887 status = compact_zone_order(zone, order, gfp_mask, sync,
888 contended);
831 rc = max(status, rc); 889 rc = max(status, rc);
832 890
833 /* If a normal allocation would succeed, stop compacting */ 891 /* If a normal allocation would succeed, stop compacting */