mm: compaction: Abort async compaction if locks are contended or taking too long

Jim Schutt reported a problem that pointed at compaction contending heavily on locks. The workload is straight-forward and in his own words; The systems in question have 24 SAS drives spread across 3 HBAs, running 24 Ceph OSD instances, one per drive. FWIW these servers are dual-socket Intel 5675 Xeons w/48 GB memory. I've got ~160 Ceph Linux clients doing dd simultaneously to a Ceph file system backed by 12 of these servers. Early in the test everything looks fine procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 31 15 0 287216 576 38606628 0 0 2 1158 2 14 1 3 95 0 0 27 15 0 225288 576 38583384 0 0 18 2222016 203357 134876 11 56 17 15 0 28 17 0 219256 576 38544736 0 0 11 2305932 203141 146296 11 49 23 17 0 6 18 0 215596 576 38552872 0 0 7 2363207 215264 166502 12 45 22 20 0 22 18 0 226984 576 38596404 0 0 3 2445741 223114 179527 12 43 23 22 0 and then it goes to pot procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 163 8 0 464308 576 36791368 0 0 11 22210 866 536 3 13 79 4 0 207 14 0 917752 576 36181928 0 0 712 1345376 134598 47367 7 90 1 2 0 123 12 0 685516 576 36296148 0 0 429 1386615 158494 60077 8 84 5 3 0 123 12 0 598572 576 36333728 0 0 1107 1233281 147542 62351 7 84 5 4 0 622 7 0 660768 576 36118264 0 0 557 1345548 151394 59353 7 85 4 3 0 223 11 0 283960 576 36463868 0 0 46 1107160 121846 33006 6 93 1 1 0 Note that system CPU usage is very high blocks being written out has dropped by 42%. He analysed this with perf and found perf record -g -a sleep 10 perf report --sort symbol --call-graph fractal,5 34.63% [k] _raw_spin_lock_irqsave | |--97.30%-- isolate_freepages | compaction_alloc | unmap_and_move | migrate_pages | compact_zone | compact_zone_order | try_to_compact_pages | __alloc_pages_direct_compact | __alloc_pages_slowpath | __alloc_pages_nodemask | alloc_pages_vma | do_huge_pmd_anonymous_page | handle_mm_fault | do_page_fault | page_fault | | | |--87.39%-- skb_copy_datagram_iovec | | tcp_recvmsg | | inet_recvmsg | | sock_recvmsg | | sys_recvfrom | | system_call | | __recv | | | | | --100.00%-- (nil) | | | --12.61%-- memcpy --2.70%-- [...] There was other data but primarily it is all showing that compaction is contended heavily on the zone->lock and zone->lru_lock. commit [b2eef8c0: mm: compaction: minimise the time IRQs are disabled while isolating pages for migration] noted that it was possible for migration to hold the lru_lock for an excessive amount of time. Very broadly speaking this patch expands the concept. This patch introduces compact_checklock_irqsave() to check if a lock is contended or the process needs to be scheduled. If either condition is true then async compaction is aborted and the caller is informed. The page allocator will fail a THP allocation if compaction failed due to contention. This patch also introduces compact_trylock_irqsave() which will acquire the lock only if it is not contended and the process does not need to schedule. Reported-by: Jim Schutt <jaschut@sandia.gov> Tested-by: Jim Schutt <jaschut@sandia.gov> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Mel Gorman <mgorman@suse.de> 2012-08-21 19:16:17 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-08-21 19:45:03 -0400
commit: c67fe3752abe6ab47639e2f9b836900c3dc3da84 (patch)
tree: c66f8f1c7a26c0277875e90107d9315f69ec2adf /mm/compaction.c
parent: de74f1cc3b1e9730d9b58580cd11361d30cd182d (diff)
1 files changed, 79 insertions, 21 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index bcce7897e17a..7fcd3a52e68d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -51,6 +51,47 @@ static inline bool migrate_async_suitable(int migratetype)
 }
 /*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out in the event
+ * if contention is severe. For sync compaction, schedule.
+ *
+ * Returns true if the lock is held.
+ * Returns false if the lock is released and compaction should abort
+ */
+static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+                                      bool locked, struct compact_control *cc)
+{
+        if (need_resched() || spin_is_contended(lock)) {
+                if (locked) {
+                        spin_unlock_irqrestore(lock, *flags);
+                        locked = false;
+                }
+                /* async aborts if taking too long or contended */
+                if (!cc->sync) {
+                        if (cc->contended)
+                                *cc->contended = true;
+                        return false;
+                }
+                cond_resched();
+                if (fatal_signal_pending(current))
+                        return false;
+        }
+        if (!locked)
+                spin_lock_irqsave(lock, *flags);
+        return true;
+}
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+                        unsigned long *flags, struct compact_control *cc)
+{
+        return compact_checklock_irqsave(lock, flags, false, cc);
+}
+/*
 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
 * pages inside of the pageblock (even though it may still end up isolating
@@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 }
 /* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, struct compact_control *cc)
+static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
 {
        struct page *page;
        unsigned int count[2] = { 0, };
@@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
        list_for_each_entry(page, &cc->migratepages, lru)
                count[!!page_is_file_cache(page)]++;
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+        /* If locked we can use the interrupt unsafe versions */
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        if (locked) {
+                __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+                __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        } else {
+                mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+                mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        }
 }
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        struct list_head *migratelist = &cc->migratepages;
        isolate_mode_t mode = 0;
        struct lruvec *lruvec;
+        unsigned long flags;
+        bool locked;
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        /* Time to isolate some pages for migration */
        cond_resched();
-        spin_lock_irq(&zone->lru_lock);
+        spin_lock_irqsave(&zone->lru_lock, flags);
+        locked = true;
        for (; low_pfn < end_pfn; low_pfn++) {
                struct page *page;
-                bool locked = true;
                /* give a chance to irqs before checking need_resched() */
                if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-                        spin_unlock_irq(&zone->lru_lock);
+                        spin_unlock_irqrestore(&zone->lru_lock, flags);
                        locked = false;
                }
-                if (need_resched() || spin_is_contended(&zone->lru_lock)) {
-                        if (locked)
+                /* Check if it is ok to still hold the lock */
-                                spin_unlock_irq(&zone->lru_lock);
+                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-                        cond_resched();
+                                                                locked, cc);
-                        spin_lock_irq(&zone->lru_lock);
+                if (!locked)
-                        if (fatal_signal_pending(current))
+                        break;
-                                break;
-                } else if (!locked)
-                        spin_lock_irq(&zone->lru_lock);
                /*
                 * migrate_pfn does not necessarily start aligned to a
@@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                }
        }
-        acct_isolated(zone, cc);
+        acct_isolated(zone, locked, cc);
-        spin_unlock_irq(&zone->lru_lock);
+        if (locked)
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -461,7 +508,16 @@ static void isolate_freepages(struct zone *zone,
                 * are disabled
                 */
                isolated = 0;
-                spin_lock_irqsave(&zone->lock, flags);
+                /*
+                 * The zone lock must be held to isolate freepages. This
+                 * unfortunately this is a very coarse lock and can be
+                 * heavily contended if there are parallel allocations
+                 * or parallel compactions. For async compaction do not
+                 * spin on the lock
+                 */
+                if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
+                        break;
                if (suitable_migration_target(page)) {
                        end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
                        isolated = isolate_freepages_block(pfn, end_pfn,
@@ -773,7 +829,7 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync)
+                                 bool sync, bool *contended)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -782,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
+                .contended = contended,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -803,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync)
+                        bool sync, bool *contended)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -827,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                                                                nodemask) {
                int status;
-                status = compact_zone_order(zone, order, gfp_mask, sync);
+                status = compact_zone_order(zone, order, gfp_mask, sync,
+                                                contended);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
author	Mel Gorman <mgorman@suse.de>	2012-08-21 19:16:17 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-08-21 19:45:03 -0400
commit	c67fe3752abe6ab47639e2f9b836900c3dc3da84 (patch)
tree	c66f8f1c7a26c0277875e90107d9315f69ec2adf /mm/compaction.c
parent	de74f1cc3b1e9730d9b58580cd11361d30cd182d (diff)

diff --git a/mm/compaction.c b/mm/compaction.c index bcce7897e17a..7fcd3a52e68d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c
@@ -51,6 +51,47 @@ static inline bool migrate_async_suitable(int migratetype)
51	}	51	}
52		52
53	/*	53	/*
		54	* Compaction requires the taking of some coarse locks that are potentially
		55	* very heavily contended. Check if the process needs to be scheduled or
		56	* if the lock is contended. For async compaction, back out in the event
		57	* if contention is severe. For sync compaction, schedule.
		58	*
		59	* Returns true if the lock is held.
		60	* Returns false if the lock is released and compaction should abort
		61	*/
		62	static bool compact_checklock_irqsave(spinlock_t lock, unsigned long flags,
		63	bool locked, struct compact_control *cc)
		64	{
		65	if (need_resched() \|\| spin_is_contended(lock)) {
		66	if (locked) {
		67	spin_unlock_irqrestore(lock, *flags);
		68	locked = false;
		69	}
		70
		71	/* async aborts if taking too long or contended */
		72	if (!cc->sync) {
		73	if (cc->contended)
		74	*cc->contended = true;
		75	return false;
		76	}
		77
		78	cond_resched();
		79	if (fatal_signal_pending(current))
		80	return false;
		81	}
		82
		83	if (!locked)
		84	spin_lock_irqsave(lock, *flags);
		85	return true;
		86	}
		87
		88	static inline bool compact_trylock_irqsave(spinlock_t *lock,
		89	unsigned long flags, struct compact_control cc)
		90	{
		91	return compact_checklock_irqsave(lock, flags, false, cc);
		92	}
		93
		94	/*
54	* Isolate free pages onto a private freelist. Caller must hold zone->lock.	95	* Isolate free pages onto a private freelist. Caller must hold zone->lock.
55	* If @strict is true, will abort returning 0 on any invalid PFNs or non-free	96	* If @strict is true, will abort returning 0 on any invalid PFNs or non-free
56	* pages inside of the pageblock (even though it may still end up isolating	97	* pages inside of the pageblock (even though it may still end up isolating
@@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
173	}	214	}
174		215
175	/* Update the number of anon and file isolated pages in the zone */	216	/* Update the number of anon and file isolated pages in the zone */
176	static void acct_isolated(struct zone zone, struct compact_control cc)	217	static void acct_isolated(struct zone zone, bool locked, struct compact_control cc)
177	{	218	{
178	struct page *page;	219	struct page *page;
179	unsigned int count[2] = { 0, };	220	unsigned int count[2] = { 0, };
@@ -181,8 +222,14 @@ static void acct_isolated(struct zone zone, struct compact_control cc)
181	list_for_each_entry(page, &cc->migratepages, lru)	222	list_for_each_entry(page, &cc->migratepages, lru)
182	count[!!page_is_file_cache(page)]++;	223	count[!!page_is_file_cache(page)]++;
183		224
184	__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);	225	/* If locked we can use the interrupt unsafe versions */
185	__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);	226	if (locked) {
		227	__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
		228	__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
		229	} else {
		230	mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
		231	mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
		232	}
186	}	233	}
187		234
188	/* Similar to reclaim, but different enough that they don't share logic */	235	/* Similar to reclaim, but different enough that they don't share logic */
@@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone zone, struct compact_control cc,
228	struct list_head *migratelist = &cc->migratepages;	275	struct list_head *migratelist = &cc->migratepages;
229	isolate_mode_t mode = 0;	276	isolate_mode_t mode = 0;
230	struct lruvec *lruvec;	277	struct lruvec *lruvec;
		278	unsigned long flags;
		279	bool locked;
231		280
232	/*	281	/*
233	* Ensure that there are not too many pages isolated from the LRU	282	* Ensure that there are not too many pages isolated from the LRU
@@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone zone, struct compact_control cc,
247		296
248	/* Time to isolate some pages for migration */	297	/* Time to isolate some pages for migration */
249	cond_resched();	298	cond_resched();
250	spin_lock_irq(&zone->lru_lock);	299	spin_lock_irqsave(&zone->lru_lock, flags);
		300	locked = true;
251	for (; low_pfn < end_pfn; low_pfn++) {	301	for (; low_pfn < end_pfn; low_pfn++) {
252	struct page *page;	302	struct page *page;
253	bool locked = true;
254		303
255	/* give a chance to irqs before checking need_resched() */	304	/* give a chance to irqs before checking need_resched() */
256	if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {	305	if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
257	spin_unlock_irq(&zone->lru_lock);	306	spin_unlock_irqrestore(&zone->lru_lock, flags);
258	locked = false;	307	locked = false;
259	}	308	}
260	if (need_resched() \|\| spin_is_contended(&zone->lru_lock)) {	309
261	if (locked)	310	/* Check if it is ok to still hold the lock */
262	spin_unlock_irq(&zone->lru_lock);	311	locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
263	cond_resched();	312	locked, cc);
264	spin_lock_irq(&zone->lru_lock);	313	if (!locked)
265	if (fatal_signal_pending(current))	314	break;
266	break;
267	} else if (!locked)
268	spin_lock_irq(&zone->lru_lock);
269		315
270	/*	316	/*
271	* migrate_pfn does not necessarily start aligned to a	317	* migrate_pfn does not necessarily start aligned to a
@@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone zone, struct compact_control cc,
349	}	395	}
350	}	396	}
351		397
352	acct_isolated(zone, cc);	398	acct_isolated(zone, locked, cc);
353		399
354	spin_unlock_irq(&zone->lru_lock);	400	if (locked)
		401	spin_unlock_irqrestore(&zone->lru_lock, flags);
355		402
356	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);	403	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
357		404
@@ -461,7 +508,16 @@ static void isolate_freepages(struct zone *zone,
461	* are disabled	508	* are disabled
462	*/	509	*/
463	isolated = 0;	510	isolated = 0;
464	spin_lock_irqsave(&zone->lock, flags);	511
		512	/*
		513	* The zone lock must be held to isolate freepages. This
		514	* unfortunately this is a very coarse lock and can be
		515	* heavily contended if there are parallel allocations
		516	* or parallel compactions. For async compaction do not
		517	* spin on the lock
		518	*/
		519	if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
		520	break;
465	if (suitable_migration_target(page)) {	521	if (suitable_migration_target(page)) {
466	end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);	522	end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
467	isolated = isolate_freepages_block(pfn, end_pfn,	523	isolated = isolate_freepages_block(pfn, end_pfn,
@@ -773,7 +829,7 @@ out:
773		829
774	static unsigned long compact_zone_order(struct zone *zone,	830	static unsigned long compact_zone_order(struct zone *zone,
775	int order, gfp_t gfp_mask,	831	int order, gfp_t gfp_mask,
776	bool sync)	832	bool sync, bool *contended)
777	{	833	{
778	struct compact_control cc = {	834	struct compact_control cc = {
779	.nr_freepages = 0,	835	.nr_freepages = 0,
@@ -782,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
782	.migratetype = allocflags_to_migratetype(gfp_mask),	838	.migratetype = allocflags_to_migratetype(gfp_mask),
783	.zone = zone,	839	.zone = zone,
784	.sync = sync,	840	.sync = sync,
		841	.contended = contended,
785	};	842	};
786	INIT_LIST_HEAD(&cc.freepages);	843	INIT_LIST_HEAD(&cc.freepages);
787	INIT_LIST_HEAD(&cc.migratepages);	844	INIT_LIST_HEAD(&cc.migratepages);
@@ -803,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
803	*/	860	*/
804	unsigned long try_to_compact_pages(struct zonelist *zonelist,	861	unsigned long try_to_compact_pages(struct zonelist *zonelist,
805	int order, gfp_t gfp_mask, nodemask_t *nodemask,	862	int order, gfp_t gfp_mask, nodemask_t *nodemask,
806	bool sync)	863	bool sync, bool *contended)
807	{	864	{
808	enum zone_type high_zoneidx = gfp_zone(gfp_mask);	865	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
809	int may_enter_fs = gfp_mask & __GFP_FS;	866	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -827,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
827	nodemask) {	884	nodemask) {
828	int status;	885	int status;
829		886
830	status = compact_zone_order(zone, order, gfp_mask, sync);	887	status = compact_zone_order(zone, order, gfp_mask, sync,
		888	contended);
831	rc = max(status, rc);	889	rc = max(status, rc);
832		890
833	/* If a normal allocation would succeed, stop compacting */	891	/* If a normal allocation would succeed, stop compacting */