mm: compaction: partially revert capture of suitable high-order page

Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when waiting for POLLIN on a local TCP socket. It was easier to trigger if there was disk IO and dirty pages at the same time and he bisected it to commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). The intention of that patch was to improve high-order allocations under memory pressure after changes made to reclaim in 3.6 drastically hurt THP allocations but the approach was flawed. For Eric, the problem was that page->pfmemalloc was not being cleared for captured pages leading to a poor interaction with swap-over-NFS support causing the packets to be dropped. However, I identified a few more problems with the patch including the fact that it can increase contention on zone->lock in some cases which could result in async direct compaction being aborted early. In retrospect the capture patch took the wrong approach. What it should have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it was allocating for THP and avoided races that way. While the patch was showing to improve allocation success rates at the time, the benefit is marginal given the relative complexity and it should be revisited from scratch in the context of the other reclaim-related changes that have taken place since the patch was first written and tested. This patch partially reverts commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). Reported-and-tested-by: Eric Wong <normalperson@yhbt.net> Tested-by: Eric Dumazet <eric.dumazet@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Mel Gorman <mgorman@suse.de> 2013-01-11 17:32:16 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-01-11 17:54:56 -0500
commit: 8fb74b9fb2b182d54beee592350d9ea1f325917a (patch)
tree: ccdcf36cfedb0cf54268226ebde75330e4882539 /mm/compaction.c
parent: c0a3a20b6c4b5229ef5d26fd9b1c4b1957632aa7 (diff)
1 files changed, 13 insertions, 79 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index f8f5c111b7d7..c62bd063d766 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
+        unsigned int order;
        unsigned long watermark;
        if (fatal_signal_pending(current))
@@ -850,22 +851,16 @@ static int compact_finished(struct zone *zone,
                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
-        if (cc->page) {
+        for (order = cc->order; order < MAX_ORDER; order++) {
-                /* Was a suitable page captured? */
+                struct free_area *area = &zone->free_area[order];
-                if (*cc->page)
+                /* Job done if page is free of the right migratetype */
+                if (!list_empty(&area->free_list[cc->migratetype]))
+                        return COMPACT_PARTIAL;
+                /* Job done if allocation would set block type */
+                if (cc->order >= pageblock_order && area->nr_free)
                        return COMPACT_PARTIAL;
-        } else {
-                unsigned int order;
-                for (order = cc->order; order < MAX_ORDER; order++) {
-                        struct free_area *area = &zone->free_area[cc->order];
-                        /* Job done if page is free of the right migratetype */
-                        if (!list_empty(&area->free_list[cc->migratetype]))
-                                return COMPACT_PARTIAL;
-                        /* Job done if allocation would set block type */
-                        if (cc->order >= pageblock_order && area->nr_free)
-                                return COMPACT_PARTIAL;
-                }
        }
        return COMPACT_CONTINUE;
@@ -921,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
        return COMPACT_CONTINUE;
 }
-static void compact_capture_page(struct compact_control *cc)
-{
-        unsigned long flags;
-        int mtype, mtype_low, mtype_high;
-        if (!cc->page || *cc->page)
-                return;
-        /*
-         * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
-         * regardless of the migratetype of the freelist is is captured from.
-         * This is fine because the order for a high-order MIGRATE_MOVABLE
-         * allocation is typically at least a pageblock size and overall
-         * fragmentation is not impaired. Other allocation types must
-         * capture pages from their own migratelist because otherwise they
-         * could pollute other pageblocks like MIGRATE_MOVABLE with
-         * difficult to move pages and making fragmentation worse overall.
-         */
-        if (cc->migratetype == MIGRATE_MOVABLE) {
-                mtype_low = 0;
-                mtype_high = MIGRATE_PCPTYPES;
-        } else {
-                mtype_low = cc->migratetype;
-                mtype_high = cc->migratetype + 1;
-        }
-        /* Speculatively examine the free lists without zone lock */
-        for (mtype = mtype_low; mtype < mtype_high; mtype++) {
-                int order;
-                for (order = cc->order; order < MAX_ORDER; order++) {
-                        struct page *page;
-                        struct free_area *area;
-                        area = &(cc->zone->free_area[order]);
-                        if (list_empty(&area->free_list[mtype]))
-                                continue;
-                        /* Take the lock and attempt capture of the page */
-                        if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
-                                return;
-                        if (!list_empty(&area->free_list[mtype])) {
-                                page = list_entry(area->free_list[mtype].next,
-                                                        struct page, lru);
-                                if (capture_free_page(page, cc->order, mtype)) {
-                                        spin_unlock_irqrestore(&cc->zone->lock,
-                                                                        flags);
-                                        *cc->page = page;
-                                        return;
-                                }
-                        }
-                        spin_unlock_irqrestore(&cc->zone->lock, flags);
-                }
-        }
-}
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
@@ -1054,9 +995,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                                goto out;
                        }
                }
-                /* Capture a page now if it is a suitable size */
-                compact_capture_page(cc);
        }
 out:
@@ -1069,8 +1007,7 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync, bool *contended,
+                                 bool sync, bool *contended)
-                                 struct page **page)
 {
        unsigned long ret;
        struct compact_control cc = {
@@ -1080,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
-                .page = page,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -1110,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync, bool *contended, struct page **page)
+                        bool sync, bool *contended)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1136,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                int status;
                status = compact_zone_order(zone, order, gfp_mask, sync,
-                                                contended, page);
+                                                contended);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
@@ -1192,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
        struct compact_control cc = {
                .order = order,
                .sync = false,
-                .page = NULL,
        };
        return __compact_pgdat(pgdat, &cc);
@@ -1203,7 +1138,6 @@ static int compact_node(int nid)
        struct compact_control cc = {
                .order = -1,
                .sync = true,
-                .page = NULL,
        };
        return __compact_pgdat(NODE_DATA(nid), &cc);
author	Mel Gorman <mgorman@suse.de>	2013-01-11 17:32:16 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-01-11 17:54:56 -0500
commit	8fb74b9fb2b182d54beee592350d9ea1f325917a (patch)
tree	ccdcf36cfedb0cf54268226ebde75330e4882539 /mm/compaction.c
parent	c0a3a20b6c4b5229ef5d26fd9b1c4b1957632aa7 (diff)

diff --git a/mm/compaction.c b/mm/compaction.c index f8f5c111b7d7..c62bd063d766 100644 --- a/mm/compaction.c +++ b/mm/compaction.c
@@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
816	static int compact_finished(struct zone *zone,	816	static int compact_finished(struct zone *zone,
817	struct compact_control *cc)	817	struct compact_control *cc)
818	{	818	{
		819	unsigned int order;
819	unsigned long watermark;	820	unsigned long watermark;
820		821
821	if (fatal_signal_pending(current))	822	if (fatal_signal_pending(current))
@@ -850,22 +851,16 @@ static int compact_finished(struct zone *zone,
850	return COMPACT_CONTINUE;	851	return COMPACT_CONTINUE;
851		852
852	/* Direct compactor: Is a suitable page free? */	853	/* Direct compactor: Is a suitable page free? */
853	if (cc->page) {	854	for (order = cc->order; order < MAX_ORDER; order++) {
854	/* Was a suitable page captured? */	855	struct free_area *area = &zone->free_area[order];
855	if (*cc->page)	856
		857	/* Job done if page is free of the right migratetype */
		858	if (!list_empty(&area->free_list[cc->migratetype]))
		859	return COMPACT_PARTIAL;
		860
		861	/* Job done if allocation would set block type */
		862	if (cc->order >= pageblock_order && area->nr_free)
856	return COMPACT_PARTIAL;	863	return COMPACT_PARTIAL;
857	} else {
858	unsigned int order;
859	for (order = cc->order; order < MAX_ORDER; order++) {
860	struct free_area *area = &zone->free_area[cc->order];
861	/* Job done if page is free of the right migratetype */
862	if (!list_empty(&area->free_list[cc->migratetype]))
863	return COMPACT_PARTIAL;
864
865	/* Job done if allocation would set block type */
866	if (cc->order >= pageblock_order && area->nr_free)
867	return COMPACT_PARTIAL;
868	}
869	}	864	}
870		865
871	return COMPACT_CONTINUE;	866	return COMPACT_CONTINUE;
@@ -921,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
921	return COMPACT_CONTINUE;	916	return COMPACT_CONTINUE;
922	}	917	}
923		918
924	static void compact_capture_page(struct compact_control *cc)
925	{
926	unsigned long flags;
927	int mtype, mtype_low, mtype_high;
928
929	if (!cc->page \|\| *cc->page)
930	return;
931
932	/*
933	* For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
934	* regardless of the migratetype of the freelist is is captured from.
935	* This is fine because the order for a high-order MIGRATE_MOVABLE
936	* allocation is typically at least a pageblock size and overall
937	* fragmentation is not impaired. Other allocation types must
938	* capture pages from their own migratelist because otherwise they
939	* could pollute other pageblocks like MIGRATE_MOVABLE with
940	* difficult to move pages and making fragmentation worse overall.
941	*/
942	if (cc->migratetype == MIGRATE_MOVABLE) {
943	mtype_low = 0;
944	mtype_high = MIGRATE_PCPTYPES;
945	} else {
946	mtype_low = cc->migratetype;
947	mtype_high = cc->migratetype + 1;
948	}
949
950	/* Speculatively examine the free lists without zone lock */
951	for (mtype = mtype_low; mtype < mtype_high; mtype++) {
952	int order;
953	for (order = cc->order; order < MAX_ORDER; order++) {
954	struct page *page;
955	struct free_area *area;
956	area = &(cc->zone->free_area[order]);
957	if (list_empty(&area->free_list[mtype]))
958	continue;
959
960	/* Take the lock and attempt capture of the page */
961	if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
962	return;
963	if (!list_empty(&area->free_list[mtype])) {
964	page = list_entry(area->free_list[mtype].next,
965	struct page, lru);
966	if (capture_free_page(page, cc->order, mtype)) {
967	spin_unlock_irqrestore(&cc->zone->lock,
968	flags);
969	*cc->page = page;
970	return;
971	}
972	}
973	spin_unlock_irqrestore(&cc->zone->lock, flags);
974	}
975	}
976	}
977
978	static int compact_zone(struct zone zone, struct compact_control cc)	919	static int compact_zone(struct zone zone, struct compact_control cc)
979	{	920	{
980	int ret;	921	int ret;
@@ -1054,9 +995,6 @@ static int compact_zone(struct zone zone, struct compact_control cc)
1054	goto out;	995	goto out;
1055	}	996	}
1056	}	997	}
1057
1058	/* Capture a page now if it is a suitable size */
1059	compact_capture_page(cc);
1060	}	998	}
1061		999
1062	out:	1000	out:
@@ -1069,8 +1007,7 @@ out:
1069		1007
1070	static unsigned long compact_zone_order(struct zone *zone,	1008	static unsigned long compact_zone_order(struct zone *zone,
1071	int order, gfp_t gfp_mask,	1009	int order, gfp_t gfp_mask,
1072	bool sync, bool *contended,	1010	bool sync, bool *contended)
1073	struct page **page)
1074	{	1011	{
1075	unsigned long ret;	1012	unsigned long ret;
1076	struct compact_control cc = {	1013	struct compact_control cc = {
@@ -1080,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
1080	.migratetype = allocflags_to_migratetype(gfp_mask),	1017	.migratetype = allocflags_to_migratetype(gfp_mask),
1081	.zone = zone,	1018	.zone = zone,
1082	.sync = sync,	1019	.sync = sync,
1083	.page = page,
1084	};	1020	};
1085	INIT_LIST_HEAD(&cc.freepages);	1021	INIT_LIST_HEAD(&cc.freepages);
1086	INIT_LIST_HEAD(&cc.migratepages);	1022	INIT_LIST_HEAD(&cc.migratepages);
@@ -1110,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
1110	*/	1046	*/
1111	unsigned long try_to_compact_pages(struct zonelist *zonelist,	1047	unsigned long try_to_compact_pages(struct zonelist *zonelist,
1112	int order, gfp_t gfp_mask, nodemask_t *nodemask,	1048	int order, gfp_t gfp_mask, nodemask_t *nodemask,
1113	bool sync, bool contended, struct page *page)	1049	bool sync, bool *contended)
1114	{	1050	{
1115	enum zone_type high_zoneidx = gfp_zone(gfp_mask);	1051	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1116	int may_enter_fs = gfp_mask & __GFP_FS;	1052	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1136,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1136	int status;	1072	int status;
1137		1073
1138	status = compact_zone_order(zone, order, gfp_mask, sync,	1074	status = compact_zone_order(zone, order, gfp_mask, sync,
1139	contended, page);	1075	contended);
1140	rc = max(status, rc);	1076	rc = max(status, rc);
1141		1077
1142	/* If a normal allocation would succeed, stop compacting */	1078	/* If a normal allocation would succeed, stop compacting */
@@ -1192,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
1192	struct compact_control cc = {	1128	struct compact_control cc = {
1193	.order = order,	1129	.order = order,
1194	.sync = false,	1130	.sync = false,
1195	.page = NULL,
1196	};	1131	};
1197		1132
1198	return __compact_pgdat(pgdat, &cc);	1133	return __compact_pgdat(pgdat, &cc);
@@ -1203,7 +1138,6 @@ static int compact_node(int nid)
1203	struct compact_control cc = {	1138	struct compact_control cc = {
1204	.order = -1,	1139	.order = -1,
1205	.sync = true,	1140	.sync = true,
1206	.page = NULL,
1207	};	1141	};
1208		1142
1209	return __compact_pgdat(NODE_DATA(nid), &cc);	1143	return __compact_pgdat(NODE_DATA(nid), &cc);