aboutsummaryrefslogtreecommitdiffstats
path: root/mm/compaction.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-01-11 17:32:16 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-01-11 17:54:56 -0500
commit8fb74b9fb2b182d54beee592350d9ea1f325917a (patch)
treeccdcf36cfedb0cf54268226ebde75330e4882539 /mm/compaction.c
parentc0a3a20b6c4b5229ef5d26fd9b1c4b1957632aa7 (diff)
mm: compaction: partially revert capture of suitable high-order page
Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when waiting for POLLIN on a local TCP socket. It was easier to trigger if there was disk IO and dirty pages at the same time and he bisected it to commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). The intention of that patch was to improve high-order allocations under memory pressure after changes made to reclaim in 3.6 drastically hurt THP allocations but the approach was flawed. For Eric, the problem was that page->pfmemalloc was not being cleared for captured pages leading to a poor interaction with swap-over-NFS support causing the packets to be dropped. However, I identified a few more problems with the patch including the fact that it can increase contention on zone->lock in some cases which could result in async direct compaction being aborted early. In retrospect the capture patch took the wrong approach. What it should have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it was allocating for THP and avoided races that way. While the patch was showing to improve allocation success rates at the time, the benefit is marginal given the relative complexity and it should be revisited from scratch in the context of the other reclaim-related changes that have taken place since the patch was first written and tested. This patch partially reverts commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). Reported-and-tested-by: Eric Wong <normalperson@yhbt.net> Tested-by: Eric Dumazet <eric.dumazet@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/compaction.c')
-rw-r--r--mm/compaction.c92
1 files changed, 13 insertions, 79 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index f8f5c111b7d7..c62bd063d766 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
816static int compact_finished(struct zone *zone, 816static int compact_finished(struct zone *zone,
817 struct compact_control *cc) 817 struct compact_control *cc)
818{ 818{
819 unsigned int order;
819 unsigned long watermark; 820 unsigned long watermark;
820 821
821 if (fatal_signal_pending(current)) 822 if (fatal_signal_pending(current))
@@ -850,22 +851,16 @@ static int compact_finished(struct zone *zone,
850 return COMPACT_CONTINUE; 851 return COMPACT_CONTINUE;
851 852
852 /* Direct compactor: Is a suitable page free? */ 853 /* Direct compactor: Is a suitable page free? */
853 if (cc->page) { 854 for (order = cc->order; order < MAX_ORDER; order++) {
854 /* Was a suitable page captured? */ 855 struct free_area *area = &zone->free_area[order];
855 if (*cc->page) 856
857 /* Job done if page is free of the right migratetype */
858 if (!list_empty(&area->free_list[cc->migratetype]))
859 return COMPACT_PARTIAL;
860
861 /* Job done if allocation would set block type */
862 if (cc->order >= pageblock_order && area->nr_free)
856 return COMPACT_PARTIAL; 863 return COMPACT_PARTIAL;
857 } else {
858 unsigned int order;
859 for (order = cc->order; order < MAX_ORDER; order++) {
860 struct free_area *area = &zone->free_area[cc->order];
861 /* Job done if page is free of the right migratetype */
862 if (!list_empty(&area->free_list[cc->migratetype]))
863 return COMPACT_PARTIAL;
864
865 /* Job done if allocation would set block type */
866 if (cc->order >= pageblock_order && area->nr_free)
867 return COMPACT_PARTIAL;
868 }
869 } 864 }
870 865
871 return COMPACT_CONTINUE; 866 return COMPACT_CONTINUE;
@@ -921,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
921 return COMPACT_CONTINUE; 916 return COMPACT_CONTINUE;
922} 917}
923 918
924static void compact_capture_page(struct compact_control *cc)
925{
926 unsigned long flags;
927 int mtype, mtype_low, mtype_high;
928
929 if (!cc->page || *cc->page)
930 return;
931
932 /*
933 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
934 * regardless of the migratetype of the freelist is is captured from.
935 * This is fine because the order for a high-order MIGRATE_MOVABLE
936 * allocation is typically at least a pageblock size and overall
937 * fragmentation is not impaired. Other allocation types must
938 * capture pages from their own migratelist because otherwise they
939 * could pollute other pageblocks like MIGRATE_MOVABLE with
940 * difficult to move pages and making fragmentation worse overall.
941 */
942 if (cc->migratetype == MIGRATE_MOVABLE) {
943 mtype_low = 0;
944 mtype_high = MIGRATE_PCPTYPES;
945 } else {
946 mtype_low = cc->migratetype;
947 mtype_high = cc->migratetype + 1;
948 }
949
950 /* Speculatively examine the free lists without zone lock */
951 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
952 int order;
953 for (order = cc->order; order < MAX_ORDER; order++) {
954 struct page *page;
955 struct free_area *area;
956 area = &(cc->zone->free_area[order]);
957 if (list_empty(&area->free_list[mtype]))
958 continue;
959
960 /* Take the lock and attempt capture of the page */
961 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
962 return;
963 if (!list_empty(&area->free_list[mtype])) {
964 page = list_entry(area->free_list[mtype].next,
965 struct page, lru);
966 if (capture_free_page(page, cc->order, mtype)) {
967 spin_unlock_irqrestore(&cc->zone->lock,
968 flags);
969 *cc->page = page;
970 return;
971 }
972 }
973 spin_unlock_irqrestore(&cc->zone->lock, flags);
974 }
975 }
976}
977
978static int compact_zone(struct zone *zone, struct compact_control *cc) 919static int compact_zone(struct zone *zone, struct compact_control *cc)
979{ 920{
980 int ret; 921 int ret;
@@ -1054,9 +995,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1054 goto out; 995 goto out;
1055 } 996 }
1056 } 997 }
1057
1058 /* Capture a page now if it is a suitable size */
1059 compact_capture_page(cc);
1060 } 998 }
1061 999
1062out: 1000out:
@@ -1069,8 +1007,7 @@ out:
1069 1007
1070static unsigned long compact_zone_order(struct zone *zone, 1008static unsigned long compact_zone_order(struct zone *zone,
1071 int order, gfp_t gfp_mask, 1009 int order, gfp_t gfp_mask,
1072 bool sync, bool *contended, 1010 bool sync, bool *contended)
1073 struct page **page)
1074{ 1011{
1075 unsigned long ret; 1012 unsigned long ret;
1076 struct compact_control cc = { 1013 struct compact_control cc = {
@@ -1080,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
1080 .migratetype = allocflags_to_migratetype(gfp_mask), 1017 .migratetype = allocflags_to_migratetype(gfp_mask),
1081 .zone = zone, 1018 .zone = zone,
1082 .sync = sync, 1019 .sync = sync,
1083 .page = page,
1084 }; 1020 };
1085 INIT_LIST_HEAD(&cc.freepages); 1021 INIT_LIST_HEAD(&cc.freepages);
1086 INIT_LIST_HEAD(&cc.migratepages); 1022 INIT_LIST_HEAD(&cc.migratepages);
@@ -1110,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
1110 */ 1046 */
1111unsigned long try_to_compact_pages(struct zonelist *zonelist, 1047unsigned long try_to_compact_pages(struct zonelist *zonelist,
1112 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1048 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1113 bool sync, bool *contended, struct page **page) 1049 bool sync, bool *contended)
1114{ 1050{
1115 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1051 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1116 int may_enter_fs = gfp_mask & __GFP_FS; 1052 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1136,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1136 int status; 1072 int status;
1137 1073
1138 status = compact_zone_order(zone, order, gfp_mask, sync, 1074 status = compact_zone_order(zone, order, gfp_mask, sync,
1139 contended, page); 1075 contended);
1140 rc = max(status, rc); 1076 rc = max(status, rc);
1141 1077
1142 /* If a normal allocation would succeed, stop compacting */ 1078 /* If a normal allocation would succeed, stop compacting */
@@ -1192,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
1192 struct compact_control cc = { 1128 struct compact_control cc = {
1193 .order = order, 1129 .order = order,
1194 .sync = false, 1130 .sync = false,
1195 .page = NULL,
1196 }; 1131 };
1197 1132
1198 return __compact_pgdat(pgdat, &cc); 1133 return __compact_pgdat(pgdat, &cc);
@@ -1203,7 +1138,6 @@ static int compact_node(int nid)
1203 struct compact_control cc = { 1138 struct compact_control cc = {
1204 .order = -1, 1139 .order = -1,
1205 .sync = true, 1140 .sync = true,
1206 .page = NULL,
1207 }; 1141 };
1208 1142
1209 return __compact_pgdat(NODE_DATA(nid), &cc); 1143 return __compact_pgdat(NODE_DATA(nid), &cc);