aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-01-11 17:32:16 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-01-11 17:54:56 -0500
commit8fb74b9fb2b182d54beee592350d9ea1f325917a (patch)
treeccdcf36cfedb0cf54268226ebde75330e4882539
parentc0a3a20b6c4b5229ef5d26fd9b1c4b1957632aa7 (diff)
mm: compaction: partially revert capture of suitable high-order page
Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when waiting for POLLIN on a local TCP socket. It was easier to trigger if there was disk IO and dirty pages at the same time and he bisected it to commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). The intention of that patch was to improve high-order allocations under memory pressure after changes made to reclaim in 3.6 drastically hurt THP allocations but the approach was flawed. For Eric, the problem was that page->pfmemalloc was not being cleared for captured pages leading to a poor interaction with swap-over-NFS support causing the packets to be dropped. However, I identified a few more problems with the patch including the fact that it can increase contention on zone->lock in some cases which could result in async direct compaction being aborted early. In retrospect the capture patch took the wrong approach. What it should have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it was allocating for THP and avoided races that way. While the patch was showing to improve allocation success rates at the time, the benefit is marginal given the relative complexity and it should be revisited from scratch in the context of the other reclaim-related changes that have taken place since the patch was first written and tested. This patch partially reverts commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). Reported-and-tested-by: Eric Wong <normalperson@yhbt.net> Tested-by: Eric Dumazet <eric.dumazet@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/compaction.h4
-rw-r--r--include/linux/mm.h1
-rw-r--r--mm/compaction.c92
-rw-r--r--mm/internal.h1
-rw-r--r--mm/page_alloc.c35
5 files changed, 23 insertions, 110 deletions
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 6ecb6dc2f303..cc7bddeaf553 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
22extern int fragmentation_index(struct zone *zone, unsigned int order); 22extern int fragmentation_index(struct zone *zone, unsigned int order);
23extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 23extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24 int order, gfp_t gfp_mask, nodemask_t *mask, 24 int order, gfp_t gfp_mask, nodemask_t *mask,
25 bool sync, bool *contended, struct page **page); 25 bool sync, bool *contended);
26extern int compact_pgdat(pg_data_t *pgdat, int order); 26extern int compact_pgdat(pg_data_t *pgdat, int order);
27extern void reset_isolation_suitable(pg_data_t *pgdat); 27extern void reset_isolation_suitable(pg_data_t *pgdat);
28extern unsigned long compaction_suitable(struct zone *zone, int order); 28extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -75,7 +75,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
75#else 75#else
76static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 76static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
77 int order, gfp_t gfp_mask, nodemask_t *nodemask, 77 int order, gfp_t gfp_mask, nodemask_t *nodemask,
78 bool sync, bool *contended, struct page **page) 78 bool sync, bool *contended)
79{ 79{
80 return COMPACT_CONTINUE; 80 return COMPACT_CONTINUE;
81} 81}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 63204078f72b..66e2f7c61e5c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -455,7 +455,6 @@ void put_pages_list(struct list_head *pages);
455 455
456void split_page(struct page *page, unsigned int order); 456void split_page(struct page *page, unsigned int order);
457int split_free_page(struct page *page); 457int split_free_page(struct page *page);
458int capture_free_page(struct page *page, int alloc_order, int migratetype);
459 458
460/* 459/*
461 * Compound pages have a destructor function. Provide a 460 * Compound pages have a destructor function. Provide a
diff --git a/mm/compaction.c b/mm/compaction.c
index f8f5c111b7d7..c62bd063d766 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
816static int compact_finished(struct zone *zone, 816static int compact_finished(struct zone *zone,
817 struct compact_control *cc) 817 struct compact_control *cc)
818{ 818{
819 unsigned int order;
819 unsigned long watermark; 820 unsigned long watermark;
820 821
821 if (fatal_signal_pending(current)) 822 if (fatal_signal_pending(current))
@@ -850,22 +851,16 @@ static int compact_finished(struct zone *zone,
850 return COMPACT_CONTINUE; 851 return COMPACT_CONTINUE;
851 852
852 /* Direct compactor: Is a suitable page free? */ 853 /* Direct compactor: Is a suitable page free? */
853 if (cc->page) { 854 for (order = cc->order; order < MAX_ORDER; order++) {
854 /* Was a suitable page captured? */ 855 struct free_area *area = &zone->free_area[order];
855 if (*cc->page) 856
857 /* Job done if page is free of the right migratetype */
858 if (!list_empty(&area->free_list[cc->migratetype]))
859 return COMPACT_PARTIAL;
860
861 /* Job done if allocation would set block type */
862 if (cc->order >= pageblock_order && area->nr_free)
856 return COMPACT_PARTIAL; 863 return COMPACT_PARTIAL;
857 } else {
858 unsigned int order;
859 for (order = cc->order; order < MAX_ORDER; order++) {
860 struct free_area *area = &zone->free_area[cc->order];
861 /* Job done if page is free of the right migratetype */
862 if (!list_empty(&area->free_list[cc->migratetype]))
863 return COMPACT_PARTIAL;
864
865 /* Job done if allocation would set block type */
866 if (cc->order >= pageblock_order && area->nr_free)
867 return COMPACT_PARTIAL;
868 }
869 } 864 }
870 865
871 return COMPACT_CONTINUE; 866 return COMPACT_CONTINUE;
@@ -921,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
921 return COMPACT_CONTINUE; 916 return COMPACT_CONTINUE;
922} 917}
923 918
924static void compact_capture_page(struct compact_control *cc)
925{
926 unsigned long flags;
927 int mtype, mtype_low, mtype_high;
928
929 if (!cc->page || *cc->page)
930 return;
931
932 /*
933 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
934 * regardless of the migratetype of the freelist is is captured from.
935 * This is fine because the order for a high-order MIGRATE_MOVABLE
936 * allocation is typically at least a pageblock size and overall
937 * fragmentation is not impaired. Other allocation types must
938 * capture pages from their own migratelist because otherwise they
939 * could pollute other pageblocks like MIGRATE_MOVABLE with
940 * difficult to move pages and making fragmentation worse overall.
941 */
942 if (cc->migratetype == MIGRATE_MOVABLE) {
943 mtype_low = 0;
944 mtype_high = MIGRATE_PCPTYPES;
945 } else {
946 mtype_low = cc->migratetype;
947 mtype_high = cc->migratetype + 1;
948 }
949
950 /* Speculatively examine the free lists without zone lock */
951 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
952 int order;
953 for (order = cc->order; order < MAX_ORDER; order++) {
954 struct page *page;
955 struct free_area *area;
956 area = &(cc->zone->free_area[order]);
957 if (list_empty(&area->free_list[mtype]))
958 continue;
959
960 /* Take the lock and attempt capture of the page */
961 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
962 return;
963 if (!list_empty(&area->free_list[mtype])) {
964 page = list_entry(area->free_list[mtype].next,
965 struct page, lru);
966 if (capture_free_page(page, cc->order, mtype)) {
967 spin_unlock_irqrestore(&cc->zone->lock,
968 flags);
969 *cc->page = page;
970 return;
971 }
972 }
973 spin_unlock_irqrestore(&cc->zone->lock, flags);
974 }
975 }
976}
977
978static int compact_zone(struct zone *zone, struct compact_control *cc) 919static int compact_zone(struct zone *zone, struct compact_control *cc)
979{ 920{
980 int ret; 921 int ret;
@@ -1054,9 +995,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1054 goto out; 995 goto out;
1055 } 996 }
1056 } 997 }
1057
1058 /* Capture a page now if it is a suitable size */
1059 compact_capture_page(cc);
1060 } 998 }
1061 999
1062out: 1000out:
@@ -1069,8 +1007,7 @@ out:
1069 1007
1070static unsigned long compact_zone_order(struct zone *zone, 1008static unsigned long compact_zone_order(struct zone *zone,
1071 int order, gfp_t gfp_mask, 1009 int order, gfp_t gfp_mask,
1072 bool sync, bool *contended, 1010 bool sync, bool *contended)
1073 struct page **page)
1074{ 1011{
1075 unsigned long ret; 1012 unsigned long ret;
1076 struct compact_control cc = { 1013 struct compact_control cc = {
@@ -1080,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
1080 .migratetype = allocflags_to_migratetype(gfp_mask), 1017 .migratetype = allocflags_to_migratetype(gfp_mask),
1081 .zone = zone, 1018 .zone = zone,
1082 .sync = sync, 1019 .sync = sync,
1083 .page = page,
1084 }; 1020 };
1085 INIT_LIST_HEAD(&cc.freepages); 1021 INIT_LIST_HEAD(&cc.freepages);
1086 INIT_LIST_HEAD(&cc.migratepages); 1022 INIT_LIST_HEAD(&cc.migratepages);
@@ -1110,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
1110 */ 1046 */
1111unsigned long try_to_compact_pages(struct zonelist *zonelist, 1047unsigned long try_to_compact_pages(struct zonelist *zonelist,
1112 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1048 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1113 bool sync, bool *contended, struct page **page) 1049 bool sync, bool *contended)
1114{ 1050{
1115 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1051 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1116 int may_enter_fs = gfp_mask & __GFP_FS; 1052 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1136,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1136 int status; 1072 int status;
1137 1073
1138 status = compact_zone_order(zone, order, gfp_mask, sync, 1074 status = compact_zone_order(zone, order, gfp_mask, sync,
1139 contended, page); 1075 contended);
1140 rc = max(status, rc); 1076 rc = max(status, rc);
1141 1077
1142 /* If a normal allocation would succeed, stop compacting */ 1078 /* If a normal allocation would succeed, stop compacting */
@@ -1192,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
1192 struct compact_control cc = { 1128 struct compact_control cc = {
1193 .order = order, 1129 .order = order,
1194 .sync = false, 1130 .sync = false,
1195 .page = NULL,
1196 }; 1131 };
1197 1132
1198 return __compact_pgdat(pgdat, &cc); 1133 return __compact_pgdat(pgdat, &cc);
@@ -1203,7 +1138,6 @@ static int compact_node(int nid)
1203 struct compact_control cc = { 1138 struct compact_control cc = {
1204 .order = -1, 1139 .order = -1,
1205 .sync = true, 1140 .sync = true,
1206 .page = NULL,
1207 }; 1141 };
1208 1142
1209 return __compact_pgdat(NODE_DATA(nid), &cc); 1143 return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/internal.h b/mm/internal.h
index d597f94cc205..9ba21100ebf3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -135,7 +135,6 @@ struct compact_control {
135 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 135 int migratetype; /* MOVABLE, RECLAIMABLE etc */
136 struct zone *zone; 136 struct zone *zone;
137 bool contended; /* True if a lock was contended */ 137 bool contended; /* True if a lock was contended */
138 struct page **page; /* Page captured of requested size */
139}; 138};
140 139
141unsigned long 140unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c957805a7f0e..df2022ff0c8a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1384,14 +1384,8 @@ void split_page(struct page *page, unsigned int order)
1384 set_page_refcounted(page + i); 1384 set_page_refcounted(page + i);
1385} 1385}
1386 1386
1387/* 1387static int __isolate_free_page(struct page *page, unsigned int order)
1388 * Similar to the split_page family of functions except that the page
1389 * required at the given order and being isolated now to prevent races
1390 * with parallel allocators
1391 */
1392int capture_free_page(struct page *page, int alloc_order, int migratetype)
1393{ 1388{
1394 unsigned int order;
1395 unsigned long watermark; 1389 unsigned long watermark;
1396 struct zone *zone; 1390 struct zone *zone;
1397 int mt; 1391 int mt;
@@ -1399,7 +1393,6 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1399 BUG_ON(!PageBuddy(page)); 1393 BUG_ON(!PageBuddy(page));
1400 1394
1401 zone = page_zone(page); 1395 zone = page_zone(page);
1402 order = page_order(page);
1403 mt = get_pageblock_migratetype(page); 1396 mt = get_pageblock_migratetype(page);
1404 1397
1405 if (mt != MIGRATE_ISOLATE) { 1398 if (mt != MIGRATE_ISOLATE) {
@@ -1408,7 +1401,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1408 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1401 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1409 return 0; 1402 return 0;
1410 1403
1411 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); 1404 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1412 } 1405 }
1413 1406
1414 /* Remove page from free list */ 1407 /* Remove page from free list */
@@ -1416,11 +1409,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1416 zone->free_area[order].nr_free--; 1409 zone->free_area[order].nr_free--;
1417 rmv_page_order(page); 1410 rmv_page_order(page);
1418 1411
1419 if (alloc_order != order) 1412 /* Set the pageblock if the isolated page is at least a pageblock */
1420 expand(zone, page, alloc_order, order,
1421 &zone->free_area[order], migratetype);
1422
1423 /* Set the pageblock if the captured page is at least a pageblock */
1424 if (order >= pageblock_order - 1) { 1413 if (order >= pageblock_order - 1) {
1425 struct page *endpage = page + (1 << order) - 1; 1414 struct page *endpage = page + (1 << order) - 1;
1426 for (; page < endpage; page += pageblock_nr_pages) { 1415 for (; page < endpage; page += pageblock_nr_pages) {
@@ -1431,7 +1420,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1431 } 1420 }
1432 } 1421 }
1433 1422
1434 return 1UL << alloc_order; 1423 return 1UL << order;
1435} 1424}
1436 1425
1437/* 1426/*
@@ -1449,10 +1438,9 @@ int split_free_page(struct page *page)
1449 unsigned int order; 1438 unsigned int order;
1450 int nr_pages; 1439 int nr_pages;
1451 1440
1452 BUG_ON(!PageBuddy(page));
1453 order = page_order(page); 1441 order = page_order(page);
1454 1442
1455 nr_pages = capture_free_page(page, order, 0); 1443 nr_pages = __isolate_free_page(page, order);
1456 if (!nr_pages) 1444 if (!nr_pages)
1457 return 0; 1445 return 0;
1458 1446
@@ -2136,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2136 bool *contended_compaction, bool *deferred_compaction, 2124 bool *contended_compaction, bool *deferred_compaction,
2137 unsigned long *did_some_progress) 2125 unsigned long *did_some_progress)
2138{ 2126{
2139 struct page *page = NULL;
2140
2141 if (!order) 2127 if (!order)
2142 return NULL; 2128 return NULL;
2143 2129
@@ -2149,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2149 current->flags |= PF_MEMALLOC; 2135 current->flags |= PF_MEMALLOC;
2150 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2136 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2151 nodemask, sync_migration, 2137 nodemask, sync_migration,
2152 contended_compaction, &page); 2138 contended_compaction);
2153 current->flags &= ~PF_MEMALLOC; 2139 current->flags &= ~PF_MEMALLOC;
2154 2140
2155 /* If compaction captured a page, prep and use it */
2156 if (page) {
2157 prep_new_page(page, order, gfp_mask);
2158 goto got_page;
2159 }
2160
2161 if (*did_some_progress != COMPACT_SKIPPED) { 2141 if (*did_some_progress != COMPACT_SKIPPED) {
2142 struct page *page;
2143
2162 /* Page migration frees to the PCP lists but we want merging */ 2144 /* Page migration frees to the PCP lists but we want merging */
2163 drain_pages(get_cpu()); 2145 drain_pages(get_cpu());
2164 put_cpu(); 2146 put_cpu();
@@ -2168,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2168 alloc_flags & ~ALLOC_NO_WATERMARKS, 2150 alloc_flags & ~ALLOC_NO_WATERMARKS,
2169 preferred_zone, migratetype); 2151 preferred_zone, migratetype);
2170 if (page) { 2152 if (page) {
2171got_page:
2172 preferred_zone->compact_blockskip_flush = false; 2153 preferred_zone->compact_blockskip_flush = false;
2173 preferred_zone->compact_considered = 0; 2154 preferred_zone->compact_considered = 0;
2174 preferred_zone->compact_defer_shift = 0; 2155 preferred_zone->compact_defer_shift = 0;