aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-05-29 18:06:19 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-29 19:22:19 -0400
commitc53919adc045bf803252e912f23028a68525753d (patch)
tree352361d84125b06ca890f3a2b3568512cc14c458 /mm/vmscan.c
parente709ffd6169ccd259eb5874e853303e91e94e829 (diff)
mm: vmscan: remove lumpy reclaim
This series removes lumpy reclaim and some stalling logic that was unintentionally being used by memory compaction. The end result is that stalling on dirty pages during page reclaim now depends on wait_iff_congested(). Four kernels were compared 3.3.0 vanilla 3.4.0-rc2 vanilla 3.4.0-rc2 lumpyremove-v2 is patch one from this series 3.4.0-rc2 nosync-v2r3 is the full series Removing lumpy reclaim saves almost 900 bytes of text whereas the full series removes 1200 bytes. text data bss dec hex filename 6740375 1927944 2260992 10929311 a6c49f vmlinux-3.4.0-rc2-vanilla 6739479 1927944 2260992 10928415 a6c11f vmlinux-3.4.0-rc2-lumpyremove-v2 6739159 1927944 2260992 10928095 a6bfdf vmlinux-3.4.0-rc2-nosync-v2 There are behaviour changes in the series and so tests were run with monitoring of ftrace events. This disrupts results so the performance results are distorted but the new behaviour should be clearer. fs-mark running in a threaded configuration showed little of interest as it did not push reclaim aggressively FS-Mark Multi Threaded 3.3.0-vanilla rc2-vanilla lumpyremove-v2r3 nosync-v2r3 Files/s min 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) Files/s mean 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) Files/s stddev 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Files/s max 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) 3.20 ( 0.00%) Overhead min 508667.00 ( 0.00%) 521350.00 (-2.49%) 544292.00 (-7.00%) 547168.00 (-7.57%) Overhead mean 551185.00 ( 0.00%) 652690.73 (-18.42%) 991208.40 (-79.83%) 570130.53 (-3.44%) Overhead stddev 18200.69 ( 0.00%) 331958.29 (-1723.88%) 1579579.43 (-8578.68%) 9576.81 (47.38%) Overhead max 576775.00 ( 0.00%) 1846634.00 (-220.17%) 6901055.00 (-1096.49%) 585675.00 (-1.54%) MMTests Statistics: duration Sys Time Running Test (seconds) 309.90 300.95 307.33 298.95 User+Sys Time Running Test (seconds) 319.32 309.67 315.69 307.51 Total Elapsed Time (seconds) 1187.85 1193.09 1191.98 1193.73 MMTests Statistics: vmstat Page Ins 80532 82212 81420 79480 Page Outs 111434984 111456240 111437376 111582628 Swap Ins 0 0 0 0 Swap Outs 0 0 0 0 Direct pages scanned 44881 27889 27453 34843 Kswapd pages scanned 25841428 25860774 25861233 25843212 Kswapd pages reclaimed 25841393 25860741 25861199 25843179 Direct pages reclaimed 44881 27889 27453 34843 Kswapd efficiency 99% 99% 99% 99% Kswapd velocity 21754.791 21675.460 21696.029 21649.127 Direct efficiency 100% 100% 100% 100% Direct velocity 37.783 23.375 23.031 29.188 Percentage direct scans 0% 0% 0% 0% ftrace showed that there was no stalling on writeback or pages submitted for IO from reclaim context. postmark was similar and while it was more interesting, it also did not push reclaim heavily. POSTMARK 3.3.0-vanilla rc2-vanilla lumpyremove-v2r3 nosync-v2r3 Transactions per second: 16.00 ( 0.00%) 20.00 (25.00%) 18.00 (12.50%) 17.00 ( 6.25%) Data megabytes read per second: 18.80 ( 0.00%) 24.27 (29.10%) 22.26 (18.40%) 20.54 ( 9.26%) Data megabytes written per second: 35.83 ( 0.00%) 46.25 (29.08%) 42.42 (18.39%) 39.14 ( 9.24%) Files created alone per second: 28.00 ( 0.00%) 38.00 (35.71%) 34.00 (21.43%) 30.00 ( 7.14%) Files create/transact per second: 8.00 ( 0.00%) 10.00 (25.00%) 9.00 (12.50%) 8.00 ( 0.00%) Files deleted alone per second: 556.00 ( 0.00%) 1224.00 (120.14%) 3062.00 (450.72%) 6124.00 (1001.44%) Files delete/transact per second: 8.00 ( 0.00%) 10.00 (25.00%) 9.00 (12.50%) 8.00 ( 0.00%) MMTests Statistics: duration Sys Time Running Test (seconds) 113.34 107.99 109.73 108.72 User+Sys Time Running Test (seconds) 145.51 139.81 143.32 143.55 Total Elapsed Time (seconds) 1159.16 899.23 980.17 1062.27 MMTests Statistics: vmstat Page Ins 13710192 13729032 13727944 13760136 Page Outs 43071140 42987228 42733684 42931624 Swap Ins 0 0 0 0 Swap Outs 0 0 0 0 Direct pages scanned 0 0 0 0 Kswapd pages scanned 9941613 9937443 9939085 9929154 Kswapd pages reclaimed 9940926 9936751 9938397 9928465 Direct pages reclaimed 0 0 0 0 Kswapd efficiency 99% 99% 99% 99% Kswapd velocity 8576.567 11051.058 10140.164 9347.109 Direct efficiency 100% 100% 100% 100% Direct velocity 0.000 0.000 0.000 0.000 It looks like here that the full series regresses performance but as ftrace showed no usage of wait_iff_congested() or sync reclaim I am assuming it's a disruption due to monitoring. Other data such as memory usage, page IO, swap IO all looked similar. Running a benchmark with a plain DD showed nothing very interesting. The full series stalled in wait_iff_congested() slightly less but stall times on vanilla kernels were marginal. Running a benchmark that hammered on file-backed mappings showed stalls due to congestion but not in sync writebacks MICRO 3.3.0-vanilla rc2-vanilla lumpyremove-v2r3 nosync-v2r3 MMTests Statistics: duration Sys Time Running Test (seconds) 308.13 294.50 298.75 299.53 User+Sys Time Running Test (seconds) 330.45 316.28 318.93 320.79 Total Elapsed Time (seconds) 1814.90 1833.88 1821.14 1832.91 MMTests Statistics: vmstat Page Ins 108712 120708 97224 110344 Page Outs 155514576 156017404 155813676 156193256 Swap Ins 0 0 0 0 Swap Outs 0 0 0 0 Direct pages scanned 2599253 1550480 2512822 2414760 Kswapd pages scanned 69742364 71150694 68839041 69692533 Kswapd pages reclaimed 34824488 34773341 34796602 34799396 Direct pages reclaimed 53693 94750 61792 75205 Kswapd efficiency 49% 48% 50% 49% Kswapd velocity 38427.662 38797.901 37799.972 38022.889 Direct efficiency 2% 6% 2% 3% Direct velocity 1432.174 845.464 1379.807 1317.446 Percentage direct scans 3% 2% 3% 3% Page writes by reclaim 0 0 0 0 Page writes file 0 0 0 0 Page writes anon 0 0 0 0 Page reclaim immediate 0 0 0 1218 Page rescued immediate 0 0 0 0 Slabs scanned 15360 16384 13312 16384 Direct inode steals 0 0 0 0 Kswapd inode steals 4340 4327 1630 4323 FTrace Reclaim Statistics: congestion_wait Direct number congest waited 0 0 0 0 Direct time congest waited 0ms 0ms 0ms 0ms Direct full congest waited 0 0 0 0 Direct number conditional waited 900 870 754 789 Direct time conditional waited 0ms 0ms 0ms 20ms Direct full conditional waited 0 0 0 0 KSwapd number congest waited 2106 2308 2116 1915 KSwapd time congest waited 139924ms 157832ms 125652ms 132516ms KSwapd full congest waited 1346 1530 1202 1278 KSwapd number conditional waited 12922 16320 10943 14670 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 Reclaim statistics are not radically changed. The stall times in kswapd are massive but it is clear that it is due to calls to congestion_wait() and that is almost certainly the call in balance_pgdat(). Otherwise stalls due to dirty pages are non-existant. I ran a benchmark that stressed high-order allocation. This is very artifical load but was used in the past to evaluate lumpy reclaim and compaction. Generally I look at allocation success rates and latency figures. STRESS-HIGHALLOC 3.3.0-vanilla rc2-vanilla lumpyremove-v2r3 nosync-v2r3 Pass 1 81.00 ( 0.00%) 28.00 (-53.00%) 24.00 (-57.00%) 28.00 (-53.00%) Pass 2 82.00 ( 0.00%) 39.00 (-43.00%) 38.00 (-44.00%) 43.00 (-39.00%) while Rested 88.00 ( 0.00%) 87.00 (-1.00%) 88.00 ( 0.00%) 88.00 ( 0.00%) MMTests Statistics: duration Sys Time Running Test (seconds) 740.93 681.42 685.14 684.87 User+Sys Time Running Test (seconds) 2922.65 3269.52 3281.35 3279.44 Total Elapsed Time (seconds) 1161.73 1152.49 1159.55 1161.44 MMTests Statistics: vmstat Page Ins 4486020 2807256 2855944 2876244 Page Outs 7261600 7973688 7975320 7986120 Swap Ins 31694 0 0 0 Swap Outs 98179 0 0 0 Direct pages scanned 53494 57731 34406 113015 Kswapd pages scanned 6271173 1287481 1278174 1219095 Kswapd pages reclaimed 2029240 1281025 1260708 1201583 Direct pages reclaimed 1468 14564 16649 92456 Kswapd efficiency 32% 99% 98% 98% Kswapd velocity 5398.133 1117.130 1102.302 1049.641 Direct efficiency 2% 25% 48% 81% Direct velocity 46.047 50.092 29.672 97.306 Percentage direct scans 0% 4% 2% 8% Page writes by reclaim 1616049 0 0 0 Page writes file 1517870 0 0 0 Page writes anon 98179 0 0 0 Page reclaim immediate 103778 27339 9796 17831 Page rescued immediate 0 0 0 0 Slabs scanned 1096704 986112 980992 998400 Direct inode steals 223 215040 216736 247881 Kswapd inode steals 175331 61548 68444 63066 Kswapd skipped wait 21991 0 1 0 THP fault alloc 1 135 125 134 THP collapse alloc 393 311 228 236 THP splits 25 13 7 8 THP fault fallback 0 0 0 0 THP collapse fail 3 5 7 7 Compaction stalls 865 1270 1422 1518 Compaction success 370 401 353 383 Compaction failures 495 869 1069 1135 Compaction pages moved 870155 3828868 4036106 4423626 Compaction move failure 26429 23865 29742 27514 Success rates are completely hosed for 3.4-rc2 which is almost certainly due to commit fe2c2a106663 ("vmscan: reclaim at order 0 when compaction is enabled"). I expected this would happen for kswapd and impair allocation success rates (https://lkml.org/lkml/2012/1/25/166) but I did not anticipate this much a difference: 80% less scanning, 37% less reclaim by kswapd In comparison, reclaim/compaction is not aggressive and gives up easily which is the intended behaviour. hugetlbfs uses __GFP_REPEAT and would be much more aggressive about reclaim/compaction than THP allocations are. The stress test above is allocating like neither THP or hugetlbfs but is much closer to THP. Mainline is now impaired in terms of high order allocation under heavy load although I do not know to what degree as I did not test with __GFP_REPEAT. Keep this in mind for bugs related to hugepage pool resizing, THP allocation and high order atomic allocation failures from network devices. In terms of congestion throttling, I see the following for this test FTrace Reclaim Statistics: congestion_wait Direct number congest waited 3 0 0 0 Direct time congest waited 0ms 0ms 0ms 0ms Direct full congest waited 0 0 0 0 Direct number conditional waited 957 512 1081 1075 Direct time conditional waited 0ms 0ms 0ms 0ms Direct full conditional waited 0 0 0 0 KSwapd number congest waited 36 4 3 5 KSwapd time congest waited 3148ms 400ms 300ms 500ms KSwapd full congest waited 30 4 3 5 KSwapd number conditional waited 88514 197 332 542 KSwapd time conditional waited 4980ms 0ms 0ms 0ms KSwapd full conditional waited 49 0 0 0 The "conditional waited" times are the most interesting as this is directly impacted by the number of dirty pages encountered during scan. As lumpy reclaim is no longer scanning contiguous ranges, it is finding fewer dirty pages. This brings wait times from about 5 seconds to 0. kswapd itself is still calling congestion_wait() so it'll still stall but it's a lot less. In terms of the type of IO we were doing, I see this FTrace Reclaim Statistics: mm_vmscan_writepage Direct writes anon sync 0 0 0 0 Direct writes anon async 0 0 0 0 Direct writes file sync 0 0 0 0 Direct writes file async 0 0 0 0 Direct writes mixed sync 0 0 0 0 Direct writes mixed async 0 0 0 0 KSwapd writes anon sync 0 0 0 0 KSwapd writes anon async 91682 0 0 0 KSwapd writes file sync 0 0 0 0 KSwapd writes file async 822629 0 0 0 KSwapd writes mixed sync 0 0 0 0 KSwapd writes mixed async 0 0 0 0 In 3.2, kswapd was doing a bunch of async writes of pages but reclaim/compaction was never reaching a point where it was doing sync IO. This does not guarantee that reclaim/compaction was not calling wait_on_page_writeback() but I would consider it unlikely. It indicates that merging patches 2 and 3 to stop reclaim/compaction calling wait_on_page_writeback() should be safe. This patch: Lumpy reclaim had a purpose but in the mind of some, it was to kick the system so hard it trashed. For others the purpose was to complicate vmscan.c. Over time it was giving softer shoes and a nicer attitude but memory compaction needs to step up and replace it so this patch sends lumpy reclaim to the farm. The tracepoint format changes for isolating LRU pages with this patch applied. Furthermore reclaim/compaction can no longer queue dirty pages in pageout() if the underlying BDI is congested. Lumpy reclaim used this logic and reclaim/compaction was using it in error. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: Hugh Dickins <hughd@google.com> Cc: Ying Han <yinghan@google.com> Cc: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c144
1 files changed, 15 insertions, 129 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ca46080bb074..546d02ce90ee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -58,9 +58,6 @@
58 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages 58 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
59 * RECLAIM_MODE_ASYNC: Do not block 59 * RECLAIM_MODE_ASYNC: Do not block
60 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback 60 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
61 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
62 * page from the LRU and reclaim all pages within a
63 * naturally aligned range
64 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of 61 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
65 * order-0 pages and then compact the zone 62 * order-0 pages and then compact the zone
66 */ 63 */
@@ -68,7 +65,6 @@ typedef unsigned __bitwise__ reclaim_mode_t;
68#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) 65#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
69#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) 66#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
70#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) 67#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
71#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
72#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) 68#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
73 69
74struct scan_control { 70struct scan_control {
@@ -367,27 +363,17 @@ out:
367static void set_reclaim_mode(int priority, struct scan_control *sc, 363static void set_reclaim_mode(int priority, struct scan_control *sc,
368 bool sync) 364 bool sync)
369{ 365{
366 /* Sync reclaim used only for compaction */
370 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; 367 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
371 368
372 /* 369 /*
373 * Initially assume we are entering either lumpy reclaim or 370 * Restrict reclaim/compaction to costly allocations or when
374 * reclaim/compaction.Depending on the order, we will either set the
375 * sync mode or just reclaim order-0 pages later.
376 */
377 if (COMPACTION_BUILD)
378 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
379 else
380 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
381
382 /*
383 * Avoid using lumpy reclaim or reclaim/compaction if possible by
384 * restricting when its set to either costly allocations or when
385 * under memory pressure 371 * under memory pressure
386 */ 372 */
387 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 373 if (COMPACTION_BUILD && sc->order &&
388 sc->reclaim_mode |= syncmode; 374 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
389 else if (sc->order && priority < DEF_PRIORITY - 2) 375 priority < DEF_PRIORITY - 2))
390 sc->reclaim_mode |= syncmode; 376 sc->reclaim_mode = RECLAIM_MODE_COMPACTION | syncmode;
391 else 377 else
392 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; 378 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
393} 379}
@@ -416,10 +402,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
416 return 1; 402 return 1;
417 if (bdi == current->backing_dev_info) 403 if (bdi == current->backing_dev_info)
418 return 1; 404 return 1;
419
420 /* lumpy reclaim for hugepage often need a lot of write */
421 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
422 return 1;
423 return 0; 405 return 0;
424} 406}
425 407
@@ -710,10 +692,6 @@ static enum page_references page_check_references(struct page *page,
710 referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); 692 referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
711 referenced_page = TestClearPageReferenced(page); 693 referenced_page = TestClearPageReferenced(page);
712 694
713 /* Lumpy reclaim - ignore references */
714 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
715 return PAGEREF_RECLAIM;
716
717 /* 695 /*
718 * Mlock lost the isolation race with us. Let try_to_unmap() 696 * Mlock lost the isolation race with us. Let try_to_unmap()
719 * move the page to the unevictable list. 697 * move the page to the unevictable list.
@@ -824,7 +802,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
824 wait_on_page_writeback(page); 802 wait_on_page_writeback(page);
825 else { 803 else {
826 unlock_page(page); 804 unlock_page(page);
827 goto keep_lumpy; 805 goto keep_reclaim_mode;
828 } 806 }
829 } 807 }
830 808
@@ -908,7 +886,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
908 goto activate_locked; 886 goto activate_locked;
909 case PAGE_SUCCESS: 887 case PAGE_SUCCESS:
910 if (PageWriteback(page)) 888 if (PageWriteback(page))
911 goto keep_lumpy; 889 goto keep_reclaim_mode;
912 if (PageDirty(page)) 890 if (PageDirty(page))
913 goto keep; 891 goto keep;
914 892
@@ -1008,7 +986,7 @@ keep_locked:
1008 unlock_page(page); 986 unlock_page(page);
1009keep: 987keep:
1010 reset_reclaim_mode(sc); 988 reset_reclaim_mode(sc);
1011keep_lumpy: 989keep_reclaim_mode:
1012 list_add(&page->lru, &ret_pages); 990 list_add(&page->lru, &ret_pages);
1013 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 991 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1014 } 992 }
@@ -1064,11 +1042,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1064 if (!all_lru_mode && !!page_is_file_cache(page) != file) 1042 if (!all_lru_mode && !!page_is_file_cache(page) != file)
1065 return ret; 1043 return ret;
1066 1044
1067 /* 1045 /* Do not give back unevictable pages for compaction */
1068 * When this function is being called for lumpy reclaim, we
1069 * initially look into all LRU pages, active, inactive and
1070 * unevictable; only give shrink_page_list evictable pages.
1071 */
1072 if (PageUnevictable(page)) 1046 if (PageUnevictable(page))
1073 return ret; 1047 return ret;
1074 1048
@@ -1153,9 +1127,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1153 struct lruvec *lruvec; 1127 struct lruvec *lruvec;
1154 struct list_head *src; 1128 struct list_head *src;
1155 unsigned long nr_taken = 0; 1129 unsigned long nr_taken = 0;
1156 unsigned long nr_lumpy_taken = 0;
1157 unsigned long nr_lumpy_dirty = 0;
1158 unsigned long nr_lumpy_failed = 0;
1159 unsigned long scan; 1130 unsigned long scan;
1160 int lru = LRU_BASE; 1131 int lru = LRU_BASE;
1161 1132
@@ -1168,10 +1139,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1168 1139
1169 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1140 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1170 struct page *page; 1141 struct page *page;
1171 unsigned long pfn;
1172 unsigned long end_pfn;
1173 unsigned long page_pfn;
1174 int zone_id;
1175 1142
1176 page = lru_to_page(src); 1143 page = lru_to_page(src);
1177 prefetchw_prev_lru_page(page, src, flags); 1144 prefetchw_prev_lru_page(page, src, flags);
@@ -1193,84 +1160,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1193 default: 1160 default:
1194 BUG(); 1161 BUG();
1195 } 1162 }
1196
1197 if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
1198 continue;
1199
1200 /*
1201 * Attempt to take all pages in the order aligned region
1202 * surrounding the tag page. Only take those pages of
1203 * the same active state as that tag page. We may safely
1204 * round the target page pfn down to the requested order
1205 * as the mem_map is guaranteed valid out to MAX_ORDER,
1206 * where that page is in a different zone we will detect
1207 * it from its zone id and abort this block scan.
1208 */
1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << sc->order) - 1);
1212 end_pfn = pfn + (1 << sc->order);
1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page;
1215
1216 /* The target page is in the block, ignore it. */
1217 if (unlikely(pfn == page_pfn))
1218 continue;
1219
1220 /* Avoid holes within the zone. */
1221 if (unlikely(!pfn_valid_within(pfn)))
1222 break;
1223
1224 cursor_page = pfn_to_page(pfn);
1225
1226 /* Check that we have not crossed a zone boundary. */
1227 if (unlikely(page_zone_id(cursor_page) != zone_id))
1228 break;
1229
1230 /*
1231 * If we don't have enough swap space, reclaiming of
1232 * anon page which don't already have a swap slot is
1233 * pointless.
1234 */
1235 if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
1236 !PageSwapCache(cursor_page))
1237 break;
1238
1239 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1240 unsigned int isolated_pages;
1241
1242 mem_cgroup_lru_del(cursor_page);
1243 list_move(&cursor_page->lru, dst);
1244 isolated_pages = hpage_nr_pages(cursor_page);
1245 nr_taken += isolated_pages;
1246 nr_lumpy_taken += isolated_pages;
1247 if (PageDirty(cursor_page))
1248 nr_lumpy_dirty += isolated_pages;
1249 scan++;
1250 pfn += isolated_pages - 1;
1251 } else {
1252 /*
1253 * Check if the page is freed already.
1254 *
1255 * We can't use page_count() as that
1256 * requires compound_head and we don't
1257 * have a pin on the page here. If a
1258 * page is tail, we may or may not
1259 * have isolated the head, so assume
1260 * it's not free, it'd be tricky to
1261 * track the head status without a
1262 * page pin.
1263 */
1264 if (!PageTail(cursor_page) &&
1265 !atomic_read(&cursor_page->_count))
1266 continue;
1267 break;
1268 }
1269 }
1270
1271 /* If we break out of the loop above, lumpy reclaim failed */
1272 if (pfn < end_pfn)
1273 nr_lumpy_failed++;
1274 } 1163 }
1275 1164
1276 *nr_scanned = scan; 1165 *nr_scanned = scan;
@@ -1278,7 +1167,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1278 trace_mm_vmscan_lru_isolate(sc->order, 1167 trace_mm_vmscan_lru_isolate(sc->order,
1279 nr_to_scan, scan, 1168 nr_to_scan, scan,
1280 nr_taken, 1169 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1282 mode, file); 1170 mode, file);
1283 return nr_taken; 1171 return nr_taken;
1284} 1172}
@@ -1466,13 +1354,13 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1466 int priority, 1354 int priority,
1467 struct scan_control *sc) 1355 struct scan_control *sc)
1468{ 1356{
1469 int lumpy_stall_priority; 1357 int stall_priority;
1470 1358
1471 /* kswapd should not stall on sync IO */ 1359 /* kswapd should not stall on sync IO */
1472 if (current_is_kswapd()) 1360 if (current_is_kswapd())
1473 return false; 1361 return false;
1474 1362
1475 /* Only stall on lumpy reclaim */ 1363 /* Only stall for memory compaction */
1476 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) 1364 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1477 return false; 1365 return false;
1478 1366
@@ -1487,11 +1375,11 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1487 * priority to be much higher before stalling. 1375 * priority to be much higher before stalling.
1488 */ 1376 */
1489 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 1377 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1490 lumpy_stall_priority = DEF_PRIORITY; 1378 stall_priority = DEF_PRIORITY;
1491 else 1379 else
1492 lumpy_stall_priority = DEF_PRIORITY / 3; 1380 stall_priority = DEF_PRIORITY / 3;
1493 1381
1494 return priority <= lumpy_stall_priority; 1382 return priority <= stall_priority;
1495} 1383}
1496 1384
1497/* 1385/*
@@ -1523,8 +1411,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1523 } 1411 }
1524 1412
1525 set_reclaim_mode(priority, sc, false); 1413 set_reclaim_mode(priority, sc, false);
1526 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1527 isolate_mode |= ISOLATE_ACTIVE;
1528 1414
1529 lru_add_drain(); 1415 lru_add_drain();
1530 1416