diff options
author | Mel Gorman <mgorman@techsingularity.net> | 2016-07-28 18:47:31 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-28 19:07:41 -0400 |
commit | 5a1c84b404a7176b8b36e2a0041b6f0adb3151a3 (patch) | |
tree | ff98e242c5d4d3a24ca49f6ddc707028aeb938f9 | |
parent | bb4cc2bea6df7854d629bff114ca03237cc718d6 (diff) |
mm: remove reclaim and compaction retry approximations
If per-zone LRU accounting is available then there is no point
approximating whether reclaim and compaction should retry based on pgdat
statistics. This is effectively a revert of "mm, vmstat: remove zone
and node double accounting by approximating retries" with the difference
that inactive/active stats are still available. This preserves the
history of why the approximation was retried and why it had to be
reverted to handle OOM kills on 32-bit systems.
Link: http://lkml.kernel.org/r/1469110261-7365-4-git-send-email-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/mmzone.h | 1 | ||||
-rw-r--r-- | include/linux/swap.h | 1 | ||||
-rw-r--r-- | mm/compaction.c | 20 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 49 | ||||
-rw-r--r-- | mm/vmscan.c | 18 | ||||
-rw-r--r-- | mm/vmstat.c | 1 |
8 files changed, 39 insertions, 58 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1a813ad335f4..ca0fbc483441 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -116,6 +116,7 @@ enum zone_stat_item { | |||
116 | NR_ZONE_INACTIVE_FILE, | 116 | NR_ZONE_INACTIVE_FILE, |
117 | NR_ZONE_ACTIVE_FILE, | 117 | NR_ZONE_ACTIVE_FILE, |
118 | NR_ZONE_UNEVICTABLE, | 118 | NR_ZONE_UNEVICTABLE, |
119 | NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ | ||
119 | NR_MLOCK, /* mlock()ed pages found and moved off LRU */ | 120 | NR_MLOCK, /* mlock()ed pages found and moved off LRU */ |
120 | NR_SLAB_RECLAIMABLE, | 121 | NR_SLAB_RECLAIMABLE, |
121 | NR_SLAB_UNRECLAIMABLE, | 122 | NR_SLAB_UNRECLAIMABLE, |
diff --git a/include/linux/swap.h b/include/linux/swap.h index cc753c639e3d..b17cc4830fa6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -307,6 +307,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, | |||
307 | struct vm_area_struct *vma); | 307 | struct vm_area_struct *vma); |
308 | 308 | ||
309 | /* linux/mm/vmscan.c */ | 309 | /* linux/mm/vmscan.c */ |
310 | extern unsigned long zone_reclaimable_pages(struct zone *zone); | ||
310 | extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); | 311 | extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); |
311 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 312 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
312 | gfp_t gfp_mask, nodemask_t *mask); | 313 | gfp_t gfp_mask, nodemask_t *mask); |
diff --git a/mm/compaction.c b/mm/compaction.c index cd93ea24c565..e5995f38d677 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -1438,11 +1438,6 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, | |||
1438 | { | 1438 | { |
1439 | struct zone *zone; | 1439 | struct zone *zone; |
1440 | struct zoneref *z; | 1440 | struct zoneref *z; |
1441 | pg_data_t *last_pgdat = NULL; | ||
1442 | |||
1443 | /* Do not retry compaction for zone-constrained allocations */ | ||
1444 | if (ac->high_zoneidx < ZONE_NORMAL) | ||
1445 | return false; | ||
1446 | 1441 | ||
1447 | /* | 1442 | /* |
1448 | * Make sure at least one zone would pass __compaction_suitable if we continue | 1443 | * Make sure at least one zone would pass __compaction_suitable if we continue |
@@ -1453,27 +1448,14 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, | |||
1453 | unsigned long available; | 1448 | unsigned long available; |
1454 | enum compact_result compact_result; | 1449 | enum compact_result compact_result; |
1455 | 1450 | ||
1456 | if (last_pgdat == zone->zone_pgdat) | ||
1457 | continue; | ||
1458 | |||
1459 | /* | ||
1460 | * This over-estimates the number of pages available for | ||
1461 | * reclaim/compaction but walking the LRU would take too | ||
1462 | * long. The consequences are that compaction may retry | ||
1463 | * longer than it should for a zone-constrained allocation | ||
1464 | * request. | ||
1465 | */ | ||
1466 | last_pgdat = zone->zone_pgdat; | ||
1467 | available = pgdat_reclaimable_pages(zone->zone_pgdat) / order; | ||
1468 | |||
1469 | /* | 1451 | /* |
1470 | * Do not consider all the reclaimable memory because we do not | 1452 | * Do not consider all the reclaimable memory because we do not |
1471 | * want to trash just for a single high order allocation which | 1453 | * want to trash just for a single high order allocation which |
1472 | * is even not guaranteed to appear even if __compaction_suitable | 1454 | * is even not guaranteed to appear even if __compaction_suitable |
1473 | * is happy about the watermark check. | 1455 | * is happy about the watermark check. |
1474 | */ | 1456 | */ |
1457 | available = zone_reclaimable_pages(zone) / order; | ||
1475 | available += zone_page_state_snapshot(zone, NR_FREE_PAGES); | 1458 | available += zone_page_state_snapshot(zone, NR_FREE_PAGES); |
1476 | available = min(zone->managed_pages, available); | ||
1477 | compact_result = __compaction_suitable(zone, order, alloc_flags, | 1459 | compact_result = __compaction_suitable(zone, order, alloc_flags, |
1478 | ac_classzone_idx(ac), available); | 1460 | ac_classzone_idx(ac), available); |
1479 | if (compact_result != COMPACT_SKIPPED && | 1461 | if (compact_result != COMPACT_SKIPPED && |
diff --git a/mm/migrate.c b/mm/migrate.c index ed2f85e61de1..ed0268268e93 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -513,7 +513,9 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
513 | } | 513 | } |
514 | if (dirty && mapping_cap_account_dirty(mapping)) { | 514 | if (dirty && mapping_cap_account_dirty(mapping)) { |
515 | __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); | 515 | __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); |
516 | __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); | ||
516 | __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); | 517 | __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); |
518 | __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); | ||
517 | } | 519 | } |
518 | } | 520 | } |
519 | local_irq_enable(); | 521 | local_irq_enable(); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7b5920a3500f..f4cd7d8005c9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2462,6 +2462,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) | |||
2462 | 2462 | ||
2463 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); | 2463 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
2464 | __inc_node_page_state(page, NR_FILE_DIRTY); | 2464 | __inc_node_page_state(page, NR_FILE_DIRTY); |
2465 | __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); | ||
2465 | __inc_node_page_state(page, NR_DIRTIED); | 2466 | __inc_node_page_state(page, NR_DIRTIED); |
2466 | __inc_wb_stat(wb, WB_RECLAIMABLE); | 2467 | __inc_wb_stat(wb, WB_RECLAIMABLE); |
2467 | __inc_wb_stat(wb, WB_DIRTIED); | 2468 | __inc_wb_stat(wb, WB_DIRTIED); |
@@ -2483,6 +2484,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, | |||
2483 | if (mapping_cap_account_dirty(mapping)) { | 2484 | if (mapping_cap_account_dirty(mapping)) { |
2484 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); | 2485 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
2485 | dec_node_page_state(page, NR_FILE_DIRTY); | 2486 | dec_node_page_state(page, NR_FILE_DIRTY); |
2487 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); | ||
2486 | dec_wb_stat(wb, WB_RECLAIMABLE); | 2488 | dec_wb_stat(wb, WB_RECLAIMABLE); |
2487 | task_io_account_cancelled_write(PAGE_SIZE); | 2489 | task_io_account_cancelled_write(PAGE_SIZE); |
2488 | } | 2490 | } |
@@ -2739,6 +2741,7 @@ int clear_page_dirty_for_io(struct page *page) | |||
2739 | if (TestClearPageDirty(page)) { | 2741 | if (TestClearPageDirty(page)) { |
2740 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); | 2742 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
2741 | dec_node_page_state(page, NR_FILE_DIRTY); | 2743 | dec_node_page_state(page, NR_FILE_DIRTY); |
2744 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); | ||
2742 | dec_wb_stat(wb, WB_RECLAIMABLE); | 2745 | dec_wb_stat(wb, WB_RECLAIMABLE); |
2743 | ret = 1; | 2746 | ret = 1; |
2744 | } | 2747 | } |
@@ -2785,6 +2788,7 @@ int test_clear_page_writeback(struct page *page) | |||
2785 | if (ret) { | 2788 | if (ret) { |
2786 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | 2789 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); |
2787 | dec_node_page_state(page, NR_WRITEBACK); | 2790 | dec_node_page_state(page, NR_WRITEBACK); |
2791 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); | ||
2788 | inc_node_page_state(page, NR_WRITTEN); | 2792 | inc_node_page_state(page, NR_WRITTEN); |
2789 | } | 2793 | } |
2790 | unlock_page_memcg(page); | 2794 | unlock_page_memcg(page); |
@@ -2839,6 +2843,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
2839 | if (!ret) { | 2843 | if (!ret) { |
2840 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | 2844 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); |
2841 | inc_node_page_state(page, NR_WRITEBACK); | 2845 | inc_node_page_state(page, NR_WRITEBACK); |
2846 | inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); | ||
2842 | } | 2847 | } |
2843 | unlock_page_memcg(page); | 2848 | unlock_page_memcg(page); |
2844 | return ret; | 2849 | return ret; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 759cfa8cbbeb..dfdb608f7b3d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -3402,7 +3402,6 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, | |||
3402 | { | 3402 | { |
3403 | struct zone *zone; | 3403 | struct zone *zone; |
3404 | struct zoneref *z; | 3404 | struct zoneref *z; |
3405 | pg_data_t *current_pgdat = NULL; | ||
3406 | 3405 | ||
3407 | /* | 3406 | /* |
3408 | * Make sure we converge to OOM if we cannot make any progress | 3407 | * Make sure we converge to OOM if we cannot make any progress |
@@ -3412,15 +3411,6 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, | |||
3412 | return false; | 3411 | return false; |
3413 | 3412 | ||
3414 | /* | 3413 | /* |
3415 | * Blindly retry lowmem allocation requests that are often ignored by | ||
3416 | * the OOM killer up to MAX_RECLAIM_RETRIES as we not have a reliable | ||
3417 | * and fast means of calculating reclaimable, dirty and writeback pages | ||
3418 | * in eligible zones. | ||
3419 | */ | ||
3420 | if (ac->high_zoneidx < ZONE_NORMAL) | ||
3421 | goto out; | ||
3422 | |||
3423 | /* | ||
3424 | * Keep reclaiming pages while there is a chance this will lead | 3414 | * Keep reclaiming pages while there is a chance this will lead |
3425 | * somewhere. If none of the target zones can satisfy our allocation | 3415 | * somewhere. If none of the target zones can satisfy our allocation |
3426 | * request even if all reclaimable pages are considered then we are | 3416 | * request even if all reclaimable pages are considered then we are |
@@ -3430,38 +3420,18 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, | |||
3430 | ac->nodemask) { | 3420 | ac->nodemask) { |
3431 | unsigned long available; | 3421 | unsigned long available; |
3432 | unsigned long reclaimable; | 3422 | unsigned long reclaimable; |
3433 | int zid; | ||
3434 | 3423 | ||
3435 | if (current_pgdat == zone->zone_pgdat) | 3424 | available = reclaimable = zone_reclaimable_pages(zone); |
3436 | continue; | ||
3437 | |||
3438 | current_pgdat = zone->zone_pgdat; | ||
3439 | available = reclaimable = pgdat_reclaimable_pages(current_pgdat); | ||
3440 | available -= DIV_ROUND_UP(no_progress_loops * available, | 3425 | available -= DIV_ROUND_UP(no_progress_loops * available, |
3441 | MAX_RECLAIM_RETRIES); | 3426 | MAX_RECLAIM_RETRIES); |
3442 | 3427 | available += zone_page_state_snapshot(zone, NR_FREE_PAGES); | |
3443 | /* Account for all free pages on eligible zones */ | ||
3444 | for (zid = 0; zid <= zone_idx(zone); zid++) { | ||
3445 | struct zone *acct_zone = ¤t_pgdat->node_zones[zid]; | ||
3446 | |||
3447 | available += zone_page_state_snapshot(acct_zone, NR_FREE_PAGES); | ||
3448 | } | ||
3449 | 3428 | ||
3450 | /* | 3429 | /* |
3451 | * Would the allocation succeed if we reclaimed the whole | 3430 | * Would the allocation succeed if we reclaimed the whole |
3452 | * available? This is approximate because there is no | 3431 | * available? |
3453 | * accurate count of reclaimable pages per zone. | ||
3454 | */ | 3432 | */ |
3455 | for (zid = 0; zid <= zone_idx(zone); zid++) { | 3433 | if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), |
3456 | struct zone *check_zone = ¤t_pgdat->node_zones[zid]; | 3434 | ac_classzone_idx(ac), alloc_flags, available)) { |
3457 | unsigned long estimate; | ||
3458 | |||
3459 | estimate = min(check_zone->managed_pages, available); | ||
3460 | if (!__zone_watermark_ok(check_zone, order, | ||
3461 | min_wmark_pages(check_zone), ac_classzone_idx(ac), | ||
3462 | alloc_flags, estimate)) | ||
3463 | continue; | ||
3464 | |||
3465 | /* | 3435 | /* |
3466 | * If we didn't make any progress and have a lot of | 3436 | * If we didn't make any progress and have a lot of |
3467 | * dirty + writeback pages then we should wait for | 3437 | * dirty + writeback pages then we should wait for |
@@ -3471,16 +3441,15 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, | |||
3471 | if (!did_some_progress) { | 3441 | if (!did_some_progress) { |
3472 | unsigned long write_pending; | 3442 | unsigned long write_pending; |
3473 | 3443 | ||
3474 | write_pending = | 3444 | write_pending = zone_page_state_snapshot(zone, |
3475 | node_page_state(current_pgdat, NR_WRITEBACK) + | 3445 | NR_ZONE_WRITE_PENDING); |
3476 | node_page_state(current_pgdat, NR_FILE_DIRTY); | ||
3477 | 3446 | ||
3478 | if (2 * write_pending > reclaimable) { | 3447 | if (2 * write_pending > reclaimable) { |
3479 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 3448 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
3480 | return true; | 3449 | return true; |
3481 | } | 3450 | } |
3482 | } | 3451 | } |
3483 | out: | 3452 | |
3484 | /* | 3453 | /* |
3485 | * Memory allocation/reclaim might be called from a WQ | 3454 | * Memory allocation/reclaim might be called from a WQ |
3486 | * context and the current implementation of the WQ | 3455 | * context and the current implementation of the WQ |
@@ -4361,6 +4330,7 @@ void show_free_areas(unsigned int filter) | |||
4361 | " active_file:%lukB" | 4330 | " active_file:%lukB" |
4362 | " inactive_file:%lukB" | 4331 | " inactive_file:%lukB" |
4363 | " unevictable:%lukB" | 4332 | " unevictable:%lukB" |
4333 | " writepending:%lukB" | ||
4364 | " present:%lukB" | 4334 | " present:%lukB" |
4365 | " managed:%lukB" | 4335 | " managed:%lukB" |
4366 | " mlocked:%lukB" | 4336 | " mlocked:%lukB" |
@@ -4383,6 +4353,7 @@ void show_free_areas(unsigned int filter) | |||
4383 | K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), | 4353 | K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), |
4384 | K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), | 4354 | K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), |
4385 | K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), | 4355 | K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), |
4356 | K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), | ||
4386 | K(zone->present_pages), | 4357 | K(zone->present_pages), |
4387 | K(zone->managed_pages), | 4358 | K(zone->managed_pages), |
4388 | K(zone_page_state(zone, NR_MLOCK)), | 4359 | K(zone_page_state(zone, NR_MLOCK)), |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 222d5403dd4b..134381a20099 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -194,6 +194,24 @@ static bool sane_reclaim(struct scan_control *sc) | |||
194 | } | 194 | } |
195 | #endif | 195 | #endif |
196 | 196 | ||
197 | /* | ||
198 | * This misses isolated pages which are not accounted for to save counters. | ||
199 | * As the data only determines if reclaim or compaction continues, it is | ||
200 | * not expected that isolated pages will be a dominating factor. | ||
201 | */ | ||
202 | unsigned long zone_reclaimable_pages(struct zone *zone) | ||
203 | { | ||
204 | unsigned long nr; | ||
205 | |||
206 | nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + | ||
207 | zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); | ||
208 | if (get_nr_swap_pages() > 0) | ||
209 | nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + | ||
210 | zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); | ||
211 | |||
212 | return nr; | ||
213 | } | ||
214 | |||
197 | unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) | 215 | unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) |
198 | { | 216 | { |
199 | unsigned long nr; | 217 | unsigned long nr; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 053075ac67b8..89cec42d19ff 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -926,6 +926,7 @@ const char * const vmstat_text[] = { | |||
926 | "nr_zone_inactive_file", | 926 | "nr_zone_inactive_file", |
927 | "nr_zone_active_file", | 927 | "nr_zone_active_file", |
928 | "nr_zone_unevictable", | 928 | "nr_zone_unevictable", |
929 | "nr_zone_write_pending", | ||
929 | "nr_mlock", | 930 | "nr_mlock", |
930 | "nr_slab_reclaimable", | 931 | "nr_slab_reclaimable", |
931 | "nr_slab_unreclaimable", | 932 | "nr_slab_unreclaimable", |