diff options
author | Mel Gorman <mgorman@techsingularity.net> | 2016-07-28 18:46:11 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-28 19:07:41 -0400 |
commit | 281e37265f2826ed401d84d6790226448ef3f0e8 (patch) | |
tree | a29b375b754c242f29082cd9e0df1a48c8109ac2 | |
parent | 1e6b10857f91685c60c341703ece4ae9bb775cf3 (diff) |
mm, page_alloc: consider dirtyable memory in terms of nodes
Historically dirty pages were spread among zones but now that LRUs are
per-node it is more appropriate to consider dirty pages in a node.
Link: http://lkml.kernel.org/r/1467970510-21195-17-git-send-email-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/mmzone.h | 12 | ||||
-rw-r--r-- | include/linux/writeback.h | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 91 | ||||
-rw-r--r-- | mm/page_alloc.c | 26 |
4 files changed, 79 insertions, 52 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 62f477d6cfe8..fae2fe3c6942 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -363,12 +363,6 @@ struct zone { | |||
363 | struct pglist_data *zone_pgdat; | 363 | struct pglist_data *zone_pgdat; |
364 | struct per_cpu_pageset __percpu *pageset; | 364 | struct per_cpu_pageset __percpu *pageset; |
365 | 365 | ||
366 | /* | ||
367 | * This is a per-zone reserve of pages that are not available | ||
368 | * to userspace allocations. | ||
369 | */ | ||
370 | unsigned long totalreserve_pages; | ||
371 | |||
372 | #ifndef CONFIG_SPARSEMEM | 366 | #ifndef CONFIG_SPARSEMEM |
373 | /* | 367 | /* |
374 | * Flags for a pageblock_nr_pages block. See pageblock-flags.h. | 368 | * Flags for a pageblock_nr_pages block. See pageblock-flags.h. |
@@ -687,6 +681,12 @@ typedef struct pglist_data { | |||
687 | /* Number of pages migrated during the rate limiting time interval */ | 681 | /* Number of pages migrated during the rate limiting time interval */ |
688 | unsigned long numabalancing_migrate_nr_pages; | 682 | unsigned long numabalancing_migrate_nr_pages; |
689 | #endif | 683 | #endif |
684 | /* | ||
685 | * This is a per-node reserve of pages that are not available | ||
686 | * to userspace allocations. | ||
687 | */ | ||
688 | unsigned long totalreserve_pages; | ||
689 | |||
690 | /* Write-intensive fields used by page reclaim */ | 690 | /* Write-intensive fields used by page reclaim */ |
691 | ZONE_PADDING(_pad1_) | 691 | ZONE_PADDING(_pad1_) |
692 | spinlock_t lru_lock; | 692 | spinlock_t lru_lock; |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 717e6149e753..fc1e16c25a29 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -320,7 +320,7 @@ void laptop_mode_timer_fn(unsigned long data); | |||
320 | static inline void laptop_sync_completion(void) { } | 320 | static inline void laptop_sync_completion(void) { } |
321 | #endif | 321 | #endif |
322 | void throttle_vm_writeout(gfp_t gfp_mask); | 322 | void throttle_vm_writeout(gfp_t gfp_mask); |
323 | bool zone_dirty_ok(struct zone *zone); | 323 | bool node_dirty_ok(struct pglist_data *pgdat); |
324 | int wb_domain_init(struct wb_domain *dom, gfp_t gfp); | 324 | int wb_domain_init(struct wb_domain *dom, gfp_t gfp); |
325 | #ifdef CONFIG_CGROUP_WRITEBACK | 325 | #ifdef CONFIG_CGROUP_WRITEBACK |
326 | void wb_domain_exit(struct wb_domain *dom); | 326 | void wb_domain_exit(struct wb_domain *dom); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0ada2b2954b0..f7c0fb993fb9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -267,26 +267,35 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, | |||
267 | */ | 267 | */ |
268 | 268 | ||
269 | /** | 269 | /** |
270 | * zone_dirtyable_memory - number of dirtyable pages in a zone | 270 | * node_dirtyable_memory - number of dirtyable pages in a node |
271 | * @zone: the zone | 271 | * @pgdat: the node |
272 | * | 272 | * |
273 | * Returns the zone's number of pages potentially available for dirty | 273 | * Returns the node's number of pages potentially available for dirty |
274 | * page cache. This is the base value for the per-zone dirty limits. | 274 | * page cache. This is the base value for the per-node dirty limits. |
275 | */ | 275 | */ |
276 | static unsigned long zone_dirtyable_memory(struct zone *zone) | 276 | static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) |
277 | { | 277 | { |
278 | unsigned long nr_pages; | 278 | unsigned long nr_pages = 0; |
279 | int z; | ||
280 | |||
281 | for (z = 0; z < MAX_NR_ZONES; z++) { | ||
282 | struct zone *zone = pgdat->node_zones + z; | ||
283 | |||
284 | if (!populated_zone(zone)) | ||
285 | continue; | ||
286 | |||
287 | nr_pages += zone_page_state(zone, NR_FREE_PAGES); | ||
288 | } | ||
279 | 289 | ||
280 | nr_pages = zone_page_state(zone, NR_FREE_PAGES); | ||
281 | /* | 290 | /* |
282 | * Pages reserved for the kernel should not be considered | 291 | * Pages reserved for the kernel should not be considered |
283 | * dirtyable, to prevent a situation where reclaim has to | 292 | * dirtyable, to prevent a situation where reclaim has to |
284 | * clean pages in order to balance the zones. | 293 | * clean pages in order to balance the zones. |
285 | */ | 294 | */ |
286 | nr_pages -= min(nr_pages, zone->totalreserve_pages); | 295 | nr_pages -= min(nr_pages, pgdat->totalreserve_pages); |
287 | 296 | ||
288 | nr_pages += node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE); | 297 | nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); |
289 | nr_pages += node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE); | 298 | nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); |
290 | 299 | ||
291 | return nr_pages; | 300 | return nr_pages; |
292 | } | 301 | } |
@@ -299,13 +308,24 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
299 | int i; | 308 | int i; |
300 | 309 | ||
301 | for_each_node_state(node, N_HIGH_MEMORY) { | 310 | for_each_node_state(node, N_HIGH_MEMORY) { |
302 | for (i = 0; i < MAX_NR_ZONES; i++) { | 311 | for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { |
303 | struct zone *z = &NODE_DATA(node)->node_zones[i]; | 312 | struct zone *z; |
313 | unsigned long dirtyable; | ||
314 | |||
315 | if (!is_highmem_idx(i)) | ||
316 | continue; | ||
317 | |||
318 | z = &NODE_DATA(node)->node_zones[i]; | ||
319 | dirtyable = zone_page_state(z, NR_FREE_PAGES) + | ||
320 | zone_page_state(z, NR_ZONE_LRU_FILE); | ||
304 | 321 | ||
305 | if (is_highmem(z)) | 322 | /* watch for underflows */ |
306 | x += zone_dirtyable_memory(z); | 323 | dirtyable -= min(dirtyable, high_wmark_pages(z)); |
324 | |||
325 | x += dirtyable; | ||
307 | } | 326 | } |
308 | } | 327 | } |
328 | |||
309 | /* | 329 | /* |
310 | * Unreclaimable memory (kernel memory or anonymous memory | 330 | * Unreclaimable memory (kernel memory or anonymous memory |
311 | * without swap) can bring down the dirtyable pages below | 331 | * without swap) can bring down the dirtyable pages below |
@@ -445,23 +465,23 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
445 | } | 465 | } |
446 | 466 | ||
447 | /** | 467 | /** |
448 | * zone_dirty_limit - maximum number of dirty pages allowed in a zone | 468 | * node_dirty_limit - maximum number of dirty pages allowed in a node |
449 | * @zone: the zone | 469 | * @pgdat: the node |
450 | * | 470 | * |
451 | * Returns the maximum number of dirty pages allowed in a zone, based | 471 | * Returns the maximum number of dirty pages allowed in a node, based |
452 | * on the zone's dirtyable memory. | 472 | * on the node's dirtyable memory. |
453 | */ | 473 | */ |
454 | static unsigned long zone_dirty_limit(struct zone *zone) | 474 | static unsigned long node_dirty_limit(struct pglist_data *pgdat) |
455 | { | 475 | { |
456 | unsigned long zone_memory = zone_dirtyable_memory(zone); | 476 | unsigned long node_memory = node_dirtyable_memory(pgdat); |
457 | struct task_struct *tsk = current; | 477 | struct task_struct *tsk = current; |
458 | unsigned long dirty; | 478 | unsigned long dirty; |
459 | 479 | ||
460 | if (vm_dirty_bytes) | 480 | if (vm_dirty_bytes) |
461 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * | 481 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * |
462 | zone_memory / global_dirtyable_memory(); | 482 | node_memory / global_dirtyable_memory(); |
463 | else | 483 | else |
464 | dirty = vm_dirty_ratio * zone_memory / 100; | 484 | dirty = vm_dirty_ratio * node_memory / 100; |
465 | 485 | ||
466 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) | 486 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) |
467 | dirty += dirty / 4; | 487 | dirty += dirty / 4; |
@@ -470,19 +490,30 @@ static unsigned long zone_dirty_limit(struct zone *zone) | |||
470 | } | 490 | } |
471 | 491 | ||
472 | /** | 492 | /** |
473 | * zone_dirty_ok - tells whether a zone is within its dirty limits | 493 | * node_dirty_ok - tells whether a node is within its dirty limits |
474 | * @zone: the zone to check | 494 | * @pgdat: the node to check |
475 | * | 495 | * |
476 | * Returns %true when the dirty pages in @zone are within the zone's | 496 | * Returns %true when the dirty pages in @pgdat are within the node's |
477 | * dirty limit, %false if the limit is exceeded. | 497 | * dirty limit, %false if the limit is exceeded. |
478 | */ | 498 | */ |
479 | bool zone_dirty_ok(struct zone *zone) | 499 | bool node_dirty_ok(struct pglist_data *pgdat) |
480 | { | 500 | { |
481 | unsigned long limit = zone_dirty_limit(zone); | 501 | int z; |
502 | unsigned long limit = node_dirty_limit(pgdat); | ||
503 | unsigned long nr_pages = 0; | ||
504 | |||
505 | for (z = 0; z < MAX_NR_ZONES; z++) { | ||
506 | struct zone *zone = pgdat->node_zones + z; | ||
507 | |||
508 | if (!populated_zone(zone)) | ||
509 | continue; | ||
510 | |||
511 | nr_pages += zone_page_state(zone, NR_FILE_DIRTY); | ||
512 | nr_pages += zone_page_state(zone, NR_UNSTABLE_NFS); | ||
513 | nr_pages += zone_page_state(zone, NR_WRITEBACK); | ||
514 | } | ||
482 | 515 | ||
483 | return zone_page_state(zone, NR_FILE_DIRTY) + | 516 | return nr_pages <= limit; |
484 | zone_page_state(zone, NR_UNSTABLE_NFS) + | ||
485 | zone_page_state(zone, NR_WRITEBACK) <= limit; | ||
486 | } | 517 | } |
487 | 518 | ||
488 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 519 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 749b3c358ead..73b018df6e42 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -2912,31 +2912,24 @@ zonelist_scan: | |||
2912 | } | 2912 | } |
2913 | /* | 2913 | /* |
2914 | * When allocating a page cache page for writing, we | 2914 | * When allocating a page cache page for writing, we |
2915 | * want to get it from a zone that is within its dirty | 2915 | * want to get it from a node that is within its dirty |
2916 | * limit, such that no single zone holds more than its | 2916 | * limit, such that no single node holds more than its |
2917 | * proportional share of globally allowed dirty pages. | 2917 | * proportional share of globally allowed dirty pages. |
2918 | * The dirty limits take into account the zone's | 2918 | * The dirty limits take into account the node's |
2919 | * lowmem reserves and high watermark so that kswapd | 2919 | * lowmem reserves and high watermark so that kswapd |
2920 | * should be able to balance it without having to | 2920 | * should be able to balance it without having to |
2921 | * write pages from its LRU list. | 2921 | * write pages from its LRU list. |
2922 | * | 2922 | * |
2923 | * This may look like it could increase pressure on | ||
2924 | * lower zones by failing allocations in higher zones | ||
2925 | * before they are full. But the pages that do spill | ||
2926 | * over are limited as the lower zones are protected | ||
2927 | * by this very same mechanism. It should not become | ||
2928 | * a practical burden to them. | ||
2929 | * | ||
2930 | * XXX: For now, allow allocations to potentially | 2923 | * XXX: For now, allow allocations to potentially |
2931 | * exceed the per-zone dirty limit in the slowpath | 2924 | * exceed the per-node dirty limit in the slowpath |
2932 | * (spread_dirty_pages unset) before going into reclaim, | 2925 | * (spread_dirty_pages unset) before going into reclaim, |
2933 | * which is important when on a NUMA setup the allowed | 2926 | * which is important when on a NUMA setup the allowed |
2934 | * zones are together not big enough to reach the | 2927 | * nodes are together not big enough to reach the |
2935 | * global limit. The proper fix for these situations | 2928 | * global limit. The proper fix for these situations |
2936 | * will require awareness of zones in the | 2929 | * will require awareness of nodes in the |
2937 | * dirty-throttling and the flusher threads. | 2930 | * dirty-throttling and the flusher threads. |
2938 | */ | 2931 | */ |
2939 | if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) | 2932 | if (ac->spread_dirty_pages && !node_dirty_ok(zone->zone_pgdat)) |
2940 | continue; | 2933 | continue; |
2941 | 2934 | ||
2942 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 2935 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
@@ -6701,6 +6694,9 @@ static void calculate_totalreserve_pages(void) | |||
6701 | enum zone_type i, j; | 6694 | enum zone_type i, j; |
6702 | 6695 | ||
6703 | for_each_online_pgdat(pgdat) { | 6696 | for_each_online_pgdat(pgdat) { |
6697 | |||
6698 | pgdat->totalreserve_pages = 0; | ||
6699 | |||
6704 | for (i = 0; i < MAX_NR_ZONES; i++) { | 6700 | for (i = 0; i < MAX_NR_ZONES; i++) { |
6705 | struct zone *zone = pgdat->node_zones + i; | 6701 | struct zone *zone = pgdat->node_zones + i; |
6706 | long max = 0; | 6702 | long max = 0; |
@@ -6717,7 +6713,7 @@ static void calculate_totalreserve_pages(void) | |||
6717 | if (max > zone->managed_pages) | 6713 | if (max > zone->managed_pages) |
6718 | max = zone->managed_pages; | 6714 | max = zone->managed_pages; |
6719 | 6715 | ||
6720 | zone->totalreserve_pages = max; | 6716 | pgdat->totalreserve_pages += max; |
6721 | 6717 | ||
6722 | reserve_pages += max; | 6718 | reserve_pages += max; |
6723 | } | 6719 | } |