diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-25 19:00:17 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-25 19:00:17 -0400 |
commit | e4bc13adfd016fc1036838170288b5680d1a98b0 (patch) | |
tree | 8d2cb749397749439732f3a827cb7f2336408337 /mm/vmscan.c | |
parent | ad90fb97515b732bc27a0109baa10af636c3c8cd (diff) | |
parent | 3e1534cf4a2a8278e811e7c84a79da1a02347b8b (diff) |
Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block
Pull cgroup writeback support from Jens Axboe:
"This is the big pull request for adding cgroup writeback support.
This code has been in development for a long time, and it has been
simmering in for-next for a good chunk of this cycle too. This is one
of those problems that has been talked about for at least half a
decade, finally there's a solution and code to go with it.
Also see last weeks writeup on LWN:
http://lwn.net/Articles/648292/"
* 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits)
writeback, blkio: add documentation for cgroup writeback support
vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB
writeback: do foreign inode detection iff cgroup writeback is enabled
v9fs: fix error handling in v9fs_session_init()
bdi: fix wrong error return value in cgwb_create()
buffer: remove unusued 'ret' variable
writeback: disassociate inodes from dying bdi_writebacks
writeback: implement foreign cgroup inode bdi_writeback switching
writeback: add lockdep annotation to inode_to_wb()
writeback: use unlocked_inode_to_wb transaction in inode_congested()
writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
writeback: implement [locked_]inode_to_wb_and_lock_list()
writeback: implement foreign cgroup inode detection
writeback: make writeback_control track the inode being written back
writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb()
mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use
writeback: implement memcg writeback domain based throttling
writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes
writeback: implement memcg wb_domain
writeback: update wb_over_bg_thresh() to use wb_domain aware operations
...
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 79 |
1 files changed, 58 insertions, 21 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 19ef01e90ac4..e61445dce04e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc) | |||
154 | { | 154 | { |
155 | return !sc->target_mem_cgroup; | 155 | return !sc->target_mem_cgroup; |
156 | } | 156 | } |
157 | |||
158 | /** | ||
159 | * sane_reclaim - is the usual dirty throttling mechanism operational? | ||
160 | * @sc: scan_control in question | ||
161 | * | ||
162 | * The normal page dirty throttling mechanism in balance_dirty_pages() is | ||
163 | * completely broken with the legacy memcg and direct stalling in | ||
164 | * shrink_page_list() is used for throttling instead, which lacks all the | ||
165 | * niceties such as fairness, adaptive pausing, bandwidth proportional | ||
166 | * allocation and configurability. | ||
167 | * | ||
168 | * This function tests whether the vmscan currently in progress can assume | ||
169 | * that the normal dirty throttling mechanism is operational. | ||
170 | */ | ||
171 | static bool sane_reclaim(struct scan_control *sc) | ||
172 | { | ||
173 | struct mem_cgroup *memcg = sc->target_mem_cgroup; | ||
174 | |||
175 | if (!memcg) | ||
176 | return true; | ||
177 | #ifdef CONFIG_CGROUP_WRITEBACK | ||
178 | if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup)) | ||
179 | return true; | ||
180 | #endif | ||
181 | return false; | ||
182 | } | ||
157 | #else | 183 | #else |
158 | static bool global_reclaim(struct scan_control *sc) | 184 | static bool global_reclaim(struct scan_control *sc) |
159 | { | 185 | { |
160 | return true; | 186 | return true; |
161 | } | 187 | } |
188 | |||
189 | static bool sane_reclaim(struct scan_control *sc) | ||
190 | { | ||
191 | return true; | ||
192 | } | ||
162 | #endif | 193 | #endif |
163 | 194 | ||
164 | static unsigned long zone_reclaimable_pages(struct zone *zone) | 195 | static unsigned long zone_reclaimable_pages(struct zone *zone) |
@@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page) | |||
452 | return page_count(page) - page_has_private(page) == 2; | 483 | return page_count(page) - page_has_private(page) == 2; |
453 | } | 484 | } |
454 | 485 | ||
455 | static int may_write_to_queue(struct backing_dev_info *bdi, | 486 | static int may_write_to_inode(struct inode *inode, struct scan_control *sc) |
456 | struct scan_control *sc) | ||
457 | { | 487 | { |
458 | if (current->flags & PF_SWAPWRITE) | 488 | if (current->flags & PF_SWAPWRITE) |
459 | return 1; | 489 | return 1; |
460 | if (!bdi_write_congested(bdi)) | 490 | if (!inode_write_congested(inode)) |
461 | return 1; | 491 | return 1; |
462 | if (bdi == current->backing_dev_info) | 492 | if (inode_to_bdi(inode) == current->backing_dev_info) |
463 | return 1; | 493 | return 1; |
464 | return 0; | 494 | return 0; |
465 | } | 495 | } |
@@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
538 | } | 568 | } |
539 | if (mapping->a_ops->writepage == NULL) | 569 | if (mapping->a_ops->writepage == NULL) |
540 | return PAGE_ACTIVATE; | 570 | return PAGE_ACTIVATE; |
541 | if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) | 571 | if (!may_write_to_inode(mapping->host, sc)) |
542 | return PAGE_KEEP; | 572 | return PAGE_KEEP; |
543 | 573 | ||
544 | if (clear_page_dirty_for_io(page)) { | 574 | if (clear_page_dirty_for_io(page)) { |
@@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
579 | static int __remove_mapping(struct address_space *mapping, struct page *page, | 609 | static int __remove_mapping(struct address_space *mapping, struct page *page, |
580 | bool reclaimed) | 610 | bool reclaimed) |
581 | { | 611 | { |
612 | unsigned long flags; | ||
613 | struct mem_cgroup *memcg; | ||
614 | |||
582 | BUG_ON(!PageLocked(page)); | 615 | BUG_ON(!PageLocked(page)); |
583 | BUG_ON(mapping != page_mapping(page)); | 616 | BUG_ON(mapping != page_mapping(page)); |
584 | 617 | ||
585 | spin_lock_irq(&mapping->tree_lock); | 618 | memcg = mem_cgroup_begin_page_stat(page); |
619 | spin_lock_irqsave(&mapping->tree_lock, flags); | ||
586 | /* | 620 | /* |
587 | * The non racy check for a busy page. | 621 | * The non racy check for a busy page. |
588 | * | 622 | * |
@@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
620 | swp_entry_t swap = { .val = page_private(page) }; | 654 | swp_entry_t swap = { .val = page_private(page) }; |
621 | mem_cgroup_swapout(page, swap); | 655 | mem_cgroup_swapout(page, swap); |
622 | __delete_from_swap_cache(page); | 656 | __delete_from_swap_cache(page); |
623 | spin_unlock_irq(&mapping->tree_lock); | 657 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
658 | mem_cgroup_end_page_stat(memcg); | ||
624 | swapcache_free(swap); | 659 | swapcache_free(swap); |
625 | } else { | 660 | } else { |
626 | void (*freepage)(struct page *); | 661 | void (*freepage)(struct page *); |
@@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
640 | if (reclaimed && page_is_file_cache(page) && | 675 | if (reclaimed && page_is_file_cache(page) && |
641 | !mapping_exiting(mapping)) | 676 | !mapping_exiting(mapping)) |
642 | shadow = workingset_eviction(mapping, page); | 677 | shadow = workingset_eviction(mapping, page); |
643 | __delete_from_page_cache(page, shadow); | 678 | __delete_from_page_cache(page, shadow, memcg); |
644 | spin_unlock_irq(&mapping->tree_lock); | 679 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
680 | mem_cgroup_end_page_stat(memcg); | ||
645 | 681 | ||
646 | if (freepage != NULL) | 682 | if (freepage != NULL) |
647 | freepage(page); | 683 | freepage(page); |
@@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
650 | return 1; | 686 | return 1; |
651 | 687 | ||
652 | cannot_free: | 688 | cannot_free: |
653 | spin_unlock_irq(&mapping->tree_lock); | 689 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
690 | mem_cgroup_end_page_stat(memcg); | ||
654 | return 0; | 691 | return 0; |
655 | } | 692 | } |
656 | 693 | ||
@@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
917 | */ | 954 | */ |
918 | mapping = page_mapping(page); | 955 | mapping = page_mapping(page); |
919 | if (((dirty || writeback) && mapping && | 956 | if (((dirty || writeback) && mapping && |
920 | bdi_write_congested(inode_to_bdi(mapping->host))) || | 957 | inode_write_congested(mapping->host)) || |
921 | (writeback && PageReclaim(page))) | 958 | (writeback && PageReclaim(page))) |
922 | nr_congested++; | 959 | nr_congested++; |
923 | 960 | ||
@@ -935,10 +972,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
935 | * note that the LRU is being scanned too quickly and the | 972 | * note that the LRU is being scanned too quickly and the |
936 | * caller can stall after page list has been processed. | 973 | * caller can stall after page list has been processed. |
937 | * | 974 | * |
938 | * 2) Global reclaim encounters a page, memcg encounters a | 975 | * 2) Global or new memcg reclaim encounters a page that is |
939 | * page that is not marked for immediate reclaim or | 976 | * not marked for immediate reclaim or the caller does not |
940 | * the caller does not have __GFP_IO. In this case mark | 977 | * have __GFP_IO. In this case mark the page for immediate |
941 | * the page for immediate reclaim and continue scanning. | 978 | * reclaim and continue scanning. |
942 | * | 979 | * |
943 | * __GFP_IO is checked because a loop driver thread might | 980 | * __GFP_IO is checked because a loop driver thread might |
944 | * enter reclaim, and deadlock if it waits on a page for | 981 | * enter reclaim, and deadlock if it waits on a page for |
@@ -952,7 +989,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
952 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing | 989 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing |
953 | * may_enter_fs here is liable to OOM on them. | 990 | * may_enter_fs here is liable to OOM on them. |
954 | * | 991 | * |
955 | * 3) memcg encounters a page that is not already marked | 992 | * 3) Legacy memcg encounters a page that is not already marked |
956 | * PageReclaim. memcg does not have any dirty pages | 993 | * PageReclaim. memcg does not have any dirty pages |
957 | * throttling so we could easily OOM just because too many | 994 | * throttling so we could easily OOM just because too many |
958 | * pages are in writeback and there is nothing else to | 995 | * pages are in writeback and there is nothing else to |
@@ -967,7 +1004,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
967 | goto keep_locked; | 1004 | goto keep_locked; |
968 | 1005 | ||
969 | /* Case 2 above */ | 1006 | /* Case 2 above */ |
970 | } else if (global_reclaim(sc) || | 1007 | } else if (sane_reclaim(sc) || |
971 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { | 1008 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { |
972 | /* | 1009 | /* |
973 | * This is slightly racy - end_page_writeback() | 1010 | * This is slightly racy - end_page_writeback() |
@@ -1416,7 +1453,7 @@ static int too_many_isolated(struct zone *zone, int file, | |||
1416 | if (current_is_kswapd()) | 1453 | if (current_is_kswapd()) |
1417 | return 0; | 1454 | return 0; |
1418 | 1455 | ||
1419 | if (!global_reclaim(sc)) | 1456 | if (!sane_reclaim(sc)) |
1420 | return 0; | 1457 | return 0; |
1421 | 1458 | ||
1422 | if (file) { | 1459 | if (file) { |
@@ -1608,10 +1645,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1608 | set_bit(ZONE_WRITEBACK, &zone->flags); | 1645 | set_bit(ZONE_WRITEBACK, &zone->flags); |
1609 | 1646 | ||
1610 | /* | 1647 | /* |
1611 | * memcg will stall in page writeback so only consider forcibly | 1648 | * Legacy memcg will stall in page writeback so avoid forcibly |
1612 | * stalling for global reclaim | 1649 | * stalling here. |
1613 | */ | 1650 | */ |
1614 | if (global_reclaim(sc)) { | 1651 | if (sane_reclaim(sc)) { |
1615 | /* | 1652 | /* |
1616 | * Tag a zone as congested if all the dirty pages scanned were | 1653 | * Tag a zone as congested if all the dirty pages scanned were |
1617 | * backed by a congested BDI and wait_iff_congested will stall. | 1654 | * backed by a congested BDI and wait_iff_congested will stall. |