aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-06-25 19:00:17 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-25 19:00:17 -0400
commite4bc13adfd016fc1036838170288b5680d1a98b0 (patch)
tree8d2cb749397749439732f3a827cb7f2336408337 /mm/vmscan.c
parentad90fb97515b732bc27a0109baa10af636c3c8cd (diff)
parent3e1534cf4a2a8278e811e7c84a79da1a02347b8b (diff)
Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block
Pull cgroup writeback support from Jens Axboe: "This is the big pull request for adding cgroup writeback support. This code has been in development for a long time, and it has been simmering in for-next for a good chunk of this cycle too. This is one of those problems that has been talked about for at least half a decade, finally there's a solution and code to go with it. Also see last weeks writeup on LWN: http://lwn.net/Articles/648292/" * 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits) writeback, blkio: add documentation for cgroup writeback support vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB writeback: do foreign inode detection iff cgroup writeback is enabled v9fs: fix error handling in v9fs_session_init() bdi: fix wrong error return value in cgwb_create() buffer: remove unusued 'ret' variable writeback: disassociate inodes from dying bdi_writebacks writeback: implement foreign cgroup inode bdi_writeback switching writeback: add lockdep annotation to inode_to_wb() writeback: use unlocked_inode_to_wb transaction in inode_congested() writeback: implement unlocked_inode_to_wb transaction and use it for stat updates writeback: implement [locked_]inode_to_wb_and_lock_list() writeback: implement foreign cgroup inode detection writeback: make writeback_control track the inode being written back writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use writeback: implement memcg writeback domain based throttling writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes writeback: implement memcg wb_domain writeback: update wb_over_bg_thresh() to use wb_domain aware operations ...
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c79
1 files changed, 58 insertions, 21 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 19ef01e90ac4..e61445dce04e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc)
154{ 154{
155 return !sc->target_mem_cgroup; 155 return !sc->target_mem_cgroup;
156} 156}
157
158/**
159 * sane_reclaim - is the usual dirty throttling mechanism operational?
160 * @sc: scan_control in question
161 *
162 * The normal page dirty throttling mechanism in balance_dirty_pages() is
163 * completely broken with the legacy memcg and direct stalling in
164 * shrink_page_list() is used for throttling instead, which lacks all the
165 * niceties such as fairness, adaptive pausing, bandwidth proportional
166 * allocation and configurability.
167 *
168 * This function tests whether the vmscan currently in progress can assume
169 * that the normal dirty throttling mechanism is operational.
170 */
171static bool sane_reclaim(struct scan_control *sc)
172{
173 struct mem_cgroup *memcg = sc->target_mem_cgroup;
174
175 if (!memcg)
176 return true;
177#ifdef CONFIG_CGROUP_WRITEBACK
178 if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
179 return true;
180#endif
181 return false;
182}
157#else 183#else
158static bool global_reclaim(struct scan_control *sc) 184static bool global_reclaim(struct scan_control *sc)
159{ 185{
160 return true; 186 return true;
161} 187}
188
189static bool sane_reclaim(struct scan_control *sc)
190{
191 return true;
192}
162#endif 193#endif
163 194
164static unsigned long zone_reclaimable_pages(struct zone *zone) 195static unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page)
452 return page_count(page) - page_has_private(page) == 2; 483 return page_count(page) - page_has_private(page) == 2;
453} 484}
454 485
455static int may_write_to_queue(struct backing_dev_info *bdi, 486static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
456 struct scan_control *sc)
457{ 487{
458 if (current->flags & PF_SWAPWRITE) 488 if (current->flags & PF_SWAPWRITE)
459 return 1; 489 return 1;
460 if (!bdi_write_congested(bdi)) 490 if (!inode_write_congested(inode))
461 return 1; 491 return 1;
462 if (bdi == current->backing_dev_info) 492 if (inode_to_bdi(inode) == current->backing_dev_info)
463 return 1; 493 return 1;
464 return 0; 494 return 0;
465} 495}
@@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
538 } 568 }
539 if (mapping->a_ops->writepage == NULL) 569 if (mapping->a_ops->writepage == NULL)
540 return PAGE_ACTIVATE; 570 return PAGE_ACTIVATE;
541 if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) 571 if (!may_write_to_inode(mapping->host, sc))
542 return PAGE_KEEP; 572 return PAGE_KEEP;
543 573
544 if (clear_page_dirty_for_io(page)) { 574 if (clear_page_dirty_for_io(page)) {
@@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
579static int __remove_mapping(struct address_space *mapping, struct page *page, 609static int __remove_mapping(struct address_space *mapping, struct page *page,
580 bool reclaimed) 610 bool reclaimed)
581{ 611{
612 unsigned long flags;
613 struct mem_cgroup *memcg;
614
582 BUG_ON(!PageLocked(page)); 615 BUG_ON(!PageLocked(page));
583 BUG_ON(mapping != page_mapping(page)); 616 BUG_ON(mapping != page_mapping(page));
584 617
585 spin_lock_irq(&mapping->tree_lock); 618 memcg = mem_cgroup_begin_page_stat(page);
619 spin_lock_irqsave(&mapping->tree_lock, flags);
586 /* 620 /*
587 * The non racy check for a busy page. 621 * The non racy check for a busy page.
588 * 622 *
@@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
620 swp_entry_t swap = { .val = page_private(page) }; 654 swp_entry_t swap = { .val = page_private(page) };
621 mem_cgroup_swapout(page, swap); 655 mem_cgroup_swapout(page, swap);
622 __delete_from_swap_cache(page); 656 __delete_from_swap_cache(page);
623 spin_unlock_irq(&mapping->tree_lock); 657 spin_unlock_irqrestore(&mapping->tree_lock, flags);
658 mem_cgroup_end_page_stat(memcg);
624 swapcache_free(swap); 659 swapcache_free(swap);
625 } else { 660 } else {
626 void (*freepage)(struct page *); 661 void (*freepage)(struct page *);
@@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
640 if (reclaimed && page_is_file_cache(page) && 675 if (reclaimed && page_is_file_cache(page) &&
641 !mapping_exiting(mapping)) 676 !mapping_exiting(mapping))
642 shadow = workingset_eviction(mapping, page); 677 shadow = workingset_eviction(mapping, page);
643 __delete_from_page_cache(page, shadow); 678 __delete_from_page_cache(page, shadow, memcg);
644 spin_unlock_irq(&mapping->tree_lock); 679 spin_unlock_irqrestore(&mapping->tree_lock, flags);
680 mem_cgroup_end_page_stat(memcg);
645 681
646 if (freepage != NULL) 682 if (freepage != NULL)
647 freepage(page); 683 freepage(page);
@@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
650 return 1; 686 return 1;
651 687
652cannot_free: 688cannot_free:
653 spin_unlock_irq(&mapping->tree_lock); 689 spin_unlock_irqrestore(&mapping->tree_lock, flags);
690 mem_cgroup_end_page_stat(memcg);
654 return 0; 691 return 0;
655} 692}
656 693
@@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
917 */ 954 */
918 mapping = page_mapping(page); 955 mapping = page_mapping(page);
919 if (((dirty || writeback) && mapping && 956 if (((dirty || writeback) && mapping &&
920 bdi_write_congested(inode_to_bdi(mapping->host))) || 957 inode_write_congested(mapping->host)) ||
921 (writeback && PageReclaim(page))) 958 (writeback && PageReclaim(page)))
922 nr_congested++; 959 nr_congested++;
923 960
@@ -935,10 +972,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
935 * note that the LRU is being scanned too quickly and the 972 * note that the LRU is being scanned too quickly and the
936 * caller can stall after page list has been processed. 973 * caller can stall after page list has been processed.
937 * 974 *
938 * 2) Global reclaim encounters a page, memcg encounters a 975 * 2) Global or new memcg reclaim encounters a page that is
939 * page that is not marked for immediate reclaim or 976 * not marked for immediate reclaim or the caller does not
940 * the caller does not have __GFP_IO. In this case mark 977 * have __GFP_IO. In this case mark the page for immediate
941 * the page for immediate reclaim and continue scanning. 978 * reclaim and continue scanning.
942 * 979 *
943 * __GFP_IO is checked because a loop driver thread might 980 * __GFP_IO is checked because a loop driver thread might
944 * enter reclaim, and deadlock if it waits on a page for 981 * enter reclaim, and deadlock if it waits on a page for
@@ -952,7 +989,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
952 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing 989 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
953 * may_enter_fs here is liable to OOM on them. 990 * may_enter_fs here is liable to OOM on them.
954 * 991 *
955 * 3) memcg encounters a page that is not already marked 992 * 3) Legacy memcg encounters a page that is not already marked
956 * PageReclaim. memcg does not have any dirty pages 993 * PageReclaim. memcg does not have any dirty pages
957 * throttling so we could easily OOM just because too many 994 * throttling so we could easily OOM just because too many
958 * pages are in writeback and there is nothing else to 995 * pages are in writeback and there is nothing else to
@@ -967,7 +1004,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
967 goto keep_locked; 1004 goto keep_locked;
968 1005
969 /* Case 2 above */ 1006 /* Case 2 above */
970 } else if (global_reclaim(sc) || 1007 } else if (sane_reclaim(sc) ||
971 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 1008 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
972 /* 1009 /*
973 * This is slightly racy - end_page_writeback() 1010 * This is slightly racy - end_page_writeback()
@@ -1416,7 +1453,7 @@ static int too_many_isolated(struct zone *zone, int file,
1416 if (current_is_kswapd()) 1453 if (current_is_kswapd())
1417 return 0; 1454 return 0;
1418 1455
1419 if (!global_reclaim(sc)) 1456 if (!sane_reclaim(sc))
1420 return 0; 1457 return 0;
1421 1458
1422 if (file) { 1459 if (file) {
@@ -1608,10 +1645,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1608 set_bit(ZONE_WRITEBACK, &zone->flags); 1645 set_bit(ZONE_WRITEBACK, &zone->flags);
1609 1646
1610 /* 1647 /*
1611 * memcg will stall in page writeback so only consider forcibly 1648 * Legacy memcg will stall in page writeback so avoid forcibly
1612 * stalling for global reclaim 1649 * stalling here.
1613 */ 1650 */
1614 if (global_reclaim(sc)) { 1651 if (sane_reclaim(sc)) {
1615 /* 1652 /*
1616 * Tag a zone as congested if all the dirty pages scanned were 1653 * Tag a zone as congested if all the dirty pages scanned were
1617 * backed by a congested BDI and wait_iff_congested will stall. 1654 * backed by a congested BDI and wait_iff_congested will stall.