Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block

Pull cgroup writeback support from Jens Axboe: "This is the big pull request for adding cgroup writeback support. This code has been in development for a long time, and it has been simmering in for-next for a good chunk of this cycle too. This is one of those problems that has been talked about for at least half a decade, finally there's a solution and code to go with it. Also see last weeks writeup on LWN: http://lwn.net/Articles/648292/" * 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits) writeback, blkio: add documentation for cgroup writeback support vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB writeback: do foreign inode detection iff cgroup writeback is enabled v9fs: fix error handling in v9fs_session_init() bdi: fix wrong error return value in cgwb_create() buffer: remove unusued 'ret' variable writeback: disassociate inodes from dying bdi_writebacks writeback: implement foreign cgroup inode bdi_writeback switching writeback: add lockdep annotation to inode_to_wb() writeback: use unlocked_inode_to_wb transaction in inode_congested() writeback: implement unlocked_inode_to_wb transaction and use it for stat updates writeback: implement [locked_]inode_to_wb_and_lock_list() writeback: implement foreign cgroup inode detection writeback: make writeback_control track the inode being written back writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use writeback: implement memcg writeback domain based throttling writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes writeback: implement memcg wb_domain writeback: update wb_over_bg_thresh() to use wb_domain aware operations ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-06-25 19:00:17 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-06-25 19:00:17 -0400
commit: e4bc13adfd016fc1036838170288b5680d1a98b0 (patch)
tree: 8d2cb749397749439732f3a827cb7f2336408337 /mm/vmscan.c
parent: ad90fb97515b732bc27a0109baa10af636c3c8cd (diff)
parent: 3e1534cf4a2a8278e811e7c84a79da1a02347b8b (diff)
1 files changed, 58 insertions, 21 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 19ef01e90ac4..e61445dce04e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc)
 {
        return !sc->target_mem_cgroup;
 }
+/**
+ * sane_reclaim - is the usual dirty throttling mechanism operational?
+ * @sc: scan_control in question
+ *
+ * The normal page dirty throttling mechanism in balance_dirty_pages() is
+ * completely broken with the legacy memcg and direct stalling in
+ * shrink_page_list() is used for throttling instead, which lacks all the
+ * niceties such as fairness, adaptive pausing, bandwidth proportional
+ * allocation and configurability.
+ *
+ * This function tests whether the vmscan currently in progress can assume
+ * that the normal dirty throttling mechanism is operational.
+ */
+static bool sane_reclaim(struct scan_control *sc)
+{
+        struct mem_cgroup *memcg = sc->target_mem_cgroup;
+        if (!memcg)
+                return true;
+#ifdef CONFIG_CGROUP_WRITEBACK
+        if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+                return true;
+#endif
+        return false;
+}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
        return true;
 }
+static bool sane_reclaim(struct scan_control *sc)
+{
+        return true;
+}
 #endif
 static unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page)
        return page_count(page) - page_has_private(page) == 2;
 }
-static int may_write_to_queue(struct backing_dev_info *bdi,
+static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
-                              struct scan_control *sc)
 {
        if (current->flags & PF_SWAPWRITE)
                return 1;
-        if (!bdi_write_congested(bdi))
+        if (!inode_write_congested(inode))
                return 1;
-        if (bdi == current->backing_dev_info)
+        if (inode_to_bdi(inode) == current->backing_dev_info)
                return 1;
        return 0;
 }
@@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-        if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+        if (!may_write_to_inode(mapping->host, sc))
                return PAGE_KEEP;
        if (clear_page_dirty_for_io(page)) {
@@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 static int __remove_mapping(struct address_space *mapping, struct page *page,
                            bool reclaimed)
 {
+        unsigned long flags;
+        struct mem_cgroup *memcg;
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
-        spin_lock_irq(&mapping->tree_lock);
+        memcg = mem_cgroup_begin_page_stat(page);
+        spin_lock_irqsave(&mapping->tree_lock, flags);
        /*
         * The non racy check for a busy page.
         *
@@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                swp_entry_t swap = { .val = page_private(page) };
                mem_cgroup_swapout(page, swap);
                __delete_from_swap_cache(page);
-                spin_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
+                mem_cgroup_end_page_stat(memcg);
                swapcache_free(swap);
        } else {
                void (*freepage)(struct page *);
@@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                if (reclaimed && page_is_file_cache(page) &&
                    !mapping_exiting(mapping))
                        shadow = workingset_eviction(mapping, page);
-                __delete_from_page_cache(page, shadow);
+                __delete_from_page_cache(page, shadow, memcg);
-                spin_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
+                mem_cgroup_end_page_stat(memcg);
                if (freepage != NULL)
                        freepage(page);
@@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
        return 1;
 cannot_free:
-        spin_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irqrestore(&mapping->tree_lock, flags);
+        mem_cgroup_end_page_stat(memcg);
        return 0;
 }
@@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 */
                mapping = page_mapping(page);
                if (((dirty || writeback) && mapping &&
-                     bdi_write_congested(inode_to_bdi(mapping->host))) ||
+                     inode_write_congested(mapping->host)) ||
                    (writeback && PageReclaim(page)))
                        nr_congested++;
@@ -935,10 +972,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 *    note that the LRU is being scanned too quickly and the
                 *    caller can stall after page list has been processed.
                 *
-                 * 2) Global reclaim encounters a page, memcg encounters a
+                 * 2) Global or new memcg reclaim encounters a page that is
-                 *    page that is not marked for immediate reclaim or
+                 *    not marked for immediate reclaim or the caller does not
-                 *    the caller does not have __GFP_IO. In this case mark
+                 *    have __GFP_IO. In this case mark the page for immediate
-                 *    the page for immediate reclaim and continue scanning.
+                 *    reclaim and continue scanning.
                 *
                 *    __GFP_IO is checked  because a loop driver thread might
                 *    enter reclaim, and deadlock if it waits on a page for
@@ -952,7 +989,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
                 *    may_enter_fs here is liable to OOM on them.
                 *
-                 * 3) memcg encounters a page that is not already marked
+                 * 3) Legacy memcg encounters a page that is not already marked
                 *    PageReclaim. memcg does not have any dirty pages
                 *    throttling so we could easily OOM just because too many
                 *    pages are in writeback and there is nothing else to
@@ -967,7 +1004,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                        /* Case 2 above */
-                        } else if (global_reclaim(sc) ||
+                        } else if (sane_reclaim(sc) ||
                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                /*
                                 * This is slightly racy - end_page_writeback()
@@ -1416,7 +1453,7 @@ static int too_many_isolated(struct zone *zone, int file,
        if (current_is_kswapd())
                return 0;
-        if (!global_reclaim(sc))
+        if (!sane_reclaim(sc))
                return 0;
        if (file) {
@@ -1608,10 +1645,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                set_bit(ZONE_WRITEBACK, &zone->flags);
        /*
-         * memcg will stall in page writeback so only consider forcibly
+         * Legacy memcg will stall in page writeback so avoid forcibly
-         * stalling for global reclaim
+         * stalling here.
         */
-        if (global_reclaim(sc)) {
+        if (sane_reclaim(sc)) {
                /*
                 * Tag a zone as congested if all the dirty pages scanned were
                 * backed by a congested BDI and wait_iff_congested will stall.
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-06-25 19:00:17 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-06-25 19:00:17 -0400
commit	e4bc13adfd016fc1036838170288b5680d1a98b0 (patch)
tree	8d2cb749397749439732f3a827cb7f2336408337 /mm/vmscan.c
parent	ad90fb97515b732bc27a0109baa10af636c3c8cd (diff)
parent	3e1534cf4a2a8278e811e7c84a79da1a02347b8b (diff)