summaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2015-05-28 14:50:53 -0400
committerJens Axboe <axboe@fb.com>2015-06-02 10:40:20 -0400
commit682aa8e1a6a1504a4caaa62e6c2c9daae3757210 (patch)
treed2471a86d2a286ef8076f629f9709310d68f4e95 /include/linux
parent87e1d789bf55b12fa7c1cdce024499aee3bc0af0 (diff)
writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
The mechanism for detecting whether an inode should switch its wb (bdi_writeback) association is now in place. This patch build the framework for the actual switching. This patch adds a new inode flag I_WB_SWITCHING, which has two functions. First, the easy one, it ensures that there's only one switching in progress for a give inode. Second, it's used as a mechanism to synchronize wb stat updates. The two stats, WB_RECLAIMABLE and WB_WRITEBACK, aren't event counters but track the current number of dirty pages and pages under writeback respectively. As such, when an inode is moved from one wb to another, the inode's portion of those stats have to be transferred together; unfortunately, this is a bit tricky as those stat updates are percpu operations which are performed without holding any lock in some places. This patch solves the problem in a similar way as memcg. Each such lockless stat updates are wrapped in transaction surrounded by unlocked_inode_to_wb_begin/end(). During normal operation, they map to rcu_read_lock/unlock(); however, if I_WB_SWITCHING is asserted, mapping->tree_lock is grabbed across the transaction. In turn, the switching path sets I_WB_SWITCHING and waits for a RCU grace period to pass before actually starting to switch, which guarantees that all stat update paths are synchronizing against mapping->tree_lock. This patch still doesn't implement the actual switching. v3: Updated on top of the recent cancel_dirty_page() updates. unlocked_inode_to_wb_begin() now nests inside mem_cgroup_begin_page_stat() to match the locking order. v2: The i_wb access transaction will be used for !stat accesses too. Function names and comments updated accordingly. s/inode_wb_stat_unlocked_{begin|end}/unlocked_inode_to_wb_{begin|end}/ s/switch_wb/switch_wbs/ Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Jan Kara <jack@suse.cz> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Greg Thelen <gthelen@google.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/backing-dev.h54
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/mm.h3
3 files changed, 62 insertions, 1 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b1d2489a6536..73ffa32e58ee 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -332,6 +332,50 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
332 return inode->i_wb; 332 return inode->i_wb;
333} 333}
334 334
335/**
336 * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
337 * @inode: target inode
338 * @lockedp: temp bool output param, to be passed to the end function
339 *
340 * The caller wants to access the wb associated with @inode but isn't
341 * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This
342 * function determines the wb associated with @inode and ensures that the
343 * association doesn't change until the transaction is finished with
344 * unlocked_inode_to_wb_end().
345 *
346 * The caller must call unlocked_inode_to_wb_end() with *@lockdep
347 * afterwards and can't sleep during transaction. IRQ may or may not be
348 * disabled on return.
349 */
350static inline struct bdi_writeback *
351unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
352{
353 rcu_read_lock();
354
355 /*
356 * Paired with store_release in inode_switch_wb_work_fn() and
357 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
358 */
359 *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
360
361 if (unlikely(*lockedp))
362 spin_lock_irq(&inode->i_mapping->tree_lock);
363 return inode_to_wb(inode);
364}
365
366/**
367 * unlocked_inode_to_wb_end - end inode wb access transaction
368 * @inode: target inode
369 * @locked: *@lockedp from unlocked_inode_to_wb_begin()
370 */
371static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
372{
373 if (unlikely(locked))
374 spin_unlock_irq(&inode->i_mapping->tree_lock);
375
376 rcu_read_unlock();
377}
378
335struct wb_iter { 379struct wb_iter {
336 int start_blkcg_id; 380 int start_blkcg_id;
337 struct radix_tree_iter tree_iter; 381 struct radix_tree_iter tree_iter;
@@ -420,6 +464,16 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
420 return &inode_to_bdi(inode)->wb; 464 return &inode_to_bdi(inode)->wb;
421} 465}
422 466
467static inline struct bdi_writeback *
468unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
469{
470 return inode_to_wb(inode);
471}
472
473static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
474{
475}
476
423static inline void wb_memcg_offline(struct mem_cgroup *memcg) 477static inline void wb_memcg_offline(struct mem_cgroup *memcg)
424{ 478{
425} 479}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 740126d7c44e..b5e1dcfbc5e3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1815,6 +1815,11 @@ struct super_operations {
1815 * 1815 *
1816 * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). 1816 * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit().
1817 * 1817 *
1818 * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
1819 * synchronize competing switching instances and to tell
1820 * wb stat updates to grab mapping->tree_lock. See
1821 * inode_switch_wb_work_fn() for details.
1822 *
1818 * Q: What is the difference between I_WILL_FREE and I_FREEING? 1823 * Q: What is the difference between I_WILL_FREE and I_FREEING?
1819 */ 1824 */
1820#define I_DIRTY_SYNC (1 << 0) 1825#define I_DIRTY_SYNC (1 << 0)
@@ -1834,6 +1839,7 @@ struct super_operations {
1834#define I_DIRTY_TIME (1 << 11) 1839#define I_DIRTY_TIME (1 << 11)
1835#define __I_DIRTY_TIME_EXPIRED 12 1840#define __I_DIRTY_TIME_EXPIRED 12
1836#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) 1841#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED)
1842#define I_WB_SWITCH (1 << 13)
1837 1843
1838#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) 1844#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
1839#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) 1845#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f48d979ced4b..4024543b4203 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -27,6 +27,7 @@ struct anon_vma_chain;
27struct file_ra_state; 27struct file_ra_state;
28struct user_struct; 28struct user_struct;
29struct writeback_control; 29struct writeback_control;
30struct bdi_writeback;
30 31
31#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ 32#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */
32extern unsigned long max_mapnr; 33extern unsigned long max_mapnr;
@@ -1214,7 +1215,7 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
1214void account_page_dirtied(struct page *page, struct address_space *mapping, 1215void account_page_dirtied(struct page *page, struct address_space *mapping,
1215 struct mem_cgroup *memcg); 1216 struct mem_cgroup *memcg);
1216void account_page_cleaned(struct page *page, struct address_space *mapping, 1217void account_page_cleaned(struct page *page, struct address_space *mapping,
1217 struct mem_cgroup *memcg); 1218 struct mem_cgroup *memcg, struct bdi_writeback *wb);
1218int set_page_dirty(struct page *page); 1219int set_page_dirty(struct page *page);
1219int set_page_dirty_lock(struct page *page); 1220int set_page_dirty_lock(struct page *page);
1220void cancel_dirty_page(struct page *page); 1221void cancel_dirty_page(struct page *page);