aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2015-05-28 14:50:53 -0400
committerJens Axboe <axboe@fb.com>2015-06-02 10:40:20 -0400
commit682aa8e1a6a1504a4caaa62e6c2c9daae3757210 (patch)
treed2471a86d2a286ef8076f629f9709310d68f4e95 /fs/fs-writeback.c
parent87e1d789bf55b12fa7c1cdce024499aee3bc0af0 (diff)
writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
The mechanism for detecting whether an inode should switch its wb (bdi_writeback) association is now in place. This patch build the framework for the actual switching. This patch adds a new inode flag I_WB_SWITCHING, which has two functions. First, the easy one, it ensures that there's only one switching in progress for a give inode. Second, it's used as a mechanism to synchronize wb stat updates. The two stats, WB_RECLAIMABLE and WB_WRITEBACK, aren't event counters but track the current number of dirty pages and pages under writeback respectively. As such, when an inode is moved from one wb to another, the inode's portion of those stats have to be transferred together; unfortunately, this is a bit tricky as those stat updates are percpu operations which are performed without holding any lock in some places. This patch solves the problem in a similar way as memcg. Each such lockless stat updates are wrapped in transaction surrounded by unlocked_inode_to_wb_begin/end(). During normal operation, they map to rcu_read_lock/unlock(); however, if I_WB_SWITCHING is asserted, mapping->tree_lock is grabbed across the transaction. In turn, the switching path sets I_WB_SWITCHING and waits for a RCU grace period to pass before actually starting to switch, which guarantees that all stat update paths are synchronizing against mapping->tree_lock. This patch still doesn't implement the actual switching. v3: Updated on top of the recent cancel_dirty_page() updates. unlocked_inode_to_wb_begin() now nests inside mem_cgroup_begin_page_stat() to match the locking order. v2: The i_wb access transaction will be used for !stat accesses too. Function names and comments updated accordingly. s/inode_wb_stat_unlocked_{begin|end}/unlocked_inode_to_wb_{begin|end}/ s/switch_wb/switch_wbs/ Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Jan Kara <jack@suse.cz> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Greg Thelen <gthelen@google.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c117
1 files changed, 111 insertions, 6 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f67b956fb321..08f5496fcf1b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -308,6 +308,115 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
308 return locked_inode_to_wb_and_lock_list(inode); 308 return locked_inode_to_wb_and_lock_list(inode);
309} 309}
310 310
311struct inode_switch_wbs_context {
312 struct inode *inode;
313 struct bdi_writeback *new_wb;
314
315 struct rcu_head rcu_head;
316 struct work_struct work;
317};
318
319static void inode_switch_wbs_work_fn(struct work_struct *work)
320{
321 struct inode_switch_wbs_context *isw =
322 container_of(work, struct inode_switch_wbs_context, work);
323 struct inode *inode = isw->inode;
324 struct bdi_writeback *new_wb = isw->new_wb;
325
326 /*
327 * By the time control reaches here, RCU grace period has passed
328 * since I_WB_SWITCH assertion and all wb stat update transactions
329 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
330 * synchronizing against mapping->tree_lock.
331 */
332 spin_lock(&inode->i_lock);
333
334 inode->i_wb_frn_winner = 0;
335 inode->i_wb_frn_avg_time = 0;
336 inode->i_wb_frn_history = 0;
337
338 /*
339 * Paired with load_acquire in unlocked_inode_to_wb_begin() and
340 * ensures that the new wb is visible if they see !I_WB_SWITCH.
341 */
342 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
343
344 spin_unlock(&inode->i_lock);
345
346 iput(inode);
347 wb_put(new_wb);
348 kfree(isw);
349}
350
351static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
352{
353 struct inode_switch_wbs_context *isw = container_of(rcu_head,
354 struct inode_switch_wbs_context, rcu_head);
355
356 /* needs to grab bh-unsafe locks, bounce to work item */
357 INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
358 schedule_work(&isw->work);
359}
360
361/**
362 * inode_switch_wbs - change the wb association of an inode
363 * @inode: target inode
364 * @new_wb_id: ID of the new wb
365 *
366 * Switch @inode's wb association to the wb identified by @new_wb_id. The
367 * switching is performed asynchronously and may fail silently.
368 */
369static void inode_switch_wbs(struct inode *inode, int new_wb_id)
370{
371 struct backing_dev_info *bdi = inode_to_bdi(inode);
372 struct cgroup_subsys_state *memcg_css;
373 struct inode_switch_wbs_context *isw;
374
375 /* noop if seems to be already in progress */
376 if (inode->i_state & I_WB_SWITCH)
377 return;
378
379 isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
380 if (!isw)
381 return;
382
383 /* find and pin the new wb */
384 rcu_read_lock();
385 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
386 if (memcg_css)
387 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
388 rcu_read_unlock();
389 if (!isw->new_wb)
390 goto out_free;
391
392 /* while holding I_WB_SWITCH, no one else can update the association */
393 spin_lock(&inode->i_lock);
394 if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
395 inode_to_wb(inode) == isw->new_wb) {
396 spin_unlock(&inode->i_lock);
397 goto out_free;
398 }
399 inode->i_state |= I_WB_SWITCH;
400 spin_unlock(&inode->i_lock);
401
402 ihold(inode);
403 isw->inode = inode;
404
405 /*
406 * In addition to synchronizing among switchers, I_WB_SWITCH tells
407 * the RCU protected stat update paths to grab the mapping's
408 * tree_lock so that stat transfer can synchronize against them.
409 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
410 */
411 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
412 return;
413
414out_free:
415 if (isw->new_wb)
416 wb_put(isw->new_wb);
417 kfree(isw);
418}
419
311/** 420/**
312 * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it 421 * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
313 * @wbc: writeback_control of interest 422 * @wbc: writeback_control of interest
@@ -433,12 +542,8 @@ void wbc_detach_inode(struct writeback_control *wbc)
433 * is okay. The main goal is avoiding keeping an inode on 542 * is okay. The main goal is avoiding keeping an inode on
434 * the wrong wb for an extended period of time. 543 * the wrong wb for an extended period of time.
435 */ 544 */
436 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) { 545 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
437 /* switch */ 546 inode_switch_wbs(inode, max_id);
438 max_id = 0;
439 avg_time = 0;
440 history = 0;
441 }
442 } 547 }
443 548
444 /* 549 /*