summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/fs-writeback.c177
-rw-r--r--fs/mpage.c1
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/linux/writeback.h16
5 files changed, 200 insertions, 3 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index d883c799fb45..aca687f966d7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3040,8 +3040,10 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh,
3040 */ 3040 */
3041 bio = bio_alloc(GFP_NOIO, 1); 3041 bio = bio_alloc(GFP_NOIO, 1);
3042 3042
3043 if (wbc) 3043 if (wbc) {
3044 wbc_init_bio(wbc, bio); 3044 wbc_init_bio(wbc, bio);
3045 wbc_account_io(wbc, bh->b_page, bh->b_size);
3046 }
3045 3047
3046 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 3048 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3047 bio->bi_bdev = bh->b_bdev; 3049 bio->bi_bdev = bh->b_bdev;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 755e8ef8d1f0..f98d40333c85 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -214,6 +214,20 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
214 214
215#ifdef CONFIG_CGROUP_WRITEBACK 215#ifdef CONFIG_CGROUP_WRITEBACK
216 216
217/* parameters for foreign inode detection, see wb_detach_inode() */
218#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */
219#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */
220#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */
221#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */
222
223#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */
224#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
225 /* each slot's duration is 2s / 16 */
226#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
227 /* if foreign slots >= 8, switch */
228#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
229 /* one round can affect upto 5 slots */
230
217void __inode_attach_wb(struct inode *inode, struct page *page) 231void __inode_attach_wb(struct inode *inode, struct page *page)
218{ 232{
219 struct backing_dev_info *bdi = inode_to_bdi(inode); 233 struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -258,24 +272,183 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
258 struct inode *inode) 272 struct inode *inode)
259{ 273{
260 wbc->wb = inode_to_wb(inode); 274 wbc->wb = inode_to_wb(inode);
275 wbc->inode = inode;
276
277 wbc->wb_id = wbc->wb->memcg_css->id;
278 wbc->wb_lcand_id = inode->i_wb_frn_winner;
279 wbc->wb_tcand_id = 0;
280 wbc->wb_bytes = 0;
281 wbc->wb_lcand_bytes = 0;
282 wbc->wb_tcand_bytes = 0;
283
261 wb_get(wbc->wb); 284 wb_get(wbc->wb);
262 spin_unlock(&inode->i_lock); 285 spin_unlock(&inode->i_lock);
263} 286}
264 287
265/** 288/**
266 * wbc_detach_inode - disassociate wbc from its target inode 289 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
267 * @wbc: writeback_control of interest 290 * @wbc: writeback_control of the just finished writeback
268 * 291 *
269 * To be called after a writeback attempt of an inode finishes and undoes 292 * To be called after a writeback attempt of an inode finishes and undoes
270 * wbc_attach_and_unlock_inode(). Can be called under any context. 293 * wbc_attach_and_unlock_inode(). Can be called under any context.
294 *
295 * As concurrent write sharing of an inode is expected to be very rare and
296 * memcg only tracks page ownership on first-use basis severely confining
297 * the usefulness of such sharing, cgroup writeback tracks ownership
298 * per-inode. While the support for concurrent write sharing of an inode
299 * is deemed unnecessary, an inode being written to by different cgroups at
300 * different points in time is a lot more common, and, more importantly,
301 * charging only by first-use can too readily lead to grossly incorrect
302 * behaviors (single foreign page can lead to gigabytes of writeback to be
303 * incorrectly attributed).
304 *
305 * To resolve this issue, cgroup writeback detects the majority dirtier of
306 * an inode and transfers the ownership to it. To avoid unnnecessary
307 * oscillation, the detection mechanism keeps track of history and gives
308 * out the switch verdict only if the foreign usage pattern is stable over
309 * a certain amount of time and/or writeback attempts.
310 *
311 * On each writeback attempt, @wbc tries to detect the majority writer
312 * using Boyer-Moore majority vote algorithm. In addition to the byte
313 * count from the majority voting, it also counts the bytes written for the
314 * current wb and the last round's winner wb (max of last round's current
315 * wb, the winner from two rounds ago, and the last round's majority
316 * candidate). Keeping track of the historical winner helps the algorithm
317 * to semi-reliably detect the most active writer even when it's not the
318 * absolute majority.
319 *
320 * Once the winner of the round is determined, whether the winner is
321 * foreign or not and how much IO time the round consumed is recorded in
322 * inode->i_wb_frn_history. If the amount of recorded foreign IO time is
323 * over a certain threshold, the switch verdict is given.
271 */ 324 */
272void wbc_detach_inode(struct writeback_control *wbc) 325void wbc_detach_inode(struct writeback_control *wbc)
273{ 326{
327 struct bdi_writeback *wb = wbc->wb;
328 struct inode *inode = wbc->inode;
329 u16 history = inode->i_wb_frn_history;
330 unsigned long avg_time = inode->i_wb_frn_avg_time;
331 unsigned long max_bytes, max_time;
332 int max_id;
333
334 /* pick the winner of this round */
335 if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
336 wbc->wb_bytes >= wbc->wb_tcand_bytes) {
337 max_id = wbc->wb_id;
338 max_bytes = wbc->wb_bytes;
339 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
340 max_id = wbc->wb_lcand_id;
341 max_bytes = wbc->wb_lcand_bytes;
342 } else {
343 max_id = wbc->wb_tcand_id;
344 max_bytes = wbc->wb_tcand_bytes;
345 }
346
347 /*
348 * Calculate the amount of IO time the winner consumed and fold it
349 * into the running average kept per inode. If the consumed IO
350 * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
351 * deciding whether to switch or not. This is to prevent one-off
352 * small dirtiers from skewing the verdict.
353 */
354 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
355 wb->avg_write_bandwidth);
356 if (avg_time)
357 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
358 (avg_time >> WB_FRN_TIME_AVG_SHIFT);
359 else
360 avg_time = max_time; /* immediate catch up on first run */
361
362 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
363 int slots;
364
365 /*
366 * The switch verdict is reached if foreign wb's consume
367 * more than a certain proportion of IO time in a
368 * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot
369 * history mask where each bit represents one sixteenth of
370 * the period. Determine the number of slots to shift into
371 * history from @max_time.
372 */
373 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
374 (unsigned long)WB_FRN_HIST_MAX_SLOTS);
375 history <<= slots;
376 if (wbc->wb_id != max_id)
377 history |= (1U << slots) - 1;
378
379 /*
380 * Switch if the current wb isn't the consistent winner.
381 * If there are multiple closely competing dirtiers, the
382 * inode may switch across them repeatedly over time, which
383 * is okay. The main goal is avoiding keeping an inode on
384 * the wrong wb for an extended period of time.
385 */
386 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) {
387 /* switch */
388 max_id = 0;
389 avg_time = 0;
390 history = 0;
391 }
392 }
393
394 /*
395 * Multiple instances of this function may race to update the
396 * following fields but we don't mind occassional inaccuracies.
397 */
398 inode->i_wb_frn_winner = max_id;
399 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
400 inode->i_wb_frn_history = history;
401
274 wb_put(wbc->wb); 402 wb_put(wbc->wb);
275 wbc->wb = NULL; 403 wbc->wb = NULL;
276} 404}
277 405
278/** 406/**
407 * wbc_account_io - account IO issued during writeback
408 * @wbc: writeback_control of the writeback in progress
409 * @page: page being written out
410 * @bytes: number of bytes being written out
411 *
412 * @bytes from @page are about to written out during the writeback
413 * controlled by @wbc. Keep the book for foreign inode detection. See
414 * wbc_detach_inode().
415 */
416void wbc_account_io(struct writeback_control *wbc, struct page *page,
417 size_t bytes)
418{
419 int id;
420
421 /*
422 * pageout() path doesn't attach @wbc to the inode being written
423 * out. This is intentional as we don't want the function to block
424 * behind a slow cgroup. Ultimately, we want pageout() to kick off
425 * regular writeback instead of writing things out itself.
426 */
427 if (!wbc->wb)
428 return;
429
430 rcu_read_lock();
431 id = mem_cgroup_css_from_page(page)->id;
432 rcu_read_unlock();
433
434 if (id == wbc->wb_id) {
435 wbc->wb_bytes += bytes;
436 return;
437 }
438
439 if (id == wbc->wb_lcand_id)
440 wbc->wb_lcand_bytes += bytes;
441
442 /* Boyer-Moore majority vote algorithm */
443 if (!wbc->wb_tcand_bytes)
444 wbc->wb_tcand_id = id;
445 if (id == wbc->wb_tcand_id)
446 wbc->wb_tcand_bytes += bytes;
447 else
448 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
449}
450
451/**
279 * inode_congested - test whether an inode is congested 452 * inode_congested - test whether an inode is congested
280 * @inode: inode to test for congestion 453 * @inode: inode to test for congestion
281 * @cong_bits: mask of WB_[a]sync_congested bits to test 454 * @cong_bits: mask of WB_[a]sync_congested bits to test
diff --git a/fs/mpage.c b/fs/mpage.c
index 388fde6ac255..ca0244b69de8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -614,6 +614,7 @@ alloc_new:
614 * the confused fail path above (OOM) will be very confused when 614 * the confused fail path above (OOM) will be very confused when
615 * it finds all bh marked clean (i.e. it will not write anything) 615 * it finds all bh marked clean (i.e. it will not write anything)
616 */ 616 */
617 wbc_account_io(wbc, page, PAGE_SIZE);
617 length = first_unmapped << blkbits; 618 length = first_unmapped << blkbits;
618 if (bio_add_page(bio, page, length, 0) < length) { 619 if (bio_add_page(bio, page, length, 0) < length) {
619 bio = mpage_bio_submit(WRITE, bio); 620 bio = mpage_bio_submit(WRITE, bio);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 67a42ec95065..740126d7c44e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -638,6 +638,11 @@ struct inode {
638 struct list_head i_wb_list; /* backing dev IO list */ 638 struct list_head i_wb_list; /* backing dev IO list */
639#ifdef CONFIG_CGROUP_WRITEBACK 639#ifdef CONFIG_CGROUP_WRITEBACK
640 struct bdi_writeback *i_wb; /* the associated cgroup wb */ 640 struct bdi_writeback *i_wb; /* the associated cgroup wb */
641
642 /* foreign inode detection, see wbc_detach_inode() */
643 int i_wb_frn_winner;
644 u16 i_wb_frn_avg_time;
645 u16 i_wb_frn_history;
641#endif 646#endif
642 struct list_head i_lru; /* inode LRU list */ 647 struct list_head i_lru; /* inode LRU list */
643 struct list_head i_sb_list; 648 struct list_head i_sb_list;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 8f964e558af5..b333c945e571 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -88,6 +88,15 @@ struct writeback_control {
88 unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ 88 unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
89#ifdef CONFIG_CGROUP_WRITEBACK 89#ifdef CONFIG_CGROUP_WRITEBACK
90 struct bdi_writeback *wb; /* wb this writeback is issued under */ 90 struct bdi_writeback *wb; /* wb this writeback is issued under */
91 struct inode *inode; /* inode being written out */
92
93 /* foreign inode detection, see wbc_detach_inode() */
94 int wb_id; /* current wb id */
95 int wb_lcand_id; /* last foreign candidate wb id */
96 int wb_tcand_id; /* this foreign candidate wb id */
97 size_t wb_bytes; /* bytes written by current wb */
98 size_t wb_lcand_bytes; /* bytes written by last candidate */
99 size_t wb_tcand_bytes; /* bytes written by this candidate */
91#endif 100#endif
92}; 101};
93 102
@@ -187,6 +196,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
187 struct inode *inode) 196 struct inode *inode)
188 __releases(&inode->i_lock); 197 __releases(&inode->i_lock);
189void wbc_detach_inode(struct writeback_control *wbc); 198void wbc_detach_inode(struct writeback_control *wbc);
199void wbc_account_io(struct writeback_control *wbc, struct page *page,
200 size_t bytes);
190 201
191/** 202/**
192 * inode_attach_wb - associate an inode with its wb 203 * inode_attach_wb - associate an inode with its wb
@@ -285,6 +296,11 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
285{ 296{
286} 297}
287 298
299static inline void wbc_account_io(struct writeback_control *wbc,
300 struct page *page, size_t bytes)
301{
302}
303
288#endif /* CONFIG_CGROUP_WRITEBACK */ 304#endif /* CONFIG_CGROUP_WRITEBACK */
289 305
290/* 306/*