diff options
-rw-r--r-- | fs/buffer.c | 4 | ||||
-rw-r--r-- | fs/fs-writeback.c | 177 | ||||
-rw-r--r-- | fs/mpage.c | 1 | ||||
-rw-r--r-- | include/linux/fs.h | 5 | ||||
-rw-r--r-- | include/linux/writeback.h | 16 |
5 files changed, 200 insertions, 3 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index d883c799fb45..aca687f966d7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -3040,8 +3040,10 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh, | |||
3040 | */ | 3040 | */ |
3041 | bio = bio_alloc(GFP_NOIO, 1); | 3041 | bio = bio_alloc(GFP_NOIO, 1); |
3042 | 3042 | ||
3043 | if (wbc) | 3043 | if (wbc) { |
3044 | wbc_init_bio(wbc, bio); | 3044 | wbc_init_bio(wbc, bio); |
3045 | wbc_account_io(wbc, bh->b_page, bh->b_size); | ||
3046 | } | ||
3045 | 3047 | ||
3046 | bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); | 3048 | bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); |
3047 | bio->bi_bdev = bh->b_bdev; | 3049 | bio->bi_bdev = bh->b_bdev; |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 755e8ef8d1f0..f98d40333c85 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -214,6 +214,20 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, | |||
214 | 214 | ||
215 | #ifdef CONFIG_CGROUP_WRITEBACK | 215 | #ifdef CONFIG_CGROUP_WRITEBACK |
216 | 216 | ||
217 | /* parameters for foreign inode detection, see wb_detach_inode() */ | ||
218 | #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ | ||
219 | #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ | ||
220 | #define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ | ||
221 | #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ | ||
222 | |||
223 | #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ | ||
224 | #define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) | ||
225 | /* each slot's duration is 2s / 16 */ | ||
226 | #define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) | ||
227 | /* if foreign slots >= 8, switch */ | ||
228 | #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) | ||
229 | /* one round can affect upto 5 slots */ | ||
230 | |||
217 | void __inode_attach_wb(struct inode *inode, struct page *page) | 231 | void __inode_attach_wb(struct inode *inode, struct page *page) |
218 | { | 232 | { |
219 | struct backing_dev_info *bdi = inode_to_bdi(inode); | 233 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
@@ -258,24 +272,183 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, | |||
258 | struct inode *inode) | 272 | struct inode *inode) |
259 | { | 273 | { |
260 | wbc->wb = inode_to_wb(inode); | 274 | wbc->wb = inode_to_wb(inode); |
275 | wbc->inode = inode; | ||
276 | |||
277 | wbc->wb_id = wbc->wb->memcg_css->id; | ||
278 | wbc->wb_lcand_id = inode->i_wb_frn_winner; | ||
279 | wbc->wb_tcand_id = 0; | ||
280 | wbc->wb_bytes = 0; | ||
281 | wbc->wb_lcand_bytes = 0; | ||
282 | wbc->wb_tcand_bytes = 0; | ||
283 | |||
261 | wb_get(wbc->wb); | 284 | wb_get(wbc->wb); |
262 | spin_unlock(&inode->i_lock); | 285 | spin_unlock(&inode->i_lock); |
263 | } | 286 | } |
264 | 287 | ||
265 | /** | 288 | /** |
266 | * wbc_detach_inode - disassociate wbc from its target inode | 289 | * wbc_detach_inode - disassociate wbc from inode and perform foreign detection |
267 | * @wbc: writeback_control of interest | 290 | * @wbc: writeback_control of the just finished writeback |
268 | * | 291 | * |
269 | * To be called after a writeback attempt of an inode finishes and undoes | 292 | * To be called after a writeback attempt of an inode finishes and undoes |
270 | * wbc_attach_and_unlock_inode(). Can be called under any context. | 293 | * wbc_attach_and_unlock_inode(). Can be called under any context. |
294 | * | ||
295 | * As concurrent write sharing of an inode is expected to be very rare and | ||
296 | * memcg only tracks page ownership on first-use basis severely confining | ||
297 | * the usefulness of such sharing, cgroup writeback tracks ownership | ||
298 | * per-inode. While the support for concurrent write sharing of an inode | ||
299 | * is deemed unnecessary, an inode being written to by different cgroups at | ||
300 | * different points in time is a lot more common, and, more importantly, | ||
301 | * charging only by first-use can too readily lead to grossly incorrect | ||
302 | * behaviors (single foreign page can lead to gigabytes of writeback to be | ||
303 | * incorrectly attributed). | ||
304 | * | ||
305 | * To resolve this issue, cgroup writeback detects the majority dirtier of | ||
306 | * an inode and transfers the ownership to it. To avoid unnnecessary | ||
307 | * oscillation, the detection mechanism keeps track of history and gives | ||
308 | * out the switch verdict only if the foreign usage pattern is stable over | ||
309 | * a certain amount of time and/or writeback attempts. | ||
310 | * | ||
311 | * On each writeback attempt, @wbc tries to detect the majority writer | ||
312 | * using Boyer-Moore majority vote algorithm. In addition to the byte | ||
313 | * count from the majority voting, it also counts the bytes written for the | ||
314 | * current wb and the last round's winner wb (max of last round's current | ||
315 | * wb, the winner from two rounds ago, and the last round's majority | ||
316 | * candidate). Keeping track of the historical winner helps the algorithm | ||
317 | * to semi-reliably detect the most active writer even when it's not the | ||
318 | * absolute majority. | ||
319 | * | ||
320 | * Once the winner of the round is determined, whether the winner is | ||
321 | * foreign or not and how much IO time the round consumed is recorded in | ||
322 | * inode->i_wb_frn_history. If the amount of recorded foreign IO time is | ||
323 | * over a certain threshold, the switch verdict is given. | ||
271 | */ | 324 | */ |
272 | void wbc_detach_inode(struct writeback_control *wbc) | 325 | void wbc_detach_inode(struct writeback_control *wbc) |
273 | { | 326 | { |
327 | struct bdi_writeback *wb = wbc->wb; | ||
328 | struct inode *inode = wbc->inode; | ||
329 | u16 history = inode->i_wb_frn_history; | ||
330 | unsigned long avg_time = inode->i_wb_frn_avg_time; | ||
331 | unsigned long max_bytes, max_time; | ||
332 | int max_id; | ||
333 | |||
334 | /* pick the winner of this round */ | ||
335 | if (wbc->wb_bytes >= wbc->wb_lcand_bytes && | ||
336 | wbc->wb_bytes >= wbc->wb_tcand_bytes) { | ||
337 | max_id = wbc->wb_id; | ||
338 | max_bytes = wbc->wb_bytes; | ||
339 | } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { | ||
340 | max_id = wbc->wb_lcand_id; | ||
341 | max_bytes = wbc->wb_lcand_bytes; | ||
342 | } else { | ||
343 | max_id = wbc->wb_tcand_id; | ||
344 | max_bytes = wbc->wb_tcand_bytes; | ||
345 | } | ||
346 | |||
347 | /* | ||
348 | * Calculate the amount of IO time the winner consumed and fold it | ||
349 | * into the running average kept per inode. If the consumed IO | ||
350 | * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for | ||
351 | * deciding whether to switch or not. This is to prevent one-off | ||
352 | * small dirtiers from skewing the verdict. | ||
353 | */ | ||
354 | max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, | ||
355 | wb->avg_write_bandwidth); | ||
356 | if (avg_time) | ||
357 | avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - | ||
358 | (avg_time >> WB_FRN_TIME_AVG_SHIFT); | ||
359 | else | ||
360 | avg_time = max_time; /* immediate catch up on first run */ | ||
361 | |||
362 | if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { | ||
363 | int slots; | ||
364 | |||
365 | /* | ||
366 | * The switch verdict is reached if foreign wb's consume | ||
367 | * more than a certain proportion of IO time in a | ||
368 | * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot | ||
369 | * history mask where each bit represents one sixteenth of | ||
370 | * the period. Determine the number of slots to shift into | ||
371 | * history from @max_time. | ||
372 | */ | ||
373 | slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), | ||
374 | (unsigned long)WB_FRN_HIST_MAX_SLOTS); | ||
375 | history <<= slots; | ||
376 | if (wbc->wb_id != max_id) | ||
377 | history |= (1U << slots) - 1; | ||
378 | |||
379 | /* | ||
380 | * Switch if the current wb isn't the consistent winner. | ||
381 | * If there are multiple closely competing dirtiers, the | ||
382 | * inode may switch across them repeatedly over time, which | ||
383 | * is okay. The main goal is avoiding keeping an inode on | ||
384 | * the wrong wb for an extended period of time. | ||
385 | */ | ||
386 | if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) { | ||
387 | /* switch */ | ||
388 | max_id = 0; | ||
389 | avg_time = 0; | ||
390 | history = 0; | ||
391 | } | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * Multiple instances of this function may race to update the | ||
396 | * following fields but we don't mind occassional inaccuracies. | ||
397 | */ | ||
398 | inode->i_wb_frn_winner = max_id; | ||
399 | inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); | ||
400 | inode->i_wb_frn_history = history; | ||
401 | |||
274 | wb_put(wbc->wb); | 402 | wb_put(wbc->wb); |
275 | wbc->wb = NULL; | 403 | wbc->wb = NULL; |
276 | } | 404 | } |
277 | 405 | ||
278 | /** | 406 | /** |
407 | * wbc_account_io - account IO issued during writeback | ||
408 | * @wbc: writeback_control of the writeback in progress | ||
409 | * @page: page being written out | ||
410 | * @bytes: number of bytes being written out | ||
411 | * | ||
412 | * @bytes from @page are about to written out during the writeback | ||
413 | * controlled by @wbc. Keep the book for foreign inode detection. See | ||
414 | * wbc_detach_inode(). | ||
415 | */ | ||
416 | void wbc_account_io(struct writeback_control *wbc, struct page *page, | ||
417 | size_t bytes) | ||
418 | { | ||
419 | int id; | ||
420 | |||
421 | /* | ||
422 | * pageout() path doesn't attach @wbc to the inode being written | ||
423 | * out. This is intentional as we don't want the function to block | ||
424 | * behind a slow cgroup. Ultimately, we want pageout() to kick off | ||
425 | * regular writeback instead of writing things out itself. | ||
426 | */ | ||
427 | if (!wbc->wb) | ||
428 | return; | ||
429 | |||
430 | rcu_read_lock(); | ||
431 | id = mem_cgroup_css_from_page(page)->id; | ||
432 | rcu_read_unlock(); | ||
433 | |||
434 | if (id == wbc->wb_id) { | ||
435 | wbc->wb_bytes += bytes; | ||
436 | return; | ||
437 | } | ||
438 | |||
439 | if (id == wbc->wb_lcand_id) | ||
440 | wbc->wb_lcand_bytes += bytes; | ||
441 | |||
442 | /* Boyer-Moore majority vote algorithm */ | ||
443 | if (!wbc->wb_tcand_bytes) | ||
444 | wbc->wb_tcand_id = id; | ||
445 | if (id == wbc->wb_tcand_id) | ||
446 | wbc->wb_tcand_bytes += bytes; | ||
447 | else | ||
448 | wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); | ||
449 | } | ||
450 | |||
451 | /** | ||
279 | * inode_congested - test whether an inode is congested | 452 | * inode_congested - test whether an inode is congested |
280 | * @inode: inode to test for congestion | 453 | * @inode: inode to test for congestion |
281 | * @cong_bits: mask of WB_[a]sync_congested bits to test | 454 | * @cong_bits: mask of WB_[a]sync_congested bits to test |
diff --git a/fs/mpage.c b/fs/mpage.c index 388fde6ac255..ca0244b69de8 100644 --- a/fs/mpage.c +++ b/fs/mpage.c | |||
@@ -614,6 +614,7 @@ alloc_new: | |||
614 | * the confused fail path above (OOM) will be very confused when | 614 | * the confused fail path above (OOM) will be very confused when |
615 | * it finds all bh marked clean (i.e. it will not write anything) | 615 | * it finds all bh marked clean (i.e. it will not write anything) |
616 | */ | 616 | */ |
617 | wbc_account_io(wbc, page, PAGE_SIZE); | ||
617 | length = first_unmapped << blkbits; | 618 | length = first_unmapped << blkbits; |
618 | if (bio_add_page(bio, page, length, 0) < length) { | 619 | if (bio_add_page(bio, page, length, 0) < length) { |
619 | bio = mpage_bio_submit(WRITE, bio); | 620 | bio = mpage_bio_submit(WRITE, bio); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 67a42ec95065..740126d7c44e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -638,6 +638,11 @@ struct inode { | |||
638 | struct list_head i_wb_list; /* backing dev IO list */ | 638 | struct list_head i_wb_list; /* backing dev IO list */ |
639 | #ifdef CONFIG_CGROUP_WRITEBACK | 639 | #ifdef CONFIG_CGROUP_WRITEBACK |
640 | struct bdi_writeback *i_wb; /* the associated cgroup wb */ | 640 | struct bdi_writeback *i_wb; /* the associated cgroup wb */ |
641 | |||
642 | /* foreign inode detection, see wbc_detach_inode() */ | ||
643 | int i_wb_frn_winner; | ||
644 | u16 i_wb_frn_avg_time; | ||
645 | u16 i_wb_frn_history; | ||
641 | #endif | 646 | #endif |
642 | struct list_head i_lru; /* inode LRU list */ | 647 | struct list_head i_lru; /* inode LRU list */ |
643 | struct list_head i_sb_list; | 648 | struct list_head i_sb_list; |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8f964e558af5..b333c945e571 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -88,6 +88,15 @@ struct writeback_control { | |||
88 | unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ | 88 | unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ |
89 | #ifdef CONFIG_CGROUP_WRITEBACK | 89 | #ifdef CONFIG_CGROUP_WRITEBACK |
90 | struct bdi_writeback *wb; /* wb this writeback is issued under */ | 90 | struct bdi_writeback *wb; /* wb this writeback is issued under */ |
91 | struct inode *inode; /* inode being written out */ | ||
92 | |||
93 | /* foreign inode detection, see wbc_detach_inode() */ | ||
94 | int wb_id; /* current wb id */ | ||
95 | int wb_lcand_id; /* last foreign candidate wb id */ | ||
96 | int wb_tcand_id; /* this foreign candidate wb id */ | ||
97 | size_t wb_bytes; /* bytes written by current wb */ | ||
98 | size_t wb_lcand_bytes; /* bytes written by last candidate */ | ||
99 | size_t wb_tcand_bytes; /* bytes written by this candidate */ | ||
91 | #endif | 100 | #endif |
92 | }; | 101 | }; |
93 | 102 | ||
@@ -187,6 +196,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, | |||
187 | struct inode *inode) | 196 | struct inode *inode) |
188 | __releases(&inode->i_lock); | 197 | __releases(&inode->i_lock); |
189 | void wbc_detach_inode(struct writeback_control *wbc); | 198 | void wbc_detach_inode(struct writeback_control *wbc); |
199 | void wbc_account_io(struct writeback_control *wbc, struct page *page, | ||
200 | size_t bytes); | ||
190 | 201 | ||
191 | /** | 202 | /** |
192 | * inode_attach_wb - associate an inode with its wb | 203 | * inode_attach_wb - associate an inode with its wb |
@@ -285,6 +296,11 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) | |||
285 | { | 296 | { |
286 | } | 297 | } |
287 | 298 | ||
299 | static inline void wbc_account_io(struct writeback_control *wbc, | ||
300 | struct page *page, size_t bytes) | ||
301 | { | ||
302 | } | ||
303 | |||
288 | #endif /* CONFIG_CGROUP_WRITEBACK */ | 304 | #endif /* CONFIG_CGROUP_WRITEBACK */ |
289 | 305 | ||
290 | /* | 306 | /* |