diff options
-rw-r--r-- | fs/block_dev.c | 16 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 2 | ||||
-rw-r--r-- | fs/ext4/inode.c | 4 | ||||
-rw-r--r-- | fs/fs-writeback.c | 373 | ||||
-rw-r--r-- | fs/inode.c | 5 | ||||
-rw-r--r-- | fs/nfs/write.c | 3 | ||||
-rw-r--r-- | include/linux/backing-dev.h | 8 | ||||
-rw-r--r-- | include/linux/writeback.h | 55 | ||||
-rw-r--r-- | include/trace/events/btrfs.h | 6 | ||||
-rw-r--r-- | include/trace/events/ext4.h | 6 | ||||
-rw-r--r-- | include/trace/events/writeback.h | 183 | ||||
-rw-r--r-- | mm/backing-dev.c | 82 | ||||
-rw-r--r-- | mm/filemap.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 280 | ||||
-rw-r--r-- | mm/rmap.c | 4 |
15 files changed, 747 insertions, 286 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index c62fb84944d5..f55aad4d1611 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode) | |||
44 | { | 44 | { |
45 | return &BDEV_I(inode)->bdev; | 45 | return &BDEV_I(inode)->bdev; |
46 | } | 46 | } |
47 | |||
48 | EXPORT_SYMBOL(I_BDEV); | 47 | EXPORT_SYMBOL(I_BDEV); |
49 | 48 | ||
50 | /* | 49 | /* |
51 | * move the inode from it's current bdi to the a new bdi. if the inode is dirty | 50 | * Move the inode from its current bdi to a new bdi. If the inode is dirty we |
52 | * we need to move it onto the dirty list of @dst so that the inode is always | 51 | * need to move it onto the dirty list of @dst so that the inode is always on |
53 | * on the right list. | 52 | * the right list. |
54 | */ | 53 | */ |
55 | static void bdev_inode_switch_bdi(struct inode *inode, | 54 | static void bdev_inode_switch_bdi(struct inode *inode, |
56 | struct backing_dev_info *dst) | 55 | struct backing_dev_info *dst) |
57 | { | 56 | { |
58 | spin_lock(&inode_wb_list_lock); | 57 | struct backing_dev_info *old = inode->i_data.backing_dev_info; |
58 | |||
59 | if (unlikely(dst == old)) /* deadlock avoidance */ | ||
60 | return; | ||
61 | bdi_lock_two(&old->wb, &dst->wb); | ||
59 | spin_lock(&inode->i_lock); | 62 | spin_lock(&inode->i_lock); |
60 | inode->i_data.backing_dev_info = dst; | 63 | inode->i_data.backing_dev_info = dst; |
61 | if (inode->i_state & I_DIRTY) | 64 | if (inode->i_state & I_DIRTY) |
62 | list_move(&inode->i_wb_list, &dst->wb.b_dirty); | 65 | list_move(&inode->i_wb_list, &dst->wb.b_dirty); |
63 | spin_unlock(&inode->i_lock); | 66 | spin_unlock(&inode->i_lock); |
64 | spin_unlock(&inode_wb_list_lock); | 67 | spin_unlock(&old->wb.list_lock); |
68 | spin_unlock(&dst->wb.list_lock); | ||
65 | } | 69 | } |
66 | 70 | ||
67 | static sector_t max_block(struct block_device *bdev) | 71 | static sector_t max_block(struct block_device *bdev) |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7055d11c1efd..561262d35689 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -2551,7 +2551,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, | |||
2551 | }; | 2551 | }; |
2552 | struct writeback_control wbc_writepages = { | 2552 | struct writeback_control wbc_writepages = { |
2553 | .sync_mode = wbc->sync_mode, | 2553 | .sync_mode = wbc->sync_mode, |
2554 | .older_than_this = NULL, | ||
2555 | .nr_to_write = 64, | 2554 | .nr_to_write = 64, |
2556 | .range_start = page_offset(page) + PAGE_CACHE_SIZE, | 2555 | .range_start = page_offset(page) + PAGE_CACHE_SIZE, |
2557 | .range_end = (loff_t)-1, | 2556 | .range_end = (loff_t)-1, |
@@ -2584,7 +2583,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, | |||
2584 | }; | 2583 | }; |
2585 | struct writeback_control wbc_writepages = { | 2584 | struct writeback_control wbc_writepages = { |
2586 | .sync_mode = mode, | 2585 | .sync_mode = mode, |
2587 | .older_than_this = NULL, | ||
2588 | .nr_to_write = nr_pages * 2, | 2586 | .nr_to_write = nr_pages * 2, |
2589 | .range_start = start, | 2587 | .range_start = start, |
2590 | .range_end = end + 1, | 2588 | .range_end = end + 1, |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 678cde834f19..3e5191f9f398 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -2741,7 +2741,7 @@ static int write_cache_pages_da(struct address_space *mapping, | |||
2741 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 2741 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
2742 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 2742 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
2743 | 2743 | ||
2744 | if (wbc->sync_mode == WB_SYNC_ALL) | 2744 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
2745 | tag = PAGECACHE_TAG_TOWRITE; | 2745 | tag = PAGECACHE_TAG_TOWRITE; |
2746 | else | 2746 | else |
2747 | tag = PAGECACHE_TAG_DIRTY; | 2747 | tag = PAGECACHE_TAG_DIRTY; |
@@ -2973,7 +2973,7 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2973 | } | 2973 | } |
2974 | 2974 | ||
2975 | retry: | 2975 | retry: |
2976 | if (wbc->sync_mode == WB_SYNC_ALL) | 2976 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
2977 | tag_pages_for_writeback(mapping, index, end); | 2977 | tag_pages_for_writeback(mapping, index, end); |
2978 | 2978 | ||
2979 | while (!ret && wbc->nr_to_write > 0) { | 2979 | while (!ret && wbc->nr_to_write > 0) { |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b8c507ca42f7..1599aa985fe2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -35,7 +35,9 @@ | |||
35 | struct wb_writeback_work { | 35 | struct wb_writeback_work { |
36 | long nr_pages; | 36 | long nr_pages; |
37 | struct super_block *sb; | 37 | struct super_block *sb; |
38 | unsigned long *older_than_this; | ||
38 | enum writeback_sync_modes sync_mode; | 39 | enum writeback_sync_modes sync_mode; |
40 | unsigned int tagged_writepages:1; | ||
39 | unsigned int for_kupdate:1; | 41 | unsigned int for_kupdate:1; |
40 | unsigned int range_cyclic:1; | 42 | unsigned int range_cyclic:1; |
41 | unsigned int for_background:1; | 43 | unsigned int for_background:1; |
@@ -180,12 +182,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) | |||
180 | */ | 182 | */ |
181 | void inode_wb_list_del(struct inode *inode) | 183 | void inode_wb_list_del(struct inode *inode) |
182 | { | 184 | { |
183 | spin_lock(&inode_wb_list_lock); | 185 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
186 | |||
187 | spin_lock(&bdi->wb.list_lock); | ||
184 | list_del_init(&inode->i_wb_list); | 188 | list_del_init(&inode->i_wb_list); |
185 | spin_unlock(&inode_wb_list_lock); | 189 | spin_unlock(&bdi->wb.list_lock); |
186 | } | 190 | } |
187 | 191 | ||
188 | |||
189 | /* | 192 | /* |
190 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the | 193 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the |
191 | * furthest end of its superblock's dirty-inode list. | 194 | * furthest end of its superblock's dirty-inode list. |
@@ -195,11 +198,9 @@ void inode_wb_list_del(struct inode *inode) | |||
195 | * the case then the inode must have been redirtied while it was being written | 198 | * the case then the inode must have been redirtied while it was being written |
196 | * out and we don't reset its dirtied_when. | 199 | * out and we don't reset its dirtied_when. |
197 | */ | 200 | */ |
198 | static void redirty_tail(struct inode *inode) | 201 | static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) |
199 | { | 202 | { |
200 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 203 | assert_spin_locked(&wb->list_lock); |
201 | |||
202 | assert_spin_locked(&inode_wb_list_lock); | ||
203 | if (!list_empty(&wb->b_dirty)) { | 204 | if (!list_empty(&wb->b_dirty)) { |
204 | struct inode *tail; | 205 | struct inode *tail; |
205 | 206 | ||
@@ -213,11 +214,9 @@ static void redirty_tail(struct inode *inode) | |||
213 | /* | 214 | /* |
214 | * requeue inode for re-scanning after bdi->b_io list is exhausted. | 215 | * requeue inode for re-scanning after bdi->b_io list is exhausted. |
215 | */ | 216 | */ |
216 | static void requeue_io(struct inode *inode) | 217 | static void requeue_io(struct inode *inode, struct bdi_writeback *wb) |
217 | { | 218 | { |
218 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 219 | assert_spin_locked(&wb->list_lock); |
219 | |||
220 | assert_spin_locked(&inode_wb_list_lock); | ||
221 | list_move(&inode->i_wb_list, &wb->b_more_io); | 220 | list_move(&inode->i_wb_list, &wb->b_more_io); |
222 | } | 221 | } |
223 | 222 | ||
@@ -225,7 +224,7 @@ static void inode_sync_complete(struct inode *inode) | |||
225 | { | 224 | { |
226 | /* | 225 | /* |
227 | * Prevent speculative execution through | 226 | * Prevent speculative execution through |
228 | * spin_unlock(&inode_wb_list_lock); | 227 | * spin_unlock(&wb->list_lock); |
229 | */ | 228 | */ |
230 | 229 | ||
231 | smp_mb(); | 230 | smp_mb(); |
@@ -250,15 +249,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) | |||
250 | /* | 249 | /* |
251 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. | 250 | * Move expired dirty inodes from @delaying_queue to @dispatch_queue. |
252 | */ | 251 | */ |
253 | static void move_expired_inodes(struct list_head *delaying_queue, | 252 | static int move_expired_inodes(struct list_head *delaying_queue, |
254 | struct list_head *dispatch_queue, | 253 | struct list_head *dispatch_queue, |
255 | unsigned long *older_than_this) | 254 | unsigned long *older_than_this) |
256 | { | 255 | { |
257 | LIST_HEAD(tmp); | 256 | LIST_HEAD(tmp); |
258 | struct list_head *pos, *node; | 257 | struct list_head *pos, *node; |
259 | struct super_block *sb = NULL; | 258 | struct super_block *sb = NULL; |
260 | struct inode *inode; | 259 | struct inode *inode; |
261 | int do_sb_sort = 0; | 260 | int do_sb_sort = 0; |
261 | int moved = 0; | ||
262 | 262 | ||
263 | while (!list_empty(delaying_queue)) { | 263 | while (!list_empty(delaying_queue)) { |
264 | inode = wb_inode(delaying_queue->prev); | 264 | inode = wb_inode(delaying_queue->prev); |
@@ -269,12 +269,13 @@ static void move_expired_inodes(struct list_head *delaying_queue, | |||
269 | do_sb_sort = 1; | 269 | do_sb_sort = 1; |
270 | sb = inode->i_sb; | 270 | sb = inode->i_sb; |
271 | list_move(&inode->i_wb_list, &tmp); | 271 | list_move(&inode->i_wb_list, &tmp); |
272 | moved++; | ||
272 | } | 273 | } |
273 | 274 | ||
274 | /* just one sb in list, splice to dispatch_queue and we're done */ | 275 | /* just one sb in list, splice to dispatch_queue and we're done */ |
275 | if (!do_sb_sort) { | 276 | if (!do_sb_sort) { |
276 | list_splice(&tmp, dispatch_queue); | 277 | list_splice(&tmp, dispatch_queue); |
277 | return; | 278 | goto out; |
278 | } | 279 | } |
279 | 280 | ||
280 | /* Move inodes from one superblock together */ | 281 | /* Move inodes from one superblock together */ |
@@ -286,6 +287,8 @@ static void move_expired_inodes(struct list_head *delaying_queue, | |||
286 | list_move(&inode->i_wb_list, dispatch_queue); | 287 | list_move(&inode->i_wb_list, dispatch_queue); |
287 | } | 288 | } |
288 | } | 289 | } |
290 | out: | ||
291 | return moved; | ||
289 | } | 292 | } |
290 | 293 | ||
291 | /* | 294 | /* |
@@ -301,9 +304,11 @@ static void move_expired_inodes(struct list_head *delaying_queue, | |||
301 | */ | 304 | */ |
302 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) | 305 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) |
303 | { | 306 | { |
304 | assert_spin_locked(&inode_wb_list_lock); | 307 | int moved; |
308 | assert_spin_locked(&wb->list_lock); | ||
305 | list_splice_init(&wb->b_more_io, &wb->b_io); | 309 | list_splice_init(&wb->b_more_io, &wb->b_io); |
306 | move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); | 310 | moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); |
311 | trace_writeback_queue_io(wb, older_than_this, moved); | ||
307 | } | 312 | } |
308 | 313 | ||
309 | static int write_inode(struct inode *inode, struct writeback_control *wbc) | 314 | static int write_inode(struct inode *inode, struct writeback_control *wbc) |
@@ -316,7 +321,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) | |||
316 | /* | 321 | /* |
317 | * Wait for writeback on an inode to complete. | 322 | * Wait for writeback on an inode to complete. |
318 | */ | 323 | */ |
319 | static void inode_wait_for_writeback(struct inode *inode) | 324 | static void inode_wait_for_writeback(struct inode *inode, |
325 | struct bdi_writeback *wb) | ||
320 | { | 326 | { |
321 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); | 327 | DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); |
322 | wait_queue_head_t *wqh; | 328 | wait_queue_head_t *wqh; |
@@ -324,15 +330,15 @@ static void inode_wait_for_writeback(struct inode *inode) | |||
324 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); | 330 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); |
325 | while (inode->i_state & I_SYNC) { | 331 | while (inode->i_state & I_SYNC) { |
326 | spin_unlock(&inode->i_lock); | 332 | spin_unlock(&inode->i_lock); |
327 | spin_unlock(&inode_wb_list_lock); | 333 | spin_unlock(&wb->list_lock); |
328 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); | 334 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); |
329 | spin_lock(&inode_wb_list_lock); | 335 | spin_lock(&wb->list_lock); |
330 | spin_lock(&inode->i_lock); | 336 | spin_lock(&inode->i_lock); |
331 | } | 337 | } |
332 | } | 338 | } |
333 | 339 | ||
334 | /* | 340 | /* |
335 | * Write out an inode's dirty pages. Called under inode_wb_list_lock and | 341 | * Write out an inode's dirty pages. Called under wb->list_lock and |
336 | * inode->i_lock. Either the caller has an active reference on the inode or | 342 | * inode->i_lock. Either the caller has an active reference on the inode or |
337 | * the inode has I_WILL_FREE set. | 343 | * the inode has I_WILL_FREE set. |
338 | * | 344 | * |
@@ -343,13 +349,15 @@ static void inode_wait_for_writeback(struct inode *inode) | |||
343 | * livelocks, etc. | 349 | * livelocks, etc. |
344 | */ | 350 | */ |
345 | static int | 351 | static int |
346 | writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | 352 | writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, |
353 | struct writeback_control *wbc) | ||
347 | { | 354 | { |
348 | struct address_space *mapping = inode->i_mapping; | 355 | struct address_space *mapping = inode->i_mapping; |
356 | long nr_to_write = wbc->nr_to_write; | ||
349 | unsigned dirty; | 357 | unsigned dirty; |
350 | int ret; | 358 | int ret; |
351 | 359 | ||
352 | assert_spin_locked(&inode_wb_list_lock); | 360 | assert_spin_locked(&wb->list_lock); |
353 | assert_spin_locked(&inode->i_lock); | 361 | assert_spin_locked(&inode->i_lock); |
354 | 362 | ||
355 | if (!atomic_read(&inode->i_count)) | 363 | if (!atomic_read(&inode->i_count)) |
@@ -367,14 +375,16 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
367 | * completed a full scan of b_io. | 375 | * completed a full scan of b_io. |
368 | */ | 376 | */ |
369 | if (wbc->sync_mode != WB_SYNC_ALL) { | 377 | if (wbc->sync_mode != WB_SYNC_ALL) { |
370 | requeue_io(inode); | 378 | requeue_io(inode, wb); |
379 | trace_writeback_single_inode_requeue(inode, wbc, | ||
380 | nr_to_write); | ||
371 | return 0; | 381 | return 0; |
372 | } | 382 | } |
373 | 383 | ||
374 | /* | 384 | /* |
375 | * It's a data-integrity sync. We must wait. | 385 | * It's a data-integrity sync. We must wait. |
376 | */ | 386 | */ |
377 | inode_wait_for_writeback(inode); | 387 | inode_wait_for_writeback(inode, wb); |
378 | } | 388 | } |
379 | 389 | ||
380 | BUG_ON(inode->i_state & I_SYNC); | 390 | BUG_ON(inode->i_state & I_SYNC); |
@@ -383,7 +393,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
383 | inode->i_state |= I_SYNC; | 393 | inode->i_state |= I_SYNC; |
384 | inode->i_state &= ~I_DIRTY_PAGES; | 394 | inode->i_state &= ~I_DIRTY_PAGES; |
385 | spin_unlock(&inode->i_lock); | 395 | spin_unlock(&inode->i_lock); |
386 | spin_unlock(&inode_wb_list_lock); | 396 | spin_unlock(&wb->list_lock); |
387 | 397 | ||
388 | ret = do_writepages(mapping, wbc); | 398 | ret = do_writepages(mapping, wbc); |
389 | 399 | ||
@@ -414,10 +424,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
414 | ret = err; | 424 | ret = err; |
415 | } | 425 | } |
416 | 426 | ||
417 | spin_lock(&inode_wb_list_lock); | 427 | spin_lock(&wb->list_lock); |
418 | spin_lock(&inode->i_lock); | 428 | spin_lock(&inode->i_lock); |
419 | inode->i_state &= ~I_SYNC; | 429 | inode->i_state &= ~I_SYNC; |
420 | if (!(inode->i_state & I_FREEING)) { | 430 | if (!(inode->i_state & I_FREEING)) { |
431 | /* | ||
432 | * Sync livelock prevention. Each inode is tagged and synced in | ||
433 | * one shot. If still dirty, it will be redirty_tail()'ed below. | ||
434 | * Update the dirty time to prevent enqueue and sync it again. | ||
435 | */ | ||
436 | if ((inode->i_state & I_DIRTY) && | ||
437 | (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) | ||
438 | inode->dirtied_when = jiffies; | ||
439 | |||
421 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | 440 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { |
422 | /* | 441 | /* |
423 | * We didn't write back all the pages. nfs_writepages() | 442 | * We didn't write back all the pages. nfs_writepages() |
@@ -428,7 +447,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
428 | /* | 447 | /* |
429 | * slice used up: queue for next turn | 448 | * slice used up: queue for next turn |
430 | */ | 449 | */ |
431 | requeue_io(inode); | 450 | requeue_io(inode, wb); |
432 | } else { | 451 | } else { |
433 | /* | 452 | /* |
434 | * Writeback blocked by something other than | 453 | * Writeback blocked by something other than |
@@ -437,7 +456,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
437 | * retrying writeback of the dirty page/inode | 456 | * retrying writeback of the dirty page/inode |
438 | * that cannot be performed immediately. | 457 | * that cannot be performed immediately. |
439 | */ | 458 | */ |
440 | redirty_tail(inode); | 459 | redirty_tail(inode, wb); |
441 | } | 460 | } |
442 | } else if (inode->i_state & I_DIRTY) { | 461 | } else if (inode->i_state & I_DIRTY) { |
443 | /* | 462 | /* |
@@ -446,7 +465,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
446 | * submission or metadata updates after data IO | 465 | * submission or metadata updates after data IO |
447 | * completion. | 466 | * completion. |
448 | */ | 467 | */ |
449 | redirty_tail(inode); | 468 | redirty_tail(inode, wb); |
450 | } else { | 469 | } else { |
451 | /* | 470 | /* |
452 | * The inode is clean. At this point we either have | 471 | * The inode is clean. At this point we either have |
@@ -457,9 +476,41 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
457 | } | 476 | } |
458 | } | 477 | } |
459 | inode_sync_complete(inode); | 478 | inode_sync_complete(inode); |
479 | trace_writeback_single_inode(inode, wbc, nr_to_write); | ||
460 | return ret; | 480 | return ret; |
461 | } | 481 | } |
462 | 482 | ||
483 | static long writeback_chunk_size(struct backing_dev_info *bdi, | ||
484 | struct wb_writeback_work *work) | ||
485 | { | ||
486 | long pages; | ||
487 | |||
488 | /* | ||
489 | * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | ||
490 | * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | ||
491 | * here avoids calling into writeback_inodes_wb() more than once. | ||
492 | * | ||
493 | * The intended call sequence for WB_SYNC_ALL writeback is: | ||
494 | * | ||
495 | * wb_writeback() | ||
496 | * writeback_sb_inodes() <== called only once | ||
497 | * write_cache_pages() <== called once for each inode | ||
498 | * (quickly) tag currently dirty pages | ||
499 | * (maybe slowly) sync all tagged pages | ||
500 | */ | ||
501 | if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) | ||
502 | pages = LONG_MAX; | ||
503 | else { | ||
504 | pages = min(bdi->avg_write_bandwidth / 2, | ||
505 | global_dirty_limit / DIRTY_SCOPE); | ||
506 | pages = min(pages, work->nr_pages); | ||
507 | pages = round_down(pages + MIN_WRITEBACK_PAGES, | ||
508 | MIN_WRITEBACK_PAGES); | ||
509 | } | ||
510 | |||
511 | return pages; | ||
512 | } | ||
513 | |||
463 | /* | 514 | /* |
464 | * Write a portion of b_io inodes which belong to @sb. | 515 | * Write a portion of b_io inodes which belong to @sb. |
465 | * | 516 | * |
@@ -467,24 +518,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
467 | * inodes. Otherwise write only ones which go sequentially | 518 | * inodes. Otherwise write only ones which go sequentially |
468 | * in reverse order. | 519 | * in reverse order. |
469 | * | 520 | * |
470 | * Return 1, if the caller writeback routine should be | 521 | * Return the number of pages and/or inodes written. |
471 | * interrupted. Otherwise return 0. | ||
472 | */ | 522 | */ |
473 | static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | 523 | static long writeback_sb_inodes(struct super_block *sb, |
474 | struct writeback_control *wbc, bool only_this_sb) | 524 | struct bdi_writeback *wb, |
525 | struct wb_writeback_work *work) | ||
475 | { | 526 | { |
527 | struct writeback_control wbc = { | ||
528 | .sync_mode = work->sync_mode, | ||
529 | .tagged_writepages = work->tagged_writepages, | ||
530 | .for_kupdate = work->for_kupdate, | ||
531 | .for_background = work->for_background, | ||
532 | .range_cyclic = work->range_cyclic, | ||
533 | .range_start = 0, | ||
534 | .range_end = LLONG_MAX, | ||
535 | }; | ||
536 | unsigned long start_time = jiffies; | ||
537 | long write_chunk; | ||
538 | long wrote = 0; /* count both pages and inodes */ | ||
539 | |||
476 | while (!list_empty(&wb->b_io)) { | 540 | while (!list_empty(&wb->b_io)) { |
477 | long pages_skipped; | ||
478 | struct inode *inode = wb_inode(wb->b_io.prev); | 541 | struct inode *inode = wb_inode(wb->b_io.prev); |
479 | 542 | ||
480 | if (inode->i_sb != sb) { | 543 | if (inode->i_sb != sb) { |
481 | if (only_this_sb) { | 544 | if (work->sb) { |
482 | /* | 545 | /* |
483 | * We only want to write back data for this | 546 | * We only want to write back data for this |
484 | * superblock, move all inodes not belonging | 547 | * superblock, move all inodes not belonging |
485 | * to it back onto the dirty list. | 548 | * to it back onto the dirty list. |
486 | */ | 549 | */ |
487 | redirty_tail(inode); | 550 | redirty_tail(inode, wb); |
488 | continue; | 551 | continue; |
489 | } | 552 | } |
490 | 553 | ||
@@ -493,7 +556,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |||
493 | * Bounce back to the caller to unpin this and | 556 | * Bounce back to the caller to unpin this and |
494 | * pin the next superblock. | 557 | * pin the next superblock. |
495 | */ | 558 | */ |
496 | return 0; | 559 | break; |
497 | } | 560 | } |
498 | 561 | ||
499 | /* | 562 | /* |
@@ -504,95 +567,91 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |||
504 | spin_lock(&inode->i_lock); | 567 | spin_lock(&inode->i_lock); |
505 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { | 568 | if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { |
506 | spin_unlock(&inode->i_lock); | 569 | spin_unlock(&inode->i_lock); |
507 | requeue_io(inode); | 570 | redirty_tail(inode, wb); |
508 | continue; | 571 | continue; |
509 | } | 572 | } |
510 | |||
511 | /* | ||
512 | * Was this inode dirtied after sync_sb_inodes was called? | ||
513 | * This keeps sync from extra jobs and livelock. | ||
514 | */ | ||
515 | if (inode_dirtied_after(inode, wbc->wb_start)) { | ||
516 | spin_unlock(&inode->i_lock); | ||
517 | return 1; | ||
518 | } | ||
519 | |||
520 | __iget(inode); | 573 | __iget(inode); |
574 | write_chunk = writeback_chunk_size(wb->bdi, work); | ||
575 | wbc.nr_to_write = write_chunk; | ||
576 | wbc.pages_skipped = 0; | ||
521 | 577 | ||
522 | pages_skipped = wbc->pages_skipped; | 578 | writeback_single_inode(inode, wb, &wbc); |
523 | writeback_single_inode(inode, wbc); | 579 | |
524 | if (wbc->pages_skipped != pages_skipped) { | 580 | work->nr_pages -= write_chunk - wbc.nr_to_write; |
581 | wrote += write_chunk - wbc.nr_to_write; | ||
582 | if (!(inode->i_state & I_DIRTY)) | ||
583 | wrote++; | ||
584 | if (wbc.pages_skipped) { | ||
525 | /* | 585 | /* |
526 | * writeback is not making progress due to locked | 586 | * writeback is not making progress due to locked |
527 | * buffers. Skip this inode for now. | 587 | * buffers. Skip this inode for now. |
528 | */ | 588 | */ |
529 | redirty_tail(inode); | 589 | redirty_tail(inode, wb); |
530 | } | 590 | } |
531 | spin_unlock(&inode->i_lock); | 591 | spin_unlock(&inode->i_lock); |
532 | spin_unlock(&inode_wb_list_lock); | 592 | spin_unlock(&wb->list_lock); |
533 | iput(inode); | 593 | iput(inode); |
534 | cond_resched(); | 594 | cond_resched(); |
535 | spin_lock(&inode_wb_list_lock); | 595 | spin_lock(&wb->list_lock); |
536 | if (wbc->nr_to_write <= 0) { | 596 | /* |
537 | wbc->more_io = 1; | 597 | * bail out to wb_writeback() often enough to check |
538 | return 1; | 598 | * background threshold and other termination conditions. |
599 | */ | ||
600 | if (wrote) { | ||
601 | if (time_is_before_jiffies(start_time + HZ / 10UL)) | ||
602 | break; | ||
603 | if (work->nr_pages <= 0) | ||
604 | break; | ||
539 | } | 605 | } |
540 | if (!list_empty(&wb->b_more_io)) | ||
541 | wbc->more_io = 1; | ||
542 | } | 606 | } |
543 | /* b_io is empty */ | 607 | return wrote; |
544 | return 1; | ||
545 | } | 608 | } |
546 | 609 | ||
547 | void writeback_inodes_wb(struct bdi_writeback *wb, | 610 | static long __writeback_inodes_wb(struct bdi_writeback *wb, |
548 | struct writeback_control *wbc) | 611 | struct wb_writeback_work *work) |
549 | { | 612 | { |
550 | int ret = 0; | 613 | unsigned long start_time = jiffies; |
551 | 614 | long wrote = 0; | |
552 | if (!wbc->wb_start) | ||
553 | wbc->wb_start = jiffies; /* livelock avoidance */ | ||
554 | spin_lock(&inode_wb_list_lock); | ||
555 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) | ||
556 | queue_io(wb, wbc->older_than_this); | ||
557 | 615 | ||
558 | while (!list_empty(&wb->b_io)) { | 616 | while (!list_empty(&wb->b_io)) { |
559 | struct inode *inode = wb_inode(wb->b_io.prev); | 617 | struct inode *inode = wb_inode(wb->b_io.prev); |
560 | struct super_block *sb = inode->i_sb; | 618 | struct super_block *sb = inode->i_sb; |
561 | 619 | ||
562 | if (!grab_super_passive(sb)) { | 620 | if (!grab_super_passive(sb)) { |
563 | requeue_io(inode); | 621 | requeue_io(inode, wb); |
564 | continue; | 622 | continue; |
565 | } | 623 | } |
566 | ret = writeback_sb_inodes(sb, wb, wbc, false); | 624 | wrote += writeback_sb_inodes(sb, wb, work); |
567 | drop_super(sb); | 625 | drop_super(sb); |
568 | 626 | ||
569 | if (ret) | 627 | /* refer to the same tests at the end of writeback_sb_inodes */ |
570 | break; | 628 | if (wrote) { |
629 | if (time_is_before_jiffies(start_time + HZ / 10UL)) | ||
630 | break; | ||
631 | if (work->nr_pages <= 0) | ||
632 | break; | ||
633 | } | ||
571 | } | 634 | } |
572 | spin_unlock(&inode_wb_list_lock); | ||
573 | /* Leave any unwritten inodes on b_io */ | 635 | /* Leave any unwritten inodes on b_io */ |
636 | return wrote; | ||
574 | } | 637 | } |
575 | 638 | ||
576 | static void __writeback_inodes_sb(struct super_block *sb, | 639 | long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) |
577 | struct bdi_writeback *wb, struct writeback_control *wbc) | ||
578 | { | 640 | { |
579 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 641 | struct wb_writeback_work work = { |
642 | .nr_pages = nr_pages, | ||
643 | .sync_mode = WB_SYNC_NONE, | ||
644 | .range_cyclic = 1, | ||
645 | }; | ||
580 | 646 | ||
581 | spin_lock(&inode_wb_list_lock); | 647 | spin_lock(&wb->list_lock); |
582 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) | 648 | if (list_empty(&wb->b_io)) |
583 | queue_io(wb, wbc->older_than_this); | 649 | queue_io(wb, NULL); |
584 | writeback_sb_inodes(sb, wb, wbc, true); | 650 | __writeback_inodes_wb(wb, &work); |
585 | spin_unlock(&inode_wb_list_lock); | 651 | spin_unlock(&wb->list_lock); |
586 | } | ||
587 | 652 | ||
588 | /* | 653 | return nr_pages - work.nr_pages; |
589 | * The maximum number of pages to writeout in a single bdi flush/kupdate | 654 | } |
590 | * operation. We do this so we don't hold I_SYNC against an inode for | ||
591 | * enormous amounts of time, which would block a userspace task which has | ||
592 | * been forced to throttle against that inode. Also, the code reevaluates | ||
593 | * the dirty each time it has written this many pages. | ||
594 | */ | ||
595 | #define MAX_WRITEBACK_PAGES 1024 | ||
596 | 655 | ||
597 | static inline bool over_bground_thresh(void) | 656 | static inline bool over_bground_thresh(void) |
598 | { | 657 | { |
@@ -605,6 +664,16 @@ static inline bool over_bground_thresh(void) | |||
605 | } | 664 | } |
606 | 665 | ||
607 | /* | 666 | /* |
667 | * Called under wb->list_lock. If there are multiple wb per bdi, | ||
668 | * only the flusher working on the first wb should do it. | ||
669 | */ | ||
670 | static void wb_update_bandwidth(struct bdi_writeback *wb, | ||
671 | unsigned long start_time) | ||
672 | { | ||
673 | __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); | ||
674 | } | ||
675 | |||
676 | /* | ||
608 | * Explicit flushing or periodic writeback of "old" data. | 677 | * Explicit flushing or periodic writeback of "old" data. |
609 | * | 678 | * |
610 | * Define "old": the first time one of an inode's pages is dirtied, we mark the | 679 | * Define "old": the first time one of an inode's pages is dirtied, we mark the |
@@ -622,47 +691,16 @@ static inline bool over_bground_thresh(void) | |||
622 | static long wb_writeback(struct bdi_writeback *wb, | 691 | static long wb_writeback(struct bdi_writeback *wb, |
623 | struct wb_writeback_work *work) | 692 | struct wb_writeback_work *work) |
624 | { | 693 | { |
625 | struct writeback_control wbc = { | 694 | unsigned long wb_start = jiffies; |
626 | .sync_mode = work->sync_mode, | 695 | long nr_pages = work->nr_pages; |
627 | .older_than_this = NULL, | ||
628 | .for_kupdate = work->for_kupdate, | ||
629 | .for_background = work->for_background, | ||
630 | .range_cyclic = work->range_cyclic, | ||
631 | }; | ||
632 | unsigned long oldest_jif; | 696 | unsigned long oldest_jif; |
633 | long wrote = 0; | ||
634 | long write_chunk; | ||
635 | struct inode *inode; | 697 | struct inode *inode; |
698 | long progress; | ||
636 | 699 | ||
637 | if (wbc.for_kupdate) { | 700 | oldest_jif = jiffies; |
638 | wbc.older_than_this = &oldest_jif; | 701 | work->older_than_this = &oldest_jif; |
639 | oldest_jif = jiffies - | ||
640 | msecs_to_jiffies(dirty_expire_interval * 10); | ||
641 | } | ||
642 | if (!wbc.range_cyclic) { | ||
643 | wbc.range_start = 0; | ||
644 | wbc.range_end = LLONG_MAX; | ||
645 | } | ||
646 | 702 | ||
647 | /* | 703 | spin_lock(&wb->list_lock); |
648 | * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | ||
649 | * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | ||
650 | * here avoids calling into writeback_inodes_wb() more than once. | ||
651 | * | ||
652 | * The intended call sequence for WB_SYNC_ALL writeback is: | ||
653 | * | ||
654 | * wb_writeback() | ||
655 | * __writeback_inodes_sb() <== called only once | ||
656 | * write_cache_pages() <== called once for each inode | ||
657 | * (quickly) tag currently dirty pages | ||
658 | * (maybe slowly) sync all tagged pages | ||
659 | */ | ||
660 | if (wbc.sync_mode == WB_SYNC_NONE) | ||
661 | write_chunk = MAX_WRITEBACK_PAGES; | ||
662 | else | ||
663 | write_chunk = LONG_MAX; | ||
664 | |||
665 | wbc.wb_start = jiffies; /* livelock avoidance */ | ||
666 | for (;;) { | 704 | for (;;) { |
667 | /* | 705 | /* |
668 | * Stop writeback when nr_pages has been consumed | 706 | * Stop writeback when nr_pages has been consumed |
@@ -687,52 +725,54 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
687 | if (work->for_background && !over_bground_thresh()) | 725 | if (work->for_background && !over_bground_thresh()) |
688 | break; | 726 | break; |
689 | 727 | ||
690 | wbc.more_io = 0; | 728 | if (work->for_kupdate) { |
691 | wbc.nr_to_write = write_chunk; | 729 | oldest_jif = jiffies - |
692 | wbc.pages_skipped = 0; | 730 | msecs_to_jiffies(dirty_expire_interval * 10); |
731 | work->older_than_this = &oldest_jif; | ||
732 | } | ||
693 | 733 | ||
694 | trace_wbc_writeback_start(&wbc, wb->bdi); | 734 | trace_writeback_start(wb->bdi, work); |
735 | if (list_empty(&wb->b_io)) | ||
736 | queue_io(wb, work->older_than_this); | ||
695 | if (work->sb) | 737 | if (work->sb) |
696 | __writeback_inodes_sb(work->sb, wb, &wbc); | 738 | progress = writeback_sb_inodes(work->sb, wb, work); |
697 | else | 739 | else |
698 | writeback_inodes_wb(wb, &wbc); | 740 | progress = __writeback_inodes_wb(wb, work); |
699 | trace_wbc_writeback_written(&wbc, wb->bdi); | 741 | trace_writeback_written(wb->bdi, work); |
700 | 742 | ||
701 | work->nr_pages -= write_chunk - wbc.nr_to_write; | 743 | wb_update_bandwidth(wb, wb_start); |
702 | wrote += write_chunk - wbc.nr_to_write; | ||
703 | 744 | ||
704 | /* | 745 | /* |
705 | * If we consumed everything, see if we have more | 746 | * Did we write something? Try for more |
747 | * | ||
748 | * Dirty inodes are moved to b_io for writeback in batches. | ||
749 | * The completion of the current batch does not necessarily | ||
750 | * mean the overall work is done. So we keep looping as long | ||
751 | * as made some progress on cleaning pages or inodes. | ||
706 | */ | 752 | */ |
707 | if (wbc.nr_to_write <= 0) | 753 | if (progress) |
708 | continue; | 754 | continue; |
709 | /* | 755 | /* |
710 | * Didn't write everything and we don't have more IO, bail | 756 | * No more inodes for IO, bail |
711 | */ | 757 | */ |
712 | if (!wbc.more_io) | 758 | if (list_empty(&wb->b_more_io)) |
713 | break; | 759 | break; |
714 | /* | 760 | /* |
715 | * Did we write something? Try for more | ||
716 | */ | ||
717 | if (wbc.nr_to_write < write_chunk) | ||
718 | continue; | ||
719 | /* | ||
720 | * Nothing written. Wait for some inode to | 761 | * Nothing written. Wait for some inode to |
721 | * become available for writeback. Otherwise | 762 | * become available for writeback. Otherwise |
722 | * we'll just busyloop. | 763 | * we'll just busyloop. |
723 | */ | 764 | */ |
724 | spin_lock(&inode_wb_list_lock); | ||
725 | if (!list_empty(&wb->b_more_io)) { | 765 | if (!list_empty(&wb->b_more_io)) { |
766 | trace_writeback_wait(wb->bdi, work); | ||
726 | inode = wb_inode(wb->b_more_io.prev); | 767 | inode = wb_inode(wb->b_more_io.prev); |
727 | trace_wbc_writeback_wait(&wbc, wb->bdi); | ||
728 | spin_lock(&inode->i_lock); | 768 | spin_lock(&inode->i_lock); |
729 | inode_wait_for_writeback(inode); | 769 | inode_wait_for_writeback(inode, wb); |
730 | spin_unlock(&inode->i_lock); | 770 | spin_unlock(&inode->i_lock); |
731 | } | 771 | } |
732 | spin_unlock(&inode_wb_list_lock); | ||
733 | } | 772 | } |
773 | spin_unlock(&wb->list_lock); | ||
734 | 774 | ||
735 | return wrote; | 775 | return nr_pages - work->nr_pages; |
736 | } | 776 | } |
737 | 777 | ||
738 | /* | 778 | /* |
@@ -1063,10 +1103,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) | |||
1063 | } | 1103 | } |
1064 | 1104 | ||
1065 | spin_unlock(&inode->i_lock); | 1105 | spin_unlock(&inode->i_lock); |
1066 | spin_lock(&inode_wb_list_lock); | 1106 | spin_lock(&bdi->wb.list_lock); |
1067 | inode->dirtied_when = jiffies; | 1107 | inode->dirtied_when = jiffies; |
1068 | list_move(&inode->i_wb_list, &bdi->wb.b_dirty); | 1108 | list_move(&inode->i_wb_list, &bdi->wb.b_dirty); |
1069 | spin_unlock(&inode_wb_list_lock); | 1109 | spin_unlock(&bdi->wb.list_lock); |
1070 | 1110 | ||
1071 | if (wakeup_bdi) | 1111 | if (wakeup_bdi) |
1072 | bdi_wakeup_thread_delayed(bdi); | 1112 | bdi_wakeup_thread_delayed(bdi); |
@@ -1162,10 +1202,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) | |||
1162 | { | 1202 | { |
1163 | DECLARE_COMPLETION_ONSTACK(done); | 1203 | DECLARE_COMPLETION_ONSTACK(done); |
1164 | struct wb_writeback_work work = { | 1204 | struct wb_writeback_work work = { |
1165 | .sb = sb, | 1205 | .sb = sb, |
1166 | .sync_mode = WB_SYNC_NONE, | 1206 | .sync_mode = WB_SYNC_NONE, |
1167 | .done = &done, | 1207 | .tagged_writepages = 1, |
1168 | .nr_pages = nr, | 1208 | .done = &done, |
1209 | .nr_pages = nr, | ||
1169 | }; | 1210 | }; |
1170 | 1211 | ||
1171 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 1212 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
@@ -1267,6 +1308,7 @@ EXPORT_SYMBOL(sync_inodes_sb); | |||
1267 | */ | 1308 | */ |
1268 | int write_inode_now(struct inode *inode, int sync) | 1309 | int write_inode_now(struct inode *inode, int sync) |
1269 | { | 1310 | { |
1311 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | ||
1270 | int ret; | 1312 | int ret; |
1271 | struct writeback_control wbc = { | 1313 | struct writeback_control wbc = { |
1272 | .nr_to_write = LONG_MAX, | 1314 | .nr_to_write = LONG_MAX, |
@@ -1279,11 +1321,11 @@ int write_inode_now(struct inode *inode, int sync) | |||
1279 | wbc.nr_to_write = 0; | 1321 | wbc.nr_to_write = 0; |
1280 | 1322 | ||
1281 | might_sleep(); | 1323 | might_sleep(); |
1282 | spin_lock(&inode_wb_list_lock); | 1324 | spin_lock(&wb->list_lock); |
1283 | spin_lock(&inode->i_lock); | 1325 | spin_lock(&inode->i_lock); |
1284 | ret = writeback_single_inode(inode, &wbc); | 1326 | ret = writeback_single_inode(inode, wb, &wbc); |
1285 | spin_unlock(&inode->i_lock); | 1327 | spin_unlock(&inode->i_lock); |
1286 | spin_unlock(&inode_wb_list_lock); | 1328 | spin_unlock(&wb->list_lock); |
1287 | if (sync) | 1329 | if (sync) |
1288 | inode_sync_wait(inode); | 1330 | inode_sync_wait(inode); |
1289 | return ret; | 1331 | return ret; |
@@ -1303,13 +1345,14 @@ EXPORT_SYMBOL(write_inode_now); | |||
1303 | */ | 1345 | */ |
1304 | int sync_inode(struct inode *inode, struct writeback_control *wbc) | 1346 | int sync_inode(struct inode *inode, struct writeback_control *wbc) |
1305 | { | 1347 | { |
1348 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | ||
1306 | int ret; | 1349 | int ret; |
1307 | 1350 | ||
1308 | spin_lock(&inode_wb_list_lock); | 1351 | spin_lock(&wb->list_lock); |
1309 | spin_lock(&inode->i_lock); | 1352 | spin_lock(&inode->i_lock); |
1310 | ret = writeback_single_inode(inode, wbc); | 1353 | ret = writeback_single_inode(inode, wb, wbc); |
1311 | spin_unlock(&inode->i_lock); | 1354 | spin_unlock(&inode->i_lock); |
1312 | spin_unlock(&inode_wb_list_lock); | 1355 | spin_unlock(&wb->list_lock); |
1313 | return ret; | 1356 | return ret; |
1314 | } | 1357 | } |
1315 | EXPORT_SYMBOL(sync_inode); | 1358 | EXPORT_SYMBOL(sync_inode); |
diff --git a/fs/inode.c b/fs/inode.c index 96c77b81167c..a48fa5355fb4 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -37,7 +37,7 @@ | |||
37 | * inode->i_sb->s_inode_lru, inode->i_lru | 37 | * inode->i_sb->s_inode_lru, inode->i_lru |
38 | * inode_sb_list_lock protects: | 38 | * inode_sb_list_lock protects: |
39 | * sb->s_inodes, inode->i_sb_list | 39 | * sb->s_inodes, inode->i_sb_list |
40 | * inode_wb_list_lock protects: | 40 | * bdi->wb.list_lock protects: |
41 | * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list | 41 | * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list |
42 | * inode_hash_lock protects: | 42 | * inode_hash_lock protects: |
43 | * inode_hashtable, inode->i_hash | 43 | * inode_hashtable, inode->i_hash |
@@ -48,7 +48,7 @@ | |||
48 | * inode->i_lock | 48 | * inode->i_lock |
49 | * inode->i_sb->s_inode_lru_lock | 49 | * inode->i_sb->s_inode_lru_lock |
50 | * | 50 | * |
51 | * inode_wb_list_lock | 51 | * bdi->wb.list_lock |
52 | * inode->i_lock | 52 | * inode->i_lock |
53 | * | 53 | * |
54 | * inode_hash_lock | 54 | * inode_hash_lock |
@@ -65,7 +65,6 @@ static struct hlist_head *inode_hashtable __read_mostly; | |||
65 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); | 65 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); |
66 | 66 | ||
67 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); | 67 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); |
68 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); | ||
69 | 68 | ||
70 | /* | 69 | /* |
71 | * Empty aops. Can be used for the cases where the user does not | 70 | * Empty aops. Can be used for the cases where the user does not |
diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 08579312c57b..00e37501fa3b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c | |||
@@ -1566,8 +1566,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) | |||
1566 | int status; | 1566 | int status; |
1567 | bool sync = true; | 1567 | bool sync = true; |
1568 | 1568 | ||
1569 | if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || | 1569 | if (wbc->sync_mode == WB_SYNC_NONE) |
1570 | wbc->for_background) | ||
1571 | sync = false; | 1570 | sync = false; |
1572 | 1571 | ||
1573 | status = pnfs_layoutcommit_inode(inode, sync); | 1572 | status = pnfs_layoutcommit_inode(inode, sync); |
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 96f4094b706d..a008982e7c08 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
@@ -40,6 +40,7 @@ typedef int (congested_fn)(void *, int); | |||
40 | enum bdi_stat_item { | 40 | enum bdi_stat_item { |
41 | BDI_RECLAIMABLE, | 41 | BDI_RECLAIMABLE, |
42 | BDI_WRITEBACK, | 42 | BDI_WRITEBACK, |
43 | BDI_WRITTEN, | ||
43 | NR_BDI_STAT_ITEMS | 44 | NR_BDI_STAT_ITEMS |
44 | }; | 45 | }; |
45 | 46 | ||
@@ -57,6 +58,7 @@ struct bdi_writeback { | |||
57 | struct list_head b_dirty; /* dirty inodes */ | 58 | struct list_head b_dirty; /* dirty inodes */ |
58 | struct list_head b_io; /* parked for writeback */ | 59 | struct list_head b_io; /* parked for writeback */ |
59 | struct list_head b_more_io; /* parked for more writeback */ | 60 | struct list_head b_more_io; /* parked for more writeback */ |
61 | spinlock_t list_lock; /* protects the b_* lists */ | ||
60 | }; | 62 | }; |
61 | 63 | ||
62 | struct backing_dev_info { | 64 | struct backing_dev_info { |
@@ -71,6 +73,11 @@ struct backing_dev_info { | |||
71 | 73 | ||
72 | struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; | 74 | struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; |
73 | 75 | ||
76 | unsigned long bw_time_stamp; /* last time write bw is updated */ | ||
77 | unsigned long written_stamp; /* pages written at bw_time_stamp */ | ||
78 | unsigned long write_bandwidth; /* the estimated write bandwidth */ | ||
79 | unsigned long avg_write_bandwidth; /* further smoothed write bw */ | ||
80 | |||
74 | struct prop_local_percpu completions; | 81 | struct prop_local_percpu completions; |
75 | int dirty_exceeded; | 82 | int dirty_exceeded; |
76 | 83 | ||
@@ -106,6 +113,7 @@ int bdi_writeback_thread(void *data); | |||
106 | int bdi_has_dirty_io(struct backing_dev_info *bdi); | 113 | int bdi_has_dirty_io(struct backing_dev_info *bdi); |
107 | void bdi_arm_supers_timer(void); | 114 | void bdi_arm_supers_timer(void); |
108 | void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); | 115 | void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); |
116 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); | ||
109 | 117 | ||
110 | extern spinlock_t bdi_lock; | 118 | extern spinlock_t bdi_lock; |
111 | extern struct list_head bdi_list; | 119 | extern struct list_head bdi_list; |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 17e7ccc322a5..f1bfa12ea246 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -7,9 +7,39 @@ | |||
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | #include <linux/fs.h> | 8 | #include <linux/fs.h> |
9 | 9 | ||
10 | struct backing_dev_info; | 10 | /* |
11 | * The 1/4 region under the global dirty thresh is for smooth dirty throttling: | ||
12 | * | ||
13 | * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) | ||
14 | * | ||
15 | * The 1/16 region above the global dirty limit will be put to maximum pauses: | ||
16 | * | ||
17 | * (limit, limit + limit/DIRTY_MAXPAUSE_AREA) | ||
18 | * | ||
19 | * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put | ||
20 | * to loops: | ||
21 | * | ||
22 | * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA) | ||
23 | * | ||
24 | * Further beyond, all dirtier tasks will enter a loop waiting (possibly long | ||
25 | * time) for the dirty pages to drop, unless written enough pages. | ||
26 | * | ||
27 | * The global dirty threshold is normally equal to the global dirty limit, | ||
28 | * except when the system suddenly allocates a lot of anonymous memory and | ||
29 | * knocks down the global dirty threshold quickly, in which case the global | ||
30 | * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. | ||
31 | */ | ||
32 | #define DIRTY_SCOPE 8 | ||
33 | #define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) | ||
34 | #define DIRTY_MAXPAUSE_AREA 16 | ||
35 | #define DIRTY_PASSGOOD_AREA 8 | ||
11 | 36 | ||
12 | extern spinlock_t inode_wb_list_lock; | 37 | /* |
38 | * 4MB minimal write chunk size | ||
39 | */ | ||
40 | #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) | ||
41 | |||
42 | struct backing_dev_info; | ||
13 | 43 | ||
14 | /* | 44 | /* |
15 | * fs/fs-writeback.c | 45 | * fs/fs-writeback.c |
@@ -26,11 +56,6 @@ enum writeback_sync_modes { | |||
26 | */ | 56 | */ |
27 | struct writeback_control { | 57 | struct writeback_control { |
28 | enum writeback_sync_modes sync_mode; | 58 | enum writeback_sync_modes sync_mode; |
29 | unsigned long *older_than_this; /* If !NULL, only write back inodes | ||
30 | older than this */ | ||
31 | unsigned long wb_start; /* Time writeback_inodes_wb was | ||
32 | called. This is needed to avoid | ||
33 | extra jobs and livelock */ | ||
34 | long nr_to_write; /* Write this many pages, and decrement | 59 | long nr_to_write; /* Write this many pages, and decrement |
35 | this for each page written */ | 60 | this for each page written */ |
36 | long pages_skipped; /* Pages which were not written */ | 61 | long pages_skipped; /* Pages which were not written */ |
@@ -43,13 +68,11 @@ struct writeback_control { | |||
43 | loff_t range_start; | 68 | loff_t range_start; |
44 | loff_t range_end; | 69 | loff_t range_end; |
45 | 70 | ||
46 | unsigned nonblocking:1; /* Don't get stuck on request queues */ | ||
47 | unsigned encountered_congestion:1; /* An output: a queue is full */ | ||
48 | unsigned for_kupdate:1; /* A kupdate writeback */ | 71 | unsigned for_kupdate:1; /* A kupdate writeback */ |
49 | unsigned for_background:1; /* A background writeback */ | 72 | unsigned for_background:1; /* A background writeback */ |
73 | unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ | ||
50 | unsigned for_reclaim:1; /* Invoked from the page allocator */ | 74 | unsigned for_reclaim:1; /* Invoked from the page allocator */ |
51 | unsigned range_cyclic:1; /* range_start is cyclic */ | 75 | unsigned range_cyclic:1; /* range_start is cyclic */ |
52 | unsigned more_io:1; /* more io to be dispatched */ | ||
53 | }; | 76 | }; |
54 | 77 | ||
55 | /* | 78 | /* |
@@ -62,8 +85,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); | |||
62 | int writeback_inodes_sb_if_idle(struct super_block *); | 85 | int writeback_inodes_sb_if_idle(struct super_block *); |
63 | int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); | 86 | int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); |
64 | void sync_inodes_sb(struct super_block *); | 87 | void sync_inodes_sb(struct super_block *); |
65 | void writeback_inodes_wb(struct bdi_writeback *wb, | 88 | long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages); |
66 | struct writeback_control *wbc); | ||
67 | long wb_do_writeback(struct bdi_writeback *wb, int force_wait); | 89 | long wb_do_writeback(struct bdi_writeback *wb, int force_wait); |
68 | void wakeup_flusher_threads(long nr_pages); | 90 | void wakeup_flusher_threads(long nr_pages); |
69 | 91 | ||
@@ -94,6 +116,8 @@ static inline void laptop_sync_completion(void) { } | |||
94 | #endif | 116 | #endif |
95 | void throttle_vm_writeout(gfp_t gfp_mask); | 117 | void throttle_vm_writeout(gfp_t gfp_mask); |
96 | 118 | ||
119 | extern unsigned long global_dirty_limit; | ||
120 | |||
97 | /* These are exported to sysctl. */ | 121 | /* These are exported to sysctl. */ |
98 | extern int dirty_background_ratio; | 122 | extern int dirty_background_ratio; |
99 | extern unsigned long dirty_background_bytes; | 123 | extern unsigned long dirty_background_bytes; |
@@ -128,6 +152,13 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); | |||
128 | unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, | 152 | unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, |
129 | unsigned long dirty); | 153 | unsigned long dirty); |
130 | 154 | ||
155 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
156 | unsigned long thresh, | ||
157 | unsigned long dirty, | ||
158 | unsigned long bdi_thresh, | ||
159 | unsigned long bdi_dirty, | ||
160 | unsigned long start_time); | ||
161 | |||
131 | void page_writeback_init(void); | 162 | void page_writeback_init(void); |
132 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 163 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
133 | unsigned long nr_pages_dirtied); | 164 | unsigned long nr_pages_dirtied); |
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 4114129f0794..b31702ac15be 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h | |||
@@ -284,7 +284,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage, | |||
284 | __field( long, pages_skipped ) | 284 | __field( long, pages_skipped ) |
285 | __field( loff_t, range_start ) | 285 | __field( loff_t, range_start ) |
286 | __field( loff_t, range_end ) | 286 | __field( loff_t, range_end ) |
287 | __field( char, nonblocking ) | ||
288 | __field( char, for_kupdate ) | 287 | __field( char, for_kupdate ) |
289 | __field( char, for_reclaim ) | 288 | __field( char, for_reclaim ) |
290 | __field( char, range_cyclic ) | 289 | __field( char, range_cyclic ) |
@@ -299,7 +298,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage, | |||
299 | __entry->pages_skipped = wbc->pages_skipped; | 298 | __entry->pages_skipped = wbc->pages_skipped; |
300 | __entry->range_start = wbc->range_start; | 299 | __entry->range_start = wbc->range_start; |
301 | __entry->range_end = wbc->range_end; | 300 | __entry->range_end = wbc->range_end; |
302 | __entry->nonblocking = wbc->nonblocking; | ||
303 | __entry->for_kupdate = wbc->for_kupdate; | 301 | __entry->for_kupdate = wbc->for_kupdate; |
304 | __entry->for_reclaim = wbc->for_reclaim; | 302 | __entry->for_reclaim = wbc->for_reclaim; |
305 | __entry->range_cyclic = wbc->range_cyclic; | 303 | __entry->range_cyclic = wbc->range_cyclic; |
@@ -310,13 +308,13 @@ DECLARE_EVENT_CLASS(btrfs__writepage, | |||
310 | 308 | ||
311 | TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " | 309 | TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " |
312 | "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " | 310 | "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " |
313 | "range_end = %llu, nonblocking = %d, for_kupdate = %d, " | 311 | "range_end = %llu, for_kupdate = %d, " |
314 | "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", | 312 | "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", |
315 | show_root_type(__entry->root_objectid), | 313 | show_root_type(__entry->root_objectid), |
316 | (unsigned long)__entry->ino, __entry->index, | 314 | (unsigned long)__entry->ino, __entry->index, |
317 | __entry->nr_to_write, __entry->pages_skipped, | 315 | __entry->nr_to_write, __entry->pages_skipped, |
318 | __entry->range_start, __entry->range_end, | 316 | __entry->range_start, __entry->range_end, |
319 | __entry->nonblocking, __entry->for_kupdate, | 317 | __entry->for_kupdate, |
320 | __entry->for_reclaim, __entry->range_cyclic, | 318 | __entry->for_reclaim, __entry->range_cyclic, |
321 | (unsigned long)__entry->writeback_index) | 319 | (unsigned long)__entry->writeback_index) |
322 | ); | 320 | ); |
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 5ce2b2f5f524..6363193a3418 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h | |||
@@ -380,7 +380,6 @@ TRACE_EVENT(ext4_da_writepages_result, | |||
380 | __field( int, pages_written ) | 380 | __field( int, pages_written ) |
381 | __field( long, pages_skipped ) | 381 | __field( long, pages_skipped ) |
382 | __field( int, sync_mode ) | 382 | __field( int, sync_mode ) |
383 | __field( char, more_io ) | ||
384 | __field( pgoff_t, writeback_index ) | 383 | __field( pgoff_t, writeback_index ) |
385 | ), | 384 | ), |
386 | 385 | ||
@@ -391,16 +390,15 @@ TRACE_EVENT(ext4_da_writepages_result, | |||
391 | __entry->pages_written = pages_written; | 390 | __entry->pages_written = pages_written; |
392 | __entry->pages_skipped = wbc->pages_skipped; | 391 | __entry->pages_skipped = wbc->pages_skipped; |
393 | __entry->sync_mode = wbc->sync_mode; | 392 | __entry->sync_mode = wbc->sync_mode; |
394 | __entry->more_io = wbc->more_io; | ||
395 | __entry->writeback_index = inode->i_mapping->writeback_index; | 393 | __entry->writeback_index = inode->i_mapping->writeback_index; |
396 | ), | 394 | ), |
397 | 395 | ||
398 | TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " | 396 | TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " |
399 | " more_io %d sync_mode %d writeback_index %lu", | 397 | "sync_mode %d writeback_index %lu", |
400 | MAJOR(__entry->dev), MINOR(__entry->dev), | 398 | MAJOR(__entry->dev), MINOR(__entry->dev), |
401 | (unsigned long) __entry->ino, __entry->ret, | 399 | (unsigned long) __entry->ino, __entry->ret, |
402 | __entry->pages_written, __entry->pages_skipped, | 400 | __entry->pages_written, __entry->pages_skipped, |
403 | __entry->more_io, __entry->sync_mode, | 401 | __entry->sync_mode, |
404 | (unsigned long) __entry->writeback_index) | 402 | (unsigned long) __entry->writeback_index) |
405 | ); | 403 | ); |
406 | 404 | ||
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 4e249b927eaa..6bca4cc0063c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h | |||
@@ -8,6 +8,19 @@ | |||
8 | #include <linux/device.h> | 8 | #include <linux/device.h> |
9 | #include <linux/writeback.h> | 9 | #include <linux/writeback.h> |
10 | 10 | ||
11 | #define show_inode_state(state) \ | ||
12 | __print_flags(state, "|", \ | ||
13 | {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ | ||
14 | {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ | ||
15 | {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ | ||
16 | {I_NEW, "I_NEW"}, \ | ||
17 | {I_WILL_FREE, "I_WILL_FREE"}, \ | ||
18 | {I_FREEING, "I_FREEING"}, \ | ||
19 | {I_CLEAR, "I_CLEAR"}, \ | ||
20 | {I_SYNC, "I_SYNC"}, \ | ||
21 | {I_REFERENCED, "I_REFERENCED"} \ | ||
22 | ) | ||
23 | |||
11 | struct wb_writeback_work; | 24 | struct wb_writeback_work; |
12 | 25 | ||
13 | DECLARE_EVENT_CLASS(writeback_work_class, | 26 | DECLARE_EVENT_CLASS(writeback_work_class, |
@@ -49,6 +62,9 @@ DEFINE_EVENT(writeback_work_class, name, \ | |||
49 | DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); | 62 | DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); |
50 | DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); | 63 | DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); |
51 | DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); | 64 | DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); |
65 | DEFINE_WRITEBACK_WORK_EVENT(writeback_start); | ||
66 | DEFINE_WRITEBACK_WORK_EVENT(writeback_written); | ||
67 | DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); | ||
52 | 68 | ||
53 | TRACE_EVENT(writeback_pages_written, | 69 | TRACE_EVENT(writeback_pages_written, |
54 | TP_PROTO(long pages_written), | 70 | TP_PROTO(long pages_written), |
@@ -88,6 +104,30 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_register); | |||
88 | DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); | 104 | DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); |
89 | DEFINE_WRITEBACK_EVENT(writeback_thread_start); | 105 | DEFINE_WRITEBACK_EVENT(writeback_thread_start); |
90 | DEFINE_WRITEBACK_EVENT(writeback_thread_stop); | 106 | DEFINE_WRITEBACK_EVENT(writeback_thread_stop); |
107 | DEFINE_WRITEBACK_EVENT(balance_dirty_start); | ||
108 | DEFINE_WRITEBACK_EVENT(balance_dirty_wait); | ||
109 | |||
110 | TRACE_EVENT(balance_dirty_written, | ||
111 | |||
112 | TP_PROTO(struct backing_dev_info *bdi, int written), | ||
113 | |||
114 | TP_ARGS(bdi, written), | ||
115 | |||
116 | TP_STRUCT__entry( | ||
117 | __array(char, name, 32) | ||
118 | __field(int, written) | ||
119 | ), | ||
120 | |||
121 | TP_fast_assign( | ||
122 | strncpy(__entry->name, dev_name(bdi->dev), 32); | ||
123 | __entry->written = written; | ||
124 | ), | ||
125 | |||
126 | TP_printk("bdi %s written %d", | ||
127 | __entry->name, | ||
128 | __entry->written | ||
129 | ) | ||
130 | ); | ||
91 | 131 | ||
92 | DECLARE_EVENT_CLASS(wbc_class, | 132 | DECLARE_EVENT_CLASS(wbc_class, |
93 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), | 133 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), |
@@ -101,8 +141,6 @@ DECLARE_EVENT_CLASS(wbc_class, | |||
101 | __field(int, for_background) | 141 | __field(int, for_background) |
102 | __field(int, for_reclaim) | 142 | __field(int, for_reclaim) |
103 | __field(int, range_cyclic) | 143 | __field(int, range_cyclic) |
104 | __field(int, more_io) | ||
105 | __field(unsigned long, older_than_this) | ||
106 | __field(long, range_start) | 144 | __field(long, range_start) |
107 | __field(long, range_end) | 145 | __field(long, range_end) |
108 | ), | 146 | ), |
@@ -116,15 +154,12 @@ DECLARE_EVENT_CLASS(wbc_class, | |||
116 | __entry->for_background = wbc->for_background; | 154 | __entry->for_background = wbc->for_background; |
117 | __entry->for_reclaim = wbc->for_reclaim; | 155 | __entry->for_reclaim = wbc->for_reclaim; |
118 | __entry->range_cyclic = wbc->range_cyclic; | 156 | __entry->range_cyclic = wbc->range_cyclic; |
119 | __entry->more_io = wbc->more_io; | ||
120 | __entry->older_than_this = wbc->older_than_this ? | ||
121 | *wbc->older_than_this : 0; | ||
122 | __entry->range_start = (long)wbc->range_start; | 157 | __entry->range_start = (long)wbc->range_start; |
123 | __entry->range_end = (long)wbc->range_end; | 158 | __entry->range_end = (long)wbc->range_end; |
124 | ), | 159 | ), |
125 | 160 | ||
126 | TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " | 161 | TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " |
127 | "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx " | 162 | "bgrd=%d reclm=%d cyclic=%d " |
128 | "start=0x%lx end=0x%lx", | 163 | "start=0x%lx end=0x%lx", |
129 | __entry->name, | 164 | __entry->name, |
130 | __entry->nr_to_write, | 165 | __entry->nr_to_write, |
@@ -134,8 +169,6 @@ DECLARE_EVENT_CLASS(wbc_class, | |||
134 | __entry->for_background, | 169 | __entry->for_background, |
135 | __entry->for_reclaim, | 170 | __entry->for_reclaim, |
136 | __entry->range_cyclic, | 171 | __entry->range_cyclic, |
137 | __entry->more_io, | ||
138 | __entry->older_than_this, | ||
139 | __entry->range_start, | 172 | __entry->range_start, |
140 | __entry->range_end) | 173 | __entry->range_end) |
141 | ) | 174 | ) |
@@ -144,14 +177,79 @@ DECLARE_EVENT_CLASS(wbc_class, | |||
144 | DEFINE_EVENT(wbc_class, name, \ | 177 | DEFINE_EVENT(wbc_class, name, \ |
145 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ | 178 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ |
146 | TP_ARGS(wbc, bdi)) | 179 | TP_ARGS(wbc, bdi)) |
147 | DEFINE_WBC_EVENT(wbc_writeback_start); | ||
148 | DEFINE_WBC_EVENT(wbc_writeback_written); | ||
149 | DEFINE_WBC_EVENT(wbc_writeback_wait); | ||
150 | DEFINE_WBC_EVENT(wbc_balance_dirty_start); | ||
151 | DEFINE_WBC_EVENT(wbc_balance_dirty_written); | ||
152 | DEFINE_WBC_EVENT(wbc_balance_dirty_wait); | ||
153 | DEFINE_WBC_EVENT(wbc_writepage); | 180 | DEFINE_WBC_EVENT(wbc_writepage); |
154 | 181 | ||
182 | TRACE_EVENT(writeback_queue_io, | ||
183 | TP_PROTO(struct bdi_writeback *wb, | ||
184 | unsigned long *older_than_this, | ||
185 | int moved), | ||
186 | TP_ARGS(wb, older_than_this, moved), | ||
187 | TP_STRUCT__entry( | ||
188 | __array(char, name, 32) | ||
189 | __field(unsigned long, older) | ||
190 | __field(long, age) | ||
191 | __field(int, moved) | ||
192 | ), | ||
193 | TP_fast_assign( | ||
194 | strncpy(__entry->name, dev_name(wb->bdi->dev), 32); | ||
195 | __entry->older = older_than_this ? *older_than_this : 0; | ||
196 | __entry->age = older_than_this ? | ||
197 | (jiffies - *older_than_this) * 1000 / HZ : -1; | ||
198 | __entry->moved = moved; | ||
199 | ), | ||
200 | TP_printk("bdi %s: older=%lu age=%ld enqueue=%d", | ||
201 | __entry->name, | ||
202 | __entry->older, /* older_than_this in jiffies */ | ||
203 | __entry->age, /* older_than_this in relative milliseconds */ | ||
204 | __entry->moved) | ||
205 | ); | ||
206 | |||
207 | TRACE_EVENT(global_dirty_state, | ||
208 | |||
209 | TP_PROTO(unsigned long background_thresh, | ||
210 | unsigned long dirty_thresh | ||
211 | ), | ||
212 | |||
213 | TP_ARGS(background_thresh, | ||
214 | dirty_thresh | ||
215 | ), | ||
216 | |||
217 | TP_STRUCT__entry( | ||
218 | __field(unsigned long, nr_dirty) | ||
219 | __field(unsigned long, nr_writeback) | ||
220 | __field(unsigned long, nr_unstable) | ||
221 | __field(unsigned long, background_thresh) | ||
222 | __field(unsigned long, dirty_thresh) | ||
223 | __field(unsigned long, dirty_limit) | ||
224 | __field(unsigned long, nr_dirtied) | ||
225 | __field(unsigned long, nr_written) | ||
226 | ), | ||
227 | |||
228 | TP_fast_assign( | ||
229 | __entry->nr_dirty = global_page_state(NR_FILE_DIRTY); | ||
230 | __entry->nr_writeback = global_page_state(NR_WRITEBACK); | ||
231 | __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); | ||
232 | __entry->nr_dirtied = global_page_state(NR_DIRTIED); | ||
233 | __entry->nr_written = global_page_state(NR_WRITTEN); | ||
234 | __entry->background_thresh = background_thresh; | ||
235 | __entry->dirty_thresh = dirty_thresh; | ||
236 | __entry->dirty_limit = global_dirty_limit; | ||
237 | ), | ||
238 | |||
239 | TP_printk("dirty=%lu writeback=%lu unstable=%lu " | ||
240 | "bg_thresh=%lu thresh=%lu limit=%lu " | ||
241 | "dirtied=%lu written=%lu", | ||
242 | __entry->nr_dirty, | ||
243 | __entry->nr_writeback, | ||
244 | __entry->nr_unstable, | ||
245 | __entry->background_thresh, | ||
246 | __entry->dirty_thresh, | ||
247 | __entry->dirty_limit, | ||
248 | __entry->nr_dirtied, | ||
249 | __entry->nr_written | ||
250 | ) | ||
251 | ); | ||
252 | |||
155 | DECLARE_EVENT_CLASS(writeback_congest_waited_template, | 253 | DECLARE_EVENT_CLASS(writeback_congest_waited_template, |
156 | 254 | ||
157 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), | 255 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), |
@@ -187,6 +285,63 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, | |||
187 | TP_ARGS(usec_timeout, usec_delayed) | 285 | TP_ARGS(usec_timeout, usec_delayed) |
188 | ); | 286 | ); |
189 | 287 | ||
288 | DECLARE_EVENT_CLASS(writeback_single_inode_template, | ||
289 | |||
290 | TP_PROTO(struct inode *inode, | ||
291 | struct writeback_control *wbc, | ||
292 | unsigned long nr_to_write | ||
293 | ), | ||
294 | |||
295 | TP_ARGS(inode, wbc, nr_to_write), | ||
296 | |||
297 | TP_STRUCT__entry( | ||
298 | __array(char, name, 32) | ||
299 | __field(unsigned long, ino) | ||
300 | __field(unsigned long, state) | ||
301 | __field(unsigned long, age) | ||
302 | __field(unsigned long, writeback_index) | ||
303 | __field(long, nr_to_write) | ||
304 | __field(unsigned long, wrote) | ||
305 | ), | ||
306 | |||
307 | TP_fast_assign( | ||
308 | strncpy(__entry->name, | ||
309 | dev_name(inode->i_mapping->backing_dev_info->dev), 32); | ||
310 | __entry->ino = inode->i_ino; | ||
311 | __entry->state = inode->i_state; | ||
312 | __entry->age = (jiffies - inode->dirtied_when) * | ||
313 | 1000 / HZ; | ||
314 | __entry->writeback_index = inode->i_mapping->writeback_index; | ||
315 | __entry->nr_to_write = nr_to_write; | ||
316 | __entry->wrote = nr_to_write - wbc->nr_to_write; | ||
317 | ), | ||
318 | |||
319 | TP_printk("bdi %s: ino=%lu state=%s age=%lu " | ||
320 | "index=%lu to_write=%ld wrote=%lu", | ||
321 | __entry->name, | ||
322 | __entry->ino, | ||
323 | show_inode_state(__entry->state), | ||
324 | __entry->age, | ||
325 | __entry->writeback_index, | ||
326 | __entry->nr_to_write, | ||
327 | __entry->wrote | ||
328 | ) | ||
329 | ); | ||
330 | |||
331 | DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue, | ||
332 | TP_PROTO(struct inode *inode, | ||
333 | struct writeback_control *wbc, | ||
334 | unsigned long nr_to_write), | ||
335 | TP_ARGS(inode, wbc, nr_to_write) | ||
336 | ); | ||
337 | |||
338 | DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, | ||
339 | TP_PROTO(struct inode *inode, | ||
340 | struct writeback_control *wbc, | ||
341 | unsigned long nr_to_write), | ||
342 | TP_ARGS(inode, wbc, nr_to_write) | ||
343 | ); | ||
344 | |||
190 | #endif /* _TRACE_WRITEBACK_H */ | 345 | #endif /* _TRACE_WRITEBACK_H */ |
191 | 346 | ||
192 | /* This part must be outside protection */ | 347 | /* This part must be outside protection */ |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8290b1e88257..d6edf8d14f9c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; | |||
45 | static int bdi_sync_supers(void *); | 45 | static int bdi_sync_supers(void *); |
46 | static void sync_supers_timer_fn(unsigned long); | 46 | static void sync_supers_timer_fn(unsigned long); |
47 | 47 | ||
48 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | ||
49 | { | ||
50 | if (wb1 < wb2) { | ||
51 | spin_lock(&wb1->list_lock); | ||
52 | spin_lock_nested(&wb2->list_lock, 1); | ||
53 | } else { | ||
54 | spin_lock(&wb2->list_lock); | ||
55 | spin_lock_nested(&wb1->list_lock, 1); | ||
56 | } | ||
57 | } | ||
58 | |||
48 | #ifdef CONFIG_DEBUG_FS | 59 | #ifdef CONFIG_DEBUG_FS |
49 | #include <linux/debugfs.h> | 60 | #include <linux/debugfs.h> |
50 | #include <linux/seq_file.h> | 61 | #include <linux/seq_file.h> |
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
67 | struct inode *inode; | 78 | struct inode *inode; |
68 | 79 | ||
69 | nr_dirty = nr_io = nr_more_io = 0; | 80 | nr_dirty = nr_io = nr_more_io = 0; |
70 | spin_lock(&inode_wb_list_lock); | 81 | spin_lock(&wb->list_lock); |
71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 82 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | nr_dirty++; | 83 | nr_dirty++; |
73 | list_for_each_entry(inode, &wb->b_io, i_wb_list) | 84 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
74 | nr_io++; | 85 | nr_io++; |
75 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 86 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
76 | nr_more_io++; | 87 | nr_more_io++; |
77 | spin_unlock(&inode_wb_list_lock); | 88 | spin_unlock(&wb->list_lock); |
78 | 89 | ||
79 | global_dirty_limits(&background_thresh, &dirty_thresh); | 90 | global_dirty_limits(&background_thresh, &dirty_thresh); |
80 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 91 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
81 | 92 | ||
82 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 93 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
83 | seq_printf(m, | 94 | seq_printf(m, |
84 | "BdiWriteback: %8lu kB\n" | 95 | "BdiWriteback: %10lu kB\n" |
85 | "BdiReclaimable: %8lu kB\n" | 96 | "BdiReclaimable: %10lu kB\n" |
86 | "BdiDirtyThresh: %8lu kB\n" | 97 | "BdiDirtyThresh: %10lu kB\n" |
87 | "DirtyThresh: %8lu kB\n" | 98 | "DirtyThresh: %10lu kB\n" |
88 | "BackgroundThresh: %8lu kB\n" | 99 | "BackgroundThresh: %10lu kB\n" |
89 | "b_dirty: %8lu\n" | 100 | "BdiWritten: %10lu kB\n" |
90 | "b_io: %8lu\n" | 101 | "BdiWriteBandwidth: %10lu kBps\n" |
91 | "b_more_io: %8lu\n" | 102 | "b_dirty: %10lu\n" |
92 | "bdi_list: %8u\n" | 103 | "b_io: %10lu\n" |
93 | "state: %8lx\n", | 104 | "b_more_io: %10lu\n" |
105 | "bdi_list: %10u\n" | ||
106 | "state: %10lx\n", | ||
94 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 107 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
95 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), | 108 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
96 | K(bdi_thresh), K(dirty_thresh), | 109 | K(bdi_thresh), |
97 | K(background_thresh), nr_dirty, nr_io, nr_more_io, | 110 | K(dirty_thresh), |
111 | K(background_thresh), | ||
112 | (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), | ||
113 | (unsigned long) K(bdi->write_bandwidth), | ||
114 | nr_dirty, | ||
115 | nr_io, | ||
116 | nr_more_io, | ||
98 | !list_empty(&bdi->bdi_list), bdi->state); | 117 | !list_empty(&bdi->bdi_list), bdi->state); |
99 | #undef K | 118 | #undef K |
100 | 119 | ||
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) | |||
249 | return wb_has_dirty_io(&bdi->wb); | 268 | return wb_has_dirty_io(&bdi->wb); |
250 | } | 269 | } |
251 | 270 | ||
252 | static void bdi_flush_io(struct backing_dev_info *bdi) | ||
253 | { | ||
254 | struct writeback_control wbc = { | ||
255 | .sync_mode = WB_SYNC_NONE, | ||
256 | .older_than_this = NULL, | ||
257 | .range_cyclic = 1, | ||
258 | .nr_to_write = 1024, | ||
259 | }; | ||
260 | |||
261 | writeback_inodes_wb(&bdi->wb, &wbc); | ||
262 | } | ||
263 | |||
264 | /* | 271 | /* |
265 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() | 272 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() |
266 | * or we risk deadlocking on ->s_umount. The longer term solution would be | 273 | * or we risk deadlocking on ->s_umount. The longer term solution would be |
@@ -446,9 +453,10 @@ static int bdi_forker_thread(void *ptr) | |||
446 | if (IS_ERR(task)) { | 453 | if (IS_ERR(task)) { |
447 | /* | 454 | /* |
448 | * If thread creation fails, force writeout of | 455 | * If thread creation fails, force writeout of |
449 | * the bdi from the thread. | 456 | * the bdi from the thread. Hopefully 1024 is |
457 | * large enough for efficient IO. | ||
450 | */ | 458 | */ |
451 | bdi_flush_io(bdi); | 459 | writeback_inodes_wb(&bdi->wb, 1024); |
452 | } else { | 460 | } else { |
453 | /* | 461 | /* |
454 | * The spinlock makes sure we do not lose | 462 | * The spinlock makes sure we do not lose |
@@ -629,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | |||
629 | INIT_LIST_HEAD(&wb->b_dirty); | 637 | INIT_LIST_HEAD(&wb->b_dirty); |
630 | INIT_LIST_HEAD(&wb->b_io); | 638 | INIT_LIST_HEAD(&wb->b_io); |
631 | INIT_LIST_HEAD(&wb->b_more_io); | 639 | INIT_LIST_HEAD(&wb->b_more_io); |
640 | spin_lock_init(&wb->list_lock); | ||
632 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); | 641 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); |
633 | } | 642 | } |
634 | 643 | ||
644 | /* | ||
645 | * Initial write bandwidth: 100 MB/s | ||
646 | */ | ||
647 | #define INIT_BW (100 << (20 - PAGE_SHIFT)) | ||
648 | |||
635 | int bdi_init(struct backing_dev_info *bdi) | 649 | int bdi_init(struct backing_dev_info *bdi) |
636 | { | 650 | { |
637 | int i, err; | 651 | int i, err; |
@@ -654,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi) | |||
654 | } | 668 | } |
655 | 669 | ||
656 | bdi->dirty_exceeded = 0; | 670 | bdi->dirty_exceeded = 0; |
671 | |||
672 | bdi->bw_time_stamp = jiffies; | ||
673 | bdi->written_stamp = 0; | ||
674 | |||
675 | bdi->write_bandwidth = INIT_BW; | ||
676 | bdi->avg_write_bandwidth = INIT_BW; | ||
677 | |||
657 | err = prop_local_init_percpu(&bdi->completions); | 678 | err = prop_local_init_percpu(&bdi->completions); |
658 | 679 | ||
659 | if (err) { | 680 | if (err) { |
@@ -677,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
677 | if (bdi_has_dirty_io(bdi)) { | 698 | if (bdi_has_dirty_io(bdi)) { |
678 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | 699 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
679 | 700 | ||
680 | spin_lock(&inode_wb_list_lock); | 701 | bdi_lock_two(&bdi->wb, dst); |
681 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | 702 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
682 | list_splice(&bdi->wb.b_io, &dst->b_io); | 703 | list_splice(&bdi->wb.b_io, &dst->b_io); |
683 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | 704 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
684 | spin_unlock(&inode_wb_list_lock); | 705 | spin_unlock(&bdi->wb.list_lock); |
706 | spin_unlock(&dst->list_lock); | ||
685 | } | 707 | } |
686 | 708 | ||
687 | bdi_unregister(bdi); | 709 | bdi_unregister(bdi); |
diff --git a/mm/filemap.c b/mm/filemap.c index 10a171113273..867d40222ec7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -78,7 +78,7 @@ | |||
78 | * ->i_mutex (generic_file_buffered_write) | 78 | * ->i_mutex (generic_file_buffered_write) |
79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) | 79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
80 | * | 80 | * |
81 | * inode_wb_list_lock | 81 | * bdi->wb.list_lock |
82 | * sb_lock (fs/fs-writeback.c) | 82 | * sb_lock (fs/fs-writeback.c) |
83 | * ->mapping->tree_lock (__sync_single_inode) | 83 | * ->mapping->tree_lock (__sync_single_inode) |
84 | * | 84 | * |
@@ -96,9 +96,9 @@ | |||
96 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 96 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
97 | * ->private_lock (page_remove_rmap->set_page_dirty) | 97 | * ->private_lock (page_remove_rmap->set_page_dirty) |
98 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 98 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
99 | * inode_wb_list_lock (page_remove_rmap->set_page_dirty) | 99 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) |
100 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) | 100 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
101 | * inode_wb_list_lock (zap_pte_range->set_page_dirty) | 101 | * bdi.wb->list_lock (zap_pte_range->set_page_dirty) |
102 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 102 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | * | 104 | * |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d8767b381b9c..d1960744f881 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -37,6 +37,16 @@ | |||
37 | #include <trace/events/writeback.h> | 37 | #include <trace/events/writeback.h> |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * Sleep at most 200ms at a time in balance_dirty_pages(). | ||
41 | */ | ||
42 | #define MAX_PAUSE max(HZ/5, 1) | ||
43 | |||
44 | /* | ||
45 | * Estimate write bandwidth at 200ms intervals. | ||
46 | */ | ||
47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) | ||
48 | |||
49 | /* | ||
40 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 50 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
41 | * will look to see if it needs to force writeback or throttling. | 51 | * will look to see if it needs to force writeback or throttling. |
42 | */ | 52 | */ |
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode); | |||
111 | 121 | ||
112 | /* End of sysctl-exported parameters */ | 122 | /* End of sysctl-exported parameters */ |
113 | 123 | ||
124 | unsigned long global_dirty_limit; | ||
114 | 125 | ||
115 | /* | 126 | /* |
116 | * Scale the writeback cache size proportional to the relative writeout speeds. | 127 | * Scale the writeback cache size proportional to the relative writeout speeds. |
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
219 | */ | 230 | */ |
220 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 231 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
221 | { | 232 | { |
233 | __inc_bdi_stat(bdi, BDI_WRITTEN); | ||
222 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 234 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, |
223 | bdi->max_prop_frac); | 235 | bdi->max_prop_frac); |
224 | } | 236 | } |
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk) | |||
244 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 256 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
245 | long *numerator, long *denominator) | 257 | long *numerator, long *denominator) |
246 | { | 258 | { |
247 | if (bdi_cap_writeback_dirty(bdi)) { | 259 | prop_fraction_percpu(&vm_completions, &bdi->completions, |
248 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
249 | numerator, denominator); | 260 | numerator, denominator); |
250 | } else { | ||
251 | *numerator = 0; | ||
252 | *denominator = 1; | ||
253 | } | ||
254 | } | 261 | } |
255 | 262 | ||
256 | static inline void task_dirties_fraction(struct task_struct *tsk, | 263 | static inline void task_dirties_fraction(struct task_struct *tsk, |
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
274 | * effectively curb the growth of dirty pages. Light dirtiers with high enough | 281 | * effectively curb the growth of dirty pages. Light dirtiers with high enough |
275 | * dirty threshold may never get throttled. | 282 | * dirty threshold may never get throttled. |
276 | */ | 283 | */ |
284 | #define TASK_LIMIT_FRACTION 8 | ||
277 | static unsigned long task_dirty_limit(struct task_struct *tsk, | 285 | static unsigned long task_dirty_limit(struct task_struct *tsk, |
278 | unsigned long bdi_dirty) | 286 | unsigned long bdi_dirty) |
279 | { | 287 | { |
280 | long numerator, denominator; | 288 | long numerator, denominator; |
281 | unsigned long dirty = bdi_dirty; | 289 | unsigned long dirty = bdi_dirty; |
282 | u64 inv = dirty >> 3; | 290 | u64 inv = dirty / TASK_LIMIT_FRACTION; |
283 | 291 | ||
284 | task_dirties_fraction(tsk, &numerator, &denominator); | 292 | task_dirties_fraction(tsk, &numerator, &denominator); |
285 | inv *= numerator; | 293 | inv *= numerator; |
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk, | |||
290 | return max(dirty, bdi_dirty/2); | 298 | return max(dirty, bdi_dirty/2); |
291 | } | 299 | } |
292 | 300 | ||
301 | /* Minimum limit for any task */ | ||
302 | static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) | ||
303 | { | ||
304 | return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; | ||
305 | } | ||
306 | |||
293 | /* | 307 | /* |
294 | * | 308 | * |
295 | */ | 309 | */ |
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void) | |||
397 | return x + 1; /* Ensure that we never return 0 */ | 411 | return x + 1; /* Ensure that we never return 0 */ |
398 | } | 412 | } |
399 | 413 | ||
414 | static unsigned long hard_dirty_limit(unsigned long thresh) | ||
415 | { | ||
416 | return max(thresh, global_dirty_limit); | ||
417 | } | ||
418 | |||
400 | /* | 419 | /* |
401 | * global_dirty_limits - background-writeback and dirty-throttling thresholds | 420 | * global_dirty_limits - background-writeback and dirty-throttling thresholds |
402 | * | 421 | * |
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
435 | } | 454 | } |
436 | *pbackground = background; | 455 | *pbackground = background; |
437 | *pdirty = dirty; | 456 | *pdirty = dirty; |
457 | trace_global_dirty_state(background, dirty); | ||
438 | } | 458 | } |
439 | 459 | ||
440 | /* | 460 | /** |
441 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold | 461 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold |
462 | * @bdi: the backing_dev_info to query | ||
463 | * @dirty: global dirty limit in pages | ||
442 | * | 464 | * |
443 | * Allocate high/low dirty limits to fast/slow devices, in order to prevent | 465 | * Returns @bdi's dirty limit in pages. The term "dirty" in the context of |
466 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | ||
467 | * And the "limit" in the name is not seriously taken as hard limit in | ||
468 | * balance_dirty_pages(). | ||
469 | * | ||
470 | * It allocates high/low dirty limits to fast/slow devices, in order to prevent | ||
444 | * - starving fast devices | 471 | * - starving fast devices |
445 | * - piling up dirty pages (that will take long time to sync) on slow devices | 472 | * - piling up dirty pages (that will take long time to sync) on slow devices |
446 | * | 473 | * |
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
468 | return bdi_dirty; | 495 | return bdi_dirty; |
469 | } | 496 | } |
470 | 497 | ||
498 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | ||
499 | unsigned long elapsed, | ||
500 | unsigned long written) | ||
501 | { | ||
502 | const unsigned long period = roundup_pow_of_two(3 * HZ); | ||
503 | unsigned long avg = bdi->avg_write_bandwidth; | ||
504 | unsigned long old = bdi->write_bandwidth; | ||
505 | u64 bw; | ||
506 | |||
507 | /* | ||
508 | * bw = written * HZ / elapsed | ||
509 | * | ||
510 | * bw * elapsed + write_bandwidth * (period - elapsed) | ||
511 | * write_bandwidth = --------------------------------------------------- | ||
512 | * period | ||
513 | */ | ||
514 | bw = written - bdi->written_stamp; | ||
515 | bw *= HZ; | ||
516 | if (unlikely(elapsed > period)) { | ||
517 | do_div(bw, elapsed); | ||
518 | avg = bw; | ||
519 | goto out; | ||
520 | } | ||
521 | bw += (u64)bdi->write_bandwidth * (period - elapsed); | ||
522 | bw >>= ilog2(period); | ||
523 | |||
524 | /* | ||
525 | * one more level of smoothing, for filtering out sudden spikes | ||
526 | */ | ||
527 | if (avg > old && old >= (unsigned long)bw) | ||
528 | avg -= (avg - old) >> 3; | ||
529 | |||
530 | if (avg < old && old <= (unsigned long)bw) | ||
531 | avg += (old - avg) >> 3; | ||
532 | |||
533 | out: | ||
534 | bdi->write_bandwidth = bw; | ||
535 | bdi->avg_write_bandwidth = avg; | ||
536 | } | ||
537 | |||
538 | /* | ||
539 | * The global dirtyable memory and dirty threshold could be suddenly knocked | ||
540 | * down by a large amount (eg. on the startup of KVM in a swapless system). | ||
541 | * This may throw the system into deep dirty exceeded state and throttle | ||
542 | * heavy/light dirtiers alike. To retain good responsiveness, maintain | ||
543 | * global_dirty_limit for tracking slowly down to the knocked down dirty | ||
544 | * threshold. | ||
545 | */ | ||
546 | static void update_dirty_limit(unsigned long thresh, unsigned long dirty) | ||
547 | { | ||
548 | unsigned long limit = global_dirty_limit; | ||
549 | |||
550 | /* | ||
551 | * Follow up in one step. | ||
552 | */ | ||
553 | if (limit < thresh) { | ||
554 | limit = thresh; | ||
555 | goto update; | ||
556 | } | ||
557 | |||
558 | /* | ||
559 | * Follow down slowly. Use the higher one as the target, because thresh | ||
560 | * may drop below dirty. This is exactly the reason to introduce | ||
561 | * global_dirty_limit which is guaranteed to lie above the dirty pages. | ||
562 | */ | ||
563 | thresh = max(thresh, dirty); | ||
564 | if (limit > thresh) { | ||
565 | limit -= (limit - thresh) >> 5; | ||
566 | goto update; | ||
567 | } | ||
568 | return; | ||
569 | update: | ||
570 | global_dirty_limit = limit; | ||
571 | } | ||
572 | |||
573 | static void global_update_bandwidth(unsigned long thresh, | ||
574 | unsigned long dirty, | ||
575 | unsigned long now) | ||
576 | { | ||
577 | static DEFINE_SPINLOCK(dirty_lock); | ||
578 | static unsigned long update_time; | ||
579 | |||
580 | /* | ||
581 | * check locklessly first to optimize away locking for the most time | ||
582 | */ | ||
583 | if (time_before(now, update_time + BANDWIDTH_INTERVAL)) | ||
584 | return; | ||
585 | |||
586 | spin_lock(&dirty_lock); | ||
587 | if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { | ||
588 | update_dirty_limit(thresh, dirty); | ||
589 | update_time = now; | ||
590 | } | ||
591 | spin_unlock(&dirty_lock); | ||
592 | } | ||
593 | |||
594 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
595 | unsigned long thresh, | ||
596 | unsigned long dirty, | ||
597 | unsigned long bdi_thresh, | ||
598 | unsigned long bdi_dirty, | ||
599 | unsigned long start_time) | ||
600 | { | ||
601 | unsigned long now = jiffies; | ||
602 | unsigned long elapsed = now - bdi->bw_time_stamp; | ||
603 | unsigned long written; | ||
604 | |||
605 | /* | ||
606 | * rate-limit, only update once every 200ms. | ||
607 | */ | ||
608 | if (elapsed < BANDWIDTH_INTERVAL) | ||
609 | return; | ||
610 | |||
611 | written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); | ||
612 | |||
613 | /* | ||
614 | * Skip quiet periods when disk bandwidth is under-utilized. | ||
615 | * (at least 1s idle time between two flusher runs) | ||
616 | */ | ||
617 | if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) | ||
618 | goto snapshot; | ||
619 | |||
620 | if (thresh) | ||
621 | global_update_bandwidth(thresh, dirty, now); | ||
622 | |||
623 | bdi_update_write_bandwidth(bdi, elapsed, written); | ||
624 | |||
625 | snapshot: | ||
626 | bdi->written_stamp = written; | ||
627 | bdi->bw_time_stamp = now; | ||
628 | } | ||
629 | |||
630 | static void bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
631 | unsigned long thresh, | ||
632 | unsigned long dirty, | ||
633 | unsigned long bdi_thresh, | ||
634 | unsigned long bdi_dirty, | ||
635 | unsigned long start_time) | ||
636 | { | ||
637 | if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) | ||
638 | return; | ||
639 | spin_lock(&bdi->wb.list_lock); | ||
640 | __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, | ||
641 | start_time); | ||
642 | spin_unlock(&bdi->wb.list_lock); | ||
643 | } | ||
644 | |||
471 | /* | 645 | /* |
472 | * balance_dirty_pages() must be called by processes which are generating dirty | 646 | * balance_dirty_pages() must be called by processes which are generating dirty |
473 | * data. It looks at the number of dirty pages in the machine and will force | 647 | * data. It looks at the number of dirty pages in the machine and will force |
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
478 | static void balance_dirty_pages(struct address_space *mapping, | 652 | static void balance_dirty_pages(struct address_space *mapping, |
479 | unsigned long write_chunk) | 653 | unsigned long write_chunk) |
480 | { | 654 | { |
481 | long nr_reclaimable, bdi_nr_reclaimable; | 655 | unsigned long nr_reclaimable, bdi_nr_reclaimable; |
482 | long nr_writeback, bdi_nr_writeback; | 656 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ |
657 | unsigned long bdi_dirty; | ||
483 | unsigned long background_thresh; | 658 | unsigned long background_thresh; |
484 | unsigned long dirty_thresh; | 659 | unsigned long dirty_thresh; |
485 | unsigned long bdi_thresh; | 660 | unsigned long bdi_thresh; |
661 | unsigned long task_bdi_thresh; | ||
662 | unsigned long min_task_bdi_thresh; | ||
486 | unsigned long pages_written = 0; | 663 | unsigned long pages_written = 0; |
487 | unsigned long pause = 1; | 664 | unsigned long pause = 1; |
488 | bool dirty_exceeded = false; | 665 | bool dirty_exceeded = false; |
666 | bool clear_dirty_exceeded = true; | ||
489 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 667 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
668 | unsigned long start_time = jiffies; | ||
490 | 669 | ||
491 | for (;;) { | 670 | for (;;) { |
492 | struct writeback_control wbc = { | ||
493 | .sync_mode = WB_SYNC_NONE, | ||
494 | .older_than_this = NULL, | ||
495 | .nr_to_write = write_chunk, | ||
496 | .range_cyclic = 1, | ||
497 | }; | ||
498 | |||
499 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 671 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
500 | global_page_state(NR_UNSTABLE_NFS); | 672 | global_page_state(NR_UNSTABLE_NFS); |
501 | nr_writeback = global_page_state(NR_WRITEBACK); | 673 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); |
502 | 674 | ||
503 | global_dirty_limits(&background_thresh, &dirty_thresh); | 675 | global_dirty_limits(&background_thresh, &dirty_thresh); |
504 | 676 | ||
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
507 | * catch-up. This avoids (excessively) small writeouts | 679 | * catch-up. This avoids (excessively) small writeouts |
508 | * when the bdi limits are ramping up. | 680 | * when the bdi limits are ramping up. |
509 | */ | 681 | */ |
510 | if (nr_reclaimable + nr_writeback <= | 682 | if (nr_dirty <= (background_thresh + dirty_thresh) / 2) |
511 | (background_thresh + dirty_thresh) / 2) | ||
512 | break; | 683 | break; |
513 | 684 | ||
514 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 685 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
515 | bdi_thresh = task_dirty_limit(current, bdi_thresh); | 686 | min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); |
687 | task_bdi_thresh = task_dirty_limit(current, bdi_thresh); | ||
516 | 688 | ||
517 | /* | 689 | /* |
518 | * In order to avoid the stacked BDI deadlock we need | 690 | * In order to avoid the stacked BDI deadlock we need |
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
524 | * actually dirty; with m+n sitting in the percpu | 696 | * actually dirty; with m+n sitting in the percpu |
525 | * deltas. | 697 | * deltas. |
526 | */ | 698 | */ |
527 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | 699 | if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { |
528 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | 700 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
529 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | 701 | bdi_dirty = bdi_nr_reclaimable + |
702 | bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
530 | } else { | 703 | } else { |
531 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 704 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
532 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | 705 | bdi_dirty = bdi_nr_reclaimable + |
706 | bdi_stat(bdi, BDI_WRITEBACK); | ||
533 | } | 707 | } |
534 | 708 | ||
535 | /* | 709 | /* |
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
538 | * bdi or process from holding back light ones; The latter is | 712 | * bdi or process from holding back light ones; The latter is |
539 | * the last resort safeguard. | 713 | * the last resort safeguard. |
540 | */ | 714 | */ |
541 | dirty_exceeded = | 715 | dirty_exceeded = (bdi_dirty > task_bdi_thresh) || |
542 | (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) | 716 | (nr_dirty > dirty_thresh); |
543 | || (nr_reclaimable + nr_writeback > dirty_thresh); | 717 | clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && |
718 | (nr_dirty <= dirty_thresh); | ||
544 | 719 | ||
545 | if (!dirty_exceeded) | 720 | if (!dirty_exceeded) |
546 | break; | 721 | break; |
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
548 | if (!bdi->dirty_exceeded) | 723 | if (!bdi->dirty_exceeded) |
549 | bdi->dirty_exceeded = 1; | 724 | bdi->dirty_exceeded = 1; |
550 | 725 | ||
726 | bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, | ||
727 | bdi_thresh, bdi_dirty, start_time); | ||
728 | |||
551 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 729 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
552 | * Unstable writes are a feature of certain networked | 730 | * Unstable writes are a feature of certain networked |
553 | * filesystems (i.e. NFS) in which data may have been | 731 | * filesystems (i.e. NFS) in which data may have been |
@@ -557,17 +735,40 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
557 | * threshold otherwise wait until the disk writes catch | 735 | * threshold otherwise wait until the disk writes catch |
558 | * up. | 736 | * up. |
559 | */ | 737 | */ |
560 | trace_wbc_balance_dirty_start(&wbc, bdi); | 738 | trace_balance_dirty_start(bdi); |
561 | if (bdi_nr_reclaimable > bdi_thresh) { | 739 | if (bdi_nr_reclaimable > task_bdi_thresh) { |
562 | writeback_inodes_wb(&bdi->wb, &wbc); | 740 | pages_written += writeback_inodes_wb(&bdi->wb, |
563 | pages_written += write_chunk - wbc.nr_to_write; | 741 | write_chunk); |
564 | trace_wbc_balance_dirty_written(&wbc, bdi); | 742 | trace_balance_dirty_written(bdi, pages_written); |
565 | if (pages_written >= write_chunk) | 743 | if (pages_written >= write_chunk) |
566 | break; /* We've done our duty */ | 744 | break; /* We've done our duty */ |
567 | } | 745 | } |
568 | trace_wbc_balance_dirty_wait(&wbc, bdi); | ||
569 | __set_current_state(TASK_UNINTERRUPTIBLE); | 746 | __set_current_state(TASK_UNINTERRUPTIBLE); |
570 | io_schedule_timeout(pause); | 747 | io_schedule_timeout(pause); |
748 | trace_balance_dirty_wait(bdi); | ||
749 | |||
750 | dirty_thresh = hard_dirty_limit(dirty_thresh); | ||
751 | /* | ||
752 | * max-pause area. If dirty exceeded but still within this | ||
753 | * area, no need to sleep for more than 200ms: (a) 8 pages per | ||
754 | * 200ms is typically more than enough to curb heavy dirtiers; | ||
755 | * (b) the pause time limit makes the dirtiers more responsive. | ||
756 | */ | ||
757 | if (nr_dirty < dirty_thresh + | ||
758 | dirty_thresh / DIRTY_MAXPAUSE_AREA && | ||
759 | time_after(jiffies, start_time + MAX_PAUSE)) | ||
760 | break; | ||
761 | /* | ||
762 | * pass-good area. When some bdi gets blocked (eg. NFS server | ||
763 | * not responding), or write bandwidth dropped dramatically due | ||
764 | * to concurrent reads, or dirty threshold suddenly dropped and | ||
765 | * the dirty pages cannot be brought down anytime soon (eg. on | ||
766 | * slow USB stick), at least let go of the good bdi's. | ||
767 | */ | ||
768 | if (nr_dirty < dirty_thresh + | ||
769 | dirty_thresh / DIRTY_PASSGOOD_AREA && | ||
770 | bdi_dirty < bdi_thresh) | ||
771 | break; | ||
571 | 772 | ||
572 | /* | 773 | /* |
573 | * Increase the delay for each loop, up to our previous | 774 | * Increase the delay for each loop, up to our previous |
@@ -578,7 +779,8 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
578 | pause = HZ / 10; | 779 | pause = HZ / 10; |
579 | } | 780 | } |
580 | 781 | ||
581 | if (!dirty_exceeded && bdi->dirty_exceeded) | 782 | /* Clear dirty_exceeded flag only when no task can exceed the limit */ |
783 | if (clear_dirty_exceeded && bdi->dirty_exceeded) | ||
582 | bdi->dirty_exceeded = 0; | 784 | bdi->dirty_exceeded = 0; |
583 | 785 | ||
584 | if (writeback_in_progress(bdi)) | 786 | if (writeback_in_progress(bdi)) |
@@ -626,9 +828,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | |||
626 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 828 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
627 | unsigned long nr_pages_dirtied) | 829 | unsigned long nr_pages_dirtied) |
628 | { | 830 | { |
831 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
629 | unsigned long ratelimit; | 832 | unsigned long ratelimit; |
630 | unsigned long *p; | 833 | unsigned long *p; |
631 | 834 | ||
835 | if (!bdi_cap_account_dirty(bdi)) | ||
836 | return; | ||
837 | |||
632 | ratelimit = ratelimit_pages; | 838 | ratelimit = ratelimit_pages; |
633 | if (mapping->backing_dev_info->dirty_exceeded) | 839 | if (mapping->backing_dev_info->dirty_exceeded) |
634 | ratelimit = 8; | 840 | ratelimit = 8; |
@@ -892,12 +1098,12 @@ int write_cache_pages(struct address_space *mapping, | |||
892 | range_whole = 1; | 1098 | range_whole = 1; |
893 | cycled = 1; /* ignore range_cyclic tests */ | 1099 | cycled = 1; /* ignore range_cyclic tests */ |
894 | } | 1100 | } |
895 | if (wbc->sync_mode == WB_SYNC_ALL) | 1101 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
896 | tag = PAGECACHE_TAG_TOWRITE; | 1102 | tag = PAGECACHE_TAG_TOWRITE; |
897 | else | 1103 | else |
898 | tag = PAGECACHE_TAG_DIRTY; | 1104 | tag = PAGECACHE_TAG_DIRTY; |
899 | retry: | 1105 | retry: |
900 | if (wbc->sync_mode == WB_SYNC_ALL) | 1106 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
901 | tag_pages_for_writeback(mapping, index, end); | 1107 | tag_pages_for_writeback(mapping, index, end); |
902 | done_index = index; | 1108 | done_index = index; |
903 | while (!done && (index <= end)) { | 1109 | while (!done && (index <= end)) { |
@@ -31,11 +31,11 @@ | |||
31 | * mmlist_lock (in mmput, drain_mmlist and others) | 31 | * mmlist_lock (in mmput, drain_mmlist and others) |
32 | * mapping->private_lock (in __set_page_dirty_buffers) | 32 | * mapping->private_lock (in __set_page_dirty_buffers) |
33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) | 33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
34 | * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | 34 | * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 35 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_wb_list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
39 | * | 39 | * |
40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) | 40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) |
41 | * ->tasklist_lock | 41 | * ->tasklist_lock |