diff options
Diffstat (limited to 'fs/buffer.c')
-rw-r--r-- | fs/buffer.c | 117 |
1 files changed, 62 insertions, 55 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index 2219a76e2caf..698c6b2cc462 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
42 | #include <linux/mpage.h> | 42 | #include <linux/mpage.h> |
43 | #include <linux/bit_spinlock.h> | 43 | #include <linux/bit_spinlock.h> |
44 | #include <linux/cleancache.h> | ||
44 | 45 | ||
45 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); | 46 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); |
46 | 47 | ||
@@ -54,23 +55,15 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) | |||
54 | } | 55 | } |
55 | EXPORT_SYMBOL(init_buffer); | 56 | EXPORT_SYMBOL(init_buffer); |
56 | 57 | ||
57 | static int sync_buffer(void *word) | 58 | static int sleep_on_buffer(void *word) |
58 | { | 59 | { |
59 | struct block_device *bd; | ||
60 | struct buffer_head *bh | ||
61 | = container_of(word, struct buffer_head, b_state); | ||
62 | |||
63 | smp_mb(); | ||
64 | bd = bh->b_bdev; | ||
65 | if (bd) | ||
66 | blk_run_address_space(bd->bd_inode->i_mapping); | ||
67 | io_schedule(); | 60 | io_schedule(); |
68 | return 0; | 61 | return 0; |
69 | } | 62 | } |
70 | 63 | ||
71 | void __lock_buffer(struct buffer_head *bh) | 64 | void __lock_buffer(struct buffer_head *bh) |
72 | { | 65 | { |
73 | wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, | 66 | wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer, |
74 | TASK_UNINTERRUPTIBLE); | 67 | TASK_UNINTERRUPTIBLE); |
75 | } | 68 | } |
76 | EXPORT_SYMBOL(__lock_buffer); | 69 | EXPORT_SYMBOL(__lock_buffer); |
@@ -90,7 +83,7 @@ EXPORT_SYMBOL(unlock_buffer); | |||
90 | */ | 83 | */ |
91 | void __wait_on_buffer(struct buffer_head * bh) | 84 | void __wait_on_buffer(struct buffer_head * bh) |
92 | { | 85 | { |
93 | wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); | 86 | wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE); |
94 | } | 87 | } |
95 | EXPORT_SYMBOL(__wait_on_buffer); | 88 | EXPORT_SYMBOL(__wait_on_buffer); |
96 | 89 | ||
@@ -277,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev) | |||
277 | invalidate_bh_lrus(); | 270 | invalidate_bh_lrus(); |
278 | lru_add_drain_all(); /* make sure all lru add caches are flushed */ | 271 | lru_add_drain_all(); /* make sure all lru add caches are flushed */ |
279 | invalidate_mapping_pages(mapping, 0, -1); | 272 | invalidate_mapping_pages(mapping, 0, -1); |
273 | /* 99% of the time, we don't need to flush the cleancache on the bdev. | ||
274 | * But, for the strange corners, lets be cautious | ||
275 | */ | ||
276 | cleancache_flush_inode(mapping); | ||
280 | } | 277 | } |
281 | EXPORT_SYMBOL(invalidate_bdev); | 278 | EXPORT_SYMBOL(invalidate_bdev); |
282 | 279 | ||
@@ -749,10 +746,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) | |||
749 | { | 746 | { |
750 | struct buffer_head *bh; | 747 | struct buffer_head *bh; |
751 | struct list_head tmp; | 748 | struct list_head tmp; |
752 | struct address_space *mapping, *prev_mapping = NULL; | 749 | struct address_space *mapping; |
753 | int err = 0, err2; | 750 | int err = 0, err2; |
751 | struct blk_plug plug; | ||
754 | 752 | ||
755 | INIT_LIST_HEAD(&tmp); | 753 | INIT_LIST_HEAD(&tmp); |
754 | blk_start_plug(&plug); | ||
756 | 755 | ||
757 | spin_lock(lock); | 756 | spin_lock(lock); |
758 | while (!list_empty(list)) { | 757 | while (!list_empty(list)) { |
@@ -775,7 +774,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) | |||
775 | * still in flight on potentially older | 774 | * still in flight on potentially older |
776 | * contents. | 775 | * contents. |
777 | */ | 776 | */ |
778 | write_dirty_buffer(bh, WRITE_SYNC_PLUG); | 777 | write_dirty_buffer(bh, WRITE_SYNC); |
779 | 778 | ||
780 | /* | 779 | /* |
781 | * Kick off IO for the previous mapping. Note | 780 | * Kick off IO for the previous mapping. Note |
@@ -783,16 +782,16 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) | |||
783 | * wait_on_buffer() will do that for us | 782 | * wait_on_buffer() will do that for us |
784 | * through sync_buffer(). | 783 | * through sync_buffer(). |
785 | */ | 784 | */ |
786 | if (prev_mapping && prev_mapping != mapping) | ||
787 | blk_run_address_space(prev_mapping); | ||
788 | prev_mapping = mapping; | ||
789 | |||
790 | brelse(bh); | 785 | brelse(bh); |
791 | spin_lock(lock); | 786 | spin_lock(lock); |
792 | } | 787 | } |
793 | } | 788 | } |
794 | } | 789 | } |
795 | 790 | ||
791 | spin_unlock(lock); | ||
792 | blk_finish_plug(&plug); | ||
793 | spin_lock(lock); | ||
794 | |||
796 | while (!list_empty(&tmp)) { | 795 | while (!list_empty(&tmp)) { |
797 | bh = BH_ENTRY(tmp.prev); | 796 | bh = BH_ENTRY(tmp.prev); |
798 | get_bh(bh); | 797 | get_bh(bh); |
@@ -1144,7 +1143,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) | |||
1144 | * inode list. | 1143 | * inode list. |
1145 | * | 1144 | * |
1146 | * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, | 1145 | * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, |
1147 | * mapping->tree_lock and the global inode_lock. | 1146 | * mapping->tree_lock and mapping->host->i_lock. |
1148 | */ | 1147 | */ |
1149 | void mark_buffer_dirty(struct buffer_head *bh) | 1148 | void mark_buffer_dirty(struct buffer_head *bh) |
1150 | { | 1149 | { |
@@ -1614,14 +1613,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata); | |||
1614 | * prevents this contention from occurring. | 1613 | * prevents this contention from occurring. |
1615 | * | 1614 | * |
1616 | * If block_write_full_page() is called with wbc->sync_mode == | 1615 | * If block_write_full_page() is called with wbc->sync_mode == |
1617 | * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this | 1616 | * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this |
1618 | * causes the writes to be flagged as synchronous writes, but the | 1617 | * causes the writes to be flagged as synchronous writes. |
1619 | * block device queue will NOT be unplugged, since usually many pages | ||
1620 | * will be pushed to the out before the higher-level caller actually | ||
1621 | * waits for the writes to be completed. The various wait functions, | ||
1622 | * such as wait_on_writeback_range() will ultimately call sync_page() | ||
1623 | * which will ultimately call blk_run_backing_dev(), which will end up | ||
1624 | * unplugging the device queue. | ||
1625 | */ | 1618 | */ |
1626 | static int __block_write_full_page(struct inode *inode, struct page *page, | 1619 | static int __block_write_full_page(struct inode *inode, struct page *page, |
1627 | get_block_t *get_block, struct writeback_control *wbc, | 1620 | get_block_t *get_block, struct writeback_control *wbc, |
@@ -1634,7 +1627,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, | |||
1634 | const unsigned blocksize = 1 << inode->i_blkbits; | 1627 | const unsigned blocksize = 1 << inode->i_blkbits; |
1635 | int nr_underway = 0; | 1628 | int nr_underway = 0; |
1636 | int write_op = (wbc->sync_mode == WB_SYNC_ALL ? | 1629 | int write_op = (wbc->sync_mode == WB_SYNC_ALL ? |
1637 | WRITE_SYNC_PLUG : WRITE); | 1630 | WRITE_SYNC : WRITE); |
1638 | 1631 | ||
1639 | BUG_ON(!PageLocked(page)); | 1632 | BUG_ON(!PageLocked(page)); |
1640 | 1633 | ||
@@ -2343,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write); | |||
2343 | * page lock we can determine safely if the page is beyond EOF. If it is not | 2336 | * page lock we can determine safely if the page is beyond EOF. If it is not |
2344 | * beyond EOF, then the page is guaranteed safe against truncation until we | 2337 | * beyond EOF, then the page is guaranteed safe against truncation until we |
2345 | * unlock the page. | 2338 | * unlock the page. |
2339 | * | ||
2340 | * Direct callers of this function should call vfs_check_frozen() so that page | ||
2341 | * fault does not busyloop until the fs is thawed. | ||
2346 | */ | 2342 | */ |
2347 | int | 2343 | int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, |
2348 | block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, | 2344 | get_block_t get_block) |
2349 | get_block_t get_block) | ||
2350 | { | 2345 | { |
2351 | struct page *page = vmf->page; | 2346 | struct page *page = vmf->page; |
2352 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 2347 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
2353 | unsigned long end; | 2348 | unsigned long end; |
2354 | loff_t size; | 2349 | loff_t size; |
2355 | int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ | 2350 | int ret; |
2356 | 2351 | ||
2357 | lock_page(page); | 2352 | lock_page(page); |
2358 | size = i_size_read(inode); | 2353 | size = i_size_read(inode); |
2359 | if ((page->mapping != inode->i_mapping) || | 2354 | if ((page->mapping != inode->i_mapping) || |
2360 | (page_offset(page) > size)) { | 2355 | (page_offset(page) > size)) { |
2361 | /* page got truncated out from underneath us */ | 2356 | /* We overload EFAULT to mean page got truncated */ |
2362 | unlock_page(page); | 2357 | ret = -EFAULT; |
2363 | goto out; | 2358 | goto out_unlock; |
2364 | } | 2359 | } |
2365 | 2360 | ||
2366 | /* page is wholly or partially inside EOF */ | 2361 | /* page is wholly or partially inside EOF */ |
@@ -2373,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
2373 | if (!ret) | 2368 | if (!ret) |
2374 | ret = block_commit_write(page, 0, end); | 2369 | ret = block_commit_write(page, 0, end); |
2375 | 2370 | ||
2376 | if (unlikely(ret)) { | 2371 | if (unlikely(ret < 0)) |
2377 | unlock_page(page); | 2372 | goto out_unlock; |
2378 | if (ret == -ENOMEM) | 2373 | /* |
2379 | ret = VM_FAULT_OOM; | 2374 | * Freezing in progress? We check after the page is marked dirty and |
2380 | else /* -ENOSPC, -EIO, etc */ | 2375 | * with page lock held so if the test here fails, we are sure freezing |
2381 | ret = VM_FAULT_SIGBUS; | 2376 | * code will wait during syncing until the page fault is done - at that |
2382 | } else | 2377 | * point page will be dirty and unlocked so freezing code will write it |
2383 | ret = VM_FAULT_LOCKED; | 2378 | * and writeprotect it again. |
2384 | 2379 | */ | |
2385 | out: | 2380 | set_page_dirty(page); |
2381 | if (inode->i_sb->s_frozen != SB_UNFROZEN) { | ||
2382 | ret = -EAGAIN; | ||
2383 | goto out_unlock; | ||
2384 | } | ||
2385 | return 0; | ||
2386 | out_unlock: | ||
2387 | unlock_page(page); | ||
2386 | return ret; | 2388 | return ret; |
2387 | } | 2389 | } |
2390 | EXPORT_SYMBOL(__block_page_mkwrite); | ||
2391 | |||
2392 | int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, | ||
2393 | get_block_t get_block) | ||
2394 | { | ||
2395 | int ret; | ||
2396 | struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; | ||
2397 | |||
2398 | /* | ||
2399 | * This check is racy but catches the common case. The check in | ||
2400 | * __block_page_mkwrite() is reliable. | ||
2401 | */ | ||
2402 | vfs_check_frozen(sb, SB_FREEZE_WRITE); | ||
2403 | ret = __block_page_mkwrite(vma, vmf, get_block); | ||
2404 | return block_page_mkwrite_return(ret); | ||
2405 | } | ||
2388 | EXPORT_SYMBOL(block_page_mkwrite); | 2406 | EXPORT_SYMBOL(block_page_mkwrite); |
2389 | 2407 | ||
2390 | /* | 2408 | /* |
@@ -3138,17 +3156,6 @@ out: | |||
3138 | } | 3156 | } |
3139 | EXPORT_SYMBOL(try_to_free_buffers); | 3157 | EXPORT_SYMBOL(try_to_free_buffers); |
3140 | 3158 | ||
3141 | void block_sync_page(struct page *page) | ||
3142 | { | ||
3143 | struct address_space *mapping; | ||
3144 | |||
3145 | smp_mb(); | ||
3146 | mapping = page_mapping(page); | ||
3147 | if (mapping) | ||
3148 | blk_run_backing_dev(mapping->backing_dev_info, page); | ||
3149 | } | ||
3150 | EXPORT_SYMBOL(block_sync_page); | ||
3151 | |||
3152 | /* | 3159 | /* |
3153 | * There are no bdflush tunables left. But distributions are | 3160 | * There are no bdflush tunables left. But distributions are |
3154 | * still running obsolete flush daemons, so we terminate them here. | 3161 | * still running obsolete flush daemons, so we terminate them here. |