diff options
author | Jan Kara <jack@suse.cz> | 2008-02-08 07:21:59 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-08 12:22:42 -0500 |
commit | 535ee2fbf79ab52d26bce3d2e127c9007503581e (patch) | |
tree | 9d7cdbd98e481ec2fb6d5ff6eebf65f4d422ec8c /fs/buffer.c | |
parent | f6f21c81464ce52dbeec921bdc2e8b288c491920 (diff) |
buffer_head: fix private_list handling
There are two possible races in handling of private_list in buffer cache.
1) When fsync_buffers_list() processes a private_list, it clears
b_assoc_mapping and moves buffer to its private list. Now
drop_buffers() comes, sees a buffer is on list so it calls
__remove_assoc_queue() which complains about b_assoc_mapping being
cleared (as it cannot propagate possible IO error). This race has been
actually observed in the wild.
2) When fsync_buffers_list() processes a private_list,
mark_buffer_dirty_inode() can be called on bh which is already on the
private list of fsync_buffers_list(). As buffer is on some list (note
that the check is performed without private_lock), it is not readded to
the mapping's private_list and after fsync_buffers_list() finishes, we
have a dirty buffer which should be on private_list but it isn't. This
race has not been reported, probably because most (but not all) callers
of mark_buffer_dirty_inode() hold i_mutex and thus are serialized with
fsync().
Fix these issues by not clearing b_assoc_map when fsync_buffers_list()
moves buffer to a dedicated list and by reinserting buffer in private_list
when it is found dirty after we have submitted buffer for IO. We also
change the tests whether a buffer is on a private list from
!list_empty(&bh->b_assoc_buffers) to bh->b_assoc_map so that they are
single word reads and hence lockless checks are safe.
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/buffer.c')
-rw-r--r-- | fs/buffer.c | 23 |
1 files changed, 19 insertions, 4 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index 6f0bddddcf4a..3ebccf4aa7e3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -678,7 +678,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) | |||
678 | } else { | 678 | } else { |
679 | BUG_ON(mapping->assoc_mapping != buffer_mapping); | 679 | BUG_ON(mapping->assoc_mapping != buffer_mapping); |
680 | } | 680 | } |
681 | if (list_empty(&bh->b_assoc_buffers)) { | 681 | if (!bh->b_assoc_map) { |
682 | spin_lock(&buffer_mapping->private_lock); | 682 | spin_lock(&buffer_mapping->private_lock); |
683 | list_move_tail(&bh->b_assoc_buffers, | 683 | list_move_tail(&bh->b_assoc_buffers, |
684 | &mapping->private_list); | 684 | &mapping->private_list); |
@@ -794,6 +794,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) | |||
794 | { | 794 | { |
795 | struct buffer_head *bh; | 795 | struct buffer_head *bh; |
796 | struct list_head tmp; | 796 | struct list_head tmp; |
797 | struct address_space *mapping; | ||
797 | int err = 0, err2; | 798 | int err = 0, err2; |
798 | 799 | ||
799 | INIT_LIST_HEAD(&tmp); | 800 | INIT_LIST_HEAD(&tmp); |
@@ -801,9 +802,14 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) | |||
801 | spin_lock(lock); | 802 | spin_lock(lock); |
802 | while (!list_empty(list)) { | 803 | while (!list_empty(list)) { |
803 | bh = BH_ENTRY(list->next); | 804 | bh = BH_ENTRY(list->next); |
805 | mapping = bh->b_assoc_map; | ||
804 | __remove_assoc_queue(bh); | 806 | __remove_assoc_queue(bh); |
807 | /* Avoid race with mark_buffer_dirty_inode() which does | ||
808 | * a lockless check and we rely on seeing the dirty bit */ | ||
809 | smp_mb(); | ||
805 | if (buffer_dirty(bh) || buffer_locked(bh)) { | 810 | if (buffer_dirty(bh) || buffer_locked(bh)) { |
806 | list_add(&bh->b_assoc_buffers, &tmp); | 811 | list_add(&bh->b_assoc_buffers, &tmp); |
812 | bh->b_assoc_map = mapping; | ||
807 | if (buffer_dirty(bh)) { | 813 | if (buffer_dirty(bh)) { |
808 | get_bh(bh); | 814 | get_bh(bh); |
809 | spin_unlock(lock); | 815 | spin_unlock(lock); |
@@ -822,8 +828,17 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) | |||
822 | 828 | ||
823 | while (!list_empty(&tmp)) { | 829 | while (!list_empty(&tmp)) { |
824 | bh = BH_ENTRY(tmp.prev); | 830 | bh = BH_ENTRY(tmp.prev); |
825 | list_del_init(&bh->b_assoc_buffers); | ||
826 | get_bh(bh); | 831 | get_bh(bh); |
832 | mapping = bh->b_assoc_map; | ||
833 | __remove_assoc_queue(bh); | ||
834 | /* Avoid race with mark_buffer_dirty_inode() which does | ||
835 | * a lockless check and we rely on seeing the dirty bit */ | ||
836 | smp_mb(); | ||
837 | if (buffer_dirty(bh)) { | ||
838 | list_add(&bh->b_assoc_buffers, | ||
839 | &bh->b_assoc_map->private_list); | ||
840 | bh->b_assoc_map = mapping; | ||
841 | } | ||
827 | spin_unlock(lock); | 842 | spin_unlock(lock); |
828 | wait_on_buffer(bh); | 843 | wait_on_buffer(bh); |
829 | if (!buffer_uptodate(bh)) | 844 | if (!buffer_uptodate(bh)) |
@@ -1195,7 +1210,7 @@ void __brelse(struct buffer_head * buf) | |||
1195 | void __bforget(struct buffer_head *bh) | 1210 | void __bforget(struct buffer_head *bh) |
1196 | { | 1211 | { |
1197 | clear_buffer_dirty(bh); | 1212 | clear_buffer_dirty(bh); |
1198 | if (!list_empty(&bh->b_assoc_buffers)) { | 1213 | if (bh->b_assoc_map) { |
1199 | struct address_space *buffer_mapping = bh->b_page->mapping; | 1214 | struct address_space *buffer_mapping = bh->b_page->mapping; |
1200 | 1215 | ||
1201 | spin_lock(&buffer_mapping->private_lock); | 1216 | spin_lock(&buffer_mapping->private_lock); |
@@ -3022,7 +3037,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) | |||
3022 | do { | 3037 | do { |
3023 | struct buffer_head *next = bh->b_this_page; | 3038 | struct buffer_head *next = bh->b_this_page; |
3024 | 3039 | ||
3025 | if (!list_empty(&bh->b_assoc_buffers)) | 3040 | if (bh->b_assoc_map) |
3026 | __remove_assoc_queue(bh); | 3041 | __remove_assoc_queue(bh); |
3027 | bh = next; | 3042 | bh = next; |
3028 | } while (bh != head); | 3043 | } while (bh != head); |