aboutsummaryrefslogtreecommitdiffstats
path: root/fs/reiserfs/inode.c
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-04-06 22:19:49 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2009-09-14 01:17:59 -0400
commit8ebc423238341b52912c7295b045a32477b33f09 (patch)
tree39677401de0df98c09ca888c87d85033b6fe93c9 /fs/reiserfs/inode.c
parent74fca6a42863ffacaf7ba6f1936a9f228950f657 (diff)
reiserfs: kill-the-BKL
This patch is an attempt to remove the Bkl based locking scheme from reiserfs and is intended. It is a bit inspired from an old attempt by Peter Zijlstra: http://lkml.indiana.edu/hypermail/linux/kernel/0704.2/2174.html The bkl is heavily used in this filesystem to prevent from concurrent write accesses on the filesystem. Reiserfs makes a deep use of the specific properties of the Bkl: - It can be acqquired recursively by a same task - It is released on the schedule() calls and reacquired when schedule() returns The two properties above are a roadmap for the reiserfs write locking so it's very hard to simply replace it with a common mutex. - We need a recursive-able locking unless we want to restructure several blocks of the code. - We need to identify the sites where the bkl was implictly relaxed (schedule, wait, sync, etc...) so that we can in turn release and reacquire our new lock explicitly. Such implicit releases of the lock are often required to let other resources producer/consumer do their job or we can suffer unexpected starvations or deadlocks. So the new lock that replaces the bkl here is a per superblock mutex with a specific property: it can be acquired recursively by a same task, like the bkl. For such purpose, we integrate a lock owner and a lock depth field on the superblock information structure. The first axis on this patch is to turn reiserfs_write_(un)lock() function into a wrapper to manage this mutex. Also some explicit calls to lock_kernel() have been converted to reiserfs_write_lock() helpers. The second axis is to find the important blocking sites (schedule...(), wait_on_buffer(), sync_dirty_buffer(), etc...) and then apply an explicit release of the write lock on these locations before blocking. Then we can safely wait for those who can give us resources or those who need some. Typically this is a fight between the current writer, the reiserfs workqueue (aka the async commiter) and the pdflush threads. The third axis is a consequence of the second. The write lock is usually on top of a lock dependency chain which can include the journal lock, the flush lock or the commit lock. So it's dangerous to release and trying to reacquire the write lock while we still hold other locks. This is fine with the bkl: T1 T2 lock_kernel() mutex_lock(A) unlock_kernel() // do something lock_kernel() mutex_lock(A) -> already locked by T1 schedule() (and then unlock_kernel()) lock_kernel() mutex_unlock(A) .... This is not fine with a mutex: T1 T2 mutex_lock(write) mutex_lock(A) mutex_unlock(write) // do something mutex_lock(write) mutex_lock(A) -> already locked by T1 schedule() mutex_lock(write) -> already locked by T2 deadlock The solution in this patch is to provide a helper which releases the write lock and sleep a bit if we can't lock a mutex that depend on it. It's another simulation of the bkl behaviour. The last axis is to locate the fs callbacks that are called with the bkl held, according to Documentation/filesystem/Locking. Those are: - reiserfs_remount - reiserfs_fill_super - reiserfs_put_super Reiserfs didn't need to explicitly lock because of the context of these callbacks. But now we must take care of that with the new locking. After this patch, reiserfs suffers from a slight performance regression (for now). On UP, a high volume write with dd reports an average of 27 MB/s instead of 30 MB/s without the patch applied. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Reviewed-by: Ingo Molnar <mingo@elte.hu> Cc: Jeff Mahoney <jeffm@suse.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Bron Gondwana <brong@fastmail.fm> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> LKML-Reference: <1239070789-13354-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs/reiserfs/inode.c')
-rw-r--r--fs/reiserfs/inode.c23
1 files changed, 18 insertions, 5 deletions
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a14d6cd9eeda..1893c8198439 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -489,10 +489,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
489 disappeared */ 489 disappeared */
490 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { 490 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
491 int err; 491 int err;
492 lock_kernel(); 492
493 reiserfs_write_lock(inode->i_sb);
494
493 err = reiserfs_commit_for_inode(inode); 495 err = reiserfs_commit_for_inode(inode);
494 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 496 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
495 unlock_kernel(); 497
498 reiserfs_write_unlock(inode->i_sb);
499
496 if (err < 0) 500 if (err < 0)
497 ret = err; 501 ret = err;
498 } 502 }
@@ -616,7 +620,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
616 loff_t new_offset = 620 loff_t new_offset =
617 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; 621 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
618 622
619 /* bad.... */
620 reiserfs_write_lock(inode->i_sb); 623 reiserfs_write_lock(inode->i_sb);
621 version = get_inode_item_key_version(inode); 624 version = get_inode_item_key_version(inode);
622 625
@@ -997,10 +1000,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
997 if (retval) 1000 if (retval)
998 goto failure; 1001 goto failure;
999 } 1002 }
1000 /* inserting indirect pointers for a hole can take a 1003 /*
1001 ** long time. reschedule if needed 1004 * inserting indirect pointers for a hole can take a
1005 * long time. reschedule if needed and also release the write
1006 * lock for others.
1002 */ 1007 */
1008 reiserfs_write_unlock(inode->i_sb);
1003 cond_resched(); 1009 cond_resched();
1010 reiserfs_write_lock(inode->i_sb);
1004 1011
1005 retval = search_for_position_by_key(inode->i_sb, &key, &path); 1012 retval = search_for_position_by_key(inode->i_sb, &key, &path);
1006 if (retval == IO_ERROR) { 1013 if (retval == IO_ERROR) {
@@ -2608,7 +2615,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
2608 int ret; 2615 int ret;
2609 int old_ref = 0; 2616 int old_ref = 0;
2610 2617
2618 reiserfs_write_unlock(inode->i_sb);
2611 reiserfs_wait_on_write_block(inode->i_sb); 2619 reiserfs_wait_on_write_block(inode->i_sb);
2620 reiserfs_write_lock(inode->i_sb);
2621
2612 fix_tail_page_for_writing(page); 2622 fix_tail_page_for_writing(page);
2613 if (reiserfs_transaction_running(inode->i_sb)) { 2623 if (reiserfs_transaction_running(inode->i_sb)) {
2614 struct reiserfs_transaction_handle *th; 2624 struct reiserfs_transaction_handle *th;
@@ -2758,7 +2768,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2758 int update_sd = 0; 2768 int update_sd = 0;
2759 struct reiserfs_transaction_handle *th = NULL; 2769 struct reiserfs_transaction_handle *th = NULL;
2760 2770
2771 reiserfs_write_unlock(inode->i_sb);
2761 reiserfs_wait_on_write_block(inode->i_sb); 2772 reiserfs_wait_on_write_block(inode->i_sb);
2773 reiserfs_write_lock(inode->i_sb);
2774
2762 if (reiserfs_transaction_running(inode->i_sb)) { 2775 if (reiserfs_transaction_running(inode->i_sb)) {
2763 th = current->journal_info; 2776 th = current->journal_info;
2764 } 2777 }