aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-04-06 22:19:49 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2009-09-14 01:17:59 -0400
commit8ebc423238341b52912c7295b045a32477b33f09 (patch)
tree39677401de0df98c09ca888c87d85033b6fe93c9 /fs
parent74fca6a42863ffacaf7ba6f1936a9f228950f657 (diff)
reiserfs: kill-the-BKL
This patch is an attempt to remove the Bkl based locking scheme from reiserfs and is intended. It is a bit inspired from an old attempt by Peter Zijlstra: http://lkml.indiana.edu/hypermail/linux/kernel/0704.2/2174.html The bkl is heavily used in this filesystem to prevent from concurrent write accesses on the filesystem. Reiserfs makes a deep use of the specific properties of the Bkl: - It can be acqquired recursively by a same task - It is released on the schedule() calls and reacquired when schedule() returns The two properties above are a roadmap for the reiserfs write locking so it's very hard to simply replace it with a common mutex. - We need a recursive-able locking unless we want to restructure several blocks of the code. - We need to identify the sites where the bkl was implictly relaxed (schedule, wait, sync, etc...) so that we can in turn release and reacquire our new lock explicitly. Such implicit releases of the lock are often required to let other resources producer/consumer do their job or we can suffer unexpected starvations or deadlocks. So the new lock that replaces the bkl here is a per superblock mutex with a specific property: it can be acquired recursively by a same task, like the bkl. For such purpose, we integrate a lock owner and a lock depth field on the superblock information structure. The first axis on this patch is to turn reiserfs_write_(un)lock() function into a wrapper to manage this mutex. Also some explicit calls to lock_kernel() have been converted to reiserfs_write_lock() helpers. The second axis is to find the important blocking sites (schedule...(), wait_on_buffer(), sync_dirty_buffer(), etc...) and then apply an explicit release of the write lock on these locations before blocking. Then we can safely wait for those who can give us resources or those who need some. Typically this is a fight between the current writer, the reiserfs workqueue (aka the async commiter) and the pdflush threads. The third axis is a consequence of the second. The write lock is usually on top of a lock dependency chain which can include the journal lock, the flush lock or the commit lock. So it's dangerous to release and trying to reacquire the write lock while we still hold other locks. This is fine with the bkl: T1 T2 lock_kernel() mutex_lock(A) unlock_kernel() // do something lock_kernel() mutex_lock(A) -> already locked by T1 schedule() (and then unlock_kernel()) lock_kernel() mutex_unlock(A) .... This is not fine with a mutex: T1 T2 mutex_lock(write) mutex_lock(A) mutex_unlock(write) // do something mutex_lock(write) mutex_lock(A) -> already locked by T1 schedule() mutex_lock(write) -> already locked by T2 deadlock The solution in this patch is to provide a helper which releases the write lock and sleep a bit if we can't lock a mutex that depend on it. It's another simulation of the bkl behaviour. The last axis is to locate the fs callbacks that are called with the bkl held, according to Documentation/filesystem/Locking. Those are: - reiserfs_remount - reiserfs_fill_super - reiserfs_put_super Reiserfs didn't need to explicitly lock because of the context of these callbacks. But now we must take care of that with the new locking. After this patch, reiserfs suffers from a slight performance regression (for now). On UP, a high volume write with dd reports an average of 27 MB/s instead of 30 MB/s without the patch applied. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Reviewed-by: Ingo Molnar <mingo@elte.hu> Cc: Jeff Mahoney <jeffm@suse.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Bron Gondwana <brong@fastmail.fm> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> LKML-Reference: <1239070789-13354-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs')
-rw-r--r--fs/reiserfs/Makefile2
-rw-r--r--fs/reiserfs/bitmap.c2
-rw-r--r--fs/reiserfs/dir.c8
-rw-r--r--fs/reiserfs/fix_node.c10
-rw-r--r--fs/reiserfs/inode.c23
-rw-r--r--fs/reiserfs/ioctl.c6
-rw-r--r--fs/reiserfs/journal.c134
-rw-r--r--fs/reiserfs/lock.c63
-rw-r--r--fs/reiserfs/resize.c2
-rw-r--r--fs/reiserfs/stree.c2
-rw-r--r--fs/reiserfs/super.c37
11 files changed, 245 insertions, 44 deletions
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 7c5ab6330dd6..6a9e30c041dd 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
7reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ 7reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
8 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ 8 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
9 hashes.o tail_conversion.o journal.o resize.o \ 9 hashes.o tail_conversion.o journal.o resize.o \
10 item_ops.o ioctl.o procfs.o xattr.o 10 item_ops.o ioctl.o procfs.o xattr.o lock.o
11 11
12ifeq ($(CONFIG_REISERFS_FS_XATTR),y) 12ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
13reiserfs-objs += xattr_user.o xattr_trusted.o 13reiserfs-objs += xattr_user.o xattr_trusted.o
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index e716161ab325..147033461b87 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1256,7 +1256,9 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
1256 else { 1256 else {
1257 if (buffer_locked(bh)) { 1257 if (buffer_locked(bh)) {
1258 PROC_INFO_INC(sb, scan_bitmap.wait); 1258 PROC_INFO_INC(sb, scan_bitmap.wait);
1259 reiserfs_write_unlock(sb);
1259 __wait_on_buffer(bh); 1260 __wait_on_buffer(bh);
1261 reiserfs_write_lock(sb);
1260 } 1262 }
1261 BUG_ON(!buffer_uptodate(bh)); 1263 BUG_ON(!buffer_uptodate(bh));
1262 BUG_ON(atomic_read(&bh->b_count) == 0); 1264 BUG_ON(atomic_read(&bh->b_count) == 0);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6d2668fdc384..17f31ad379c8 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -174,14 +174,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
174 // user space buffer is swapped out. At that time 174 // user space buffer is swapped out. At that time
175 // entry can move to somewhere else 175 // entry can move to somewhere else
176 memcpy(local_buf, d_name, d_reclen); 176 memcpy(local_buf, d_name, d_reclen);
177
178 /*
179 * Since filldir might sleep, we can release
180 * the write lock here for other waiters
181 */
182 reiserfs_write_unlock(inode->i_sb);
177 if (filldir 183 if (filldir
178 (dirent, local_buf, d_reclen, d_off, d_ino, 184 (dirent, local_buf, d_reclen, d_off, d_ino,
179 DT_UNKNOWN) < 0) { 185 DT_UNKNOWN) < 0) {
186 reiserfs_write_lock(inode->i_sb);
180 if (local_buf != small_buf) { 187 if (local_buf != small_buf) {
181 kfree(local_buf); 188 kfree(local_buf);
182 } 189 }
183 goto end; 190 goto end;
184 } 191 }
192 reiserfs_write_lock(inode->i_sb);
185 if (local_buf != small_buf) { 193 if (local_buf != small_buf) {
186 kfree(local_buf); 194 kfree(local_buf);
187 } 195 }
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 5e5a4e6fbaf8..bf5f2cbdb063 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -1022,7 +1022,11 @@ static int get_far_parent(struct tree_balance *tb,
1022 /* Check whether the common parent is locked. */ 1022 /* Check whether the common parent is locked. */
1023 1023
1024 if (buffer_locked(*pcom_father)) { 1024 if (buffer_locked(*pcom_father)) {
1025
1026 /* Release the write lock while the buffer is busy */
1027 reiserfs_write_unlock(tb->tb_sb);
1025 __wait_on_buffer(*pcom_father); 1028 __wait_on_buffer(*pcom_father);
1029 reiserfs_write_lock(tb->tb_sb);
1026 if (FILESYSTEM_CHANGED_TB(tb)) { 1030 if (FILESYSTEM_CHANGED_TB(tb)) {
1027 brelse(*pcom_father); 1031 brelse(*pcom_father);
1028 return REPEAT_SEARCH; 1032 return REPEAT_SEARCH;
@@ -1927,7 +1931,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
1927 return REPEAT_SEARCH; 1931 return REPEAT_SEARCH;
1928 1932
1929 if (buffer_locked(bh)) { 1933 if (buffer_locked(bh)) {
1934 reiserfs_write_unlock(tb->tb_sb);
1930 __wait_on_buffer(bh); 1935 __wait_on_buffer(bh);
1936 reiserfs_write_lock(tb->tb_sb);
1931 if (FILESYSTEM_CHANGED_TB(tb)) 1937 if (FILESYSTEM_CHANGED_TB(tb))
1932 return REPEAT_SEARCH; 1938 return REPEAT_SEARCH;
1933 } 1939 }
@@ -2278,7 +2284,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2278 REPEAT_SEARCH : CARRY_ON; 2284 REPEAT_SEARCH : CARRY_ON;
2279 } 2285 }
2280#endif 2286#endif
2287 reiserfs_write_unlock(tb->tb_sb);
2281 __wait_on_buffer(locked); 2288 __wait_on_buffer(locked);
2289 reiserfs_write_lock(tb->tb_sb);
2282 if (FILESYSTEM_CHANGED_TB(tb)) 2290 if (FILESYSTEM_CHANGED_TB(tb))
2283 return REPEAT_SEARCH; 2291 return REPEAT_SEARCH;
2284 } 2292 }
@@ -2349,7 +2357,9 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
2349 2357
2350 /* if it possible in indirect_to_direct conversion */ 2358 /* if it possible in indirect_to_direct conversion */
2351 if (buffer_locked(tbS0)) { 2359 if (buffer_locked(tbS0)) {
2360 reiserfs_write_unlock(tb->tb_sb);
2352 __wait_on_buffer(tbS0); 2361 __wait_on_buffer(tbS0);
2362 reiserfs_write_lock(tb->tb_sb);
2353 if (FILESYSTEM_CHANGED_TB(tb)) 2363 if (FILESYSTEM_CHANGED_TB(tb))
2354 return REPEAT_SEARCH; 2364 return REPEAT_SEARCH;
2355 } 2365 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a14d6cd9eeda..1893c8198439 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -489,10 +489,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
489 disappeared */ 489 disappeared */
490 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { 490 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
491 int err; 491 int err;
492 lock_kernel(); 492
493 reiserfs_write_lock(inode->i_sb);
494
493 err = reiserfs_commit_for_inode(inode); 495 err = reiserfs_commit_for_inode(inode);
494 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 496 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
495 unlock_kernel(); 497
498 reiserfs_write_unlock(inode->i_sb);
499
496 if (err < 0) 500 if (err < 0)
497 ret = err; 501 ret = err;
498 } 502 }
@@ -616,7 +620,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
616 loff_t new_offset = 620 loff_t new_offset =
617 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; 621 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
618 622
619 /* bad.... */
620 reiserfs_write_lock(inode->i_sb); 623 reiserfs_write_lock(inode->i_sb);
621 version = get_inode_item_key_version(inode); 624 version = get_inode_item_key_version(inode);
622 625
@@ -997,10 +1000,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
997 if (retval) 1000 if (retval)
998 goto failure; 1001 goto failure;
999 } 1002 }
1000 /* inserting indirect pointers for a hole can take a 1003 /*
1001 ** long time. reschedule if needed 1004 * inserting indirect pointers for a hole can take a
1005 * long time. reschedule if needed and also release the write
1006 * lock for others.
1002 */ 1007 */
1008 reiserfs_write_unlock(inode->i_sb);
1003 cond_resched(); 1009 cond_resched();
1010 reiserfs_write_lock(inode->i_sb);
1004 1011
1005 retval = search_for_position_by_key(inode->i_sb, &key, &path); 1012 retval = search_for_position_by_key(inode->i_sb, &key, &path);
1006 if (retval == IO_ERROR) { 1013 if (retval == IO_ERROR) {
@@ -2608,7 +2615,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
2608 int ret; 2615 int ret;
2609 int old_ref = 0; 2616 int old_ref = 0;
2610 2617
2618 reiserfs_write_unlock(inode->i_sb);
2611 reiserfs_wait_on_write_block(inode->i_sb); 2619 reiserfs_wait_on_write_block(inode->i_sb);
2620 reiserfs_write_lock(inode->i_sb);
2621
2612 fix_tail_page_for_writing(page); 2622 fix_tail_page_for_writing(page);
2613 if (reiserfs_transaction_running(inode->i_sb)) { 2623 if (reiserfs_transaction_running(inode->i_sb)) {
2614 struct reiserfs_transaction_handle *th; 2624 struct reiserfs_transaction_handle *th;
@@ -2758,7 +2768,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2758 int update_sd = 0; 2768 int update_sd = 0;
2759 struct reiserfs_transaction_handle *th = NULL; 2769 struct reiserfs_transaction_handle *th = NULL;
2760 2770
2771 reiserfs_write_unlock(inode->i_sb);
2761 reiserfs_wait_on_write_block(inode->i_sb); 2772 reiserfs_wait_on_write_block(inode->i_sb);
2773 reiserfs_write_lock(inode->i_sb);
2774
2762 if (reiserfs_transaction_running(inode->i_sb)) { 2775 if (reiserfs_transaction_running(inode->i_sb)) {
2763 th = current->journal_info; 2776 th = current->journal_info;
2764 } 2777 }
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 0ccc3fdda7bf..5e40b0cd4c3d 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -141,9 +141,11 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
141 default: 141 default:
142 return -ENOIOCTLCMD; 142 return -ENOIOCTLCMD;
143 } 143 }
144 lock_kernel(); 144
145 reiserfs_write_lock(inode->i_sb);
145 ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); 146 ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
146 unlock_kernel(); 147 reiserfs_write_unlock(inode->i_sb);
148
147 return ret; 149 return ret;
148} 150}
149#endif 151#endif
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 90622200b39c..438c71f0bc91 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -429,21 +429,6 @@ static void clear_prepared_bits(struct buffer_head *bh)
429 clear_buffer_journal_restore_dirty(bh); 429 clear_buffer_journal_restore_dirty(bh);
430} 430}
431 431
432/* utility function to force a BUG if it is called without the big
433** kernel lock held. caller is the string printed just before calling BUG()
434*/
435void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
436{
437#ifdef CONFIG_SMP
438 if (current->lock_depth < 0) {
439 reiserfs_panic(sb, "journal-1", "%s called without kernel "
440 "lock held", caller);
441 }
442#else
443 ;
444#endif
445}
446
447/* return a cnode with same dev, block number and size in table, or null if not found */ 432/* return a cnode with same dev, block number and size in table, or null if not found */
448static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct 433static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
449 super_block 434 super_block
@@ -552,11 +537,48 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
552 journal_hash(table, cn->sb, cn->blocknr) = cn; 537 journal_hash(table, cn->sb, cn->blocknr) = cn;
553} 538}
554 539
540/*
541 * Several mutexes depend on the write lock.
542 * However sometimes we want to relax the write lock while we hold
543 * these mutexes, according to the release/reacquire on schedule()
544 * properties of the Bkl that were used.
545 * Reiserfs performances and locking were based on this scheme.
546 * Now that the write lock is a mutex and not the bkl anymore, doing so
547 * may result in a deadlock:
548 *
549 * A acquire write_lock
550 * A acquire j_commit_mutex
551 * A release write_lock and wait for something
552 * B acquire write_lock
553 * B can't acquire j_commit_mutex and sleep
554 * A can't acquire write lock anymore
555 * deadlock
556 *
557 * What we do here is avoiding such deadlock by playing the same game
558 * than the Bkl: if we can't acquire a mutex that depends on the write lock,
559 * we release the write lock, wait a bit and then retry.
560 *
561 * The mutexes concerned by this hack are:
562 * - The commit mutex of a journal list
563 * - The flush mutex
564 * - The journal lock
565 */
566static inline void reiserfs_mutex_lock_safe(struct mutex *m,
567 struct super_block *s)
568{
569 while (!mutex_trylock(m)) {
570 reiserfs_write_unlock(s);
571 schedule();
572 reiserfs_write_lock(s);
573 }
574}
575
555/* lock the current transaction */ 576/* lock the current transaction */
556static inline void lock_journal(struct super_block *sb) 577static inline void lock_journal(struct super_block *sb)
557{ 578{
558 PROC_INFO_INC(sb, journal.lock_journal); 579 PROC_INFO_INC(sb, journal.lock_journal);
559 mutex_lock(&SB_JOURNAL(sb)->j_mutex); 580
581 reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
560} 582}
561 583
562/* unlock the current transaction */ 584/* unlock the current transaction */
@@ -708,7 +730,9 @@ static void check_barrier_completion(struct super_block *s,
708 disable_barrier(s); 730 disable_barrier(s);
709 set_buffer_uptodate(bh); 731 set_buffer_uptodate(bh);
710 set_buffer_dirty(bh); 732 set_buffer_dirty(bh);
733 reiserfs_write_unlock(s);
711 sync_dirty_buffer(bh); 734 sync_dirty_buffer(bh);
735 reiserfs_write_lock(s);
712 } 736 }
713} 737}
714 738
@@ -996,8 +1020,13 @@ static int reiserfs_async_progress_wait(struct super_block *s)
996{ 1020{
997 DEFINE_WAIT(wait); 1021 DEFINE_WAIT(wait);
998 struct reiserfs_journal *j = SB_JOURNAL(s); 1022 struct reiserfs_journal *j = SB_JOURNAL(s);
999 if (atomic_read(&j->j_async_throttle)) 1023
1024 if (atomic_read(&j->j_async_throttle)) {
1025 reiserfs_write_unlock(s);
1000 congestion_wait(BLK_RW_ASYNC, HZ / 10); 1026 congestion_wait(BLK_RW_ASYNC, HZ / 10);
1027 reiserfs_write_lock(s);
1028 }
1029
1001 return 0; 1030 return 0;
1002} 1031}
1003 1032
@@ -1043,7 +1072,8 @@ static int flush_commit_list(struct super_block *s,
1043 } 1072 }
1044 1073
1045 /* make sure nobody is trying to flush this one at the same time */ 1074 /* make sure nobody is trying to flush this one at the same time */
1046 mutex_lock(&jl->j_commit_mutex); 1075 reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
1076
1047 if (!journal_list_still_alive(s, trans_id)) { 1077 if (!journal_list_still_alive(s, trans_id)) {
1048 mutex_unlock(&jl->j_commit_mutex); 1078 mutex_unlock(&jl->j_commit_mutex);
1049 goto put_jl; 1079 goto put_jl;
@@ -1061,12 +1091,17 @@ static int flush_commit_list(struct super_block *s,
1061 1091
1062 if (!list_empty(&jl->j_bh_list)) { 1092 if (!list_empty(&jl->j_bh_list)) {
1063 int ret; 1093 int ret;
1064 unlock_kernel(); 1094
1095 /*
1096 * We might sleep in numerous places inside
1097 * write_ordered_buffers. Relax the write lock.
1098 */
1099 reiserfs_write_unlock(s);
1065 ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, 1100 ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
1066 journal, jl, &jl->j_bh_list); 1101 journal, jl, &jl->j_bh_list);
1067 if (ret < 0 && retval == 0) 1102 if (ret < 0 && retval == 0)
1068 retval = ret; 1103 retval = ret;
1069 lock_kernel(); 1104 reiserfs_write_lock(s);
1070 } 1105 }
1071 BUG_ON(!list_empty(&jl->j_bh_list)); 1106 BUG_ON(!list_empty(&jl->j_bh_list));
1072 /* 1107 /*
@@ -1114,12 +1149,19 @@ static int flush_commit_list(struct super_block *s,
1114 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + 1149 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1115 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); 1150 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
1116 tbh = journal_find_get_block(s, bn); 1151 tbh = journal_find_get_block(s, bn);
1152
1153 reiserfs_write_unlock(s);
1117 wait_on_buffer(tbh); 1154 wait_on_buffer(tbh);
1155 reiserfs_write_lock(s);
1118 // since we're using ll_rw_blk above, it might have skipped over 1156 // since we're using ll_rw_blk above, it might have skipped over
1119 // a locked buffer. Double check here 1157 // a locked buffer. Double check here
1120 // 1158 //
1121 if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */ 1159 /* redundant, sync_dirty_buffer() checks */
1160 if (buffer_dirty(tbh)) {
1161 reiserfs_write_unlock(s);
1122 sync_dirty_buffer(tbh); 1162 sync_dirty_buffer(tbh);
1163 reiserfs_write_lock(s);
1164 }
1123 if (unlikely(!buffer_uptodate(tbh))) { 1165 if (unlikely(!buffer_uptodate(tbh))) {
1124#ifdef CONFIG_REISERFS_CHECK 1166#ifdef CONFIG_REISERFS_CHECK
1125 reiserfs_warning(s, "journal-601", 1167 reiserfs_warning(s, "journal-601",
@@ -1143,10 +1185,15 @@ static int flush_commit_list(struct super_block *s,
1143 if (buffer_dirty(jl->j_commit_bh)) 1185 if (buffer_dirty(jl->j_commit_bh))
1144 BUG(); 1186 BUG();
1145 mark_buffer_dirty(jl->j_commit_bh) ; 1187 mark_buffer_dirty(jl->j_commit_bh) ;
1188 reiserfs_write_unlock(s);
1146 sync_dirty_buffer(jl->j_commit_bh) ; 1189 sync_dirty_buffer(jl->j_commit_bh) ;
1190 reiserfs_write_lock(s);
1147 } 1191 }
1148 } else 1192 } else {
1193 reiserfs_write_unlock(s);
1149 wait_on_buffer(jl->j_commit_bh); 1194 wait_on_buffer(jl->j_commit_bh);
1195 reiserfs_write_lock(s);
1196 }
1150 1197
1151 check_barrier_completion(s, jl->j_commit_bh); 1198 check_barrier_completion(s, jl->j_commit_bh);
1152 1199
@@ -1286,7 +1333,9 @@ static int _update_journal_header_block(struct super_block *sb,
1286 1333
1287 if (trans_id >= journal->j_last_flush_trans_id) { 1334 if (trans_id >= journal->j_last_flush_trans_id) {
1288 if (buffer_locked((journal->j_header_bh))) { 1335 if (buffer_locked((journal->j_header_bh))) {
1336 reiserfs_write_unlock(sb);
1289 wait_on_buffer((journal->j_header_bh)); 1337 wait_on_buffer((journal->j_header_bh));
1338 reiserfs_write_lock(sb);
1290 if (unlikely(!buffer_uptodate(journal->j_header_bh))) { 1339 if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
1291#ifdef CONFIG_REISERFS_CHECK 1340#ifdef CONFIG_REISERFS_CHECK
1292 reiserfs_warning(sb, "journal-699", 1341 reiserfs_warning(sb, "journal-699",
@@ -1312,12 +1361,16 @@ static int _update_journal_header_block(struct super_block *sb,
1312 disable_barrier(sb); 1361 disable_barrier(sb);
1313 goto sync; 1362 goto sync;
1314 } 1363 }
1364 reiserfs_write_unlock(sb);
1315 wait_on_buffer(journal->j_header_bh); 1365 wait_on_buffer(journal->j_header_bh);
1366 reiserfs_write_lock(sb);
1316 check_barrier_completion(sb, journal->j_header_bh); 1367 check_barrier_completion(sb, journal->j_header_bh);
1317 } else { 1368 } else {
1318 sync: 1369 sync:
1319 set_buffer_dirty(journal->j_header_bh); 1370 set_buffer_dirty(journal->j_header_bh);
1371 reiserfs_write_unlock(sb);
1320 sync_dirty_buffer(journal->j_header_bh); 1372 sync_dirty_buffer(journal->j_header_bh);
1373 reiserfs_write_lock(sb);
1321 } 1374 }
1322 if (!buffer_uptodate(journal->j_header_bh)) { 1375 if (!buffer_uptodate(journal->j_header_bh)) {
1323 reiserfs_warning(sb, "journal-837", 1376 reiserfs_warning(sb, "journal-837",
@@ -1409,7 +1462,7 @@ static int flush_journal_list(struct super_block *s,
1409 1462
1410 /* if flushall == 0, the lock is already held */ 1463 /* if flushall == 0, the lock is already held */
1411 if (flushall) { 1464 if (flushall) {
1412 mutex_lock(&journal->j_flush_mutex); 1465 reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1413 } else if (mutex_trylock(&journal->j_flush_mutex)) { 1466 } else if (mutex_trylock(&journal->j_flush_mutex)) {
1414 BUG(); 1467 BUG();
1415 } 1468 }
@@ -1553,7 +1606,11 @@ static int flush_journal_list(struct super_block *s,
1553 reiserfs_panic(s, "journal-1011", 1606 reiserfs_panic(s, "journal-1011",
1554 "cn->bh is NULL"); 1607 "cn->bh is NULL");
1555 } 1608 }
1609
1610 reiserfs_write_unlock(s);
1556 wait_on_buffer(cn->bh); 1611 wait_on_buffer(cn->bh);
1612 reiserfs_write_lock(s);
1613
1557 if (!cn->bh) { 1614 if (!cn->bh) {
1558 reiserfs_panic(s, "journal-1012", 1615 reiserfs_panic(s, "journal-1012",
1559 "cn->bh is NULL"); 1616 "cn->bh is NULL");
@@ -1973,11 +2030,19 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1973 reiserfs_mounted_fs_count--; 2030 reiserfs_mounted_fs_count--;
1974 /* wait for all commits to finish */ 2031 /* wait for all commits to finish */
1975 cancel_delayed_work(&SB_JOURNAL(sb)->j_work); 2032 cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
2033
2034 /*
2035 * We must release the write lock here because
2036 * the workqueue job (flush_async_commit) needs this lock
2037 */
2038 reiserfs_write_unlock(sb);
1976 flush_workqueue(commit_wq); 2039 flush_workqueue(commit_wq);
2040
1977 if (!reiserfs_mounted_fs_count) { 2041 if (!reiserfs_mounted_fs_count) {
1978 destroy_workqueue(commit_wq); 2042 destroy_workqueue(commit_wq);
1979 commit_wq = NULL; 2043 commit_wq = NULL;
1980 } 2044 }
2045 reiserfs_write_lock(sb);
1981 2046
1982 free_journal_ram(sb); 2047 free_journal_ram(sb);
1983 2048
@@ -2243,7 +2308,11 @@ static int journal_read_transaction(struct super_block *sb,
2243 /* read in the log blocks, memcpy to the corresponding real block */ 2308 /* read in the log blocks, memcpy to the corresponding real block */
2244 ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); 2309 ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
2245 for (i = 0; i < get_desc_trans_len(desc); i++) { 2310 for (i = 0; i < get_desc_trans_len(desc); i++) {
2311
2312 reiserfs_write_unlock(sb);
2246 wait_on_buffer(log_blocks[i]); 2313 wait_on_buffer(log_blocks[i]);
2314 reiserfs_write_lock(sb);
2315
2247 if (!buffer_uptodate(log_blocks[i])) { 2316 if (!buffer_uptodate(log_blocks[i])) {
2248 reiserfs_warning(sb, "journal-1212", 2317 reiserfs_warning(sb, "journal-1212",
2249 "REPLAY FAILURE fsck required! " 2318 "REPLAY FAILURE fsck required! "
@@ -2964,8 +3033,11 @@ static void queue_log_writer(struct super_block *s)
2964 init_waitqueue_entry(&wait, current); 3033 init_waitqueue_entry(&wait, current);
2965 add_wait_queue(&journal->j_join_wait, &wait); 3034 add_wait_queue(&journal->j_join_wait, &wait);
2966 set_current_state(TASK_UNINTERRUPTIBLE); 3035 set_current_state(TASK_UNINTERRUPTIBLE);
2967 if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) 3036 if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
3037 reiserfs_write_unlock(s);
2968 schedule(); 3038 schedule();
3039 reiserfs_write_lock(s);
3040 }
2969 __set_current_state(TASK_RUNNING); 3041 __set_current_state(TASK_RUNNING);
2970 remove_wait_queue(&journal->j_join_wait, &wait); 3042 remove_wait_queue(&journal->j_join_wait, &wait);
2971} 3043}
@@ -2982,7 +3054,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
2982 struct reiserfs_journal *journal = SB_JOURNAL(sb); 3054 struct reiserfs_journal *journal = SB_JOURNAL(sb);
2983 unsigned long bcount = journal->j_bcount; 3055 unsigned long bcount = journal->j_bcount;
2984 while (1) { 3056 while (1) {
3057 reiserfs_write_unlock(sb);
2985 schedule_timeout_uninterruptible(1); 3058 schedule_timeout_uninterruptible(1);
3059 reiserfs_write_lock(sb);
2986 journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; 3060 journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
2987 while ((atomic_read(&journal->j_wcount) > 0 || 3061 while ((atomic_read(&journal->j_wcount) > 0 ||
2988 atomic_read(&journal->j_jlock)) && 3062 atomic_read(&journal->j_jlock)) &&
@@ -3033,7 +3107,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3033 3107
3034 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { 3108 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
3035 unlock_journal(sb); 3109 unlock_journal(sb);
3110 reiserfs_write_unlock(sb);
3036 reiserfs_wait_on_write_block(sb); 3111 reiserfs_wait_on_write_block(sb);
3112 reiserfs_write_lock(sb);
3037 PROC_INFO_INC(sb, journal.journal_relock_writers); 3113 PROC_INFO_INC(sb, journal.journal_relock_writers);
3038 goto relock; 3114 goto relock;
3039 } 3115 }
@@ -3506,14 +3582,14 @@ static void flush_async_commits(struct work_struct *work)
3506 struct reiserfs_journal_list *jl; 3582 struct reiserfs_journal_list *jl;
3507 struct list_head *entry; 3583 struct list_head *entry;
3508 3584
3509 lock_kernel(); 3585 reiserfs_write_lock(sb);
3510 if (!list_empty(&journal->j_journal_list)) { 3586 if (!list_empty(&journal->j_journal_list)) {
3511 /* last entry is the youngest, commit it and you get everything */ 3587 /* last entry is the youngest, commit it and you get everything */
3512 entry = journal->j_journal_list.prev; 3588 entry = journal->j_journal_list.prev;
3513 jl = JOURNAL_LIST_ENTRY(entry); 3589 jl = JOURNAL_LIST_ENTRY(entry);
3514 flush_commit_list(sb, jl, 1); 3590 flush_commit_list(sb, jl, 1);
3515 } 3591 }
3516 unlock_kernel(); 3592 reiserfs_write_unlock(sb);
3517} 3593}
3518 3594
3519/* 3595/*
@@ -4041,7 +4117,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4041 * the new transaction is fully setup, and we've already flushed the 4117 * the new transaction is fully setup, and we've already flushed the
4042 * ordered bh list 4118 * ordered bh list
4043 */ 4119 */
4044 mutex_lock(&jl->j_commit_mutex); 4120 reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
4045 4121
4046 /* save the transaction id in case we need to commit it later */ 4122 /* save the transaction id in case we need to commit it later */
4047 commit_trans_id = jl->j_trans_id; 4123 commit_trans_id = jl->j_trans_id;
@@ -4203,10 +4279,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4203 * is lost. 4279 * is lost.
4204 */ 4280 */
4205 if (!list_empty(&jl->j_tail_bh_list)) { 4281 if (!list_empty(&jl->j_tail_bh_list)) {
4206 unlock_kernel(); 4282 reiserfs_write_unlock(sb);
4207 write_ordered_buffers(&journal->j_dirty_buffers_lock, 4283 write_ordered_buffers(&journal->j_dirty_buffers_lock,
4208 journal, jl, &jl->j_tail_bh_list); 4284 journal, jl, &jl->j_tail_bh_list);
4209 lock_kernel(); 4285 reiserfs_write_lock(sb);
4210 } 4286 }
4211 BUG_ON(!list_empty(&jl->j_tail_bh_list)); 4287 BUG_ON(!list_empty(&jl->j_tail_bh_list));
4212 mutex_unlock(&jl->j_commit_mutex); 4288 mutex_unlock(&jl->j_commit_mutex);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
new file mode 100644
index 000000000000..cdd8d9ef048e
--- /dev/null
+++ b/fs/reiserfs/lock.c
@@ -0,0 +1,63 @@
1#include <linux/reiserfs_fs.h>
2#include <linux/mutex.h>
3
4/*
5 * The previous reiserfs locking scheme was heavily based on
6 * the tricky properties of the Bkl:
7 *
8 * - it was acquired recursively by a same task
9 * - the performances relied on the release-while-schedule() property
10 *
11 * Now that we replace it by a mutex, we still want to keep the same
12 * recursive property to avoid big changes in the code structure.
13 * We use our own lock_owner here because the owner field on a mutex
14 * is only available in SMP or mutex debugging, also we only need this field
15 * for this mutex, no need for a system wide mutex facility.
16 *
17 * Also this lock is often released before a call that could block because
18 * reiserfs performances were partialy based on the release while schedule()
19 * property of the Bkl.
20 */
21void reiserfs_write_lock(struct super_block *s)
22{
23 struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
24
25 if (sb_i->lock_owner != current) {
26 mutex_lock(&sb_i->lock);
27 sb_i->lock_owner = current;
28 }
29
30 /* No need to protect it, only the current task touches it */
31 sb_i->lock_depth++;
32}
33
34void reiserfs_write_unlock(struct super_block *s)
35{
36 struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
37
38 /*
39 * Are we unlocking without even holding the lock?
40 * Such a situation could even raise a BUG() if we don't
41 * want the data become corrupted
42 */
43 WARN_ONCE(sb_i->lock_owner != current,
44 "Superblock write lock imbalance");
45
46 if (--sb_i->lock_depth == -1) {
47 sb_i->lock_owner = NULL;
48 mutex_unlock(&sb_i->lock);
49 }
50}
51
52/*
53 * Utility function to force a BUG if it is called without the superblock
54 * write lock held. caller is the string printed just before calling BUG()
55 */
56void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
57{
58 struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
59
60 if (sb_i->lock_depth < 0)
61 reiserfs_panic(sb, "%s called without kernel lock held %d",
62 caller);
63}
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 18b315d3d104..b3a94d20f0fc 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -141,7 +141,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
141 141
142 set_buffer_uptodate(bh); 142 set_buffer_uptodate(bh);
143 mark_buffer_dirty(bh); 143 mark_buffer_dirty(bh);
144 reiserfs_write_unlock(s);
144 sync_dirty_buffer(bh); 145 sync_dirty_buffer(bh);
146 reiserfs_write_lock(s);
145 // update bitmap_info stuff 147 // update bitmap_info stuff
146 bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; 148 bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
147 brelse(bh); 149 brelse(bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index d036ee5b1c81..6bd99a99a652 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -629,7 +629,9 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
629 search_by_key_reada(sb, reada_bh, 629 search_by_key_reada(sb, reada_bh,
630 reada_blocks, reada_count); 630 reada_blocks, reada_count);
631 ll_rw_block(READ, 1, &bh); 631 ll_rw_block(READ, 1, &bh);
632 reiserfs_write_unlock(sb);
632 wait_on_buffer(bh); 633 wait_on_buffer(bh);
634 reiserfs_write_lock(sb);
633 if (!buffer_uptodate(bh)) 635 if (!buffer_uptodate(bh))
634 goto io_error; 636 goto io_error;
635 } else { 637 } else {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7adea74d6a8a..e1cfb80d0bf3 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -465,7 +465,7 @@ static void reiserfs_put_super(struct super_block *s)
465 struct reiserfs_transaction_handle th; 465 struct reiserfs_transaction_handle th;
466 th.t_trans_id = 0; 466 th.t_trans_id = 0;
467 467
468 lock_kernel(); 468 reiserfs_write_lock(s);
469 469
470 if (s->s_dirt) 470 if (s->s_dirt)
471 reiserfs_write_super(s); 471 reiserfs_write_super(s);
@@ -499,10 +499,10 @@ static void reiserfs_put_super(struct super_block *s)
499 499
500 reiserfs_proc_info_done(s); 500 reiserfs_proc_info_done(s);
501 501
502 reiserfs_write_unlock(s);
503 mutex_destroy(&REISERFS_SB(s)->lock);
502 kfree(s->s_fs_info); 504 kfree(s->s_fs_info);
503 s->s_fs_info = NULL; 505 s->s_fs_info = NULL;
504
505 unlock_kernel();
506} 506}
507 507
508static struct kmem_cache *reiserfs_inode_cachep; 508static struct kmem_cache *reiserfs_inode_cachep;
@@ -1168,11 +1168,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1168 unsigned int qfmt = 0; 1168 unsigned int qfmt = 0;
1169#ifdef CONFIG_QUOTA 1169#ifdef CONFIG_QUOTA
1170 int i; 1170 int i;
1171#endif
1172
1173 reiserfs_write_lock(s);
1171 1174
1175#ifdef CONFIG_QUOTA
1172 memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); 1176 memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
1173#endif 1177#endif
1174 1178
1175 lock_kernel();
1176 rs = SB_DISK_SUPER_BLOCK(s); 1179 rs = SB_DISK_SUPER_BLOCK(s);
1177 1180
1178 if (!reiserfs_parse_options 1181 if (!reiserfs_parse_options
@@ -1295,12 +1298,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1295 1298
1296out_ok: 1299out_ok:
1297 replace_mount_options(s, new_opts); 1300 replace_mount_options(s, new_opts);
1298 unlock_kernel(); 1301 reiserfs_write_unlock(s);
1299 return 0; 1302 return 0;
1300 1303
1301out_err: 1304out_err:
1302 kfree(new_opts); 1305 kfree(new_opts);
1303 unlock_kernel(); 1306 reiserfs_write_unlock(s);
1304 return err; 1307 return err;
1305} 1308}
1306 1309
@@ -1404,7 +1407,9 @@ static int read_super_block(struct super_block *s, int offset)
1404static int reread_meta_blocks(struct super_block *s) 1407static int reread_meta_blocks(struct super_block *s)
1405{ 1408{
1406 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); 1409 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
1410 reiserfs_write_unlock(s);
1407 wait_on_buffer(SB_BUFFER_WITH_SB(s)); 1411 wait_on_buffer(SB_BUFFER_WITH_SB(s));
1412 reiserfs_write_lock(s);
1408 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { 1413 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
1409 reiserfs_warning(s, "reiserfs-2504", "error reading the super"); 1414 reiserfs_warning(s, "reiserfs-2504", "error reading the super");
1410 return 1; 1415 return 1;
@@ -1613,7 +1618,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1613 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); 1618 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
1614 if (!sbi) { 1619 if (!sbi) {
1615 errval = -ENOMEM; 1620 errval = -ENOMEM;
1616 goto error; 1621 goto error_alloc;
1617 } 1622 }
1618 s->s_fs_info = sbi; 1623 s->s_fs_info = sbi;
1619 /* Set default values for options: non-aggressive tails, RO on errors */ 1624 /* Set default values for options: non-aggressive tails, RO on errors */
@@ -1627,6 +1632,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1627 /* setup default block allocator options */ 1632 /* setup default block allocator options */
1628 reiserfs_init_alloc_options(s); 1633 reiserfs_init_alloc_options(s);
1629 1634
1635 mutex_init(&REISERFS_SB(s)->lock);
1636 REISERFS_SB(s)->lock_depth = -1;
1637
1638 /*
1639 * This function is called with the bkl, which also was the old
1640 * locking used here.
1641 * do_journal_begin() will soon check if we hold the lock (ie: was the
1642 * bkl). This is likely because do_journal_begin() has several another
1643 * callers because at this time, it doesn't seem to be necessary to
1644 * protect against anything.
1645 * Anyway, let's be conservative and lock for now.
1646 */
1647 reiserfs_write_lock(s);
1648
1630 jdev_name = NULL; 1649 jdev_name = NULL;
1631 if (reiserfs_parse_options 1650 if (reiserfs_parse_options
1632 (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, 1651 (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
@@ -1852,9 +1871,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1852 init_waitqueue_head(&(sbi->s_wait)); 1871 init_waitqueue_head(&(sbi->s_wait));
1853 spin_lock_init(&sbi->bitmap_lock); 1872 spin_lock_init(&sbi->bitmap_lock);
1854 1873
1874 reiserfs_write_unlock(s);
1875
1855 return (0); 1876 return (0);
1856 1877
1857error: 1878error:
1879 reiserfs_write_unlock(s);
1880error_alloc:
1858 if (jinit_done) { /* kill the commit thread, free journal ram */ 1881 if (jinit_done) { /* kill the commit thread, free journal ram */
1859 journal_release_error(NULL, s); 1882 journal_release_error(NULL, s);
1860 } 1883 }