From 5e64b0d9e86ffff8b299556341d85319117539e9 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 7 Sep 2010 13:30:05 +0800 Subject: ocfs2/lockdep: Move ip_xattr_sem out of ocfs2_xattr_get_nolock. As the name shows, we shouldn't have any lock in ocfs2_xattr_get_nolock. so lift ip_xattr_sem to the caller. This should be safe for us since the only 2 callers are: 1. ocfs2_xattr_get which will lock the resources. 2. ocfs2_mknod which don't need this locking. And this also resolves the following lockdep warning. ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.35+ #5 ------------------------------------------------------- reflink/30027 is trying to acquire lock: (&oi->ip_alloc_sem){+.+.+.}, at: [] ocfs2_reflink_ioctl+0x69a/0x1226 [ocfs2] but task is already holding lock: (&oi->ip_xattr_sem){++++..}, at: [] ocfs2_reflink_ioctl+0x68b/0x1226 [ocfs2] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&oi->ip_xattr_sem){++++..}: [] __lock_acquire+0x79a/0x7f1 [] lock_acquire+0xc6/0xed [] down_read+0x34/0x47 [] ocfs2_xattr_get_nolock+0xa0/0x4e6 [ocfs2] [] ocfs2_get_acl_nolock+0x5c/0x132 [ocfs2] [] ocfs2_init_acl+0x60/0x243 [ocfs2] [] ocfs2_mknod+0xae8/0xfea [ocfs2] [] ocfs2_create+0x9d/0x105 [ocfs2] [] vfs_create+0x9b/0xf4 [] do_last+0x2fd/0x5be [] do_filp_open+0x1fb/0x572 [] do_sys_open+0x5a/0xe7 [] sys_open+0x1b/0x1d [] system_call_fastpath+0x16/0x1b -> #2 (jbd2_handle){+.+...}: [] __lock_acquire+0x79a/0x7f1 [] lock_acquire+0xc6/0xed [] start_this_handle+0x4a3/0x4bc [jbd2] [] jbd2__journal_start+0xba/0xee [jbd2] [] jbd2_journal_start+0xe/0x10 [jbd2] [] ocfs2_start_trans+0xb7/0x19b [ocfs2] [] ocfs2_mknod+0x73e/0xfea [ocfs2] [] ocfs2_create+0x9d/0x105 [ocfs2] [] vfs_create+0x9b/0xf4 [] do_last+0x2fd/0x5be [] do_filp_open+0x1fb/0x572 [] do_sys_open+0x5a/0xe7 [] sys_open+0x1b/0x1d [] system_call_fastpath+0x16/0x1b -> #1 (&journal->j_trans_barrier){.+.+..}: [] __lock_acquire+0x79a/0x7f1 [] lock_release_non_nested+0x1e5/0x24b [] lock_release+0x158/0x17a [] __mutex_unlock_slowpath+0xbf/0x11b [] mutex_unlock+0x9/0xb [] ocfs2_free_ac_resource+0x31/0x67 [ocfs2] [] ocfs2_free_alloc_context+0x11/0x1d [ocfs2] [] ocfs2_write_begin_nolock+0x141e/0x159b [ocfs2] [] ocfs2_write_begin+0x11e/0x1e7 [ocfs2] [] generic_file_buffered_write+0x10c/0x210 [] ocfs2_file_aio_write+0x4cc/0x6d3 [ocfs2] [] do_sync_write+0xc2/0x106 [] vfs_write+0xae/0x131 [] sys_write+0x47/0x6f [] system_call_fastpath+0x16/0x1b -> #0 (&oi->ip_alloc_sem){+.+.+.}: [] validate_chain+0x727/0xd68 [] __lock_acquire+0x79a/0x7f1 [] lock_acquire+0xc6/0xed [] down_write+0x31/0x52 [] ocfs2_reflink_ioctl+0x69a/0x1226 [ocfs2] [] ocfs2_ioctl+0x61a/0x656 [ocfs2] [] vfs_ioctl+0x2a/0x9d [] do_vfs_ioctl+0x45d/0x4ae [] sys_ioctl+0x57/0x7a [] system_call_fastpath+0x16/0x1b Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/xattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d03469f61801..06fa5e77c40e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1286,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode, xis.inode_bh = xbs.inode_bh = di_bh; di = (struct ocfs2_dinode *)di_bh->b_data; - down_read(&oi->ip_xattr_sem); ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, buffer_size, &xis); if (ret == -ENODATA && di->i_xattr_loc) ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, buffer_size, &xbs); - up_read(&oi->ip_xattr_sem); return ret; } @@ -1316,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode, mlog_errno(ret); return ret; } + down_read(&OCFS2_I(inode)->ip_xattr_sem); ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, name, buffer, buffer_size); + up_read(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 0); -- cgit v1.2.2 From 07eaac9438b13ec0b863111698b91ccec8f3b8d4 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 7 Sep 2010 13:30:06 +0800 Subject: ocfs2: Fix lockdep warning in reflink. This patch change mutex_lock to a new subclass and add a new inode lock subclass for the target inode which caused this lockdep warning. ============================================= [ INFO: possible recursive locking detected ] 2.6.35+ #5 --------------------------------------------- reflink/11086 is trying to acquire lock: (Meta){+++++.}, at: [] ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] but task is already holding lock: (Meta){+++++.}, at: [] ocfs2_reflink_ioctl+0x5d3/0x1229 [ocfs2] other info that might help us debug this: 6 locks held by reflink/11086: #0: (&sb->s_type->i_mutex_key#15/1){+.+.+.}, at: [] lookup_create+0x26/0x97 #1: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [] ocfs2_reflink_ioctl+0x4d3/0x1229 [ocfs2] #2: (Meta){+++++.}, at: [] ocfs2_reflink_ioctl+0x5d3/0x1229 [ocfs2] #3: (&oi->ip_xattr_sem){+.+.+.}, at: [] ocfs2_reflink_ioctl+0x68b/0x1229 [ocfs2] #4: (&oi->ip_alloc_sem){+.+.+.}, at: [] ocfs2_reflink_ioctl+0x69a/0x1229 [ocfs2] #5: (&sb->s_type->i_mutex_key#15/2){+.+...}, at: [] ocfs2_reflink_ioctl+0x882/0x1229 [ocfs2] stack backtrace: Pid: 11086, comm: reflink Not tainted 2.6.35+ #5 Call Trace: [] validate_chain+0x56e/0xd68 [] ? mark_held_locks+0x49/0x69 [] __lock_acquire+0x79a/0x7f1 [] lock_acquire+0xc6/0xed [] ? ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [] __ocfs2_cluster_lock+0x975/0xa0d [ocfs2] [] ? ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [] ? ocfs2_wait_for_recovery+0x15/0x8a [ocfs2] [] ocfs2_inode_lock_full_nested+0x1ac/0xdc5 [ocfs2] [] ? ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [] ? trace_hardirqs_on_caller+0x10b/0x12f [] ? debug_mutex_free_waiter+0x4f/0x53 [] ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [] ? ocfs2_file_lock_res_init+0x66/0x78 [ocfs2] [] ? might_fault+0x40/0x8d [] ocfs2_ioctl+0x61a/0x656 [ocfs2] [] ? mntput_no_expire+0x1d/0xb0 [] ? path_put+0x2c/0x31 [] vfs_ioctl+0x2a/0x9d [] do_vfs_ioctl+0x45d/0x4ae [] ? _raw_spin_unlock+0x26/0x2a [] ? sysret_check+0x27/0x62 [] sys_ioctl+0x57/0x7a [] system_call_fastpath+0x16/0x1b Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/dlmglue.h | 1 + fs/ocfs2/refcounttree.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index d1ce48e1b3d6..1d596d8c4a4a 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -84,6 +84,7 @@ enum { OI_LS_PARENT, OI_LS_RENAME1, OI_LS_RENAME2, + OI_LS_REFLINK_TARGET, }; int ocfs2_dlm_init(struct ocfs2_super *osb); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 0afeda83120f..efdd75607406 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4201,8 +4201,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out; } - mutex_lock(&new_inode->i_mutex); - ret = ocfs2_inode_lock(new_inode, &new_bh, 1); + mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, + OI_LS_REFLINK_TARGET); if (ret) { mlog_errno(ret); goto out_unlock; -- cgit v1.2.2 From 0f4da216b8c3c35c90ecd18e1899c6f125957c2b Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Wed, 8 Sep 2010 17:12:38 +0800 Subject: Ocfs2: Re-access the journal after ocfs2_insert_extent() in dxdir codes. In ocfs2_dx_dir_rebalance(), we need to rejournal_acess the blocks after calling ocfs2_insert_extent() since growing an extent tree may trigger ocfs2_extend_trans(), which makes previous journal_access meaningless. Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- fs/ocfs2/dir.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f04ebcfffc4a..c49f6de0e7ab 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3931,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, goto out_commit; } + cpos = split_hash; + ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, + data_ac, meta_ac, new_dx_leaves, + num_dx_leaves); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + for (i = 0; i < num_dx_leaves; i++) { ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), orig_dx_leaves[i], @@ -3939,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, mlog_errno(ret); goto out_commit; } - } - cpos = split_hash; - ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, - data_ac, meta_ac, new_dx_leaves, - num_dx_leaves); - if (ret) { - mlog_errno(ret); - goto out_commit; + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), + new_dx_leaves[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } } ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, -- cgit v1.2.2 From 228ac6357718df2d5c8d70210fa51b2225aab5ee Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Fri, 10 Sep 2010 10:16:33 +0800 Subject: Ocfs2: Handle empty list in lockres_seq_start() for dlmdebug.c This patch tries to handle the case in which list 'dlm->tracking_list' is empty, to avoid accessing an invalid pointer. It fixes the following oops: http://oss.oracle.com/bugzilla/show_bug.cgi?id=1287 Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmdebug.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 5efdd37dfe48..901ca52bf86b 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) spin_lock(&dlm->track_lock); if (oldres) track_list = &oldres->tracking; - else + else { track_list = &dlm->tracking_list; + if (list_empty(track_list)) { + dl = NULL; + spin_unlock(&dlm->track_lock); + goto bail; + } + } list_for_each_entry(res, track_list, tracking) { if (&res->tracking == &dlm->tracking_list) @@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) } else dl = NULL; +bail: /* passed to seq_show */ return dl; } -- cgit v1.2.2 From 50aff040363d31f87e94f38f1710973d99489951 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 21 Aug 2010 14:40:20 +0800 Subject: ocfs2/net: fix uninitialized ret in o2net_send_message_vec() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mmotm/fs/ocfs2/cluster/tcp.c: In function ‘o2net_send_message_vec’: mmotm/fs/ocfs2/cluster/tcp.c:980:6: warning: ‘ret’ may be used uninitialized in this function It seems a real bug introduced by commit 9af0b38ff3 (ocfs2/net: Use wait_event() in o2net_send_message_vec()). cc: Sunil Mushran Signed-off-by: Wu Fengguang Signed-off-by: Joel Becker --- fs/ocfs2/cluster/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1361997cf205..cbe2f057cc28 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -977,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, size_t caller_veclen, u8 target_node, int *status) { - int ret; + int ret = 0; struct o2net_msg *msg = NULL; size_t veclen, caller_bytes = 0; struct kvec *vec = NULL; -- cgit v1.2.2 From 767b68e96993e29e3480d7ecdd9c4b84667c5762 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Wed, 22 Sep 2010 14:32:56 -0400 Subject: Prevent freeing uninitialized pointer in compat_do_readv_writev In 32-bit compatibility mode, the error handling for compat_do_readv_writev() may free an uninitialized pointer, potentially leading to all sorts of ugly memory corruption. This is reliably triggerable by unprivileged users by invoking the readv()/writev() syscalls with an invalid iovec pointer. The below patch fixes this to emulate the non-compat version. Introduced by commit b83733639a49 ("compat: factor out compat_rw_copy_check_uvector from compat_do_readv_writev") Signed-off-by: Dan Rosenberg Cc: stable@kernel.org (2.6.35) Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 718c7062aec1..0644a154672b 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1153,7 +1153,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, { compat_ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; + struct iovec *iov = iovstack; ssize_t ret; io_fn_t fn; iov_fn_t fnv; -- cgit v1.2.2 From c227e69028473c7c7994a9b0a2cc0034f3f7e0fe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Sep 2010 13:04:54 -0700 Subject: /proc/vmcore: fix seeking Commit 73296bc611 ("procfs: Use generic_file_llseek in /proc/vmcore") broke seeking on /proc/vmcore. This changes it back to use default_llseek in order to restore the original behaviour. The problem with generic_file_llseek is that it only allows seeks up to inode->i_sb->s_maxbytes, which is zero on procfs and some other virtual file systems. We should merge generic_file_llseek and default_llseek some day and clean this up in a proper way, but for 2.6.35/36, reverting vmcore is the safer solution. Signed-off-by: Arnd Bergmann Cc: Frederic Weisbecker Reported-by: CAI Qian Tested-by: CAI Qian Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 91c817ff02c3..2367fb3f70bc 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -163,7 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, static const struct file_operations proc_vmcore_operations = { .read = read_vmcore, - .llseek = generic_file_llseek, + .llseek = default_llseek, }; static struct vmcore* __init get_new_element(void) -- cgit v1.2.2 From a0c42bac79731276c9b2f28d54f9e658fcf843a2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Sep 2010 13:05:03 -0700 Subject: aio: do not return ERESTARTSYS as a result of AIO OCFS2 can return ERESTARTSYS from its write function when the process is signalled while waiting for a cluster lock (and the filesystem is mounted with intr mount option). Generally, it seems reasonable to allow filesystems to return this error code from its IO functions. As we must not leak ERESTARTSYS (and similar error codes) to userspace as a result of an AIO operation, we have to properly convert it to EINTR inside AIO code (restarting the syscall isn't really an option because other AIO could have been already submitted by the same io_submit syscall). Signed-off-by: Jan Kara Reviewed-by: Jeff Moyer Cc: Christoph Hellwig Cc: Zach Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 1320b2a05fb2..250b0a73c8a8 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -712,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb) */ ret = retry(iocb); - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) + if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; aio_complete(iocb, ret, 0); + } out: spin_lock_irq(&ctx->ctx_lock); -- cgit v1.2.2 From 1c2499ae87f828eabddf6483b0dfc11da1100c07 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 22 Sep 2010 13:05:06 -0700 Subject: /proc/pid/smaps: fix dirty pages accounting Currently, /proc//smaps has wrong dirty pages accounting. Shared_Dirty and Private_Dirty output only pte dirty pages and ignore PG_dirty page flag. It is difference against documentation, but also inconsistent against Referenced field. (Referenced checks both pte and page flags) This patch fixes it. Test program: large-array.c --------------------------------------------------- #include #include #include #include char array[1*1024*1024*1024L]; int main(void) { memset(array, 1, sizeof(array)); pause(); return 0; } --------------------------------------------------- Test case: 1. run ./large-array 2. cat /proc/`pidof large-array`/smaps 3. swapoff -a 4. cat /proc/`pidof large-array`/smaps again Test result: 00601000-40601000 rw-p 00000000 00:00 0 Size: 1048576 kB Rss: 1048576 kB Pss: 1048576 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 218992 kB <-- showed pages as clean incorrectly Private_Dirty: 829584 kB Referenced: 388364 kB Swap: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB 00601000-40601000 rw-p 00000000 00:00 0 Size: 1048576 kB Rss: 1048576 kB Pss: 1048576 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 1048576 kB <-- fixed Referenced: 388480 kB Swap: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Signed-off-by: KOSAKI Motohiro Acked-by: Hugh Dickins Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 271afc48b9a5..1dbca4e8cc16 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -363,13 +363,13 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, mss->referenced += PAGE_SIZE; mapcount = page_mapcount(page); if (mapcount >= 2) { - if (pte_dirty(ptent)) + if (pte_dirty(ptent) || PageDirty(page)) mss->shared_dirty += PAGE_SIZE; else mss->shared_clean += PAGE_SIZE; mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; } else { - if (pte_dirty(ptent)) + if (pte_dirty(ptent) || PageDirty(page)) mss->private_dirty += PAGE_SIZE; else mss->private_clean += PAGE_SIZE; -- cgit v1.2.2 From 12828061cdacfb1db3eb03fd71952d5ebc555bbb Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 13 Sep 2010 14:00:23 +0800 Subject: ocfs2: update ctime when changing the file's permission by setfacl In commit 30e2bab, ext3 fixed it. So change it accordingly in ocfs2. Steps to reproduce: # touch aaa # stat -c %Z aaa 1283760364 # setfacl -m 'u::x,g::x,o::x' aaa # stat -c %Z aaa 1283760364 Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/acl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index a76e0aa5cd3f..391915093fe1 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, } inode->i_mode = new_mode; + inode->i_ctime = CURRENT_TIME; di->i_mode = cpu_to_le16(inode->i_mode); + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ocfs2_journal_dirty(handle, di_bh); -- cgit v1.2.2 From 47dea423799d98c53793237ab386a94976f305d5 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 13 Sep 2010 15:13:50 +0800 Subject: ocfs2: Use cpu_to_le16 for e_leaf_clusters in ocfs2_bg_discontig_add_extent. e_leaf_clusters is a le16, so use cpu_to_le16 instead of cpu_to_le32. What's more, we change 'clusters' to unsigned int to signify that the size of 'clusters' isn't important here. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/suballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8a286f54dca1..849c2f0e0a0e 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -357,7 +357,7 @@ out: static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, struct ocfs2_group_desc *bg, struct ocfs2_chain_list *cl, - u64 p_blkno, u32 clusters) + u64 p_blkno, unsigned int clusters) { struct ocfs2_extent_list *el = &bg->bg_list; struct ocfs2_extent_rec *rec; @@ -369,7 +369,7 @@ static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, rec->e_blkno = cpu_to_le64(p_blkno); rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc)); - rec->e_leaf_clusters = cpu_to_le32(clusters); + rec->e_leaf_clusters = cpu_to_le16(clusters); le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); le16_add_cpu(&bg->bg_free_bits_count, clusters * le16_to_cpu(cl->cl_bpc)); -- cgit v1.2.2 From 4a452de4fdfe4dbb27e491904d8bfaf1262bdff4 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 19 Sep 2010 13:42:28 +0800 Subject: ocfs2: Move 'wanted' into parens of ocfs2_resmap_resv_bits. The first time I read the function ocfs2_resmap_resv_bits, I consider about what 'wanted' will be used and consider about the comments. Then I find it is only used if the reservation is empty. ;) So we'd better move it to the parens so that it make the code more readable, what's more, ocfs2_resmap_resv_bits is used so frequently and we should save some cpus. Acked-by: Mark Fasheh Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/reservations.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index d8b6e4259b80..3e78db361bc7 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -732,25 +732,23 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv, int *cstart, int *clen) { - unsigned int wanted = *clen; - if (resv == NULL || ocfs2_resmap_disabled(resmap)) return -ENOSPC; spin_lock(&resv_lock); - /* - * We don't want to over-allocate for temporary - * windows. Otherwise, we run the risk of fragmenting the - * allocation space. - */ - wanted = ocfs2_resv_window_bits(resmap, resv); - if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) - wanted = *clen; - if (ocfs2_resv_empty(resv)) { - mlog(0, "empty reservation, find new window\n"); + /* + * We don't want to over-allocate for temporary + * windows. Otherwise, we run the risk of fragmenting the + * allocation space. + */ + unsigned int wanted = ocfs2_resv_window_bits(resmap, resv); + if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) + wanted = *clen; + + mlog(0, "empty reservation, find new window\n"); /* * Try to get a window here. If it works, we must fall * through and test the bitmap . This avoids some -- cgit v1.2.2 From 0000b862027d624ac564609b87c1aa4d14dd1e46 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 19 Sep 2010 13:42:29 +0800 Subject: ocfs2: Sync inode flags with ext2. We sync our inode flags with ext2 and define them by hex values. But actually in commit 3669567(4 years ago), all these values are moved to include/linux/fs.h. So we'd better also use them as what ext2 did. So sync our inode flags with ext2 by using FS_*. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2_fs.h | 37 +++++++++++++++++++++++++------------ fs/ocfs2/ocfs2_ioctl.h | 8 ++++---- 2 files changed, 29 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 33f1c9a8258d..fa31d05e41b7 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -235,18 +235,31 @@ #define OCFS2_HAS_REFCOUNT_FL (0x0010) /* Inode attributes, keep in sync with EXT2 */ -#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ -#define OCFS2_UNRM_FL (0x00000002) /* Undelete */ -#define OCFS2_COMPR_FL (0x00000004) /* Compress file */ -#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */ -#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */ -#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */ -#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */ -#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */ -#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */ - -#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */ -#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ +#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */ +#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */ +#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */ +#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */ +#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */ +#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */ +#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */ +#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */ +/* Reserved for compression usage... */ +#define OCFS2_DIRTY_FL FS_DIRTY_FL +#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */ +#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */ +#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */ +/* End compression flags --- maybe not all used */ +#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */ +#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */ +#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */ +#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */ +#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ +#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ +#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ +#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ + +#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ +#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ /* * Extent record flags (e_node.leaf.flags) diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 2d3420af1a83..5d241505690b 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -23,10 +23,10 @@ /* * ioctl commands */ -#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) -#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long) -#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int) -#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) +#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS +#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS +#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS /* * Space reservation / allocation / free ioctls and argument structure -- cgit v1.2.2 From 5dad6c39d156fbbde0b0ef170d9173feffdeb546 Mon Sep 17 00:00:00 2001 From: Srinivas Eeda Date: Tue, 21 Sep 2010 16:27:26 -0700 Subject: o2dlm: force free mles during dlm exit While umounting, a block mle doesn't get freed if dlm is shutdown after master request is received but before assert master. This results in unclean shutdown of dlm domain. This patch frees all mles that lie around after other nodes were notified about exiting the dlm and marking dlm state as leaving. Only block mles are expected to be around, so we log ERROR for other mles but still free them. Signed-off-by: Srinivas Eeda Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmcommon.h | 1 + fs/ocfs2/dlm/dlmdomain.c | 1 + fs/ocfs2/dlm/dlmmaster.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 4b6ae2c13b47..765298908f1d 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1030,6 +1030,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node); +void dlm_force_free_mles(struct dlm_ctxt *dlm); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); int __dlm_lockres_has_locks(struct dlm_lock_resource *res); int __dlm_lockres_unused(struct dlm_lock_resource *res); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 153abb5abef0..11a5c87fd7f7 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -693,6 +693,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) dlm_mark_domain_leaving(dlm); dlm_leave_domain(dlm); + dlm_force_free_mles(dlm); dlm_complete_dlm_shutdown(dlm); } dlm_put(dlm); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index ffb4c68dafa4..f564b0e5f80d 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -3433,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm, wake_up(&res->wq); wake_up(&dlm->migration_wq); } + +void dlm_force_free_mles(struct dlm_ctxt *dlm) +{ + int i; + struct hlist_head *bucket; + struct dlm_master_list_entry *mle; + struct hlist_node *tmp, *list; + + /* + * We notified all other nodes that we are exiting the domain and + * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still + * around we force free them and wake any processes that are waiting + * on the mles + */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + + BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); + BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); + + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each_safe(list, tmp, bucket) { + mle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); + if (mle->type != DLM_MLE_BLOCK) { + mlog(ML_ERROR, "bad mle: %p\n", mle); + dlm_print_one_mle(mle); + } + atomic_set(&mle->woken, 1); + wake_up(&mle->wq); + + __dlm_unlink_mle(dlm, mle); + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); + } + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); +} -- cgit v1.2.2 From 80168676ebfe4af51407d30f336d67f082d45201 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 24 Sep 2010 18:13:44 +1000 Subject: xfs: force background CIL push under sustained load I have been seeing occasional pauses in transaction throughput up to 30s long under heavy parallel workloads. The only notable thing was that the xfsaild was trying to be active during the pauses, but making no progress. It was running exactly 20 times a second (on the 50ms no-progress backoff), and the number of pushbuf events was constant across this time as well. IOWs, the xfsaild appeared to be stuck on buffers that it could not push out. Further investigation indicated that it was trying to push out inode buffers that were pinned and/or locked. The xfsbufd was also getting woken at the same frequency (by the xfsaild, no doubt) to push out delayed write buffers. The xfsbufd was not making any progress because all the buffers in the delwri queue were pinned. This scan- and-make-no-progress dance went one in the trace for some seconds, before the xfssyncd came along an issued a log force, and then things started going again. However, I noticed something strange about the log force - there were way too many IO's issued. 516 log buffers were written, to be exact. That added up to 129MB of log IO, which got me very interested because it's almost exactly 25% of the size of the log. He delayed logging code is suppose to aggregate the minimum of 25% of the log or 8MB worth of changes before flushing. That's what really puzzled me - why did a log force write 129MB instead of only 8MB? Essentially what has happened is that no CIL pushes had occurred since the previous tail push which cleared out 25% of the log space. That caused all the new transactions to block because there wasn't log space for them, but they kick the xfsaild to push the tail. However, the xfsaild was not making progress because there were buffers it could not lock and flush, and the xfsbufd could not flush them because they were pinned. As a result, both the xfsaild and the xfsbufd could not move the tail of the log forward without the CIL first committing. The cause of the problem was that the background CIL push, which should happen when 8MB of aggregated changes have been committed, is being held off by the concurrent transaction commit load. The background push does a down_write_trylock() which will fail if there is a concurrent transaction commit holding the push lock in read mode. With 8 CPUs all doing transactions as fast as they can, there was enough concurrent transaction commits to hold off the background push until tail-pushing could no longer free log space, and the halt would occur. It should be noted that there is no reason why it would halt at 25% of log space used by a single CIL checkpoint. This bug could definitely violate the "no transaction should be larger than half the log" requirement and hence result in corruption if the system crashed under heavy load. This sort of bug is exactly the reason why delayed logging was tagged as experimental.... The fix is to start blocking background pushes once the threshold has been exceeded. Rework the threshold calculations to keep the amount of log space a CIL checkpoint can use to below that of the AIL push threshold to avoid the problem completely. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log_cil.c | 12 +++++++++--- fs/xfs/xfs_log_priv.h | 37 +++++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ed575fb4b495..7e206fc1fa36 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -405,9 +405,15 @@ xlog_cil_push( new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); - /* lock out transaction commit, but don't block on background push */ + /* + * Lock out transaction commit, but don't block for background pushes + * unless we are well over the CIL space limit. See the definition of + * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic + * used here. + */ if (!down_write_trylock(&cil->xc_ctx_lock)) { - if (!push_seq) + if (!push_seq && + cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log)) goto out_free_ticket; down_write(&cil->xc_ctx_lock); } @@ -422,7 +428,7 @@ xlog_cil_push( goto out_skip; /* check for a previously pushed seqeunce */ - if (push_seq < cil->xc_ctx->sequence) + if (push_seq && push_seq < cil->xc_ctx->sequence) goto out_skip; /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ced52b98b322..edcdfe01617f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -426,13 +426,13 @@ struct xfs_cil { }; /* - * The amount of log space we should the CIL to aggregate is difficult to size. - * Whatever we chose we have to make we can get a reservation for the log space - * effectively, that it is large enough to capture sufficient relogging to - * reduce log buffer IO significantly, but it is not too large for the log or - * induces too much latency when writing out through the iclogs. We track both - * space consumed and the number of vectors in the checkpoint context, so we - * need to decide which to use for limiting. + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. * * Every log buffer we write out during a push needs a header reserved, which * is at least one sector and more for v2 logs. Hence we need a reservation of @@ -459,16 +459,21 @@ struct xfs_cil { * checkpoint transaction ticket is specific to the checkpoint context, rather * than the CIL itself. * - * With dynamic reservations, we can basically make up arbitrary limits for the - * checkpoint size so long as they don't violate any other size rules. Hence - * the initial maximum size for the checkpoint transaction will be set to a - * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit - * right now based on the latency of writing out a large amount of data through - * the circular iclog buffers. + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. */ - -#define XLOG_CIL_SPACE_LIMIT(log) \ - (min((log->l_logsize >> 2), (8 * 1024 * 1024))) +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) /* * The reservation head lsn is not made up of a cycle number and block number. -- cgit v1.2.2 From 522440ed55d2cc8855ea5f82bc067e0483b2e1be Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 29 Sep 2010 09:49:54 -0400 Subject: cifs: set backing_dev_info on new S_ISREG inodes Testing on very recent kernel (2.6.36-rc6) made this warning pop: WARNING: at fs/fs-writeback.c:87 inode_to_bdi+0x65/0x70() Hardware name: Dirtiable inode bdi default != sb bdi cifs ...the following patch fixes it and seems to be the obviously correct thing to do for cifs. Cc: stable@kernel.org Acked-by: Dave Kleikamp Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 93f77d438d3c..53cce8cc2224 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -801,6 +801,8 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; + if (S_ISREG(inode->i_mode)) + inode->i_data.backing_dev_info = sb->s_bdi; #ifdef CONFIG_CIFS_FSCACHE /* initialize per-inode cache cookie pointer */ CIFS_I(inode)->fscache = NULL; -- cgit v1.2.2 From 1fc8a117865b54590acd773a55fbac9221b018f0 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 29 Sep 2010 17:33:05 -0700 Subject: ocfs2: Don't walk off the end of fast symlinks. ocfs2 fast symlinks are NUL terminated strings stored inline in the inode data area. However, disk corruption or a local attacker could, in theory, remove that NUL. Because we're using strlen() (my fault, introduced in a731d1 when removing vfs_follow_link()), we could walk off the end of that string. Signed-off-by: Joel Becker Cc: stable@kernel.org --- fs/ocfs2/symlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 32499d213fc4..9975457c981f 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry, } /* Fast symlinks can't be large */ - len = strlen(target); + len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb)); link = kzalloc(len + 1, GFP_NOFS); if (!link) { status = -ENOMEM; -- cgit v1.2.2 From f569599ae70f0899035f8d5876a7939f629c5976 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 29 Sep 2010 15:27:08 -0400 Subject: cifs: prevent infinite recursion in cifs_reconnect_tcon cifs_reconnect_tcon is called from smb_init. After a successful reconnect, cifs_reconnect_tcon will call reset_cifs_unix_caps. That function will, in turn call CIFSSMBQFSUnixInfo and CIFSSMBSetFSUnixInfo. Those functions also call smb_init. It's possible for the session and tcon reconnect to succeed, and then for another cifs_reconnect to occur before CIFSSMBQFSUnixInfo or CIFSSMBSetFSUnixInfo to be called. That'll cause those functions to call smb_init and cifs_reconnect_tcon again, ad infinitum... Break the infinite recursion by having those functions use a new smb_init variant that doesn't attempt to perform a reconnect. Reported-and-Tested-by: Michal Suchanek Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c65c3419dd37..7e83b356cc9e 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -232,7 +232,7 @@ static int small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, void **request_buf) { - int rc = 0; + int rc; rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) @@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, if (tcon != NULL) cifs_stats_inc(&tcon->num_smbs_sent); - return rc; + return 0; } int @@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct, /* If the return code is zero, this function must fill in request_buf pointer */ static int -smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, - void **request_buf /* returned */ , - void **response_buf /* returned */ ) +__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf, void **response_buf) { - int rc = 0; - - rc = cifs_reconnect_tcon(tcon, smb_command); - if (rc) - return rc; - *request_buf = cifs_buf_get(); if (*request_buf == NULL) { /* BB should we add a retry in here if not a writepage? */ @@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, if (tcon != NULL) cifs_stats_inc(&tcon->num_smbs_sent); - return rc; + return 0; +} + +/* If the return code is zero, this function must fill in request_buf pointer */ +static int +smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf, void **response_buf) +{ + int rc; + + rc = cifs_reconnect_tcon(tcon, smb_command); + if (rc) + return rc; + + return __smb_init(smb_command, wct, tcon, request_buf, response_buf); +} + +static int +smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf, void **response_buf) +{ + if (tcon->ses->need_reconnect || tcon->need_reconnect) + return -EHOSTDOWN; + + return __smb_init(smb_command, wct, tcon, request_buf, response_buf); } static int validate_t2(struct smb_t2_rsp *pSMB) @@ -4534,8 +4551,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon) cFYI(1, "In QFSUnixInfo"); QFSUnixRetry: - rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, - (void **) &pSMBr); + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, + (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; @@ -4604,8 +4621,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap) cFYI(1, "In SETFSUnixInfo"); SETFSUnixRetry: /* BB switch to small buf init to save memory */ - rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, - (void **) &pSMBr); + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, + (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; -- cgit v1.2.2 From 3036e7b490bf7878c6dae952eec5fb87b1106589 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 30 Sep 2010 15:15:33 -0700 Subject: proc: make /proc/pid/limits world readable Having the limits file world readable will ease the task of system management on systems where root privileges might be restricted. Having admin restricted with root priviledges, he/she could not check other users process' limits. Also it'd align with most of the /proc stat files. Signed-off-by: Jiri Olsa Acked-by: Neil Horman Cc: Eugene Teo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index a1c43e7c8a7b..8e4addaa5424 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2675,7 +2675,7 @@ static const struct pid_entry tgid_base_stuff[] = { INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUSR, proc_pid_limits), + INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif @@ -3011,7 +3011,7 @@ static const struct pid_entry tid_base_stuff[] = { INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUSR, proc_pid_limits), + INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif -- cgit v1.2.2 From 3f259d092c7a2fdf217823e8f1838530adb0cdb0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Sep 2010 15:15:37 -0700 Subject: reiserfs: fix dependency inversion between inode and reiserfs mutexes The reiserfs mutex already depends on the inode mutex, so we can't lock the inode mutex in reiserfs_unpack() without using the safe locking API, because reiserfs_unpack() is always called with the reiserfs mutex locked. This fixes: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.35c #13 ------------------------------------------------------- lilo/1606 is trying to acquire lock: (&sb->s_type->i_mutex_key#8){+.+.+.}, at: [] reiserfs_unpack+0x60/0x110 [reiserfs] but task is already holding lock: (&REISERFS_SB(s)->lock){+.+.+.}, at: [] reiserfs_write_lock+0x28/0x40 [reiserfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&REISERFS_SB(s)->lock){+.+.+.}: [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] reiserfs_write_lock+0x28/0x40 [reiserfs] [] reiserfs_lookup_privroot+0x2a/0x90 [reiserfs] [] reiserfs_fill_super+0x941/0xe60 [reiserfs] [] get_sb_bdev+0x117/0x170 [] get_super_block+0x21/0x30 [reiserfs] [] vfs_kern_mount+0x6a/0x1b0 [] do_kern_mount+0x39/0xe0 [] do_mount+0x340/0x790 [] sys_mount+0x84/0xb0 [] syscall_call+0x7/0xb -> #0 (&sb->s_type->i_mutex_key#8){+.+.+.}: [] __lock_acquire+0x1026/0x1180 [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] reiserfs_unpack+0x60/0x110 [reiserfs] [] reiserfs_ioctl+0x272/0x320 [reiserfs] [] vfs_ioctl+0x28/0xa0 [] do_vfs_ioctl+0x32d/0x5c0 [] sys_ioctl+0x63/0x70 [] syscall_call+0x7/0xb other info that might help us debug this: 1 lock held by lilo/1606: #0: (&REISERFS_SB(s)->lock){+.+.+.}, at: [] reiserfs_write_lock+0x28/0x40 [reiserfs] stack backtrace: Pid: 1606, comm: lilo Not tainted 2.6.35c #13 Call Trace: [] __lock_acquire+0x1026/0x1180 [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] reiserfs_unpack+0x60/0x110 [reiserfs] [] reiserfs_ioctl+0x272/0x320 [reiserfs] [] vfs_ioctl+0x28/0xa0 [] do_vfs_ioctl+0x32d/0x5c0 [] sys_ioctl+0x63/0x70 [] syscall_call+0x7/0xb Reported-by: Jarek Poplawski Tested-by: Jarek Poplawski Signed-off-by: Frederic Weisbecker Cc: Jeff Mahoney Cc: [2.6.32 and later] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index f53505de0712..679d5029f50f 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -188,7 +188,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) /* we need to make sure nobody is changing the file size beneath ** us */ - mutex_lock(&inode->i_mutex); + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); reiserfs_write_lock(inode->i_sb); write_from = inode->i_size & (blocksize - 1); -- cgit v1.2.2 From 9d8117e72bf453dd9d85e0cd322ce4a0f8bccbc0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Sep 2010 15:15:38 -0700 Subject: reiserfs: fix unwanted reiserfs lock recursion Prevent from recursively locking the reiserfs lock in reiserfs_unpack() because we may call journal_begin() that requires the lock to be taken only once, otherwise it won't be able to release the lock while taking other mutexes, ending up in inverted dependencies between the journal mutex and the reiserfs lock for example. This fixes: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.35.4.4a #3 ------------------------------------------------------- lilo/1620 is trying to acquire lock: (&journal->j_mutex){+.+...}, at: [] do_journal_begin_r+0x7f/0x340 [reiserfs] but task is already holding lock: (&REISERFS_SB(s)->lock){+.+.+.}, at: [] reiserfs_write_lock+0x28/0x40 [reiserfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&REISERFS_SB(s)->lock){+.+.+.}: [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] reiserfs_write_lock+0x28/0x40 [reiserfs] [] do_journal_begin_r+0x86/0x340 [reiserfs] [] journal_begin+0x77/0x140 [reiserfs] [] reiserfs_remount+0x224/0x530 [reiserfs] [] do_remount_sb+0x60/0x110 [] do_mount+0x625/0x790 [] sys_mount+0x84/0xb0 [] syscall_call+0x7/0xb -> #0 (&journal->j_mutex){+.+...}: [] __lock_acquire+0x1026/0x1180 [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] do_journal_begin_r+0x7f/0x340 [reiserfs] [] journal_begin+0x77/0x140 [reiserfs] [] reiserfs_persistent_transaction+0x41/0x90 [reiserfs] [] reiserfs_get_block+0x22c/0x1530 [reiserfs] [] __block_prepare_write+0x1bb/0x3a0 [] block_prepare_write+0x26/0x40 [] reiserfs_prepare_write+0x88/0x170 [reiserfs] [] reiserfs_unpack+0xe6/0x120 [reiserfs] [] reiserfs_ioctl+0x272/0x320 [reiserfs] [] vfs_ioctl+0x28/0xa0 [] do_vfs_ioctl+0x32d/0x5c0 [] sys_ioctl+0x63/0x70 [] syscall_call+0x7/0xb other info that might help us debug this: 2 locks held by lilo/1620: #0: (&sb->s_type->i_mutex_key#8){+.+.+.}, at: [] reiserfs_unpack+0x6a/0x120 [reiserfs] #1: (&REISERFS_SB(s)->lock){+.+.+.}, at: [] reiserfs_write_lock+0x28/0x40 [reiserfs] stack backtrace: Pid: 1620, comm: lilo Not tainted 2.6.35.4.4a #3 Call Trace: [] __lock_acquire+0x1026/0x1180 [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] do_journal_begin_r+0x7f/0x340 [reiserfs] [] journal_begin+0x77/0x140 [reiserfs] [] reiserfs_persistent_transaction+0x41/0x90 [reiserfs] [] reiserfs_get_block+0x22c/0x1530 [reiserfs] [] __block_prepare_write+0x1bb/0x3a0 [] block_prepare_write+0x26/0x40 [] reiserfs_prepare_write+0x88/0x170 [reiserfs] [] reiserfs_unpack+0xe6/0x120 [reiserfs] [] reiserfs_ioctl+0x272/0x320 [reiserfs] [] vfs_ioctl+0x28/0xa0 [] do_vfs_ioctl+0x32d/0x5c0 [] sys_ioctl+0x63/0x70 [] syscall_call+0x7/0xb Reported-by: Jarek Poplawski Tested-by: Jarek Poplawski Signed-off-by: Frederic Weisbecker Cc: Jeff Mahoney Cc: All since 2.6.32 Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 679d5029f50f..5cbb81e134ac 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page, int reiserfs_unpack(struct inode *inode, struct file *filp) { int retval = 0; + int depth; int index; struct page *page; struct address_space *mapping; @@ -189,7 +190,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) ** us */ reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); - reiserfs_write_lock(inode->i_sb); + depth = reiserfs_write_lock_once(inode->i_sb); write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ @@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) out: mutex_unlock(&inode->i_mutex); - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); return retval; } -- cgit v1.2.2 From 0157443c56bcc50be4933ebdff3ece723dfd535c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Sep 2010 22:06:21 +0200 Subject: fuse: Initialize total_len in fuse_retrieve() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fs/fuse/dev.c:1357: warning: ‘total_len’ may be used uninitialized in this function Initialize total_len to zero, else its value will be undefined. Signed-off-by: Geert Uytterhoeven Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d367af1514ef..cde755cca564 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1354,7 +1354,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, loff_t file_size; unsigned int num; unsigned int offset; - size_t total_len; + size_t total_len = 0; req = fuse_get_req(fc); if (IS_ERR(req)) -- cgit v1.2.2 From aaead25b954879e1a708ff2f3602f494c18d20b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Oct 2010 14:25:33 +0200 Subject: writeback: always use sb->s_bdi for writeback purposes We currently use struct backing_dev_info for various different purposes. Originally it was introduced to describe a backing device which includes an unplug and congestion function and various bits of readahead information and VM-relevant flags. We're also using for tracking dirty inodes for writeback. To make writeback properly find all inodes we need to only access the per-filesystem backing_device pointed to by the superblock in ->s_bdi inside the writeback code, and not the instances pointeded to by inode->i_mapping->backing_dev which can be overriden by special devices or might not be set at all by some filesystems. Long term we should split out the writeback-relevant bits of struct backing_device_info (which includes more than the current bdi_writeback) and only point to it from the superblock while leaving the traditional backing device as a separate structure that can be overriden by devices. The one exception for now is the block device filesystem which really wants different writeback contexts for it's different (internal) inodes to handle the writeout more efficiently. For now we do this with a hack in fs-writeback.c because we're so late in the cycle, but in the future I plan to replace this with a superblock method that allows for multiple writeback contexts per filesystem. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5581122bd2c0..ab38fef1c9a1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -72,22 +72,11 @@ int writeback_in_progress(struct backing_dev_info *bdi) static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb = inode->i_sb; - struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; - /* - * For inodes on standard filesystems, we use superblock's bdi. For - * inodes on virtual filesystems, we want to use inode mapping's bdi - * because they can possibly point to something useful (think about - * block_dev filesystem). - */ - if (sb->s_bdi && sb->s_bdi != &noop_backing_dev_info) { - /* Some device inodes could play dirty tricks. Catch them... */ - WARN(bdi != sb->s_bdi && bdi_cap_writeback_dirty(bdi), - "Dirtiable inode bdi %s != sb bdi %s\n", - bdi->name, sb->s_bdi->name); - return sb->s_bdi; - } - return bdi; + if (strcmp(sb->s_type->name, "bdev") == 0) + return inode->i_mapping->backing_dev_info; + + return sb->s_bdi; } static void bdi_queue_work(struct backing_dev_info *bdi, -- cgit v1.2.2