From 6146f0d5e47ca4047ffded0fb79b6c25359b386c Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 4 Feb 2009 09:06:57 -0500 Subject: integrity: IMA hooks This patch replaces the generic integrity hooks, for which IMA registered itself, with IMA integrity hooks in the appropriate places directly in the fs directory. Signed-off-by: Mimi Zohar Acked-by: Serge Hallyn Signed-off-by: James Morris --- fs/exec.c | 10 ++++++++++ fs/file_table.c | 2 ++ fs/inode.c | 24 +++++++++++++++++------- fs/namei.c | 8 ++++++++ 4 files changed, 37 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 02d2e120542d..9c789a525cc4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -128,6 +129,9 @@ asmlinkage long sys_uselib(const char __user * library) goto exit; error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN); + if (error) + goto exit; + error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN); if (error) goto exit; @@ -681,6 +685,9 @@ struct file *open_exec(const char *name) goto out_path_put; err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN); + if (err) + goto out_path_put; + err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN); if (err) goto out_path_put; @@ -1207,6 +1214,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) } #endif retval = security_bprm_check(bprm); + if (retval) + return retval; + retval = ima_bprm_check(bprm); if (retval) return retval; diff --git a/fs/file_table.c b/fs/file_table.c index 0fbcacc3ea75..55895ccc08c6 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -276,6 +277,7 @@ void __fput(struct file *file) if (file->f_op && file->f_op->release) file->f_op->release(inode, file); security_file_free(file); + ima_file_free(file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) cdev_put(inode->i_cdev); fops_put(file->f_op); diff --git a/fs/inode.c b/fs/inode.c index 098a2443196f..ed22b14f2202 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -144,13 +145,13 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_cdev = NULL; inode->i_rdev = 0; inode->dirtied_when = 0; - if (security_inode_alloc(inode)) { - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - kmem_cache_free(inode_cachep, (inode)); - return NULL; - } + + if (security_inode_alloc(inode)) + goto out_free_inode; + + /* allocate and initialize an i_integrity */ + if (ima_inode_alloc(inode)) + goto out_free_security; spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); @@ -186,6 +187,15 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_mapping = mapping; return inode; + +out_free_security: + security_inode_free(inode); +out_free_inode: + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; } EXPORT_SYMBOL(inode_init_always); diff --git a/fs/namei.c b/fs/namei.c index af3783fff1de..734f2b5591bf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -860,6 +861,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd) err = exec_permission_lite(inode); if (err == -EAGAIN) err = vfs_permission(nd, MAY_EXEC); + if (!err) + err = ima_path_check(&nd->path, MAY_EXEC); if (err) break; @@ -1525,6 +1528,11 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) error = vfs_permission(nd, acc_mode); if (error) return error; + + error = ima_path_check(&nd->path, + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) + return error; /* * An append-only file must be opened in append mode for writing. */ -- cgit v1.2.2 From f9ce1f1cda8b73a36f47e424975a9dfa78b7840c Mon Sep 17 00:00:00 2001 From: Kentaro Takeda Date: Thu, 5 Feb 2009 17:18:11 +0900 Subject: Add in_execve flag into task_struct. This patch allows LSM modules to determine whether current process is in an execve operation or not so that they can behave differently while an execve operation is in progress. This patch is needed by TOMOYO. Please see another patch titled "LSM adapter functions." for backgrounds. Signed-off-by: Tetsuo Handa Signed-off-by: David Howells Signed-off-by: James Morris --- fs/compat.c | 3 +++ fs/exec.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 65a070e705ab..25589f8322f2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1402,6 +1402,7 @@ int compat_do_execve(char * filename, retval = mutex_lock_interruptible(¤t->cred_exec_mutex); if (retval < 0) goto out_free; + current->in_execve = 1; retval = -ENOMEM; bprm->cred = prepare_exec_creds(); @@ -1454,6 +1455,7 @@ int compat_do_execve(char * filename, goto out; /* execve succeeded */ + current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); free_bprm(bprm); @@ -1470,6 +1472,7 @@ out_file: } out_unlock: + current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); out_free: diff --git a/fs/exec.c b/fs/exec.c index febfd8ed6ad1..9881dc3bb488 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1278,6 +1278,7 @@ int do_execve(char * filename, retval = mutex_lock_interruptible(¤t->cred_exec_mutex); if (retval < 0) goto out_free; + current->in_execve = 1; retval = -ENOMEM; bprm->cred = prepare_exec_creds(); @@ -1331,6 +1332,7 @@ int do_execve(char * filename, goto out; /* execve succeeded */ + current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); free_bprm(bprm); @@ -1349,6 +1351,7 @@ out_file: } out_unlock: + current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); out_free: -- cgit v1.2.2 From 7ce9d5d1f3c8736511daa413c64985a05b2feee3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 4 Mar 2009 18:38:18 -0500 Subject: ext4: fix ext4_free_inode() vs. ext4_claim_inode() race I was seeing fsck errors on inode bitmaps after a 4 thread dbench run on a 4 cpu machine: Inode bitmap differences: -50736 -(50752--50753) etc... I believe that this is because ext4_free_inode() uses atomic bitops, and although ext4_new_inode() *used* to also use atomic bitops for synchronization, commit 393418676a7602e1d7d3f6e560159c65c8cbd50e changed this to use the sb_bgl_lock, so that we could also synchronize against read_inode_bitmap and initialization of uninit inode tables. However, that change left ext4_free_inode using atomic bitops, which I think leaves no synchronization between setting & unsetting bits in the inode table. The below patch fixes it for me, although I wonder if we're getting at all heavy-handed with this spinlock... Signed-off-by: Eric Sandeen Reviewed-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f18a919be70b..627f8c3337a3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -188,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; - int fatal = 0, err, count; + int fatal = 0, err, count, cleared; ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { @@ -248,8 +248,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) goto error_return; /* Ok, now we can actually update the inode bitmaps.. */ - if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), - bit, bitmap_bh->b_data)) + spin_lock(sb_bgl_lock(sbi, block_group)); + cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + spin_unlock(sb_bgl_lock(sbi, block_group)); + if (!cleared) ext4_error(sb, "ext4_free_inode", "bit already cleared for inode %lu", ino); else { -- cgit v1.2.2 From 118e1ef6fabfc023126e6075f6ac0fc729cb5285 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 5 Mar 2009 00:31:12 +0000 Subject: Squashfs: Fix oops when reading fsfuzzer corrupted filesystems This fixes a code regression caused by the recent mainlining changes. The recent code changes call zlib_inflate repeatedly, decompressing into separate 4K buffers, this code didn't check for the possibility that zlib_inflate might ask for too many buffers when decompressing corrupted data. Signed-off-by: Phillip Lougher --- fs/squashfs/block.c | 13 +++++++++++-- fs/squashfs/cache.c | 4 ++-- fs/squashfs/squashfs.h | 2 +- fs/squashfs/super.c | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index c837dfc2b3c6..321728f48f2d 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb, * generated a larger block - this does occasionally happen with zlib). */ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, - int length, u64 *next_index, int srclength) + int length, u64 *next_index, int srclength, int pages) { struct squashfs_sb_info *msblk = sb->s_fs_info; struct buffer_head **bh; @@ -185,6 +185,14 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, } if (msblk->stream.avail_out == 0) { + if (page == pages) { + ERROR("zlib_inflate tried to " + "decompress too much data, " + "expected %d bytes. Zlib " + "data probably corrupt\n", + srclength); + goto release_mutex; + } msblk->stream.next_out = buffer[page++]; msblk->stream.avail_out = PAGE_CACHE_SIZE; } @@ -268,7 +276,8 @@ block_release: put_bh(bh[k]); read_failure: - ERROR("sb_bread failed reading block 0x%llx\n", cur_index); + ERROR("squashfs_read_data failed to read block 0x%llx\n", + (unsigned long long) index); kfree(bh); return -EIO; } diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index f29eda16d25e..1c4739e33af6 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, entry->length = squashfs_read_data(sb, entry->data, block, length, &entry->next_index, - cache->block_size); + cache->block_size, cache->pages); spin_lock(&cache->lock); @@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block, for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) data[i] = buffer; res = squashfs_read_data(sb, data, block, length | - SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length); + SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages); kfree(data); return res; } diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 6b2515d027d5..0e9feb6adf7e 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) /* block.c */ extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, - int); + int, int); /* cache.c */ extern struct squashfs_cache *squashfs_cache_init(char *, int, int); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 071df5b5b491..681ec0d83799 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -389,7 +389,7 @@ static int __init init_squashfs_fs(void) return err; } - printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) " + printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) " "Phillip Lougher\n"); return 0; -- cgit v1.2.2 From f4f8056a862a9950320429dfda708c88b4ce6025 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 5 Mar 2009 00:55:31 +0000 Subject: Squashfs: frag_size should be signed, as it can hold an error result Signed-off-by: Roel Kluin Signed-off-by: Phillip Lougher --- fs/squashfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 7a63398bb855..9101dbde39ec 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) type = le16_to_cpu(sqshb_ino->inode_type); switch (type) { case SQUASHFS_REG_TYPE: { - unsigned int frag_offset, frag_size, frag; + unsigned int frag_offset, frag; + int frag_size; u64 frag_blk; struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; @@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) break; } case SQUASHFS_LREG_TYPE: { - unsigned int frag_offset, frag_size, frag; + unsigned int frag_offset, frag; + int frag_size; u64 frag_blk; struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; -- cgit v1.2.2 From ff392c497b43ddedbab5627b53928a654cc5486e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:36 -0500 Subject: xfs: prevent kernel crash due to corrupted inode log format Andras Korn reported an oops on log replay causes by a corrupted xfs_inode_log_format_t passing a 0 size to kmem_zalloc. This patch handles to small or too large numbers of log regions gracefully by rejecting the log replay with a useful error message. Signed-off-by: Christoph Hellwig Reported-by: Andras Korn Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/xfs_log_recover.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b1047de2fffd..61af610d79b3 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1455,10 +1455,19 @@ xlog_recover_add_to_trans( item = item->ri_prev; if (item->ri_total == 0) { /* first region to be added */ - item->ri_total = in_f->ilf_size; - ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); - item->ri_buf = kmem_zalloc((item->ri_total * - sizeof(xfs_log_iovec_t)), KM_SLEEP); + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xlog_warn( + "XFS: bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + return XFS_ERROR(EIO); + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); } ASSERT(item->ri_total > item->ri_cnt); /* Description region is ri_buf[0] */ -- cgit v1.2.2 From 7d46be4a25fdfb503c20bad60a618adebfe2ac5c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:35 -0500 Subject: xfs: prevent lockdep false positive in xfs_iget_cache_miss The inode can't be locked by anyone else as we just created it a few lines above and it's not been added to any lookup data structure yet. So use a trylock that must succeed to get around the lockdep warnings. Signed-off-by: Christoph Hellwig Reported-by: Alexander Beregalov Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_iget.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index e2fb6210d4c5..478e587087fe 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -246,9 +246,6 @@ xfs_iget_cache_miss( goto out_destroy; } - if (lock_flags) - xfs_ilock(ip, lock_flags); - /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload @@ -256,7 +253,16 @@ xfs_iget_cache_miss( */ if (radix_tree_preload(GFP_KERNEL)) { error = EAGAIN; - goto out_unlock; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); } mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); @@ -284,7 +290,6 @@ xfs_iget_cache_miss( out_preload_end: write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); -out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: -- cgit v1.2.2 From c141b2928fe20396a9ecdec85526e4b66ae96c90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:37 -0500 Subject: xfs: only issues a cache flush on unmount if barriers are enabled Currently we unconditionally issue a flush from xfs_free_buftarg, but since 2.6.29-rc1 this gives a warning in the style of end_request: I/O error, dev vdb, sector 0 Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 12 ++++++++++-- fs/xfs/linux-2.6/xfs_buf.h | 2 +- fs/xfs/linux-2.6/xfs_super.c | 10 +++++----- 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index cb329edc925b..aa1016bb9134 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -34,6 +34,12 @@ #include #include +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_ag.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" + static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); STATIC int xfsbufd_wakeup(int, gfp_t); @@ -1435,10 +1441,12 @@ xfs_unregister_buftarg( void xfs_free_buftarg( - xfs_buftarg_t *btp) + struct xfs_mount *mp, + struct xfs_buftarg *btp) { xfs_flush_buftarg(btp, 1); - xfs_blkdev_issue_flush(btp); + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_blkdev_issue_flush(btp); xfs_free_bufhash(btp); iput(btp->bt_mapping->host); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 288ae7c4c800..9b4d666ad31f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp) * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); -extern void xfs_free_buftarg(xfs_buftarg_t *); +extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index c71e226da7f5..32ae5028e96b 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -734,15 +734,15 @@ xfs_close_devices( { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - xfs_free_buftarg(mp->m_logdev_targp); + xfs_free_buftarg(mp, mp->m_logdev_targp); xfs_blkdev_put(logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - xfs_free_buftarg(mp->m_rtdev_targp); + xfs_free_buftarg(mp, mp->m_rtdev_targp); xfs_blkdev_put(rtdev); } - xfs_free_buftarg(mp->m_ddev_targp); + xfs_free_buftarg(mp, mp->m_ddev_targp); } /* @@ -811,9 +811,9 @@ xfs_open_devices( out_free_rtdev_targ: if (mp->m_rtdev_targp) - xfs_free_buftarg(mp->m_rtdev_targp); + xfs_free_buftarg(mp, mp->m_rtdev_targp); out_free_ddev_targ: - xfs_free_buftarg(mp->m_ddev_targp); + xfs_free_buftarg(mp, mp->m_ddev_targp); out_close_rtdev: if (rtdev) xfs_blkdev_put(rtdev); -- cgit v1.2.2 From b9447ef80bd301b932ac4d85c9622e929de5fd62 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 9 Mar 2009 11:45:38 -0400 Subject: Btrfs: fix spinlock assertions on UP systems btrfs_tree_locked was being used to make sure a given extent_buffer was properly locked in a few places. But, it wasn't correct for UP compiled kernels. This switches it to using assert_spin_locked instead, and renames it to btrfs_assert_tree_locked to better reflect how it was really being used. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 10 +++++----- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/locking.c | 6 +++--- fs/btrfs/locking.h | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 42491d728e99..37f31b5529aa 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -277,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (*cow_ret == buf) unlock_orig = 1; - WARN_ON(!btrfs_tree_locked(buf)); + btrfs_assert_tree_locked(buf); if (parent) parent_start = parent->start; @@ -2365,7 +2365,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (slot >= btrfs_header_nritems(upper) - 1) return 1; - WARN_ON(!btrfs_tree_locked(path->nodes[1])); + btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); btrfs_tree_lock(right); @@ -2562,7 +2562,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (right_nritems == 0) return 1; - WARN_ON(!btrfs_tree_locked(path->nodes[1])); + btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); btrfs_tree_lock(left); @@ -4101,7 +4101,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) next = read_node_slot(root, c, slot); if (!path->skip_locking) { - WARN_ON(!btrfs_tree_locked(c)); + btrfs_assert_tree_locked(c); btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } @@ -4126,7 +4126,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) reada_for_search(root, path, level, slot, 0); next = read_node_slot(root, next, 0); if (!path->skip_locking) { - WARN_ON(!btrfs_tree_locked(path->nodes[level])); + btrfs_assert_tree_locked(path->nodes[level]); btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index adda739a0215..3e18175248e0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -857,7 +857,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { - WARN_ON(!btrfs_tree_locked(buf)); + btrfs_assert_tree_locked(buf); /* ugh, clear_extent_buffer_dirty can be expensive */ btrfs_set_lock_blocking(buf); @@ -2361,7 +2361,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) btrfs_set_lock_blocking(buf); - WARN_ON(!btrfs_tree_locked(buf)); + btrfs_assert_tree_locked(buf); if (transid != root->fs_info->generation) { printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6b5966aacf44..9abf81f71c46 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4418,13 +4418,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); - BUG_ON(!btrfs_tree_locked(parent)); + btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); extent_buffer_get(parent); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); - BUG_ON(!btrfs_tree_locked(node)); + btrfs_assert_tree_locked(node); level = btrfs_header_level(node); extent_buffer_get(node); path->nodes[level] = node; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 85506c4a3af7..47b0a88c12a2 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -220,8 +220,8 @@ int btrfs_tree_unlock(struct extent_buffer *eb) return 0; } -int btrfs_tree_locked(struct extent_buffer *eb) +void btrfs_assert_tree_locked(struct extent_buffer *eb) { - return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || - spin_is_locked(&eb->lock); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + assert_spin_locked(&eb->lock); } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 6bb0afbff928..6c4ce457168c 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -21,11 +21,11 @@ int btrfs_tree_lock(struct extent_buffer *eb); int btrfs_tree_unlock(struct extent_buffer *eb); -int btrfs_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_lock(struct extent_buffer *eb); int btrfs_try_spin_lock(struct extent_buffer *eb); void btrfs_set_lock_blocking(struct extent_buffer *eb); void btrfs_clear_lock_blocking(struct extent_buffer *eb); +void btrfs_assert_tree_locked(struct extent_buffer *eb); #endif -- cgit v1.2.2 From 4184ea7f908d95f329febc3665cf66da8568b467 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 10 Mar 2009 12:39:20 -0400 Subject: Btrfs: Fix locking around adding new space_info Storage allocated to different raid levels in btrfs is tracked by a btrfs_space_info structure, and all of the current space_infos are collected into a list_head. Most filesystems have 3 or 4 of these structs total, and the list is only changed when new raid levels are added or at unmount time. This commit adds rcu locking on the list head, and properly frees things at unmount time. It also clears the space_info->full flag whenever new space is added to the FS. The locking for the space info list goes like this: reads: protected by rcu_read_lock() writes: protected by the chunk_mutex At unmount time we don't need special locking because all the readers are gone. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 9 +++++++++ fs/btrfs/extent-tree.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/volumes.c | 2 ++ 3 files changed, 53 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 82491ba8fa40..5e1d4e30e9d8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -784,7 +784,14 @@ struct btrfs_fs_info { struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; + + /* + * the space_info list is almost entirely read only. It only changes + * when we add a new raid type to the FS, and that happens + * very rarely. RCU is used to protect it. + */ struct list_head space_info; + spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; @@ -1797,6 +1804,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + int btrfs_check_metadata_free_space(struct btrfs_root *root); int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9abf81f71c46..fefe83ad2059 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "compat.h" #include "hash.h" #include "crc32c.h" @@ -330,13 +331,33 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, { struct list_head *head = &info->space_info; struct btrfs_space_info *found; - list_for_each_entry(found, head, list) { - if (found->flags == flags) + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags == flags) { + rcu_read_unlock(); return found; + } } + rcu_read_unlock(); return NULL; } +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -1903,7 +1924,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - list_add(&found->list, &info->space_info); INIT_LIST_HEAD(&found->block_groups); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); @@ -1917,6 +1937,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->full = 0; found->force_alloc = 0; *space_info = found; + list_add_rcu(&found->list, &info->space_info); return 0; } @@ -6320,6 +6341,7 @@ out: int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; struct rb_node *n; spin_lock(&info->block_group_cache_lock); @@ -6341,6 +6363,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) spin_lock(&info->block_group_cache_lock); } spin_unlock(&info->block_group_cache_lock); + + /* now that all the block groups are freed, go through and + * free all the space_info structs. This is only called during + * the final stages of unmount, and so we know nobody is + * using them. We call synchronize_rcu() once before we start, + * just to be on the safe side. + */ + synchronize_rcu(); + + while(!list_empty(&info->space_info)) { + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + + list_del(&space_info->list); + kfree(space_info); + } return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1316139bf9e8..7aa3810d7f69 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1459,6 +1459,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, device->fs_devices->total_rw_bytes += diff; device->total_bytes = new_size; + btrfs_clear_space_info_full(device->dev_root->fs_info); + return btrfs_update_device(trans, device); } -- cgit v1.2.2 From 913d952eb573c3d1f7487e83b5590e13e7cae2bd Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 10 Mar 2009 13:17:18 -0400 Subject: Btrfs: Clear space_info full when adding new devices The full flag on the space info structs tells the allocator not to try and allocate more chunks because the devices in the FS are fully allocated. When more devices are added, we need to clear the full flag so the allocator knows it has more space available. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7aa3810d7f69..dd06e18e5aac 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1374,6 +1374,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_add_device(trans, root, device); } + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(root->fs_info); + unlock_chunks(root); btrfs_commit_transaction(trans, root); -- cgit v1.2.2 From 395a87bfefbc400011417e9eaae33169f9f036c0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 10 Mar 2009 18:18:47 -0400 Subject: ext4: fix header check in ext4_ext_search_right() for deep extent trees. The ext4_ext_search_right() function is confusing; it uses a "depth" variable which is 0 at the root and maximum at the leaves, but the on-disk metadata uses a "depth" (actually eh_depth) which is opposite: maximum at the root, and 0 at the leaves. The ext4_ext_check_header() function is given a depth and checks the header agaisnt that depth; it expects the on-disk semantics, but we are giving it the opposite in the while loop in this function. We should be giving it the on-disk notion of "depth" which we can get from (p_depth - depth) - and if you look, the last (more commonly hit) call to ext4_ext_check_header() does just this. Sending in the wrong depth results in (incorrect) messages about corruption: EXT4-fs error (device sdb1): ext4_ext_search_right: bad header in inode #2621457: unexpected eh_depth - magic f30a, entries 340, max 340(0), depth 1(2) http://bugzilla.kernel.org/show_bug.cgi?id=12821 Reported-by: David Dindorp Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e2eab196875f..e0aa4fe4f596 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1122,7 +1122,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent_idx *ix; struct ext4_extent *ex; ext4_fsblk_t block; - int depth, ee_len; + int depth; /* Note, NOT eh_depth; depth from top of tree */ + int ee_len; BUG_ON(path == NULL); depth = path->p_depth; @@ -1179,7 +1180,8 @@ got_index: if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check_header(inode, eh, depth)) { + /* subtract from p_depth to get proper eh_depth */ + if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } -- cgit v1.2.2 From 260219cc48cfb22486e5d0d706c978228a080d63 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Mar 2009 12:55:51 -0700 Subject: devpts: remove graffiti Very annoying when working with containters. Signed-off-by: Alexey Dobriyan Cc: Alan Cox Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/devpts/inode.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 5f3231b9633f..bff4052b05e7 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -198,9 +198,6 @@ static int mknod_ptmx(struct super_block *sb) fsi->ptmx_dentry = dentry; rc = 0; - - printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n", - inode->i_ino); out: mutex_unlock(&root->d_inode->i_mutex); return rc; @@ -369,8 +366,6 @@ static int new_pts_mount(struct file_system_type *fs_type, int flags, struct pts_fs_info *fsi; struct pts_mount_opts *opts; - printk(KERN_NOTICE "devpts: newinstance mount\n"); - err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt); if (err) return err; -- cgit v1.2.2 From ef95d31e6de6be9602ce950b85fb7ab8af46ae42 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Mar 2009 20:33:17 -0400 Subject: NFS: Fix misparsing of nfsv4 fs_locations attribute (take 2) The changeset ea31a4437c59219bf3ea946d58984b01a45a289c (nfs: Fix misparsing of nfsv4 fs_locations attribute) causes the mountpath that is calculated at the beginning of try_location() to be clobbered when we later strncpy a non-nul terminated hostname using an incorrect buffer length. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 30befc39b3c6..2a2a0a7143ad 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -21,7 +21,9 @@ #define NFSDBG_FACILITY NFSDBG_VFS /* - * Check if fs_root is valid + * Convert the NFSv4 pathname components into a standard posix path. + * + * Note that the resulting string will be placed at the end of the buffer */ static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, char *buffer, ssize_t buflen) @@ -99,21 +101,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, { struct vfsmount *mnt = ERR_PTR(-ENOENT); char *mnt_path; - int page2len; + unsigned int maxbuflen; unsigned int s; mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) return mnt; mountdata->mnt_path = mnt_path; - page2 += strlen(mnt_path) + 1; - page2len = PAGE_SIZE - strlen(mnt_path) - 1; + maxbuflen = mnt_path - 1 - page2; for (s = 0; s < location->nservers; s++) { const struct nfs4_string *buf = &location->servers[s]; struct sockaddr_storage addr; - if (buf->len <= 0 || buf->len >= PAGE_SIZE) + if (buf->len <= 0 || buf->len >= maxbuflen) continue; mountdata->addr = (struct sockaddr *)&addr; @@ -126,8 +127,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, continue; nfs_set_port(mountdata->addr, NFS_PORT); - strncpy(page2, buf->data, page2len); - page2[page2len] = '\0'; + memcpy(page2, buf->data, buf->len); + page2[buf->len] = '\0'; mountdata->hostname = page2; snprintf(page, PAGE_SIZE, "%s:%s", -- cgit v1.2.2 From ae46141ff08f1965b17c531b571953c39ce8b9e2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Mar 2009 20:33:18 -0400 Subject: NFSv3: Fix posix ACL code Fix a memory leak due to allocation in the XDR layer. In cases where the RPC call needs to be retransmitted, we end up allocating new pages without clearing the old ones. Fix this by moving the allocation into nfs3_proc_setacls(). Also fix an issue discovered by Kevin Rudd, whereby the amount of memory reserved for the acls in the xdr_buf->head was miscalculated, and causing corruption. Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 27 +++++++++++++++++++++------ fs/nfs/nfs3xdr.c | 34 +++++++++++++--------------------- 2 files changed, 34 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index cef62557c87d..6bbf0e6daad2 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, { struct nfs_server *server = NFS_SERVER(inode); struct nfs_fattr fattr; - struct page *pages[NFSACL_MAXPAGES] = { }; + struct page *pages[NFSACL_MAXPAGES]; struct nfs3_setaclargs args = { .inode = inode, .mask = NFS_ACL, @@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, .rpc_argp = &args, .rpc_resp = &fattr, }; - int status, count; + int status; status = -EOPNOTSUPP; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, if (S_ISDIR(inode->i_mode)) { args.mask |= NFS_DFACL; args.acl_default = dfacl; + args.len = nfsacl_size(acl, dfacl); + } else + args.len = nfsacl_size(acl, NULL); + + if (args.len > NFS_ACL_INLINE_BUFSIZE) { + unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT); + + status = -ENOMEM; + do { + args.pages[args.npages] = alloc_page(GFP_KERNEL); + if (args.pages[args.npages] == NULL) + goto out_freepages; + args.npages++; + } while (args.npages < npages); } dprintk("NFS call setacl\n"); @@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, nfs_zap_acl_cache(inode); dprintk("NFS reply setacl: %d\n", status); - /* pages may have been allocated at the xdr layer. */ - for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) - __free_page(args.pages[count]); - switch (status) { case 0: status = nfs_refresh_inode(inode, &fattr); @@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, case -ENOTSUPP: status = -EOPNOTSUPP; } +out_freepages: + while (args.npages != 0) { + args.npages--; + __free_page(args.pages[args.npages]); + } out: return status; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 11cdddec1432..6cdeacffde46 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -82,8 +82,10 @@ #define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) #define ACL3_getaclargs_sz (NFS3_fh_sz+1) -#define ACL3_setaclargs_sz (NFS3_fh_sz+1+2*(2+5*3)) -#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+2*(2+5*3)) +#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) +#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) /* @@ -703,28 +705,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, struct nfs3_setaclargs *args) { struct xdr_buf *buf = &req->rq_snd_buf; - unsigned int base, len_in_head, len = nfsacl_size( - (args->mask & NFS_ACL) ? args->acl_access : NULL, - (args->mask & NFS_DFACL) ? args->acl_default : NULL); - int count, err; + unsigned int base; + int err; p = xdr_encode_fhandle(p, NFS_FH(args->inode)); *p++ = htonl(args->mask); - base = (char *)p - (char *)buf->head->iov_base; - /* put as much of the acls into head as possible. */ - len_in_head = min_t(unsigned int, buf->head->iov_len - base, len); - len -= len_in_head; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2)); - - for (count = 0; (count << PAGE_SHIFT) < len; count++) { - args->pages[count] = alloc_page(GFP_KERNEL); - if (!args->pages[count]) { - while (count) - __free_page(args->pages[--count]); - return -ENOMEM; - } - } - xdr_encode_pages(buf, args->pages, 0, len); + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + base = req->rq_slen; + + if (args->npages != 0) + xdr_encode_pages(buf, args->pages, 0, args->len); + else + req->rq_slen += args->len; err = nfsacl_encode(buf, base, args->inode, (args->mask & NFS_ACL) ? -- cgit v1.2.2 From 57df675c60c5cf0748ddba9c7f85afde1530d74d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 10 Mar 2009 20:33:20 -0400 Subject: NLM: Fix GRANT callback address comparison when IPv6 is enabled The NFS mount command may pass an AF_INET server address to lockd. If lockd happens to be using a PF_INET6 listener, the nlm_cmp_addr() in nlmclnt_grant() will fail to match requests from that host because they will all have a mapped IPv4 AF_INET6 address. Adopt the same solution used in nfs_sockaddr_match_ipaddr() for NFSv4 callbacks: if either address is AF_INET, map it to an AF_INET6 address before doing the comparison. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/clntlock.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 1f3b0fc0d351..aedc47a264c1 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -139,6 +139,55 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) return 0; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap, + struct in6_addr *addr_mapped) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; + + switch (sap->sa_family) { + case AF_INET6: + return &((const struct sockaddr_in6 *)sap)->sin6_addr; + case AF_INET: + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped); + return addr_mapped; + } + + return NULL; +} + +/* + * If lockd is using a PF_INET6 listener, all incoming requests appear + * to come from AF_INET6 remotes. The address of AF_INET remotes are + * mapped to AF_INET6 automatically by the network layer. In case the + * user passed an AF_INET server address at mount time, ensure both + * addresses are AF_INET6 before comparing them. + */ +static int nlmclnt_cmp_addr(const struct nlm_host *host, + const struct sockaddr *sap) +{ + const struct in6_addr *addr1; + const struct in6_addr *addr2; + struct in6_addr addr1_mapped; + struct in6_addr addr2_mapped; + + addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped); + if (likely(addr1 != NULL)) { + addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped); + if (likely(addr2 != NULL)) + return ipv6_addr_equal(addr1, addr2); + } + + return 0; +} +#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ +static int nlmclnt_cmp_addr(const struct nlm_host *host, + const struct sockaddr *sap) +{ + return nlm_cmp_addr(nlm_addr(host), sap); +} +#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ + /* * The server lockd has called us back to tell us the lock was granted */ @@ -166,7 +215,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) + if (!nlmclnt_cmp_addr(block->b_host, addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; -- cgit v1.2.2 From a71ee337b31271e701f689d544b6153b75609bc5 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Tue, 10 Mar 2009 20:33:21 -0400 Subject: NFS: Handle -ESTALE error in access() Hi Trond, I have been looking at a bugreport where trying to open applications on KDE on a NFS mounted home fails temporarily. There have been multiple reports on different kernel versions pointing to this common issue: http://bugzilla.kernel.org/show_bug.cgi?id=12557 https://bugs.launchpad.net/ubuntu/+source/linux/+bug/269954 http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=508866.html This issue can be reproducible consistently by doing this on a NFS mounted home (KDE): 1. Open 2 xterm sessions 2. From one of the xterm session, do "ssh -X " 3. "stat ~/.Xauthority" on the remote SSH session 4. Close the two xterm sessions 5. On the server do a "stat ~/.Xauthority" 6. Now on the client, try to open xterm This will fail. Even if the filehandle had become stale, the NFS client should invalidate the cache/inode and should repeat LOOKUP. Looking at the packet capture when the failure occurs shows that there were two subsequent ACCESS() calls with the same filehandle and both fails with -ESTALE error. I have tested the fix below. Now the client issue a LOOKUP after the ACCESS() call fails with -ESTALE. If all this makes sense to you, can you consider this for inclusion? Thanks, If the server returns an -ESTALE error due to stale filehandle in response to an ACCESS() call, we need to invalidate the cache and inode so that LOOKUP() can be retried. Without this change, the nfs client retries ACCESS() with the same filehandle, fails again and could lead to temporary failure of applications running on nfs mounted home. Signed-off-by: Suresh Jayaraman Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e35c8199f82f..672368f865ca 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1892,8 +1892,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) cache.cred = cred; cache.jiffies = jiffies; status = NFS_PROTO(inode)->access(inode, &cache); - if (status != 0) + if (status != 0) { + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } return status; + } nfs_access_add_cache(inode, &cache); out: if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) -- cgit v1.2.2 From d7371c41b0cda782256b1df759df4e8d4724584c Mon Sep 17 00:00:00 2001 From: Ian Dall Date: Tue, 10 Mar 2009 20:33:22 -0400 Subject: Bug 11061, NFS mounts dropped Addresses: http://bugzilla.kernel.org/show_bug.cgi?id=11061 sockaddr structures can't be reliably compared using memcmp() because there are padding bytes in the structure which can't be guaranteed to be the same even when the sockaddr structures refer to the same socket. Instead compare all the relevant fields. In the case of IPv6 sin6_flowinfo is not compared because it only affects QoS and sin6_scope_id is only compared if the address is "link local" because "link local" addresses need only be unique to a specific link. Signed-off-by: Ian Dall Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 9b728f3565a1..06654b831d19 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -272,6 +272,65 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, } #endif +/* + * Test if two ip4 socket addresses refer to the same socket, by + * comparing relevant fields. The padding bytes specifically, are + * not compared. + * + * The caller should ensure both socket addresses are AF_INET. + */ +static int nfs_sockaddr_cmp_ip4(const struct sockaddr_in * saddr1, + const struct sockaddr_in * saddr2) +{ + if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) + return 0; + return saddr1->sin_port == saddr2->sin_port; +} + +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_cmp_ip6 (const struct sockaddr_in6 * saddr1, + const struct sockaddr_in6 * saddr2) +{ + if (!ipv6_addr_equal(&saddr1->sin6_addr, + &saddr1->sin6_addr)) + return 0; + if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + saddr1->sin6_scope_id != saddr2->sin6_scope_id) + return 0; + return saddr1->sin6_port == saddr2->sin6_port; +} + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields. + */ +static int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_cmp_ip4((const struct sockaddr_in *) sa1, + (const struct sockaddr_in *) sa2); + case AF_INET6: + return nfs_sockaddr_cmp_ip6((const struct sockaddr_in6 *) sa1, + (const struct sockaddr_in6 *) sa2); + } + return 0; +} + /* * Find a client by IP address and protocol version * - returns NULL if no such client @@ -344,8 +403,10 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp) static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; + const struct sockaddr *sap = data->addr; list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; @@ -358,7 +419,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat continue; /* Match the full socket address */ - if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0) + if (!nfs_sockaddr_cmp(sap, clap)) continue; atomic_inc(&clp->cl_count); -- cgit v1.2.2 From ad3bdefe877afb47480418fdb05ecd42842de65e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 11 Mar 2009 09:00:04 +0800 Subject: proc: fix kflags to uflags copying in /proc/kpageflags Fix kpf_copy_bit(src,dst) to be kpf_copy_bit(dst,src) to match the actual call patterns, e.g. kpf_copy_bit(kflags, KPF_LOCKED, PG_locked). This misplacement of src/dst only affected reporting of PG_writeback, PG_reclaim and PG_buddy. For others kflags==uflags so not affected. Signed-off-by: Wu Fengguang Reviewed-by: KOSAKI Motohiro Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/proc/page.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 2d1345112a42..e9983837d08d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -80,7 +80,7 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_RECLAIM 9 #define KPF_BUDDY 10 -#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos) +#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos) static ssize_t kpageflags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -- cgit v1.2.2 From 3a95ea1155c5d44aa58dde2f64f0ddafe27fd1fb Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 12 Mar 2009 02:03:23 +0900 Subject: Fix _fat_bmap() locking On swapon() path, it has already i_mutex. So, this uses i_alloc_sem instead of it. Signed-off-by: OGAWA Hirofumi Reported-by: Laurent GUERBY Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 6b74d09adbe5..de0004fe6e00 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ - mutex_lock(&mapping->host->i_mutex); + down_read(&mapping->host->i_alloc_sem); blocknr = generic_block_bmap(mapping, block, fat_get_block); - mutex_unlock(&mapping->host->i_mutex); + up_read(&mapping->host->i_alloc_sem); return blocknr; } -- cgit v1.2.2 From 363911d027d1de1c6df79eb3f487f5476b9619f4 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 12 Mar 2009 03:23:48 +0000 Subject: Squashfs: Valid filesystems are flagged as bad by the corrupted fs patch The corrupted filesystem patch added a check against zlib trying to output too much data in the presence of data corruption. This check triggered if zlib_inflate asked to be called again (Z_OK) with avail_out == 0 and no more output buffers available. This check proves to be rather dumb, as it incorrectly catches the case where zlib has generated all the output, but there are still input bytes to be processed. This patch does a number of things. It removes the original check and replaces it with code to not move to the next output buffer if there are no more output buffers available, relying on zlib to error if it wants an extra output buffer in the case of data corruption. It also replaces the Z_NO_FLUSH flag with the more correct Z_SYNC_FLUSH flag, and makes the error messages more understandable to non-technical users. Signed-off-by: Phillip Lougher Reported-by: Stefan Lippers-Hollmann --- fs/squashfs/block.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 321728f48f2d..2a7960310349 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -184,15 +184,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, offset = 0; } - if (msblk->stream.avail_out == 0) { - if (page == pages) { - ERROR("zlib_inflate tried to " - "decompress too much data, " - "expected %d bytes. Zlib " - "data probably corrupt\n", - srclength); - goto release_mutex; - } + if (msblk->stream.avail_out == 0 && page < pages) { msblk->stream.next_out = buffer[page++]; msblk->stream.avail_out = PAGE_CACHE_SIZE; } @@ -209,25 +201,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, zlib_init = 1; } - zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH); + zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH); if (msblk->stream.avail_in == 0 && k < b) put_bh(bh[k++]); } while (zlib_err == Z_OK); if (zlib_err != Z_STREAM_END) { - ERROR("zlib_inflate returned unexpected result" - " 0x%x, srclength %d, avail_in %d," - " avail_out %d\n", zlib_err, srclength, - msblk->stream.avail_in, - msblk->stream.avail_out); + ERROR("zlib_inflate error, data probably corrupt\n"); goto release_mutex; } zlib_err = zlib_inflateEnd(&msblk->stream); if (zlib_err != Z_OK) { - ERROR("zlib_inflateEnd returned unexpected result 0x%x," - " srclength %d\n", zlib_err, srclength); + ERROR("zlib_inflate error, data probably corrupt\n"); goto release_mutex; } length = msblk->stream.total_out; -- cgit v1.2.2 From 2842c3b5449f31470b61db716f1926b594fb6156 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 12 Mar 2009 12:20:01 -0400 Subject: ext4: Print the find_group_flex() warning only once This is a short-term warning, and even printk_ratelimit() can result in too much noise in system logs. So only print it once as a warning. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 627f8c3337a3..2d2b3585ee91 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -698,6 +698,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct inode *ret; ext4_group_t i; int free = 0; + static int once = 1; ext4_group_t flex_group; /* Cannot create files in a deleted directory */ @@ -719,7 +720,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { ret2 = find_group_other(sb, dir, &group); - if (ret2 == 0 && printk_ratelimit()) + if (ret2 == 0 && once) + once = 0; printk(KERN_NOTICE "ext4: find_group_flex " "failed, fallback succeeded dir %lu\n", dir->i_ino); -- cgit v1.2.2 From 9f4c899c0d90e1b51b6864834f3877b47c161a0e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 12 Mar 2009 14:51:32 -0400 Subject: NFS: Fix the fix to Bugzilla #11061, when IPv6 isn't defined... Stephen Rothwell reports: Today's linux-next build (powerpc ppc64_defconfig) failed like this: fs/built-in.o: In function `.nfs_get_client': client.c:(.text+0x115010): undefined reference to `.__ipv6_addr_type' Fix by moving the IPV6 specific parts of commit d7371c41b0cda782256b1df759df4e8d4724584c ("Bug 11061, NFS mounts dropped") into the '#ifdef IPV6..." section. Also fix up a couple of formatting issues. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 68 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 06654b831d19..574158ae2398 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -255,6 +255,32 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, } return 0; } + +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2; + + if (!ipv6_addr_equal(&saddr1->sin6_addr, + &saddr1->sin6_addr)) + return 0; + if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + saddr1->sin6_scope_id != saddr2->sin6_scope_id) + return 0; + return saddr1->sin6_port == saddr2->sin6_port; +} #else static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, const struct sockaddr_in *sa2) @@ -270,6 +296,12 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, (const struct sockaddr_in *)sa2); } + +static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, + const struct sockaddr * sa2) +{ + return 0; +} #endif /* @@ -279,37 +311,17 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, * * The caller should ensure both socket addresses are AF_INET. */ -static int nfs_sockaddr_cmp_ip4(const struct sockaddr_in * saddr1, - const struct sockaddr_in * saddr2) +static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, + const struct sockaddr *sa2) { + const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2; + if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) return 0; return saddr1->sin_port == saddr2->sin_port; } -/* - * Test if two ip6 socket addresses refer to the same socket by - * comparing relevant fields. The padding bytes specifically, are not - * compared. sin6_flowinfo is not compared because it only affects QoS - * and sin6_scope_id is only compared if the address is "link local" - * because "link local" addresses need only be unique to a specific - * link. Conversely, ordinary unicast addresses might have different - * sin6_scope_id. - * - * The caller should ensure both socket addresses are AF_INET6. - */ -static int nfs_sockaddr_cmp_ip6 (const struct sockaddr_in6 * saddr1, - const struct sockaddr_in6 * saddr2) -{ - if (!ipv6_addr_equal(&saddr1->sin6_addr, - &saddr1->sin6_addr)) - return 0; - if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - saddr1->sin6_scope_id != saddr2->sin6_scope_id) - return 0; - return saddr1->sin6_port == saddr2->sin6_port; -} - /* * Test if two socket addresses represent the same actual socket, * by comparing (only) relevant fields. @@ -322,11 +334,9 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, switch (sa1->sa_family) { case AF_INET: - return nfs_sockaddr_cmp_ip4((const struct sockaddr_in *) sa1, - (const struct sockaddr_in *) sa2); + return nfs_sockaddr_cmp_ip4(sa1, sa2); case AF_INET6: - return nfs_sockaddr_cmp_ip6((const struct sockaddr_in6 *) sa1, - (const struct sockaddr_in6 *) sa2); + return nfs_sockaddr_cmp_ip6(sa1, sa2); } return 0; } -- cgit v1.2.2 From e5bc49ba7439b9726006d031d440cba96819f0f8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 12 Mar 2009 14:31:28 -0700 Subject: pipe_rdwr_fasync: fix the error handling to prevent the leak/crash If the second fasync_helper() fails, pipe_rdwr_fasync() returns the error but leaves the file on ->fasync_readers. This was always wrong, but since 233e70f4228e78eb2f80dc6650f65d3ae3dbf17c "saner FASYNC handling on file close" we have the new problem. Because in this case setfl() doesn't set FASYNC bit, __fput() will not do ->fasync(0), and we leak fasync_struct with ->fa_file pointing to the freed file. Signed-off-by: Oleg Nesterov Cc: Al Viro Cc: Andi Kleen Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 3a48ba5179d5..14f502b89cf5 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -699,12 +699,12 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on) int retval; mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); - - if (retval >= 0) + if (retval >= 0) { retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); - + if (retval < 0) /* this can happen only if on == T */ + fasync_helper(-1, filp, 0, &pipe->fasync_readers); + } mutex_unlock(&inode->i_mutex); if (retval < 0) -- cgit v1.2.2 From a3cfbb53b1764a3d1f58ddc032737ab9edaa7d41 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 12 Mar 2009 14:31:29 -0700 Subject: vfs: add missing unlock in sget() In sget(), destroy_super(s) is called with s->s_umount held, which makes lockdep unhappy. Signed-off-by: Li Zefan Cc: Al Viro Acked-by: Peter Zijlstra Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 8349ed6b1412..6ce501447ada 100644 --- a/fs/super.c +++ b/fs/super.c @@ -371,8 +371,10 @@ retry: continue; if (!grab_super(old)) goto retry; - if (s) + if (s) { + up_write(&s->s_umount); destroy_super(s); + } return old; } } @@ -387,6 +389,7 @@ retry: err = set(s, data); if (err) { spin_unlock(&sb_lock); + up_write(&s->s_umount); destroy_super(s); return ERR_PTR(err); } -- cgit v1.2.2 From 7ef0d7377cb287e08f3ae94cebc919448e1f5dff Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 12 Mar 2009 14:31:38 -0700 Subject: fs: new inode i_state corruption fix There was a report of a data corruption http://lkml.org/lkml/2008/11/14/121. There is a script included to reproduce the problem. During testing, I encountered a number of strange things with ext3, so I tried ext2 to attempt to reduce complexity of the problem. I found that fsstress would quickly hang in wait_on_inode, waiting for I_LOCK to be cleared, even though instrumentation showed that unlock_new_inode had already been called for that inode. This points to memory scribble, or synchronisation problme. i_state of I_NEW inodes is not protected by inode_lock because other processes are not supposed to touch them until I_LOCK (and I_NEW) is cleared. Adding WARN_ON(inode->i_state & I_NEW) to sites where we modify i_state revealed that generic_sync_sb_inodes is picking up new inodes from the inode lists and passing them to __writeback_single_inode without waiting for I_NEW. Subsequently modifying i_state causes corruption. In my case it would look like this: CPU0 CPU1 unlock_new_inode() __sync_single_inode() reg <- inode->i_state reg -> reg & ~(I_LOCK|I_NEW) reg <- inode->i_state reg -> inode->i_state reg -> reg | I_SYNC reg -> inode->i_state Non-atomic RMW on CPU1 overwrites CPU0 store and sets I_LOCK|I_NEW again. Fix for this is rather than wait for I_NEW inodes, just skip over them: inodes concurrently being created are not subject to data integrity operations, and should not significantly contribute to dirty memory either. After this change, I'm unable to reproduce any of the added warnings or hangs after ~1hour of running. Previously, the new warnings would start immediately and hang would happen in under 5 minutes. I'm also testing on ext3 now, and so far no problems there either. I don't know whether this fixes the problem reported above, but it fixes a real problem for me. Cc: "Jorge Boncompte [DTI2]" Reported-by: Adrian Hunter Cc: Jan Kara Cc: Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 9 ++++++++- fs/inode.c | 7 +++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e5eaa62fd17f..e3fe9918faaf 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -274,6 +274,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) int ret; BUG_ON(inode->i_state & I_SYNC); + WARN_ON(inode->i_state & I_NEW); /* Set I_SYNC, reset I_DIRTY */ dirty = inode->i_state & I_DIRTY; @@ -298,6 +299,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (!(inode->i_state & I_DIRTY) && @@ -470,6 +472,11 @@ void generic_sync_sb_inodes(struct super_block *sb, break; } + if (inode->i_state & I_NEW) { + requeue_io(inode); + continue; + } + if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; if (!sb_is_blkdev_sb(sb)) @@ -531,7 +538,7 @@ void generic_sync_sb_inodes(struct super_block *sb, list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { struct address_space *mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE)) + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) continue; mapping = inode->i_mapping; if (mapping->nrpages == 0) diff --git a/fs/inode.c b/fs/inode.c index 913ab2d9a5d1..826fb0b9d1c3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -359,6 +359,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) invalidate_inode_buffers(inode); if (!atomic_read(&inode->i_count)) { list_move(&inode->i_list, dispose); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; count++; continue; @@ -460,6 +461,7 @@ static void prune_icache(int nr_to_scan) continue; } list_move(&inode->i_list, &freeable); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; nr_pruned++; } @@ -656,6 +658,7 @@ void unlock_new_inode(struct inode *inode) * just created it (so there can be no old holders * that haven't tested I_LOCK). */ + WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); inode->i_state &= ~(I_LOCK|I_NEW); wake_up_inode(inode); } @@ -1145,6 +1148,7 @@ void generic_delete_inode(struct inode *inode) list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -1186,16 +1190,19 @@ static void generic_forget_inode(struct inode *inode) spin_unlock(&inode_lock); return; } + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; spin_unlock(&inode_lock); write_inode_now(inode, 1); spin_lock(&inode_lock); + WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; inodes_stat.nr_unused--; hlist_del_init(&inode->i_hash); } list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); -- cgit v1.2.2 From 6c9fd1dc0a597e575617a7de7086c8a3efa8f524 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Fri, 6 Mar 2009 10:19:30 +0800 Subject: ocfs2: reserve xattr block for new directory with inline data If this is a new directory with inline data, we choose to reserve the entire inline area for directory contents and force an external xattr block. Signed-off-by: Tiger Yang Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4ddd788add67..c63efb5ef136 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -547,8 +547,12 @@ int ocfs2_calc_xattr_init(struct inode *dir, * when blocksize = 512, may reserve one more cluser for * xattr bucket, otherwise reserve one metadata block * for them is ok. + * If this is a new directory with inline data, + * we choose to reserve the entire inline area for + * directory contents and force an external xattr block. */ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); if (ret) { -- cgit v1.2.2 From d9ae49d6e2b1ac9166e58ae3c9345135604beaa6 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Thu, 5 Mar 2009 11:06:15 +0800 Subject: ocfs2: tweak to get the maximum inline data size with xattr Replace max_inline_data with max_inline_data_with_xattr to ensure it correct when xattr inlined. Signed-off-by: Tiger Yang Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 7 +++++-- fs/ocfs2/namei.c | 3 ++- fs/ocfs2/ocfs2_fs.h | 6 ------ 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a067a6cffb01..8e1709a679b7 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, size = i_size_read(inode); if (size > PAGE_CACHE_SIZE || - size > ocfs2_max_inline_data(inode->i_sb)) { + size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, "Inode %llu has with inline data has bad size: %Lu", (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1555,6 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, int ret, written = 0; loff_t end = pos + len; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = NULL; mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, @@ -1587,7 +1588,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, /* * Check whether the write can fit. */ - if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb)) + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + if (mmap_page || + end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) return 0; do_inline_write: diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 084aba86c3b2..4b11762f249e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -532,7 +532,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); - fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb)); + fe->id2.i_data.id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(osb->sb, fe)); } else { fel = &fe->id2.i_list; fel->l_tree_depth = 0; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index c7ae45aaa36c..2332ef740f4f 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1070,12 +1070,6 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) offsetof(struct ocfs2_dinode, id2.i_symlink); } -static inline int ocfs2_max_inline_data(struct super_block *sb) -{ - return sb->s_blocksize - - offsetof(struct ocfs2_dinode, id2.i_data.id_data); -} - static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, struct ocfs2_dinode *di) { -- cgit v1.2.2 From 74e77eb30d0ecbb12964d005b439c8b84a505b84 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Mar 2009 06:24:23 +0800 Subject: ocfs2: Fix a bug found by sparse check. We need to use le32_to_cpu to test rec->e_cpos in ocfs2_dinode_insert_check. Signed-off-by: Tao Ma Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 3a9e5deed74d..19e3a96aa02c 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -176,7 +176,8 @@ static int ocfs2_dinode_insert_check(struct inode *inode, BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != rec->e_cpos), + (OCFS2_I(inode)->ip_clusters != + le32_to_cpu(rec->e_cpos)), "Device %s, asking for sparse allocation: inode %llu, " "cpos %u, clusters %u\n", osb->dev_str, -- cgit v1.2.2 From 712e53e46a1da35fcd88c05aa0c675b10f7c0e9d Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Mar 2009 08:37:34 +0800 Subject: ocfs2: Use xs->bucket to set xattr value outside A long time ago, xs->base is allocated a 4K size and all the contents in the bucket are copied to the it. Now we use ocfs2_xattr_bucket to abstract xattr bucket and xs->base is initialized to the start of the bu_bhs[0]. So xs->base + offset will overflow when the value root is stored outside the first block. Then why we can survive the xattr test by now? It is because we always read the bucket contiguously now and kernel mm allocate continguous memory for us. We are lucky, but we should fix it. So just get the right value root as other callers do. Signed-off-by: Tao Ma Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c63efb5ef136..2563df89fc2a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -4795,19 +4795,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, char *val, int value_len) { - int offset; + int ret, offset, block_off; struct ocfs2_xattr_value_root *xv; struct ocfs2_xattr_entry *xe = xs->here; + struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); + void *base; BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); - offset = le16_to_cpu(xe->xe_name_offset) + - OCFS2_XATTR_SIZE(xe->xe_name_len); + ret = ocfs2_xattr_bucket_get_name_value(inode, xh, + xe - xh->xh_entries, + &block_off, + &offset); + if (ret) { + mlog_errno(ret); + goto out; + } - xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); + base = bucket_block(xs->bucket, block_off); + xv = (struct ocfs2_xattr_value_root *)(base + offset + + OCFS2_XATTR_SIZE(xe->xe_name_len)); - return __ocfs2_xattr_set_value_outside(inode, handle, - xv, val, value_len); + ret = __ocfs2_xattr_set_value_outside(inode, handle, + xv, val, value_len); + if (ret) + mlog_errno(ret); +out: + return ret; } static int ocfs2_rm_xattr_cluster(struct inode *inode, -- cgit v1.2.2 From 8d03c7a0c550e7ab24cadcef5e66656bfadec8b9 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 14 Mar 2009 11:51:46 -0400 Subject: ext4: fix bogus BUG_ONs in in mballoc code Thiemo Nagel reported that: # dd if=/dev/zero of=image.ext4 bs=1M count=2 # mkfs.ext4 -v -F -b 1024 -m 0 -g 512 -G 4 -I 128 -N 1 \ -O large_file,dir_index,flex_bg,extent,sparse_super image.ext4 # mount -o loop image.ext4 mnt/ # dd if=/dev/zero of=mnt/file oopsed, with a BUG_ON in ext4_mb_normalize_request because size == EXT4_BLOCKS_PER_GROUP It appears to me (esp. after talking to Andreas) that the BUG_ON is bogus; a request of exactly EXT4_BLOCKS_PER_GROUP should be allowed, though larger sizes do indicate a problem. Fix that an another (apparently rare) codepath with a similar check. Reported-by: Thiemo Nagel Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4415beeb0b62..41f4348b62f5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1447,7 +1447,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); @@ -3292,7 +3292,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ -- cgit v1.2.2 From 020fe22ff14320927f394de222cbb11708bcc7a8 Mon Sep 17 00:00:00 2001 From: Enrik Berkhan Date: Fri, 13 Mar 2009 13:51:56 -0700 Subject: nommu: ramfs: pages allocated to an inode's pagecache may get wrongly discarded The pages attached to a ramfs inode's pagecache by truncation from nothing - as done by SYSV SHM for example - may get discarded under memory pressure. The problem is that the pages are not marked dirty. Anything that creates data in an MMU-based ramfs will cause the pages holding that data will cause the set_page_dirty() aop to be called. For the NOMMU-based mmap, set_page_dirty() may be called by write(), but it won't be called by page-writing faults on writable mmaps, and it isn't called by ramfs_nommu_expand_for_mapping() when a file is being truncated from nothing to allocate a contiguous run. The solution is to mark the pages dirty at the point of allocation by the truncation code. Signed-off-by: Enrik Berkhan Signed-off-by: David Howells Cc: Peter Zijlstra Cc: Nick Piggin Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index b9b567a28376..90d72bead55b 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -114,6 +114,9 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) if (!pagevec_add(&lru_pvec, page)) __pagevec_lru_add_file(&lru_pvec); + /* prevent the page from being discarded on memory pressure */ + SetPageDirty(page); + unlock_page(page); } -- cgit v1.2.2 From 15e7b8767605dc0cb9bd4594caabfec392385210 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 13 Mar 2009 13:51:58 -0700 Subject: nommu: ramfs: don't leak pages when adding to page cache fails When a ramfs nommu mapping is expanded, contiguous pages are allocated and added to the pagecache. The caller's reference is then passed on by moving whole pagevecs to the file lru list. If the page cache adding fails, make sure that the error path also moves the pagevec contents which might still contain up to PAGEVEC_SIZE successfully added pages, of which we would leak references otherwise. Signed-off-by: Johannes Weiner Cc: David Howells Cc: Enrik Berkhan Cc: Nick Piggin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 90d72bead55b..5d7c7ececa64 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -129,6 +129,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) return -EFBIG; add_error: + pagevec_lru_add_file(&lru_pvec); page_cache_release(pages + loop); for (loop++; loop < npages; loop++) __free_page(pages + loop); -- cgit v1.2.2 From 84814d642a4f1f294bd675ab11aae1ca54c6cedb Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 13 Mar 2009 13:51:59 -0700 Subject: eCryptfs: don't encrypt file key with filename key eCryptfs has file encryption keys (FEK), file encryption key encryption keys (FEKEK), and filename encryption keys (FNEK). The per-file FEK is encrypted with one or more FEKEKs and stored in the header of the encrypted file. I noticed that the FEK is also being encrypted by the FNEK. This is a problem if a user wants to use a different FNEK than their FEKEK, as their file contents will still be accessible with the FNEK. This is a minimalistic patch which prevents the FNEKs signatures from being copied to the inode signatures list. Ultimately, it keeps the FEK from being encrypted with a FNEK. Signed-off-by: Tyler Hicks Cc: Serge Hallyn Acked-by: Dustin Kirkland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 2 ++ fs/ecryptfs/ecryptfs_kernel.h | 3 ++- fs/ecryptfs/keystore.c | 3 ++- fs/ecryptfs/main.c | 5 +++-- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f6caeb1d1106..bdca1f4b3a3e 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -946,6 +946,8 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( list_for_each_entry(global_auth_tok, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { + if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK) + continue; rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); if (rc) { printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index c11fc95714ab..eb2267eca1fe 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -328,6 +328,7 @@ struct ecryptfs_dentry_info { */ struct ecryptfs_global_auth_tok { #define ECRYPTFS_AUTH_TOK_INVALID 0x00000001 +#define ECRYPTFS_AUTH_TOK_FNEK 0x00000002 u32 flags; struct list_head mount_crypt_stat_list; struct key *global_auth_tok_key; @@ -696,7 +697,7 @@ ecryptfs_write_header_metadata(char *virt, int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig); int ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, - char *sig); + char *sig, u32 global_auth_tok_flags); int ecryptfs_get_global_auth_tok_for_sig( struct ecryptfs_global_auth_tok **global_auth_tok, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index ff539420cc6f..e4a6223c3145 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -2375,7 +2375,7 @@ struct kmem_cache *ecryptfs_global_auth_tok_cache; int ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, - char *sig) + char *sig, u32 global_auth_tok_flags) { struct ecryptfs_global_auth_tok *new_auth_tok; int rc = 0; @@ -2389,6 +2389,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, goto out; } memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); + new_auth_tok->flags = global_auth_tok_flags; new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); list_add(&new_auth_tok->mount_crypt_stat_list, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 789cf2e1be1e..aed56c25539b 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -319,7 +319,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) case ecryptfs_opt_ecryptfs_sig: sig_src = args[0].from; rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, - sig_src); + sig_src, 0); if (rc) { printk(KERN_ERR "Error attempting to register " "global sig; rc = [%d]\n", rc); @@ -370,7 +370,8 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) ECRYPTFS_SIG_SIZE_HEX] = '\0'; rc = ecryptfs_add_global_auth_tok( mount_crypt_stat, - mount_crypt_stat->global_default_fnek_sig); + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_AUTH_TOK_FNEK); if (rc) { printk(KERN_ERR "Error attempting to register " "global fnek sig [%s]; rc = [%d]\n", -- cgit v1.2.2 From 87092698c665e0a358caf9825ae13114343027e8 Mon Sep 17 00:00:00 2001 From: un'ichi Nomura Date: Mon, 9 Mar 2009 10:40:52 +0100 Subject: block: Add gfp_mask parameter to bio_integrity_clone() Stricter gfp_mask might be required for clone allocation. For example, request-based dm may clone bio in interrupt context so it has to use GFP_ATOMIC. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Acked-by: Martin K. Petersen Cc: Alasdair G Kergon Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 5 +++-- fs/bio.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 549b0144da11..fe2b1aa2464e 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -685,19 +685,20 @@ EXPORT_SYMBOL(bio_integrity_split); * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio + * @gfp_mask: Memory allocation mask * @bs: bio_set to allocate bip from * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - struct bio_set *bs) + gfp_t gfp_mask, struct bio_set *bs) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); + bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); if (bip == NULL) return -EIO; diff --git a/fs/bio.c b/fs/bio.c index 124b95c4d582..cf747378b977 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -463,7 +463,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) if (bio_integrity(bio)) { int ret; - ret = bio_integrity_clone(b, bio, fs_bio_set); + ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); if (ret < 0) return NULL; -- cgit v1.2.2 From 059ea3318c8ede71851a52b4359fbf1ab0cec301 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Mar 2009 10:42:45 +0100 Subject: block: fix memory leak in bio_clone() If bio_integrity_clone() fails, bio_clone() returns NULL without freeing the newly allocated bio. Signed-off-by: Li Zefan Signed-off-by: Jens Axboe --- fs/bio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index cf747378b977..d4f06327c810 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -465,8 +465,10 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); - if (ret < 0) + if (ret < 0) { + bio_put(b); return NULL; + } } return b; -- cgit v1.2.2 From ee6f779b9e0851e2f7da292a9f58e0095edf615a Mon Sep 17 00:00:00 2001 From: Zhang Le Date: Mon, 16 Mar 2009 14:44:31 +0800 Subject: filp->f_pos not correctly updated in proc_task_readdir filp->f_pos only get updated at the end of the function. Thus d_off of those dirents who are in the middle will be 0, and this will cause a problem in glibc's readdir implementation, specifically endless loop. Because when overflow occurs, f_pos will be set to next dirent to read, however it will be 0, unless the next one is the last one. So it will start over again and again. There is a sample program in man 2 gendents. This is the output of the program running on a multithread program's task dir before this patch is applied: $ ./a.out /proc/3807/task --------------- nread=128 --------------- i-node# file type d_reclen d_off d_name 506442 directory 16 1 . 506441 directory 16 0 .. 506443 directory 16 0 3807 506444 directory 16 0 3809 506445 directory 16 0 3812 506446 directory 16 0 3861 506447 directory 16 0 3862 506448 directory 16 8 3863 This is the output after this patch is applied $ ./a.out /proc/3807/task --------------- nread=128 --------------- i-node# file type d_reclen d_off d_name 506442 directory 16 1 . 506441 directory 16 2 .. 506443 directory 16 3 3807 506444 directory 16 4 3809 506445 directory 16 5 3812 506446 directory 16 6 3861 506447 directory 16 7 3862 506448 directory 16 8 3863 Signed-off-by: Zhang Le Acked-by: Al Viro Signed-off-by: Linus Torvalds --- fs/proc/base.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0c9de19a1633..cc6ea2329e71 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3066,7 +3066,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi int retval = -ENOENT; ino_t ino; int tid; - unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ struct pid_namespace *ns; task = get_proc_task(inode); @@ -3083,18 +3082,18 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi goto out_no_task; retval = 0; - switch (pos) { + switch (filp->f_pos) { case 0: ino = inode->i_ino; - if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) + if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) goto out; - pos++; + filp->f_pos++; /* fall through */ case 1: ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) + if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0) goto out; - pos++; + filp->f_pos++; /* fall through */ } @@ -3104,9 +3103,9 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi ns = filp->f_dentry->d_sb->s_fs_info; tid = (int)filp->f_version; filp->f_version = 0; - for (task = first_tid(leader, tid, pos - 2, ns); + for (task = first_tid(leader, tid, filp->f_pos - 2, ns); task; - task = next_tid(task), pos++) { + task = next_tid(task), filp->f_pos++) { tid = task_pid_nr_ns(task, ns); if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { /* returning this tgid failed, save it as the first @@ -3117,7 +3116,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi } } out: - filp->f_pos = pos; put_task_struct(leader); out_no_task: return retval; -- cgit v1.2.2 From d33a1976fbee1ee321d6f014333d8f03a39d526c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 16 Mar 2009 23:25:40 -0400 Subject: ext4: fix bb_prealloc_list corruption due to wrong group locking This is for Red Hat bug 490026: EXT4 panic, list corruption in ext4_mb_new_inode_pa ext4_lock_group(sb, group) is supposed to protect this list for each group, and a common code flow to remove an album is like this: ext4_get_group_no_and_offset(sb, pa->pa_pstart, &grp, NULL); ext4_lock_group(sb, grp); list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); so it's critical that we get the right group number back for this prealloc context, to lock the right group (the one associated with this pa) and prevent concurrent list manipulation. however, ext4_mb_put_pa() passes in (pa->pa_pstart - 1) with a comment, "-1 is to protect from crossing allocation group". This makes sense for the group_pa, where pa_pstart is advanced by the length which has been used (in ext4_mb_release_context()), and when the entire length has been used, pa_pstart has been advanced to the first block of the next group. However, for inode_pa, pa_pstart is never advanced; it's just set once to the first block in the group and not moved after that. So in this case, if we subtract one in ext4_mb_put_pa(), we are actually locking the *previous* group, and opening the race with the other threads which do not subtract off the extra block. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 41f4348b62f5..9f61e62f435f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3589,6 +3589,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; + ext4_fsblk_t grp_blk; if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) return; @@ -3603,8 +3604,12 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); - /* -1 is to protect from crossing allocation group */ - ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); + grp_blk = pa->pa_pstart; + /* If linear, pa_pstart may be in the next group when pa is used up */ + if (pa->pa_linear) + grp_blk--; + + ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); /* * possible race: -- cgit v1.2.2 From ee568b25ee9e160b32d1aef73d8b2ee9c05d34db Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 17 Mar 2009 10:02:35 -0700 Subject: Avoid 64-bit "switch()" statements on 32-bit architectures Commit ee6f779b9e0851e2f7da292a9f58e0095edf615a ("filp->f_pos not correctly updated in proc_task_readdir") changed the proc code to use filp->f_pos directly, rather than through a temporary variable. In the process, that caused the operations to be done on the full 64 bits, even though the offset is never that big. That's all fine and dandy per se, but for some unfathomable reason gcc generates absolutely horrid code when using 64-bit values in switch() statements. To the point of actually calling out to gcc helper functions like __cmpdi2 rather than just doing the trivial comparisons directly the way gcc does for normal compares. At which point we get link failures, because we really don't want to support that kind of crazy code. Fix this by just casting the f_pos value to "unsigned long", which is plenty big enough for /proc, and avoids the gcc code generation issue. Reported-by: Alexey Dobriyan Cc: Zhang Le Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index cc6ea2329e71..beaa0ce3b82e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3082,7 +3082,7 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi goto out_no_task; retval = 0; - switch (filp->f_pos) { + switch ((unsigned long)filp->f_pos) { case 0: ino = inode->i_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) -- cgit v1.2.2 From 84f09f46b4ee9e4e9b6381f8af31817516d2091b Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 4 Mar 2009 23:05:35 +0200 Subject: NFSD: provide encode routine for OP_OPENATTR Although this operation is unsupported by our implementation we still need to provide an encode routine for it to merely encode its (error) status back in the compound reply. Thanks for Bill Baker at sun.com for testing with the Sun OpenSolaris' client, finding, and reporting this bug at Connectathon 2009. This bug was introduced in 2.6.27 Signed-off-by: Benny Halevy Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f65953be39c0..9250067943d8 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2596,6 +2596,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, + [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, -- cgit v1.2.2 From a8e7d49aa7be728c4ae241a75a2a124cdcabc0c5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 19 Mar 2009 11:32:05 -0700 Subject: Fix race in create_empty_buffers() vs __set_page_dirty_buffers() Nick Piggin noticed this (very unlikely) race between setting a page dirty and creating the buffers for it - we need to hold the mapping private_lock until we've set the page dirty bit in order to make sure that create_empty_buffers() might not build up a set of buffers without the dirty bits set when the page is dirty. I doubt anybody has ever hit this race (and it didn't solve the issue Nick was looking at), but as Nick says: "Still, it does appear to solve a real race, which we should close." Acked-by: Nick Piggin Signed-off-by: Linus Torvalds --- fs/buffer.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 9f697419ed8e..891e1c78e4f1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -760,15 +760,9 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. */ -static int __set_page_dirty(struct page *page, +static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - if (unlikely(!mapping)) - return !TestSetPageDirty(page); - - if (TestSetPageDirty(page)) - return 0; - spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); @@ -785,8 +779,6 @@ static int __set_page_dirty(struct page *page, } spin_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - - return 1; } /* @@ -816,6 +808,7 @@ static int __set_page_dirty(struct page *page, */ int __set_page_dirty_buffers(struct page *page) { + int newly_dirty; struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) @@ -831,9 +824,12 @@ int __set_page_dirty_buffers(struct page *page) bh = bh->b_this_page; } while (bh != head); } + newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); - return __set_page_dirty(page, mapping, 1); + if (newly_dirty) + __set_page_dirty(page, mapping, 1); + return newly_dirty; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -1262,8 +1258,11 @@ void mark_buffer_dirty(struct buffer_head *bh) return; } - if (!test_set_buffer_dirty(bh)) - __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); + if (!test_set_buffer_dirty(bh)) { + struct page *page = bh->b_page; + if (!TestSetPageDirty(page)) + __set_page_dirty(page, page_mapping(page), 0); + } } /* -- cgit v1.2.2 From 87c3a86e1c220121d0ced59d1a71e78ed9abc6dd Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Wed, 18 Mar 2009 17:04:19 -0700 Subject: eventfd: remove fput() call from possible IRQ context Remove a source of fput() call from inside IRQ context. Myself, like Eric, wasn't able to reproduce an fput() call from IRQ context, but Jeff said he was able to, with the attached test program. Independently from this, the bug is conceptually there, so we might be better off fixing it. This patch adds an optimization similar to the one we already do on ->ki_filp, on ->ki_eventfd. Playing with ->f_count directly is not pretty in general, but the alternative here would be to add a brand new delayed fput() infrastructure, that I'm not sure is worth it. Signed-off-by: Davide Libenzi Cc: Benjamin LaHaise Cc: Trond Myklebust Cc: Eric Dumazet Signed-off-by: Jeff Moyer Cc: Zach Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 8fa77e233944..4a9d4d641fb9 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -443,7 +443,7 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) req->private = NULL; req->ki_iovec = NULL; INIT_LIST_HEAD(&req->ki_run_list); - req->ki_eventfd = ERR_PTR(-EINVAL); + req->ki_eventfd = NULL; /* Check if the completion queue has enough free space to * accept an event from this io. @@ -485,8 +485,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) { assert_spin_locked(&ctx->ctx_lock); - if (!IS_ERR(req->ki_eventfd)) - fput(req->ki_eventfd); if (req->ki_dtor) req->ki_dtor(req); if (req->ki_iovec != &req->ki_inline_vec) @@ -508,8 +506,11 @@ static void aio_fput_routine(struct work_struct *data) list_del(&req->ki_list); spin_unlock_irq(&fput_lock); - /* Complete the fput */ - __fput(req->ki_filp); + /* Complete the fput(s) */ + if (req->ki_filp != NULL) + __fput(req->ki_filp); + if (req->ki_eventfd != NULL) + __fput(req->ki_eventfd); /* Link the iocb into the context's free list */ spin_lock_irq(&ctx->ctx_lock); @@ -527,12 +528,14 @@ static void aio_fput_routine(struct work_struct *data) */ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) { + int schedule_putreq = 0; + dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", req, atomic_long_read(&req->ki_filp->f_count)); assert_spin_locked(&ctx->ctx_lock); - req->ki_users --; + req->ki_users--; BUG_ON(req->ki_users < 0); if (likely(req->ki_users)) return 0; @@ -540,10 +543,23 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) req->ki_cancel = NULL; req->ki_retry = NULL; - /* Must be done under the lock to serialise against cancellation. - * Call this aio_fput as it duplicates fput via the fput_work. + /* + * Try to optimize the aio and eventfd file* puts, by avoiding to + * schedule work in case it is not __fput() time. In normal cases, + * we would not be holding the last reference to the file*, so + * this function will be executed w/out any aio kthread wakeup. */ - if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { + if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) + schedule_putreq++; + else + req->ki_filp = NULL; + if (req->ki_eventfd != NULL) { + if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count))) + schedule_putreq++; + else + req->ki_eventfd = NULL; + } + if (unlikely(schedule_putreq)) { get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); @@ -1009,7 +1025,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (!IS_ERR(iocb->ki_eventfd)) + if (iocb->ki_eventfd != NULL) eventfd_signal(iocb->ki_eventfd, 1); put_rq: @@ -1608,6 +1624,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); + req->ki_eventfd = NULL; goto out_put_req; } } -- cgit v1.2.2 From 65c24491b4fef017c64e39ec64384fde5e05e0a0 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 18 Mar 2009 17:04:21 -0700 Subject: aio: lookup_ioctx can return the wrong value when looking up a bogus context The libaio test harness turned up a problem whereby lookup_ioctx on a bogus io context was returning the 1 valid io context from the list (harness/cases/3.p). Because of that, an extra put_iocontext was done, and when the process exited, it hit a BUG_ON in the put_iocontext macro called from exit_aio (since we expect a users count of 1 and instead get 0). The problem was introduced by "aio: make the lookup_ioctx() lockless" (commit abf137dd7712132ee56d5b3143c2ff61a72a5faa). Thanks to Zach for pointing out that hlist_for_each_entry_rcu will not return with a NULL tpos at the end of the loop, even if the entry was not found. Signed-off-by: Jeff Moyer Acked-by: Zach Brown Acked-by: Jens Axboe Cc: Benjamin LaHaise Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 4a9d4d641fb9..76da12537956 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -587,7 +587,7 @@ int aio_put_req(struct kiocb *req) static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct mm_struct *mm = current->mm; - struct kioctx *ctx = NULL; + struct kioctx *ctx, *ret = NULL; struct hlist_node *n; rcu_read_lock(); @@ -595,12 +595,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { if (ctx->user_id == ctx_id && !ctx->dead) { get_ioctx(ctx); + ret = ctx; break; } } rcu_read_unlock(); - return ctx; + return ret; } /* -- cgit v1.2.2 From 8faece5f906725c10e7a1f6caf84452abadbdc7b Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Mar 2009 01:25:09 -0500 Subject: eCryptfs: Allocate a variable number of pages for file headers When allocating the memory used to store the eCryptfs header contents, a single, zeroed page was being allocated with get_zeroed_page(). However, the size of an eCryptfs header is either PAGE_CACHE_SIZE or ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE (8192), whichever is larger, and is stored in the file's private_data->crypt_stat->num_header_bytes_at_front field. ecryptfs_write_metadata_to_contents() was using num_header_bytes_at_front to decide how many bytes should be written to the lower filesystem for the file header. Unfortunately, at least 8K was being written from the page, despite the chance of the single, zeroed page being smaller than 8K. This resulted in random areas of kernel memory being written between the 0x1000 and 0x1FFF bytes offsets in the eCryptfs file headers if PAGE_SIZE was 4K. This patch allocates a variable number of pages, calculated with num_header_bytes_at_front, and passes the number of allocated pages along to ecryptfs_write_metadata_to_contents(). Thanks to Florian Streibelt for reporting the data leak and working with me to find the problem. 2.6.28 is the only kernel release with this vulnerability. Corresponds to CVE-2009-0787 Signed-off-by: Tyler Hicks Acked-by: Dustin Kirkland Reviewed-by: Eric Sandeen Reviewed-by: Eugene Teo Cc: Greg KH Cc: dann frazier Cc: Serge E. Hallyn Cc: Florian Streibelt Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index bdca1f4b3a3e..75bee99de0f6 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1324,14 +1324,13 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, } static int -ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, - struct dentry *ecryptfs_dentry, - char *virt) +ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, + char *virt, size_t virt_len) { int rc; rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, - 0, crypt_stat->num_header_bytes_at_front); + 0, virt_len); if (rc) printk(KERN_ERR "%s: Error attempting to write header " "information to lower file; rc = [%d]\n", __func__, @@ -1341,7 +1340,6 @@ ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, static int ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, - struct ecryptfs_crypt_stat *crypt_stat, char *page_virt, size_t size) { int rc; @@ -1351,6 +1349,17 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, return rc; } +static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, + unsigned int order) +{ + struct page *page; + + page = alloc_pages(gfp_mask | __GFP_ZERO, order); + if (page) + return (unsigned long) page_address(page); + return 0; +} + /** * ecryptfs_write_metadata * @ecryptfs_dentry: The eCryptfs dentry @@ -1367,7 +1376,9 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + unsigned int order; char *virt; + size_t virt_len; size_t size = 0; int rc = 0; @@ -1383,33 +1394,35 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = -EINVAL; goto out; } + virt_len = crypt_stat->num_header_bytes_at_front; + order = get_order(virt_len); /* Released in this function */ - virt = (char *)get_zeroed_page(GFP_KERNEL); + virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); if (!virt) { printk(KERN_ERR "%s: Out of memory\n", __func__); rc = -ENOMEM; goto out; } - rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size, - crypt_stat, ecryptfs_dentry); + rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat, + ecryptfs_dentry); if (unlikely(rc)) { printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n", __func__, rc); goto out_free; } if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, - crypt_stat, virt, size); + rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, + size); else - rc = ecryptfs_write_metadata_to_contents(crypt_stat, - ecryptfs_dentry, virt); + rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, + virt_len); if (rc) { printk(KERN_ERR "%s: Error writing metadata out to lower file; " "rc = [%d]\n", __func__, rc); goto out_free; } out_free: - free_page((unsigned long)virt); + free_pages((unsigned long)virt, order); out: return rc; } -- cgit v1.2.2 From 2aac0cf88681bfa092f731553bc7fbd23516be73 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Mar 2009 02:23:57 -0500 Subject: eCryptfs: NULL crypt_stat dereference during lookup If ecryptfs_encrypted_view or ecryptfs_xattr_metadata were being specified as mount options, a NULL pointer dereference of crypt_stat was possible during lookup. This patch moves the crypt_stat assignment into ecryptfs_lookup_and_interpose_lower(), ensuring that crypt_stat will not be NULL before we attempt to dereference it. Thanks to Dan Carpenter and his static analysis tool, smatch, for finding this bug. Signed-off-by: Tyler Hicks Acked-by: Dustin Kirkland Cc: Dan Carpenter Cc: Serge Hallyn Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 10 ++++++---- fs/ecryptfs/ecryptfs_kernel.h | 1 - fs/ecryptfs/inode.c | 32 ++++++++++++-------------------- 3 files changed, 18 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 75bee99de0f6..8b65f289ee00 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -2221,17 +2221,19 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, struct dentry *ecryptfs_dir_dentry, const char *name, size_t name_size) { + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; char *decoded_name; size_t decoded_name_size; size_t packet_size; int rc = 0; - if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { - struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - &ecryptfs_superblock_to_private( - ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; const char *orig_name = name; size_t orig_name_size = name_size; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index eb2267eca1fe..ac749d4d644f 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -620,7 +620,6 @@ int ecryptfs_interpose(struct dentry *hidden_dentry, u32 flags); int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, - struct ecryptfs_crypt_stat *crypt_stat, struct inode *ecryptfs_dir_inode, struct nameidata *ecryptfs_nd); int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 5697899a168d..55b3145b8072 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -246,7 +246,6 @@ out: */ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, - struct ecryptfs_crypt_stat *crypt_stat, struct inode *ecryptfs_dir_inode, struct nameidata *ecryptfs_nd) { @@ -254,6 +253,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct vfsmount *lower_mnt; struct inode *lower_inode; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_crypt_stat *crypt_stat; char *page_virt = NULL; u64 file_size; int rc = 0; @@ -314,6 +314,11 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, goto out_free_kmem; } } + crypt_stat = &ecryptfs_inode_to_private( + ecryptfs_dentry->d_inode)->crypt_stat; + /* TODO: lock for crypt_stat comparison */ + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); rc = ecryptfs_read_and_validate_header_region(page_virt, ecryptfs_dentry->d_inode); if (rc) { @@ -362,9 +367,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, { char *encrypted_and_encoded_name = NULL; size_t encrypted_and_encoded_name_size; - struct ecryptfs_crypt_stat *crypt_stat = NULL; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; - struct ecryptfs_inode_info *inode_info; struct dentry *lower_dir_dentry, *lower_dentry; int rc = 0; @@ -388,26 +391,15 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, } if (lower_dentry->d_inode) goto lookup_and_interpose; - inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); - if (inode_info) { - crypt_stat = &inode_info->crypt_stat; - /* TODO: lock for crypt_stat comparison */ - if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) - ecryptfs_set_default_sizes(crypt_stat); - } - if (crypt_stat) - mount_crypt_stat = crypt_stat->mount_crypt_stat; - else - mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; - if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) - && !(mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if (!(mount_crypt_stat + && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) goto lookup_and_interpose; dput(lower_dentry); rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, - crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name, + NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name, ecryptfs_dentry->d_name.len); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " @@ -426,7 +418,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, } lookup_and_interpose: rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, - crypt_stat, ecryptfs_dir_inode, + ecryptfs_dir_inode, ecryptfs_nd); goto out; out_d_drop: -- cgit v1.2.2 From f762dd68218665bb87d4e4a0eeac86fde7530293 Mon Sep 17 00:00:00 2001 From: Gertjan van Wingerde Date: Sat, 21 Mar 2009 23:18:57 +0100 Subject: Update my email address Update all previous incarnations of my email address to the correct one. Signed-off-by: Gertjan van Wingerde Signed-off-by: Linus Torvalds --- fs/minix/inode.c | 2 +- fs/ufs/super.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/minix/inode.c b/fs/minix/inode.c index d1d1eb84679d..618865b3128b 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -3,7 +3,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * - * Copyright (C) 1996 Gertjan van Wingerde (gertjan@cs.vu.nl) + * Copyright (C) 1996 Gertjan van Wingerde * Minix V2 fs support. * * Modified for 680x0 by Andreas Schwab diff --git a/fs/ufs/super.c b/fs/ufs/super.c index e65212dfb60e..261a1c2f22dd 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -41,7 +41,7 @@ * Stefan Reinauer * * Module usage counts added on 96/04/29 by - * Gertjan van Wingerde + * Gertjan van Wingerde * * Clean swab support on 19970406 by * Francois-Rene Rideau -- cgit v1.2.2 From 6f04c1c7fe9566d777fb7961391690866839e722 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 6 Jan 2009 11:52:25 +0000 Subject: GFS2: Fix remount argument parsing The following patch fixes an issue relating to remount and argument parsing. After this fix is applied, remount becomes atomic in that it either succeeds changing the mount to the new state, or it fails and leaves it in the old state. Previously it was possible for the parsing of options to fail part way though and for the fs to be left in a state where some of the new arguments had been applied, but some had not. Signed-off-by: Steven Whitehouse --- fs/gfs2/mount.c | 111 +++++++++------------------------------------------ fs/gfs2/mount.h | 17 -------- fs/gfs2/ops_fstype.c | 11 ++++- fs/gfs2/ops_super.c | 41 ++++++++++++++----- fs/gfs2/super.h | 26 ++++++------ 5 files changed, 73 insertions(+), 133 deletions(-) delete mode 100644 fs/gfs2/mount.h (limited to 'fs') diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index 3cb0a44ba023..3524ae81189b 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -17,7 +17,7 @@ #include "gfs2.h" #include "incore.h" -#include "mount.h" +#include "super.h" #include "sys.h" #include "util.h" @@ -77,101 +77,46 @@ static const match_table_t tokens = { * Return: errno */ -int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) +int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) { - struct gfs2_args *args = &sdp->sd_args; - char *data = data_arg; - char *options, *o, *v; - int error = 0; - - if (!remount) { - /* Set some defaults */ - args->ar_quota = GFS2_QUOTA_DEFAULT; - args->ar_data = GFS2_DATA_DEFAULT; - } + char *o; + int token; + substring_t tmp[MAX_OPT_ARGS]; /* Split the options into tokens with the "," character and process them */ - for (options = data; (o = strsep(&options, ",")); ) { - int token; - substring_t tmp[MAX_OPT_ARGS]; - - if (!*o) + while (1) { + o = strsep(&options, ","); + if (o == NULL) + break; + if (*o == '\0') continue; token = match_token(o, tokens, tmp); switch (token) { case Opt_lockproto: - v = match_strdup(&tmp[0]); - if (!v) { - fs_info(sdp, "no memory for lockproto\n"); - error = -ENOMEM; - goto out_error; - } - - if (remount && strcmp(v, args->ar_lockproto)) { - kfree(v); - goto cant_remount; - } - - strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN); - args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0; - kfree(v); + match_strlcpy(args->ar_lockproto, &tmp[0], + GFS2_LOCKNAME_LEN); break; case Opt_locktable: - v = match_strdup(&tmp[0]); - if (!v) { - fs_info(sdp, "no memory for locktable\n"); - error = -ENOMEM; - goto out_error; - } - - if (remount && strcmp(v, args->ar_locktable)) { - kfree(v); - goto cant_remount; - } - - strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN); - args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0; - kfree(v); + match_strlcpy(args->ar_locktable, &tmp[0], + GFS2_LOCKNAME_LEN); break; case Opt_hostdata: - v = match_strdup(&tmp[0]); - if (!v) { - fs_info(sdp, "no memory for hostdata\n"); - error = -ENOMEM; - goto out_error; - } - - if (remount && strcmp(v, args->ar_hostdata)) { - kfree(v); - goto cant_remount; - } - - strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN); - args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0; - kfree(v); + match_strlcpy(args->ar_hostdata, &tmp[0], + GFS2_LOCKNAME_LEN); break; case Opt_spectator: - if (remount && !args->ar_spectator) - goto cant_remount; args->ar_spectator = 1; - sdp->sd_vfs->s_flags |= MS_RDONLY; break; case Opt_ignore_local_fs: - if (remount && !args->ar_ignore_local_fs) - goto cant_remount; args->ar_ignore_local_fs = 1; break; case Opt_localflocks: - if (remount && !args->ar_localflocks) - goto cant_remount; args->ar_localflocks = 1; break; case Opt_localcaching: - if (remount && !args->ar_localcaching) - goto cant_remount; args->ar_localcaching = 1; break; case Opt_debug: @@ -181,17 +126,13 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) args->ar_debug = 0; break; case Opt_upgrade: - if (remount && !args->ar_upgrade) - goto cant_remount; args->ar_upgrade = 1; break; case Opt_acl: args->ar_posix_acl = 1; - sdp->sd_vfs->s_flags |= MS_POSIXACL; break; case Opt_noacl: args->ar_posix_acl = 0; - sdp->sd_vfs->s_flags &= ~MS_POSIXACL; break; case Opt_quota_off: args->ar_quota = GFS2_QUOTA_OFF; @@ -215,29 +156,15 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) args->ar_data = GFS2_DATA_ORDERED; break; case Opt_meta: - if (remount && args->ar_meta != 1) - goto cant_remount; args->ar_meta = 1; break; case Opt_err: default: - fs_info(sdp, "unknown option: %s\n", o); - error = -EINVAL; - goto out_error; + fs_info(sdp, "invalid mount option: %s\n", o); + return -EINVAL; } } -out_error: - if (error) - fs_info(sdp, "invalid mount option(s)\n"); - - if (data != data_arg) - kfree(data); - - return error; - -cant_remount: - fs_info(sdp, "can't remount with option %s\n", o); - return -EINVAL; + return 0; } diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h deleted file mode 100644 index 401288acfdf3..000000000000 --- a/fs/gfs2/mount.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __MOUNT_DOT_H__ -#define __MOUNT_DOT_H__ - -struct gfs2_sbd; - -int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount); - -#endif /* __MOUNT_DOT_H__ */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index f91eebdde581..3eb49edae542 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -25,7 +25,6 @@ #include "glock.h" #include "glops.h" #include "inode.h" -#include "mount.h" #include "recovery.h" #include "rgrp.h" #include "super.h" @@ -1116,12 +1115,20 @@ static int fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; } - error = gfs2_mount_args(sdp, (char *)data, 0); + sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; + sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; + + error = gfs2_mount_args(sdp, &sdp->sd_args, data); if (error) { printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); goto fail; } + if (sdp->sd_args.ar_spectator) + sb->s_flags |= MS_RDONLY; + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= MS_POSIXACL; + sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; sb->s_export_op = &gfs2_export_ops; diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 320323d03479..f0699ac453f7 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -27,7 +27,6 @@ #include "glock.h" #include "inode.h" #include "log.h" -#include "mount.h" #include "quota.h" #include "recovery.h" #include "rgrp.h" @@ -40,6 +39,8 @@ #include "bmap.h" #include "meta_io.h" +#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) + /** * gfs2_write_inode - Make sure the inode is stable on the disk * @inode: The inode @@ -435,25 +436,45 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) { struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args args = sdp->sd_args; /* Default to current settings */ int error; - error = gfs2_mount_args(sdp, data, 1); + error = gfs2_mount_args(sdp, &args, data); if (error) return error; + /* Not allowed to change locking details */ + if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || + strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || + strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) + return -EINVAL; + + /* Some flags must not be changed */ + if (args_neq(&args, &sdp->sd_args, spectator) || + args_neq(&args, &sdp->sd_args, ignore_local_fs) || + args_neq(&args, &sdp->sd_args, localflocks) || + args_neq(&args, &sdp->sd_args, localcaching) || + args_neq(&args, &sdp->sd_args, meta)) + return -EINVAL; + if (sdp->sd_args.ar_spectator) *flags |= MS_RDONLY; - else { - if (*flags & MS_RDONLY) { - if (!(sb->s_flags & MS_RDONLY)) - error = gfs2_make_fs_ro(sdp); - } else if (!(*flags & MS_RDONLY) && - (sb->s_flags & MS_RDONLY)) { + + if ((sb->s_flags ^ *flags) & MS_RDONLY) { + if (*flags & MS_RDONLY) + error = gfs2_make_fs_ro(sdp); + else error = gfs2_make_fs_rw(sdp); - } + if (error) + return error; } - return error; + sdp->sd_args = args; + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= MS_POSIXACL; + else + sb->s_flags &= ~MS_POSIXACL; + return 0; } /** diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index f6b8b00ad881..91abdbedcc86 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -14,7 +14,7 @@ #include #include "incore.h" -void gfs2_lm_unmount(struct gfs2_sbd *sdp); +extern void gfs2_lm_unmount(struct gfs2_sbd *sdp); static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) { @@ -27,21 +27,23 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) void gfs2_jindex_free(struct gfs2_sbd *sdp); -struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); -int gfs2_jdesc_check(struct gfs2_jdesc *jd); +extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data); -int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, - struct gfs2_inode **ipp); +extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); +extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); -int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, + struct gfs2_inode **ipp); -int gfs2_statfs_init(struct gfs2_sbd *sdp); -void gfs2_statfs_change(struct gfs2_sbd *sdp, - s64 total, s64 free, s64 dinodes); -int gfs2_statfs_sync(struct gfs2_sbd *sdp); +extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); -int gfs2_freeze_fs(struct gfs2_sbd *sdp); -void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); +extern int gfs2_statfs_init(struct gfs2_sbd *sdp); +extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes); +extern int gfs2_statfs_sync(struct gfs2_sbd *sdp); + +extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); +extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); extern struct file_system_type gfs2_fs_type; extern struct file_system_type gfs2meta_fs_type; -- cgit v1.2.2 From 2db2aac255c38e75ad17c0b24feb589ccfccc0ae Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Wed, 7 Jan 2009 10:21:34 -0600 Subject: GFS2: Bring back lvb-related stuff to lock_nolock to support quotas The quota code uses lvbs and this is currently not implemented in lock_nolock, thereby causing panics when quota is enabled with lock_nolock. This patch adds the relevant bits. Signed-off-by: Abhijith Das Signed-off-by: Steven Whitehouse --- fs/gfs2/locking.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c index 523243a13a21..d3657bc7938a 100644 --- a/fs/gfs2/locking.c +++ b/fs/gfs2/locking.c @@ -23,11 +23,74 @@ struct lmh_wrapper { const struct lm_lockops *lw_ops; }; +struct nolock_lockspace { + unsigned int nl_lvb_size; +}; + +/** + * nolock_get_lock - get a lm_lock_t given a descripton of the lock + * @lockspace: the lockspace the lock lives in + * @name: the name of the lock + * @lockp: return the lm_lock_t here + * + * Returns: 0 on success, -EXXX on failure + */ + +static int nolock_get_lock(void *lockspace, struct lm_lockname *name, + void **lockp) +{ + *lockp = lockspace; + return 0; +} + +/** + * nolock_put_lock - get rid of a lock structure + * @lock: the lock to throw away + * + */ + +static void nolock_put_lock(void *lock) +{ +} + +/** + * nolock_hold_lvb - hold on to a lock value block + * @lock: the lock the LVB is associated with + * @lvbp: return the lm_lvb_t here + * + * Returns: 0 on success, -EXXX on failure + */ + +static int nolock_hold_lvb(void *lock, char **lvbp) +{ + struct nolock_lockspace *nl = lock; + int error = 0; + + *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL); + if (!*lvbp) + error = -ENOMEM; + + return error; +} + +/** + * nolock_unhold_lvb - release a LVB + * @lock: the lock the LVB is associated with + * @lvb: the lock value block + * + */ + +static void nolock_unhold_lvb(void *lock, char *lvb) +{ + kfree(lvb); +} + static int nolock_mount(char *table_name, char *host_data, lm_callback_t cb, void *cb_data, unsigned int min_lvb_size, int flags, struct lm_lockstruct *lockstruct, struct kobject *fskobj); +static void nolock_unmount(void *lockspace); /* List of registered low-level locking protocols. A file system selects one of them by name at mount time, e.g. lock_nolock, lock_dlm. */ @@ -35,6 +98,11 @@ static int nolock_mount(char *table_name, char *host_data, static const struct lm_lockops nolock_ops = { .lm_proto_name = "lock_nolock", .lm_mount = nolock_mount, + .lm_unmount = nolock_unmount, + .lm_get_lock = nolock_get_lock, + .lm_put_lock = nolock_put_lock, + .lm_hold_lvb = nolock_hold_lvb, + .lm_unhold_lvb = nolock_unhold_lvb, }; static struct lmh_wrapper nolock_proto = { @@ -53,6 +121,7 @@ static int nolock_mount(char *table_name, char *host_data, { char *c; unsigned int jid; + struct nolock_lockspace *nl; c = strstr(host_data, "jid="); if (!c) @@ -62,15 +131,28 @@ static int nolock_mount(char *table_name, char *host_data, sscanf(c, "%u", &jid); } + nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL); + if (!nl) + return -ENOMEM; + + nl->nl_lvb_size = min_lvb_size; + lockstruct->ls_jid = jid; lockstruct->ls_first = 1; lockstruct->ls_lvb_size = min_lvb_size; + lockstruct->ls_lockspace = nl; lockstruct->ls_ops = &nolock_ops; lockstruct->ls_flags = LM_LSFLAG_LOCAL; return 0; } +static void nolock_unmount(void *lockspace) +{ + struct nolock_lockspace *nl = lockspace; + kfree(nl); +} + /** * gfs2_register_lockproto - Register a low-level locking protocol * @proto: the protocol definition -- cgit v1.2.2 From 0a7ab79c5b5a16035e09b466c9013c8afc3b4bff Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Wed, 7 Jan 2009 16:03:37 -0600 Subject: GFS2: change gfs2_quota_scan into a shrinker Deallocation of gfs2_quota_data objects now happens on-demand through a shrinker instead of routinely deallocating through the quotad daemon. Signed-off-by: Abhijith Das Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 6 +- fs/gfs2/main.c | 10 +++ fs/gfs2/ops_address.c | 1 + fs/gfs2/ops_fstype.c | 1 - fs/gfs2/quota.c | 165 ++++++++++++++++++++++++++++++-------------------- fs/gfs2/quota.h | 2 + fs/gfs2/sys.c | 2 - 7 files changed, 114 insertions(+), 73 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 608849d00021..592aa5040d29 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -283,7 +283,9 @@ enum { struct gfs2_quota_data { struct list_head qd_list; - unsigned int qd_count; + struct list_head qd_reclaim; + + atomic_t qd_count; u32 qd_id; unsigned long qd_flags; /* QDF_... */ @@ -303,7 +305,6 @@ struct gfs2_quota_data { u64 qd_sync_gen; unsigned long qd_last_warn; - unsigned long qd_last_touched; }; struct gfs2_trans { @@ -406,7 +407,6 @@ struct gfs2_tune { unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ unsigned int gt_quota_scale_num; /* Numerator */ unsigned int gt_quota_scale_den; /* Denominator */ - unsigned int gt_quota_cache_secs; unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ unsigned int gt_new_files_jdata; unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 7cacfde32194..86fe06798711 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -23,6 +23,12 @@ #include "sys.h" #include "util.h" #include "glock.h" +#include "quota.h" + +static struct shrinker qd_shrinker = { + .shrink = gfs2_shrink_qd_memory, + .seeks = DEFAULT_SEEKS, +}; static void gfs2_init_inode_once(void *foo) { @@ -100,6 +106,8 @@ static int __init init_gfs2_fs(void) if (!gfs2_quotad_cachep) goto fail; + register_shrinker(&qd_shrinker); + error = register_filesystem(&gfs2_fs_type); if (error) goto fail; @@ -117,6 +125,7 @@ static int __init init_gfs2_fs(void) fail_unregister: unregister_filesystem(&gfs2_fs_type); fail: + unregister_shrinker(&qd_shrinker); gfs2_glock_exit(); if (gfs2_quotad_cachep) @@ -145,6 +154,7 @@ fail: static void __exit exit_gfs2_fs(void) { + unregister_shrinker(&qd_shrinker); gfs2_glock_exit(); gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 4ddab67867eb..dde4ead2c3be 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -442,6 +442,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) */ if (unlikely(page->index)) { zero_user(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); return 0; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3eb49edae542..530d3f6f6ea8 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -63,7 +63,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_quota_warn_period = 10; gt->gt_quota_scale_num = 1; gt->gt_quota_scale_den = 1; - gt->gt_quota_cache_secs = 300; gt->gt_quota_quantum = 60; gt->gt_new_files_jdata = 0; gt->gt_max_readahead = 1 << 18; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index b08d09696b3e..2ada6e10d07b 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -80,6 +80,53 @@ struct gfs2_quota_change_host { u32 qc_id; }; +static LIST_HEAD(qd_lru_list); +static atomic_t qd_lru_count = ATOMIC_INIT(0); +static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED; + +int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) +{ + struct gfs2_quota_data *qd; + struct gfs2_sbd *sdp; + + if (nr == 0) + goto out; + + if (!(gfp_mask & __GFP_FS)) + return -1; + + spin_lock(&qd_lru_lock); + while (nr && !list_empty(&qd_lru_list)) { + qd = list_entry(qd_lru_list.next, + struct gfs2_quota_data, qd_reclaim); + sdp = qd->qd_gl->gl_sbd; + + /* Free from the filesystem-specific list */ + list_del(&qd->qd_list); + + spin_lock(&sdp->sd_quota_spin); + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + + gfs2_lvb_unhold(qd->qd_gl); + spin_unlock(&sdp->sd_quota_spin); + atomic_dec(&sdp->sd_quota_count); + + /* Delete it from the common reclaim list */ + list_del_init(&qd->qd_reclaim); + atomic_dec(&qd_lru_count); + spin_unlock(&qd_lru_lock); + kmem_cache_free(gfs2_quotad_cachep, qd); + spin_lock(&qd_lru_lock); + nr--; + } + spin_unlock(&qd_lru_lock); + +out: + return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100; +} + static u64 qd2offset(struct gfs2_quota_data *qd) { u64 offset; @@ -100,11 +147,12 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, if (!qd) return -ENOMEM; - qd->qd_count = 1; + atomic_set(&qd->qd_count, 1); qd->qd_id = id; if (user) set_bit(QDF_USER, &qd->qd_flags); qd->qd_slot = -1; + INIT_LIST_HEAD(&qd->qd_reclaim); error = gfs2_glock_get(sdp, 2 * (u64)id + !user, &gfs2_quota_glops, CREATE, &qd->qd_gl); @@ -135,11 +183,17 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, for (;;) { found = 0; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lru_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { if (qd->qd_id == id && !test_bit(QDF_USER, &qd->qd_flags) == !user) { - qd->qd_count++; + if (!atomic_read(&qd->qd_count) && + !list_empty(&qd->qd_reclaim)) { + /* Remove it from reclaim list */ + list_del_init(&qd->qd_reclaim); + atomic_dec(&qd_lru_count); + } + atomic_inc(&qd->qd_count); found = 1; break; } @@ -155,7 +209,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, new_qd = NULL; } - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); if (qd || !create) { if (new_qd) { @@ -175,21 +229,18 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, static void qd_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - - spin_lock(&sdp->sd_quota_spin); - gfs2_assert(sdp, qd->qd_count); - qd->qd_count++; - spin_unlock(&sdp->sd_quota_spin); + gfs2_assert(sdp, atomic_read(&qd->qd_count)); + atomic_inc(&qd->qd_count); } static void qd_put(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - spin_lock(&sdp->sd_quota_spin); - gfs2_assert(sdp, qd->qd_count); - if (!--qd->qd_count) - qd->qd_last_touched = jiffies; - spin_unlock(&sdp->sd_quota_spin); + if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) { + /* Add to the reclaim list */ + list_add_tail(&qd->qd_reclaim, &qd_lru_list); + atomic_inc(&qd_lru_count); + spin_unlock(&qd_lru_lock); + } } static int slot_get(struct gfs2_quota_data *qd) @@ -330,6 +381,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) if (sdp->sd_vfs->s_flags & MS_RDONLY) return 0; + spin_lock(&qd_lru_lock); spin_lock(&sdp->sd_quota_spin); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { @@ -341,8 +393,8 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) list_move_tail(&qd->qd_list, &sdp->sd_quota_list); set_bit(QDF_LOCKED, &qd->qd_flags); - gfs2_assert_warn(sdp, qd->qd_count); - qd->qd_count++; + gfs2_assert_warn(sdp, atomic_read(&qd->qd_count)); + atomic_inc(&qd->qd_count); qd->qd_change_sync = qd->qd_change; gfs2_assert_warn(sdp, qd->qd_slot_count); qd->qd_slot_count++; @@ -355,6 +407,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) qd = NULL; spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); if (qd) { gfs2_assert_warn(sdp, qd->qd_change_sync); @@ -379,24 +432,27 @@ static int qd_trylock(struct gfs2_quota_data *qd) if (sdp->sd_vfs->s_flags & MS_RDONLY) return 0; + spin_lock(&qd_lru_lock); spin_lock(&sdp->sd_quota_spin); if (test_bit(QDF_LOCKED, &qd->qd_flags) || !test_bit(QDF_CHANGE, &qd->qd_flags)) { spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); return 0; } list_move_tail(&qd->qd_list, &sdp->sd_quota_list); set_bit(QDF_LOCKED, &qd->qd_flags); - gfs2_assert_warn(sdp, qd->qd_count); - qd->qd_count++; + gfs2_assert_warn(sdp, atomic_read(&qd->qd_count)); + atomic_inc(&qd->qd_count); qd->qd_change_sync = qd->qd_change; gfs2_assert_warn(sdp, qd->qd_slot_count); qd->qd_slot_count++; spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); gfs2_assert_warn(sdp, qd->qd_change_sync); if (bh_get(qd)) { @@ -802,8 +858,8 @@ restart: loff_t pos; gfs2_glock_dq_uninit(q_gh); error = gfs2_glock_nq_init(qd->qd_gl, - LM_ST_EXCLUSIVE, GL_NOCACHE, - q_gh); + LM_ST_EXCLUSIVE, GL_NOCACHE, + q_gh); if (error) return error; @@ -820,7 +876,6 @@ restart: gfs2_glock_dq_uninit(&i_gh); - gfs2_quota_in(&q, buf); qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); @@ -1171,13 +1226,14 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) qd->qd_change = qc.qc_change; qd->qd_slot = slot; qd->qd_slot_count = 1; - qd->qd_last_touched = jiffies; + spin_lock(&qd_lru_lock); spin_lock(&sdp->sd_quota_spin); gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); + spin_unlock(&sdp->sd_quota_spin); list_add(&qd->qd_list, &sdp->sd_quota_list); atomic_inc(&sdp->sd_quota_count); - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); found++; } @@ -1197,61 +1253,39 @@ fail: return error; } -static void gfs2_quota_scan(struct gfs2_sbd *sdp) -{ - struct gfs2_quota_data *qd, *safe; - LIST_HEAD(dead); - - spin_lock(&sdp->sd_quota_spin); - list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) { - if (!qd->qd_count && - time_after_eq(jiffies, qd->qd_last_touched + - gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) { - list_move(&qd->qd_list, &dead); - gfs2_assert_warn(sdp, - atomic_read(&sdp->sd_quota_count) > 0); - atomic_dec(&sdp->sd_quota_count); - } - } - spin_unlock(&sdp->sd_quota_spin); - - while (!list_empty(&dead)) { - qd = list_entry(dead.next, struct gfs2_quota_data, qd_list); - list_del(&qd->qd_list); - - gfs2_assert_warn(sdp, !qd->qd_change); - gfs2_assert_warn(sdp, !qd->qd_slot_count); - gfs2_assert_warn(sdp, !qd->qd_bh_count); - - gfs2_lvb_unhold(qd->qd_gl); - kmem_cache_free(gfs2_quotad_cachep, qd); - } -} - void gfs2_quota_cleanup(struct gfs2_sbd *sdp) { struct list_head *head = &sdp->sd_quota_list; struct gfs2_quota_data *qd; unsigned int x; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lru_lock); while (!list_empty(head)) { qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); - if (qd->qd_count > 1 || - (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) { - list_move(&qd->qd_list, head); + spin_lock(&sdp->sd_quota_spin); + if (atomic_read(&qd->qd_count) > 1 || + (atomic_read(&qd->qd_count) && + !test_bit(QDF_CHANGE, &qd->qd_flags))) { spin_unlock(&sdp->sd_quota_spin); + list_move(&qd->qd_list, head); + spin_unlock(&qd_lru_lock); schedule(); - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lru_lock); continue; } + spin_unlock(&sdp->sd_quota_spin); list_del(&qd->qd_list); + /* Also remove if this qd exists in the reclaim list */ + if (!list_empty(&qd->qd_reclaim)) { + list_del_init(&qd->qd_reclaim); + atomic_dec(&qd_lru_count); + } atomic_dec(&sdp->sd_quota_count); - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); - if (!qd->qd_count) { + if (!atomic_read(&qd->qd_count)) { gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_count); } else @@ -1261,9 +1295,9 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) gfs2_lvb_unhold(qd->qd_gl); kmem_cache_free(gfs2_quotad_cachep, qd); - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lru_lock); } - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); @@ -1341,9 +1375,6 @@ int gfs2_quotad(void *data) quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, "ad_timeo, &tune->gt_quota_quantum); - /* FIXME: This should be turned into a shrinker */ - gfs2_quota_scan(sdp); - /* Check for & recover partially truncated inodes */ quotad_check_trunc_list(sdp); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index cec9032be97d..0fa5fa63d0e8 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -49,4 +49,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } +extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); + #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 26c1fa777a95..a58a120dac92 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -373,7 +373,6 @@ TUNE_ATTR(complain_secs, 0); TUNE_ATTR(statfs_slow, 0); TUNE_ATTR(new_files_jdata, 0); TUNE_ATTR(quota_simul_sync, 1); -TUNE_ATTR(quota_cache_secs, 1); TUNE_ATTR(stall_secs, 1); TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); @@ -389,7 +388,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_complain_secs.attr, &tune_attr_statfs_slow.attr, &tune_attr_quota_simul_sync.attr, - &tune_attr_quota_cache_secs.attr, &tune_attr_stall_secs.attr, &tune_attr_statfs_quantum.attr, &tune_attr_recoverd_secs.attr, -- cgit v1.2.2 From 22077f57dec8fcbeb1112b35313961c0902ff038 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 8 Jan 2009 14:28:42 +0000 Subject: GFS2: Remove "double" locking in quota We only really need a single spin lock for the quota data, so lets just use the lru lock for now. Signed-off-by: Steven Whitehouse Cc: Abhijith Das --- fs/gfs2/incore.h | 1 - fs/gfs2/ops_fstype.c | 1 - fs/gfs2/quota.c | 40 ++++++++++++++-------------------------- 3 files changed, 14 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 592aa5040d29..a0117d6eb145 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -540,7 +540,6 @@ struct gfs2_sbd { struct list_head sd_quota_list; atomic_t sd_quota_count; - spinlock_t sd_quota_spin; struct mutex sd_quota_mutex; wait_queue_head_t sd_quota_wait; struct list_head sd_trunc_list; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 530d3f6f6ea8..402b6a2cd2c9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -98,7 +98,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mutex_init(&sdp->sd_jindex_mutex); INIT_LIST_HEAD(&sdp->sd_quota_list); - spin_lock_init(&sdp->sd_quota_spin); mutex_init(&sdp->sd_quota_mutex); init_waitqueue_head(&sdp->sd_quota_wait); INIT_LIST_HEAD(&sdp->sd_trunc_list); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 2ada6e10d07b..e8ef0f80fb11 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -104,13 +104,11 @@ int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) /* Free from the filesystem-specific list */ list_del(&qd->qd_list); - spin_lock(&sdp->sd_quota_spin); gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_count); gfs2_assert_warn(sdp, !qd->qd_bh_count); gfs2_lvb_unhold(qd->qd_gl); - spin_unlock(&sdp->sd_quota_spin); atomic_dec(&sdp->sd_quota_count); /* Delete it from the common reclaim list */ @@ -249,10 +247,10 @@ static int slot_get(struct gfs2_quota_data *qd) unsigned int c, o = 0, b; unsigned char byte = 0; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lru_lock); if (qd->qd_slot_count++) { - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); return 0; } @@ -276,13 +274,13 @@ found: sdp->sd_quota_bitmap[c][o] |= 1 << b; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); return 0; fail: qd->qd_slot_count--; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); return -ENOSPC; } @@ -290,23 +288,23 @@ static void slot_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lru_lock); gfs2_assert(sdp, qd->qd_slot_count); qd->qd_slot_count++; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); } static void slot_put(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lru_lock); gfs2_assert(sdp, qd->qd_slot_count); if (!--qd->qd_slot_count) { gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); qd->qd_slot = -1; } - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); } static int bh_get(struct gfs2_quota_data *qd) @@ -382,7 +380,6 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) return 0; spin_lock(&qd_lru_lock); - spin_lock(&sdp->sd_quota_spin); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { if (test_bit(QDF_LOCKED, &qd->qd_flags) || @@ -406,7 +403,6 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) if (!found) qd = NULL; - spin_unlock(&sdp->sd_quota_spin); spin_unlock(&qd_lru_lock); if (qd) { @@ -433,11 +429,9 @@ static int qd_trylock(struct gfs2_quota_data *qd) return 0; spin_lock(&qd_lru_lock); - spin_lock(&sdp->sd_quota_spin); if (test_bit(QDF_LOCKED, &qd->qd_flags) || !test_bit(QDF_CHANGE, &qd->qd_flags)) { - spin_unlock(&sdp->sd_quota_spin); spin_unlock(&qd_lru_lock); return 0; } @@ -451,7 +445,6 @@ static int qd_trylock(struct gfs2_quota_data *qd) gfs2_assert_warn(sdp, qd->qd_slot_count); qd->qd_slot_count++; - spin_unlock(&sdp->sd_quota_spin); spin_unlock(&qd_lru_lock); gfs2_assert_warn(sdp, qd->qd_change_sync); @@ -612,9 +605,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) x = be64_to_cpu(qc->qc_change) + change; qc->qc_change = cpu_to_be64(x); - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lru_lock); qd->qd_change = x; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); if (!x) { gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); @@ -945,9 +938,9 @@ static int need_sync(struct gfs2_quota_data *qd) if (!qd->qd_qb.qb_limit) return 0; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lru_lock); value = qd->qd_change; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); spin_lock(>->gt_spin); num = gt->gt_quota_scale_num; @@ -1040,9 +1033,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) continue; value = (s64)be64_to_cpu(qd->qd_qb.qb_value); - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lru_lock); value += qd->qd_change; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lru_lock); if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { print_message(qd, "exceeded"); @@ -1228,9 +1221,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) qd->qd_slot_count = 1; spin_lock(&qd_lru_lock); - spin_lock(&sdp->sd_quota_spin); gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); - spin_unlock(&sdp->sd_quota_spin); list_add(&qd->qd_list, &sdp->sd_quota_list); atomic_inc(&sdp->sd_quota_count); spin_unlock(&qd_lru_lock); @@ -1263,18 +1254,15 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) while (!list_empty(head)) { qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); - spin_lock(&sdp->sd_quota_spin); if (atomic_read(&qd->qd_count) > 1 || (atomic_read(&qd->qd_count) && !test_bit(QDF_CHANGE, &qd->qd_flags))) { - spin_unlock(&sdp->sd_quota_spin); list_move(&qd->qd_list, head); spin_unlock(&qd_lru_lock); schedule(); spin_lock(&qd_lru_lock); continue; } - spin_unlock(&sdp->sd_quota_spin); list_del(&qd->qd_list); /* Also remove if this qd exists in the reclaim list */ -- cgit v1.2.2 From f057f6cdf64175db1151b1f5d110e29904f119a1 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 12 Jan 2009 10:43:39 +0000 Subject: GFS2: Merge lock_dlm module into GFS2 This is the big patch that I've been working on for some time now. There are many reasons for wanting to make this change such as: o Reducing overhead by eliminating duplicated fields between structures o Simplifcation of the code (reduces the code size by a fair bit) o The locking interface is now the DLM interface itself as proposed some time ago. o Fewer lookups of glocks when processing replies from the DLM o Fewer memory allocations/deallocations for each glock o Scope to do further optimisations in the future (but this patch is more than big enough for now!) Please note that (a) this patch relates to the lock_dlm module and not the DLM itself, that is still a separate module; and (b) that we retain the ability to build GFS2 as a standalone single node filesystem with out requiring the DLM. This patch needs a lot of testing, hence my keeping it I restarted my -git tree after the last merge window. That way, this has the maximum exposure before its merged. This is (modulo a few minor bug fixes) the same patch that I've been posting on and off the the last three months and its passed a number of different tests so far. Signed-off-by: Steven Whitehouse --- fs/gfs2/Kconfig | 17 +- fs/gfs2/Makefile | 4 +- fs/gfs2/acl.c | 1 - fs/gfs2/bmap.c | 1 - fs/gfs2/dir.c | 1 - fs/gfs2/eaops.c | 1 - fs/gfs2/eattr.c | 1 - fs/gfs2/glock.c | 249 +++++---------- fs/gfs2/glock.h | 127 +++++++- fs/gfs2/glops.c | 14 - fs/gfs2/incore.h | 59 +++- fs/gfs2/inode.c | 13 +- fs/gfs2/inode.h | 22 +- fs/gfs2/lock_dlm.c | 240 ++++++++++++++ fs/gfs2/locking.c | 314 ------------------ fs/gfs2/locking/dlm/Makefile | 3 - fs/gfs2/locking/dlm/lock.c | 708 ----------------------------------------- fs/gfs2/locking/dlm/lock_dlm.h | 166 ---------- fs/gfs2/locking/dlm/main.c | 48 --- fs/gfs2/locking/dlm/mount.c | 276 ---------------- fs/gfs2/locking/dlm/sysfs.c | 226 ------------- fs/gfs2/locking/dlm/thread.c | 68 ---- fs/gfs2/log.c | 1 - fs/gfs2/lops.c | 1 - fs/gfs2/main.c | 3 - fs/gfs2/meta_io.c | 1 - fs/gfs2/mount.c | 1 - fs/gfs2/ops_address.c | 1 - fs/gfs2/ops_dentry.c | 1 - fs/gfs2/ops_export.c | 1 - fs/gfs2/ops_file.c | 74 ++--- fs/gfs2/ops_fstype.c | 134 +++++--- fs/gfs2/ops_inode.c | 1 - fs/gfs2/ops_super.c | 1 - fs/gfs2/quota.c | 12 +- fs/gfs2/recovery.c | 28 +- fs/gfs2/rgrp.c | 1 - fs/gfs2/super.c | 1 - fs/gfs2/sys.c | 154 ++++++++- fs/gfs2/trans.c | 3 +- fs/gfs2/util.c | 11 +- 41 files changed, 819 insertions(+), 2170 deletions(-) create mode 100644 fs/gfs2/lock_dlm.c delete mode 100644 fs/gfs2/locking.c delete mode 100644 fs/gfs2/locking/dlm/Makefile delete mode 100644 fs/gfs2/locking/dlm/lock.c delete mode 100644 fs/gfs2/locking/dlm/lock_dlm.h delete mode 100644 fs/gfs2/locking/dlm/main.c delete mode 100644 fs/gfs2/locking/dlm/mount.c delete mode 100644 fs/gfs2/locking/dlm/sysfs.c delete mode 100644 fs/gfs2/locking/dlm/thread.c (limited to 'fs') diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index e563a6449811..3a981b7f64ca 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,10 @@ config GFS2_FS tristate "GFS2 file system support" depends on EXPERIMENTAL && (64BIT || LBD) + select DLM if GFS2_FS_LOCKING_DLM + select CONFIGFS_FS if GFS2_FS_LOCKING_DLM + select SYSFS if GFS2_FS_LOCKING_DLM + select IP_SCTP if DLM_SCTP select FS_POSIX_ACL select CRC32 help @@ -18,17 +22,16 @@ config GFS2_FS the locking module below. Documentation and utilities for GFS2 can be found here: http://sources.redhat.com/cluster - The "nolock" lock module is now built in to GFS2 by default. + The "nolock" lock module is now built in to GFS2 by default. If + you want to use the DLM, be sure to enable HOTPLUG and IPv4/6 + networking. config GFS2_FS_LOCKING_DLM - tristate "GFS2 DLM locking module" - depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n) - select IP_SCTP if DLM_SCTP - select CONFIGFS_FS - select DLM + bool "GFS2 DLM locking" + depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG help Multiple node locking module for GFS2 - Most users of GFS2 will require this module. It provides the locking + Most users of GFS2 will require this. It provides the locking interface between GFS2 and the DLM, which is required to use GFS2 in a cluster environment. diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index c1b4ec6a9650..a851ea4bdf70 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,9 +1,9 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ - glops.o inode.o log.o lops.o locking.o main.o meta_io.o \ + glops.o inode.o log.o lops.o main.o meta_io.o \ mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ ops_fstype.o ops_inode.o ops_super.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o -obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/ +gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index e335dceb6a4f..43764f4fa763 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -15,7 +15,6 @@ #include #include #include -#include #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 11ffc56f1f81..3a5d3f883e10 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -13,7 +13,6 @@ #include #include #include -#include #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index b7c8e5c70791..aef4d0c06748 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -60,7 +60,6 @@ #include #include #include -#include #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c index f114ba2b3557..dee9b03e5b37 100644 --- a/fs/gfs2/eaops.c +++ b/fs/gfs2/eaops.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include "gfs2.h" diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index 0d1c76d906ae..899763aed217 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include "gfs2.h" diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6b983aef785d..cd200a564c79 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -18,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -155,13 +153,10 @@ static void glock_free(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_sbd; struct inode *aspace = gl->gl_aspace; - if (sdp->sd_lockstruct.ls_ops->lm_put_lock) - sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock); - if (aspace) gfs2_aspace_put(aspace); - kmem_cache_free(gfs2_glock_cachep, gl); + sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl); } /** @@ -211,7 +206,6 @@ int gfs2_glock_put(struct gfs2_glock *gl) atomic_dec(&lru_count); } spin_unlock(&lru_lock); - GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru)); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); glock_free(gl); @@ -255,27 +249,6 @@ static struct gfs2_glock *search_bucket(unsigned int hash, return NULL; } -/** - * gfs2_glock_find() - Find glock by lock number - * @sdp: The GFS2 superblock - * @name: The lock name - * - * Returns: NULL, or the struct gfs2_glock with the requested number - */ - -static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp, - const struct lm_lockname *name) -{ - unsigned int hash = gl_hash(sdp, name); - struct gfs2_glock *gl; - - read_lock(gl_lock_addr(hash)); - gl = search_bucket(hash, sdp, name); - read_unlock(gl_lock_addr(hash)); - - return gl; -} - /** * may_grant - check if its ok to grant a new lock * @gl: The glock @@ -523,7 +496,7 @@ out_locked: } static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state, unsigned int req_state, + unsigned int req_state, unsigned int flags) { int ret = LM_OUT_ERROR; @@ -532,7 +505,7 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, return req_state == LM_ST_UNLOCKED ? 0 : req_state; if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state, + ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, req_state, flags); return ret; } @@ -575,7 +548,7 @@ __acquires(&gl->gl_spin) gl->gl_state == LM_ST_DEFERRED) && !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) lck_flags |= LM_FLAG_TRY_1CB; - ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags); + ret = gfs2_lm_lock(sdp, gl, target, lck_flags); if (!(ret & LM_OUT_ASYNC)) { finish_xmote(gl, ret); @@ -681,18 +654,6 @@ static void glock_work_func(struct work_struct *work) gfs2_glock_put(gl); } -static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, - void **lockp) -{ - int error = -EIO; - if (!sdp->sd_lockstruct.ls_ops->lm_get_lock) - return 0; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_get_lock( - sdp->sd_lockstruct.ls_lockspace, name, lockp); - return error; -} - /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock @@ -736,6 +697,9 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_demote_state = LM_ST_EXCLUSIVE; gl->gl_hash = hash; gl->gl_ops = glops; + snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number); + memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); + gl->gl_lksb.sb_lvbptr = gl->gl_lvb; gl->gl_stamp = jiffies; gl->gl_tchange = jiffies; gl->gl_object = NULL; @@ -753,10 +717,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, } } - error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock); - if (error) - goto fail_aspace; - write_lock(gl_lock_addr(hash)); tmp = search_bucket(hash, sdp, &name); if (tmp) { @@ -772,9 +732,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, return 0; -fail_aspace: - if (gl->gl_aspace) - gfs2_aspace_put(gl->gl_aspace); fail: kmem_cache_free(gfs2_glock_cachep, gl); return error; @@ -966,7 +923,7 @@ do_cancel: if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { spin_unlock(&gl->gl_spin); if (sdp->sd_lockstruct.ls_ops->lm_cancel) - sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock); + sdp->sd_lockstruct.ls_ops->lm_cancel(gl); spin_lock(&gl->gl_spin); } return; @@ -1240,70 +1197,13 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) gfs2_glock_dq_uninit(&ghs[x]); } -static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp) +void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { - int error = -EIO; - if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb) - return 0; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp); - return error; -} - -/** - * gfs2_lvb_hold - attach a LVB from a glock - * @gl: The glock in question - * - */ - -int gfs2_lvb_hold(struct gfs2_glock *gl) -{ - int error; - - if (!atomic_read(&gl->gl_lvb_count)) { - error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb); - if (error) - return error; - gfs2_glock_hold(gl); - } - atomic_inc(&gl->gl_lvb_count); - - return 0; -} - -/** - * gfs2_lvb_unhold - detach a LVB from a glock - * @gl: The glock in question - * - */ - -void gfs2_lvb_unhold(struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - - gfs2_glock_hold(gl); - gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0); - if (atomic_dec_and_test(&gl->gl_lvb_count)) { - if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb) - sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb); - gl->gl_lvb = NULL; - gfs2_glock_put(gl); - } - gfs2_glock_put(gl); -} - -static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, - unsigned int state) -{ - struct gfs2_glock *gl; unsigned long delay = 0; unsigned long holdtime; unsigned long now = jiffies; - gl = gfs2_glock_find(sdp, name); - if (!gl) - return; - + gfs2_glock_hold(gl); holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; @@ -1317,74 +1217,37 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, gfs2_glock_put(gl); } -static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) -{ - struct gfs2_jdesc *jd; - - spin_lock(&sdp->sd_jindex_spin); - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - if (jd->jd_jid != jid) - continue; - jd->jd_dirty = 1; - break; - } - spin_unlock(&sdp->sd_jindex_spin); -} - /** - * gfs2_glock_cb - Callback used by locking module - * @sdp: Pointer to the superblock - * @type: Type of callback - * @data: Type dependent data pointer + * gfs2_glock_complete - Callback used by locking + * @gl: Pointer to the glock + * @ret: The return value from the dlm * - * Called by the locking module when it wants to tell us something. - * Either we need to drop a lock, one of our ASYNC requests completed, or - * a journal from another client needs to be recovered. */ -void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) +void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { - struct gfs2_sbd *sdp = cb_data; - - switch (type) { - case LM_CB_NEED_E: - blocking_cb(sdp, data, LM_ST_UNLOCKED); - return; - - case LM_CB_NEED_D: - blocking_cb(sdp, data, LM_ST_DEFERRED); - return; - - case LM_CB_NEED_S: - blocking_cb(sdp, data, LM_ST_SHARED); - return; - - case LM_CB_ASYNC: { - struct lm_async_cb *async = data; - struct gfs2_glock *gl; - - down_read(&gfs2_umount_flush_sem); - gl = gfs2_glock_find(sdp, &async->lc_name); - if (gfs2_assert_warn(sdp, gl)) + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + down_read(&gfs2_umount_flush_sem); + gl->gl_reply = ret; + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { + struct gfs2_holder *gh; + spin_lock(&gl->gl_spin); + gh = find_first_waiter(gl); + if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) && + (gl->gl_target != LM_ST_UNLOCKED)) || + ((ret & ~LM_OUT_ST_MASK) != 0)) + set_bit(GLF_FROZEN, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + if (test_bit(GLF_FROZEN, &gl->gl_flags)) { + up_read(&gfs2_umount_flush_sem); return; - gl->gl_reply = async->lc_ret; - set_bit(GLF_REPLY_PENDING, &gl->gl_flags); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); - up_read(&gfs2_umount_flush_sem); - return; - } - - case LM_CB_NEED_RECOVERY: - gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data); - if (sdp->sd_recoverd_process) - wake_up_process(sdp->sd_recoverd_process); - return; - - default: - gfs2_assert_warn(sdp, 0); - return; + } } + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gfs2_glock_hold(gl); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); + up_read(&gfs2_umount_flush_sem); } /** @@ -1515,6 +1378,27 @@ out: return has_entries; } + +/** + * thaw_glock - thaw out a glock which has an unprocessed reply waiting + * @gl: The glock to thaw + * + * N.B. When we freeze a glock, we leave a ref to the glock outstanding, + * so this has to result in the ref count being dropped by one. + */ + +static void thaw_glock(struct gfs2_glock *gl) +{ + if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + return; + down_read(&gfs2_umount_flush_sem); + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gfs2_glock_hold(gl); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); + up_read(&gfs2_umount_flush_sem); +} + /** * clear_glock - look at a glock and see if we can free it from glock cache * @gl: the glock to look at @@ -1539,6 +1423,20 @@ static void clear_glock(struct gfs2_glock *gl) gfs2_glock_put(gl); } +/** + * gfs2_glock_thaw - Thaw any frozen glocks + * @sdp: The super block + * + */ + +void gfs2_glock_thaw(struct gfs2_sbd *sdp) +{ + unsigned x; + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) + examine_bucket(thaw_glock, sdp, x); +} + /** * gfs2_gl_hash_clear - Empty out the glock hash table * @sdp: the filesystem @@ -1619,7 +1517,7 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) if (flags & LM_FLAG_NOEXP) *p++ = 'e'; if (flags & LM_FLAG_ANY) - *p++ = 'a'; + *p++ = 'A'; if (flags & LM_FLAG_PRIORITY) *p++ = 'p'; if (flags & GL_ASYNC) @@ -1683,6 +1581,10 @@ static const char *gflags2str(char *buf, const unsigned long *gflags) *p++ = 'i'; if (test_bit(GLF_REPLY_PENDING, gflags)) *p++ = 'r'; + if (test_bit(GLF_INITIAL, gflags)) + *p++ = 'i'; + if (test_bit(GLF_FROZEN, gflags)) + *p++ = 'F'; *p = 0; return buf; } @@ -1717,14 +1619,13 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) dtime *= 1000000/HZ; /* demote time in uSec */ if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) dtime = 0; - gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n", + gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n", state2str(gl->gl_state), gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, gflags2str(gflags_buf, &gl->gl_flags), state2str(gl->gl_target), state2str(gl->gl_demote_state), dtime, - atomic_read(&gl->gl_lvb_count), atomic_read(&gl->gl_ail_count), atomic_read(&gl->gl_ref)); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 543ec7ecfbda..a602a28f6f08 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -11,15 +11,130 @@ #define __GLOCK_DOT_H__ #include +#include #include "incore.h" -/* Flags for lock requests; used in gfs2_holder gh_flag field. - From lm_interface.h: +/* Options for hostdata parser */ + +enum { + Opt_jid, + Opt_id, + Opt_first, + Opt_nodir, + Opt_err, +}; + +/* + * lm_lockname types + */ + +#define LM_TYPE_RESERVED 0x00 +#define LM_TYPE_NONDISK 0x01 +#define LM_TYPE_INODE 0x02 +#define LM_TYPE_RGRP 0x03 +#define LM_TYPE_META 0x04 +#define LM_TYPE_IOPEN 0x05 +#define LM_TYPE_FLOCK 0x06 +#define LM_TYPE_PLOCK 0x07 +#define LM_TYPE_QUOTA 0x08 +#define LM_TYPE_JOURNAL 0x09 + +/* + * lm_lock() states + * + * SHARED is compatible with SHARED, not with DEFERRED or EX. + * DEFERRED is compatible with DEFERRED, not with SHARED or EX. + */ + +#define LM_ST_UNLOCKED 0 +#define LM_ST_EXCLUSIVE 1 +#define LM_ST_DEFERRED 2 +#define LM_ST_SHARED 3 + +/* + * lm_lock() flags + * + * LM_FLAG_TRY + * Don't wait to acquire the lock if it can't be granted immediately. + * + * LM_FLAG_TRY_1CB + * Send one blocking callback if TRY is set and the lock is not granted. + * + * LM_FLAG_NOEXP + * GFS sets this flag on lock requests it makes while doing journal recovery. + * These special requests should not be blocked due to the recovery like + * ordinary locks would be. + * + * LM_FLAG_ANY + * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may + * also be granted in SHARED. The preferred state is whichever is compatible + * with other granted locks, or the specified state if no other locks exist. + * + * LM_FLAG_PRIORITY + * Override fairness considerations. Suppose a lock is held in a shared state + * and there is a pending request for the deferred state. A shared lock + * request with the priority flag would be allowed to bypass the deferred + * request and directly join the other shared lock. A shared lock request + * without the priority flag might be forced to wait until the deferred + * requested had acquired and released the lock. + */ + #define LM_FLAG_TRY 0x00000001 #define LM_FLAG_TRY_1CB 0x00000002 #define LM_FLAG_NOEXP 0x00000004 #define LM_FLAG_ANY 0x00000008 -#define LM_FLAG_PRIORITY 0x00000010 */ +#define LM_FLAG_PRIORITY 0x00000010 +#define GL_ASYNC 0x00000040 +#define GL_EXACT 0x00000080 +#define GL_SKIP 0x00000100 +#define GL_ATIME 0x00000200 +#define GL_NOCACHE 0x00000400 + +/* + * lm_lock() and lm_async_cb return flags + * + * LM_OUT_ST_MASK + * Masks the lower two bits of lock state in the returned value. + * + * LM_OUT_CANCELED + * The lock request was canceled. + * + * LM_OUT_ASYNC + * The result of the request will be returned in an LM_CB_ASYNC callback. + * + */ + +#define LM_OUT_ST_MASK 0x00000003 +#define LM_OUT_CANCELED 0x00000008 +#define LM_OUT_ASYNC 0x00000080 +#define LM_OUT_ERROR 0x00000100 + +/* + * lm_recovery_done() messages + */ + +#define LM_RD_GAVEUP 308 +#define LM_RD_SUCCESS 309 + +#define GLR_TRYFAILED 13 + +struct lm_lockops { + const char *lm_proto_name; + int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); + void (*lm_unmount) (struct gfs2_sbd *sdp); + void (*lm_withdraw) (struct gfs2_sbd *sdp); + void (*lm_put_lock) (struct kmem_cache *cachep, void *gl); + unsigned int (*lm_lock) (struct gfs2_glock *gl, + unsigned int req_state, unsigned int flags); + void (*lm_cancel) (struct gfs2_glock *gl); + const match_table_t *lm_tokens; +}; + +#define LM_FLAG_TRY 0x00000001 +#define LM_FLAG_TRY_1CB 0x00000002 +#define LM_FLAG_NOEXP 0x00000004 +#define LM_FLAG_ANY 0x00000008 +#define LM_FLAG_PRIORITY 0x00000010 #define GL_ASYNC 0x00000040 #define GL_EXACT 0x00000080 @@ -128,10 +243,12 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, int gfs2_lvb_hold(struct gfs2_glock *gl); void gfs2_lvb_unhold(struct gfs2_glock *gl); -void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); +void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); +void gfs2_glock_complete(struct gfs2_glock *gl, int ret); void gfs2_reclaim_glock(struct gfs2_sbd *sdp); void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); void gfs2_glock_finish_truncate(struct gfs2_inode *ip); +void gfs2_glock_thaw(struct gfs2_sbd *sdp); int __init gfs2_glock_init(void); void gfs2_glock_exit(void); @@ -141,4 +258,6 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); int gfs2_register_debugfs(void); void gfs2_unregister_debugfs(void); +extern const struct lm_lockops gfs2_dlm_ops; + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 8522d3aa64fc..f07ede8cb9ba 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include "gfs2.h" @@ -390,18 +389,6 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl) return 0; } -/** - * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock - * @gl: the glock - * - * Returns: 1 if it's ok - */ - -static int quota_go_demote_ok(const struct gfs2_glock *gl) -{ - return !atomic_read(&gl->gl_lvb_count); -} - const struct gfs2_glock_operations gfs2_meta_glops = { .go_xmote_th = meta_go_sync, .go_type = LM_TYPE_META, @@ -448,7 +435,6 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = { }; const struct gfs2_glock_operations gfs2_quota_glops = { - .go_demote_ok = quota_go_demote_ok, .go_type = LM_TYPE_QUOTA, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a0117d6eb145..0af7c24de6a1 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -12,6 +12,8 @@ #include #include +#include +#include #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -26,6 +28,7 @@ struct gfs2_trans; struct gfs2_ail; struct gfs2_jdesc; struct gfs2_sbd; +struct lm_lockops; typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); @@ -121,6 +124,28 @@ struct gfs2_bufdata { struct list_head bd_ail_gl_list; }; +/* + * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a + * prefix of lock_dlm_ gets awkward. + */ + +#define GDLM_STRNAME_BYTES 25 +#define GDLM_LVB_SIZE 32 + +enum { + DFL_BLOCK_LOCKS = 0, +}; + +struct lm_lockname { + u64 ln_number; + unsigned int ln_type; +}; + +#define lm_name_equal(name1, name2) \ + (((name1)->ln_number == (name2)->ln_number) && \ + ((name1)->ln_type == (name2)->ln_type)) + + struct gfs2_glock_operations { void (*go_xmote_th) (struct gfs2_glock *gl); int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); @@ -162,6 +187,8 @@ enum { GLF_LFLUSH = 7, GLF_INVALIDATE_IN_PROGRESS = 8, GLF_REPLY_PENDING = 9, + GLF_INITIAL = 10, + GLF_FROZEN = 11, }; struct gfs2_glock { @@ -181,10 +208,9 @@ struct gfs2_glock { struct list_head gl_holders; const struct gfs2_glock_operations *gl_ops; - void *gl_lock; - char *gl_lvb; - atomic_t gl_lvb_count; - + char gl_strname[GDLM_STRNAME_BYTES]; + struct dlm_lksb gl_lksb; + char gl_lvb[32]; unsigned long gl_stamp; unsigned long gl_tchange; void *gl_object; @@ -447,6 +473,30 @@ struct gfs2_sb_host { char sb_locktable[GFS2_LOCKNAME_LEN]; }; +/* + * lm_mount() return values + * + * ls_jid - the journal ID this node should use + * ls_first - this node is the first to mount the file system + * ls_lockspace - lock module's context for this file system + * ls_ops - lock module's functions + */ + +struct lm_lockstruct { + u32 ls_id; + unsigned int ls_jid; + unsigned int ls_first; + unsigned int ls_first_done; + unsigned int ls_nodir; + const struct lm_lockops *ls_ops; + unsigned long ls_flags; + dlm_lockspace_t *ls_dlm; + + int ls_recover_jid; + int ls_recover_jid_done; + int ls_recover_jid_status; +}; + struct gfs2_sbd { struct super_block *sd_vfs; struct kobject sd_kobj; @@ -520,7 +570,6 @@ struct gfs2_sbd { spinlock_t sd_jindex_spin; struct mutex sd_jindex_mutex; unsigned int sd_journals; - unsigned long sd_jindex_refresh_time; struct gfs2_jdesc *sd_jdesc; struct gfs2_holder sd_journal_gh; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 3b87c188da41..7b277d449155 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -137,16 +136,16 @@ void gfs2_set_iop(struct inode *inode) if (S_ISREG(mode)) { inode->i_op = &gfs2_file_iops; - if (sdp->sd_args.ar_localflocks) - inode->i_fop = &gfs2_file_fops_nolock; + if (gfs2_localflocks(sdp)) + inode->i_fop = gfs2_file_fops_nolock; else - inode->i_fop = &gfs2_file_fops; + inode->i_fop = gfs2_file_fops; } else if (S_ISDIR(mode)) { inode->i_op = &gfs2_dir_iops; - if (sdp->sd_args.ar_localflocks) - inode->i_fop = &gfs2_dir_fops_nolock; + if (gfs2_localflocks(sdp)) + inode->i_fop = gfs2_dir_fops_nolock; else - inode->i_fop = &gfs2_dir_fops; + inode->i_fop = gfs2_dir_fops; } else if (S_ISLNK(mode)) { inode->i_op = &gfs2_symlink_iops; } else { diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index d5329364cdff..dca4fee3078b 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -101,12 +101,26 @@ void gfs2_dinode_print(const struct gfs2_inode *ip); extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; extern const struct inode_operations gfs2_symlink_iops; -extern const struct file_operations gfs2_file_fops; -extern const struct file_operations gfs2_dir_fops; -extern const struct file_operations gfs2_file_fops_nolock; -extern const struct file_operations gfs2_dir_fops_nolock; +extern const struct file_operations *gfs2_file_fops_nolock; +extern const struct file_operations *gfs2_dir_fops_nolock; extern void gfs2_set_inode_flags(struct inode *inode); + +#ifdef CONFIG_GFS2_FS_LOCKING_DLM +extern const struct file_operations *gfs2_file_fops; +extern const struct file_operations *gfs2_dir_fops; +static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) +{ + return sdp->sd_args.ar_localflocks; +} +#else /* Single node only */ +#define gfs2_file_fops NULL +#define gfs2_dir_fops NULL +static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) +{ + return 1; +} +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ #endif /* __INODE_DOT_H__ */ diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c new file mode 100644 index 000000000000..a0bb7d2251a0 --- /dev/null +++ b/fs/gfs2/lock_dlm.c @@ -0,0 +1,240 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include + +#include "incore.h" +#include "glock.h" +#include "util.h" + + +static void gdlm_ast(void *arg) +{ + struct gfs2_glock *gl = arg; + unsigned ret = gl->gl_state; + + BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); + + if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) + memset(gl->gl_lvb, 0, GDLM_LVB_SIZE); + + switch (gl->gl_lksb.sb_status) { + case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ + kmem_cache_free(gfs2_glock_cachep, gl); + return; + case -DLM_ECANCEL: /* Cancel while getting lock */ + ret |= LM_OUT_CANCELED; + goto out; + case -EAGAIN: /* Try lock fails */ + goto out; + case -EINVAL: /* Invalid */ + case -ENOMEM: /* Out of memory */ + ret |= LM_OUT_ERROR; + goto out; + case 0: /* Success */ + break; + default: /* Something unexpected */ + BUG(); + } + + ret = gl->gl_target; + if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) { + if (gl->gl_target == LM_ST_SHARED) + ret = LM_ST_DEFERRED; + else if (gl->gl_target == LM_ST_DEFERRED) + ret = LM_ST_SHARED; + else + BUG(); + } + + set_bit(GLF_INITIAL, &gl->gl_flags); + gfs2_glock_complete(gl, ret); + return; +out: + if (!test_bit(GLF_INITIAL, &gl->gl_flags)) + gl->gl_lksb.sb_lkid = 0; + gfs2_glock_complete(gl, ret); +} + +static void gdlm_bast(void *arg, int mode) +{ + struct gfs2_glock *gl = arg; + + switch (mode) { + case DLM_LOCK_EX: + gfs2_glock_cb(gl, LM_ST_UNLOCKED); + break; + case DLM_LOCK_CW: + gfs2_glock_cb(gl, LM_ST_DEFERRED); + break; + case DLM_LOCK_PR: + gfs2_glock_cb(gl, LM_ST_SHARED); + break; + default: + printk(KERN_ERR "unknown bast mode %d", mode); + BUG(); + } +} + +/* convert gfs lock-state to dlm lock-mode */ + +static int make_mode(const unsigned int lmstate) +{ + switch (lmstate) { + case LM_ST_UNLOCKED: + return DLM_LOCK_NL; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_SHARED: + return DLM_LOCK_PR; + } + printk(KERN_ERR "unknown LM state %d", lmstate); + BUG(); + return -1; +} + +static u32 make_flags(const u32 lkid, const unsigned int gfs_flags, + const int req) +{ + u32 lkf = 0; + + if (gfs_flags & LM_FLAG_TRY) + lkf |= DLM_LKF_NOQUEUE; + + if (gfs_flags & LM_FLAG_TRY_1CB) { + lkf |= DLM_LKF_NOQUEUE; + lkf |= DLM_LKF_NOQUEUEBAST; + } + + if (gfs_flags & LM_FLAG_PRIORITY) { + lkf |= DLM_LKF_NOORDER; + lkf |= DLM_LKF_HEADQUE; + } + + if (gfs_flags & LM_FLAG_ANY) { + if (req == DLM_LOCK_PR) + lkf |= DLM_LKF_ALTCW; + else if (req == DLM_LOCK_CW) + lkf |= DLM_LKF_ALTPR; + else + BUG(); + } + + if (lkid != 0) + lkf |= DLM_LKF_CONVERT; + + lkf |= DLM_LKF_VALBLK; + + return lkf; +} + +static unsigned int gdlm_lock(struct gfs2_glock *gl, + unsigned int req_state, unsigned int flags) +{ + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + int error; + int req; + u32 lkf; + + req = make_mode(req_state); + lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); + + /* + * Submit the actual lock request. + */ + + error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + if (error == -EAGAIN) + return 0; + if (error) + return LM_OUT_ERROR; + return LM_OUT_ASYNC; +} + +static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr) +{ + struct gfs2_glock *gl = ptr; + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + int error; + + if (gl->gl_lksb.sb_lkid == 0) { + kmem_cache_free(cachep, gl); + return; + } + + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, + NULL, gl); + if (error) { + printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, error); + return; + } +} + +static void gdlm_cancel(struct gfs2_glock *gl) +{ + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); +} + +static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + if (fsname == NULL) { + fs_info(sdp, "no fsname found\n"); + return -EINVAL; + } + + error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm, + DLM_LSFL_FS | DLM_LSFL_NEWEXCL | + (ls->ls_nodir ? DLM_LSFL_NODIR : 0), + GDLM_LVB_SIZE); + if (error) + printk(KERN_ERR "dlm_new_lockspace error %d", error); + + return error; +} + +static void gdlm_unmount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (ls->ls_dlm) { + dlm_release_lockspace(ls->ls_dlm, 2); + ls->ls_dlm = NULL; + } +} + +static const match_table_t dlm_tokens = { + { Opt_jid, "jid=%d"}, + { Opt_id, "id=%d"}, + { Opt_first, "first=%d"}, + { Opt_nodir, "nodir=%d"}, + { Opt_err, NULL }, +}; + +const struct lm_lockops gfs2_dlm_ops = { + .lm_proto_name = "lock_dlm", + .lm_mount = gdlm_mount, + .lm_unmount = gdlm_unmount, + .lm_put_lock = gdlm_put_lock, + .lm_lock = gdlm_lock, + .lm_cancel = gdlm_cancel, + .lm_tokens = &dlm_tokens, +}; + diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c deleted file mode 100644 index d3657bc7938a..000000000000 --- a/fs/gfs2/locking.c +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct lmh_wrapper { - struct list_head lw_list; - const struct lm_lockops *lw_ops; -}; - -struct nolock_lockspace { - unsigned int nl_lvb_size; -}; - -/** - * nolock_get_lock - get a lm_lock_t given a descripton of the lock - * @lockspace: the lockspace the lock lives in - * @name: the name of the lock - * @lockp: return the lm_lock_t here - * - * Returns: 0 on success, -EXXX on failure - */ - -static int nolock_get_lock(void *lockspace, struct lm_lockname *name, - void **lockp) -{ - *lockp = lockspace; - return 0; -} - -/** - * nolock_put_lock - get rid of a lock structure - * @lock: the lock to throw away - * - */ - -static void nolock_put_lock(void *lock) -{ -} - -/** - * nolock_hold_lvb - hold on to a lock value block - * @lock: the lock the LVB is associated with - * @lvbp: return the lm_lvb_t here - * - * Returns: 0 on success, -EXXX on failure - */ - -static int nolock_hold_lvb(void *lock, char **lvbp) -{ - struct nolock_lockspace *nl = lock; - int error = 0; - - *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL); - if (!*lvbp) - error = -ENOMEM; - - return error; -} - -/** - * nolock_unhold_lvb - release a LVB - * @lock: the lock the LVB is associated with - * @lvb: the lock value block - * - */ - -static void nolock_unhold_lvb(void *lock, char *lvb) -{ - kfree(lvb); -} - -static int nolock_mount(char *table_name, char *host_data, - lm_callback_t cb, void *cb_data, - unsigned int min_lvb_size, int flags, - struct lm_lockstruct *lockstruct, - struct kobject *fskobj); -static void nolock_unmount(void *lockspace); - -/* List of registered low-level locking protocols. A file system selects one - of them by name at mount time, e.g. lock_nolock, lock_dlm. */ - -static const struct lm_lockops nolock_ops = { - .lm_proto_name = "lock_nolock", - .lm_mount = nolock_mount, - .lm_unmount = nolock_unmount, - .lm_get_lock = nolock_get_lock, - .lm_put_lock = nolock_put_lock, - .lm_hold_lvb = nolock_hold_lvb, - .lm_unhold_lvb = nolock_unhold_lvb, -}; - -static struct lmh_wrapper nolock_proto = { - .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list), - .lw_ops = &nolock_ops, -}; - -static LIST_HEAD(lmh_list); -static DEFINE_MUTEX(lmh_lock); - -static int nolock_mount(char *table_name, char *host_data, - lm_callback_t cb, void *cb_data, - unsigned int min_lvb_size, int flags, - struct lm_lockstruct *lockstruct, - struct kobject *fskobj) -{ - char *c; - unsigned int jid; - struct nolock_lockspace *nl; - - c = strstr(host_data, "jid="); - if (!c) - jid = 0; - else { - c += 4; - sscanf(c, "%u", &jid); - } - - nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL); - if (!nl) - return -ENOMEM; - - nl->nl_lvb_size = min_lvb_size; - - lockstruct->ls_jid = jid; - lockstruct->ls_first = 1; - lockstruct->ls_lvb_size = min_lvb_size; - lockstruct->ls_lockspace = nl; - lockstruct->ls_ops = &nolock_ops; - lockstruct->ls_flags = LM_LSFLAG_LOCAL; - - return 0; -} - -static void nolock_unmount(void *lockspace) -{ - struct nolock_lockspace *nl = lockspace; - kfree(nl); -} - -/** - * gfs2_register_lockproto - Register a low-level locking protocol - * @proto: the protocol definition - * - * Returns: 0 on success, -EXXX on failure - */ - -int gfs2_register_lockproto(const struct lm_lockops *proto) -{ - struct lmh_wrapper *lw; - - mutex_lock(&lmh_lock); - - list_for_each_entry(lw, &lmh_list, lw_list) { - if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) { - mutex_unlock(&lmh_lock); - printk(KERN_INFO "GFS2: protocol %s already exists\n", - proto->lm_proto_name); - return -EEXIST; - } - } - - lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL); - if (!lw) { - mutex_unlock(&lmh_lock); - return -ENOMEM; - } - - lw->lw_ops = proto; - list_add(&lw->lw_list, &lmh_list); - - mutex_unlock(&lmh_lock); - - return 0; -} - -/** - * gfs2_unregister_lockproto - Unregister a low-level locking protocol - * @proto: the protocol definition - * - */ - -void gfs2_unregister_lockproto(const struct lm_lockops *proto) -{ - struct lmh_wrapper *lw; - - mutex_lock(&lmh_lock); - - list_for_each_entry(lw, &lmh_list, lw_list) { - if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) { - list_del(&lw->lw_list); - mutex_unlock(&lmh_lock); - kfree(lw); - return; - } - } - - mutex_unlock(&lmh_lock); - - printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n", - proto->lm_proto_name); -} - -/** - * gfs2_mount_lockproto - Mount a lock protocol - * @proto_name - the name of the protocol - * @table_name - the name of the lock space - * @host_data - data specific to this host - * @cb - the callback to the code using the lock module - * @sdp - The GFS2 superblock - * @min_lvb_size - the mininum LVB size that the caller can deal with - * @flags - LM_MFLAG_* - * @lockstruct - a structure returned describing the mount - * - * Returns: 0 on success, -EXXX on failure - */ - -int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data, - lm_callback_t cb, void *cb_data, - unsigned int min_lvb_size, int flags, - struct lm_lockstruct *lockstruct, - struct kobject *fskobj) -{ - struct lmh_wrapper *lw = NULL; - int try = 0; - int error, found; - - -retry: - mutex_lock(&lmh_lock); - - if (list_empty(&nolock_proto.lw_list)) - list_add(&nolock_proto.lw_list, &lmh_list); - - found = 0; - list_for_each_entry(lw, &lmh_list, lw_list) { - if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) { - found = 1; - break; - } - } - - if (!found) { - if (!try && capable(CAP_SYS_MODULE)) { - try = 1; - mutex_unlock(&lmh_lock); - request_module(proto_name); - goto retry; - } - printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name); - error = -ENOENT; - goto out; - } - - if (lw->lw_ops->lm_owner && - !try_module_get(lw->lw_ops->lm_owner)) { - try = 0; - mutex_unlock(&lmh_lock); - msleep(1000); - goto retry; - } - - error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data, - min_lvb_size, flags, lockstruct, fskobj); - if (error) - module_put(lw->lw_ops->lm_owner); -out: - mutex_unlock(&lmh_lock); - return error; -} - -void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct) -{ - mutex_lock(&lmh_lock); - if (lockstruct->ls_ops->lm_unmount) - lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace); - if (lockstruct->ls_ops->lm_owner) - module_put(lockstruct->ls_ops->lm_owner); - mutex_unlock(&lmh_lock); -} - -/** - * gfs2_withdraw_lockproto - abnormally unmount a lock module - * @lockstruct: the lockstruct passed into mount - * - */ - -void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct) -{ - mutex_lock(&lmh_lock); - lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace); - if (lockstruct->ls_ops->lm_owner) - module_put(lockstruct->ls_ops->lm_owner); - mutex_unlock(&lmh_lock); -} - -EXPORT_SYMBOL_GPL(gfs2_register_lockproto); -EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto); - diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile deleted file mode 100644 index 2609bb6cd013..000000000000 --- a/fs/gfs2/locking/dlm/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o -lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o - diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c deleted file mode 100644 index 2482c9047505..000000000000 --- a/fs/gfs2/locking/dlm/lock.c +++ /dev/null @@ -1,708 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include "lock_dlm.h" - -static char junk_lvb[GDLM_LVB_SIZE]; - - -/* convert dlm lock-mode to gfs lock-state */ - -static s16 gdlm_make_lmstate(s16 dlmmode) -{ - switch (dlmmode) { - case DLM_LOCK_IV: - case DLM_LOCK_NL: - return LM_ST_UNLOCKED; - case DLM_LOCK_EX: - return LM_ST_EXCLUSIVE; - case DLM_LOCK_CW: - return LM_ST_DEFERRED; - case DLM_LOCK_PR: - return LM_ST_SHARED; - } - gdlm_assert(0, "unknown DLM mode %d", dlmmode); - return -1; -} - -/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm - thread gets to it. */ - -static void queue_submit(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - - spin_lock(&ls->async_lock); - list_add_tail(&lp->delay_list, &ls->submit); - spin_unlock(&ls->async_lock); - wake_up(&ls->thread_wait); -} - -static void wake_up_ast(struct gdlm_lock *lp) -{ - clear_bit(LFL_AST_WAIT, &lp->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&lp->flags, LFL_AST_WAIT); -} - -static void gdlm_delete_lp(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - - spin_lock(&ls->async_lock); - if (!list_empty(&lp->delay_list)) - list_del_init(&lp->delay_list); - ls->all_locks_count--; - spin_unlock(&ls->async_lock); - - kfree(lp); -} - -static void gdlm_queue_delayed(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - - spin_lock(&ls->async_lock); - list_add_tail(&lp->delay_list, &ls->delayed); - spin_unlock(&ls->async_lock); -} - -static void process_complete(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - struct lm_async_cb acb; - - memset(&acb, 0, sizeof(acb)); - - if (lp->lksb.sb_status == -DLM_ECANCEL) { - log_info("complete dlm cancel %x,%llx flags %lx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->flags); - - lp->req = lp->cur; - acb.lc_ret |= LM_OUT_CANCELED; - if (lp->cur == DLM_LOCK_IV) - lp->lksb.sb_lkid = 0; - goto out; - } - - if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) { - if (lp->lksb.sb_status != -DLM_EUNLOCK) { - log_info("unlock sb_status %d %x,%llx flags %lx", - lp->lksb.sb_status, lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->flags); - return; - } - - lp->cur = DLM_LOCK_IV; - lp->req = DLM_LOCK_IV; - lp->lksb.sb_lkid = 0; - - if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) { - gdlm_delete_lp(lp); - return; - } - goto out; - } - - if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID) - memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); - - if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) { - if (lp->req == DLM_LOCK_PR) - lp->req = DLM_LOCK_CW; - else if (lp->req == DLM_LOCK_CW) - lp->req = DLM_LOCK_PR; - } - - /* - * A canceled lock request. The lock was just taken off the delayed - * list and was never even submitted to dlm. - */ - - if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) { - log_info("complete internal cancel %x,%llx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - lp->req = lp->cur; - acb.lc_ret |= LM_OUT_CANCELED; - goto out; - } - - /* - * An error occured. - */ - - if (lp->lksb.sb_status) { - /* a "normal" error */ - if ((lp->lksb.sb_status == -EAGAIN) && - (lp->lkf & DLM_LKF_NOQUEUE)) { - lp->req = lp->cur; - if (lp->cur == DLM_LOCK_IV) - lp->lksb.sb_lkid = 0; - goto out; - } - - /* this could only happen with cancels I think */ - log_info("ast sb_status %d %x,%llx flags %lx", - lp->lksb.sb_status, lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->flags); - return; - } - - /* - * This is an AST for an EX->EX conversion for sync_lvb from GFS. - */ - - if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) { - wake_up_ast(lp); - return; - } - - /* - * A lock has been demoted to NL because it initially completed during - * BLOCK_LOCKS. Now it must be requested in the originally requested - * mode. - */ - - if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) { - gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - - lp->cur = DLM_LOCK_NL; - lp->req = lp->prev_req; - lp->prev_req = DLM_LOCK_IV; - lp->lkf &= ~DLM_LKF_CONVDEADLK; - - set_bit(LFL_NOCACHE, &lp->flags); - - if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && - !test_bit(LFL_NOBLOCK, &lp->flags)) - gdlm_queue_delayed(lp); - else - queue_submit(lp); - return; - } - - /* - * A request is granted during dlm recovery. It may be granted - * because the locks of a failed node were cleared. In that case, - * there may be inconsistent data beneath this lock and we must wait - * for recovery to complete to use it. When gfs recovery is done this - * granted lock will be converted to NL and then reacquired in this - * granted state. - */ - - if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && - !test_bit(LFL_NOBLOCK, &lp->flags) && - lp->req != DLM_LOCK_NL) { - - lp->cur = lp->req; - lp->prev_req = lp->req; - lp->req = DLM_LOCK_NL; - lp->lkf |= DLM_LKF_CONVERT; - lp->lkf &= ~DLM_LKF_CONVDEADLK; - - log_debug("rereq %x,%llx id %x %d,%d", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->lksb.sb_lkid, lp->cur, lp->req); - - set_bit(LFL_REREQUEST, &lp->flags); - queue_submit(lp); - return; - } - - /* - * DLM demoted the lock to NL before it was granted so GFS must be - * told it cannot cache data for this lock. - */ - - if (lp->lksb.sb_flags & DLM_SBF_DEMOTED) - set_bit(LFL_NOCACHE, &lp->flags); - -out: - /* - * This is an internal lock_dlm lock - */ - - if (test_bit(LFL_INLOCK, &lp->flags)) { - clear_bit(LFL_NOBLOCK, &lp->flags); - lp->cur = lp->req; - wake_up_ast(lp); - return; - } - - /* - * Normal completion of a lock request. Tell GFS it now has the lock. - */ - - clear_bit(LFL_NOBLOCK, &lp->flags); - lp->cur = lp->req; - - acb.lc_name = lp->lockname; - acb.lc_ret |= gdlm_make_lmstate(lp->cur); - - ls->fscb(ls->sdp, LM_CB_ASYNC, &acb); -} - -static void gdlm_ast(void *astarg) -{ - struct gdlm_lock *lp = astarg; - clear_bit(LFL_ACTIVE, &lp->flags); - process_complete(lp); -} - -static void process_blocking(struct gdlm_lock *lp, int bast_mode) -{ - struct gdlm_ls *ls = lp->ls; - unsigned int cb = 0; - - switch (gdlm_make_lmstate(bast_mode)) { - case LM_ST_EXCLUSIVE: - cb = LM_CB_NEED_E; - break; - case LM_ST_DEFERRED: - cb = LM_CB_NEED_D; - break; - case LM_ST_SHARED: - cb = LM_CB_NEED_S; - break; - default: - gdlm_assert(0, "unknown bast mode %u", bast_mode); - } - - ls->fscb(ls->sdp, cb, &lp->lockname); -} - - -static void gdlm_bast(void *astarg, int mode) -{ - struct gdlm_lock *lp = astarg; - - if (!mode) { - printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - return; - } - - process_blocking(lp, mode); -} - -/* convert gfs lock-state to dlm lock-mode */ - -static s16 make_mode(s16 lmstate) -{ - switch (lmstate) { - case LM_ST_UNLOCKED: - return DLM_LOCK_NL; - case LM_ST_EXCLUSIVE: - return DLM_LOCK_EX; - case LM_ST_DEFERRED: - return DLM_LOCK_CW; - case LM_ST_SHARED: - return DLM_LOCK_PR; - } - gdlm_assert(0, "unknown LM state %d", lmstate); - return -1; -} - - -/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and - DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */ - -static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state) -{ - s16 cur = make_mode(cur_state); - if (lp->cur != DLM_LOCK_IV) - gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur); -} - -static inline unsigned int make_flags(struct gdlm_lock *lp, - unsigned int gfs_flags, - s16 cur, s16 req) -{ - unsigned int lkf = 0; - - if (gfs_flags & LM_FLAG_TRY) - lkf |= DLM_LKF_NOQUEUE; - - if (gfs_flags & LM_FLAG_TRY_1CB) { - lkf |= DLM_LKF_NOQUEUE; - lkf |= DLM_LKF_NOQUEUEBAST; - } - - if (gfs_flags & LM_FLAG_PRIORITY) { - lkf |= DLM_LKF_NOORDER; - lkf |= DLM_LKF_HEADQUE; - } - - if (gfs_flags & LM_FLAG_ANY) { - if (req == DLM_LOCK_PR) - lkf |= DLM_LKF_ALTCW; - else if (req == DLM_LOCK_CW) - lkf |= DLM_LKF_ALTPR; - } - - if (lp->lksb.sb_lkid != 0) { - lkf |= DLM_LKF_CONVERT; - } - - if (lp->lvb) - lkf |= DLM_LKF_VALBLK; - - return lkf; -} - -/* make_strname - convert GFS lock numbers to a string */ - -static inline void make_strname(const struct lm_lockname *lockname, - struct gdlm_strname *str) -{ - sprintf(str->name, "%8x%16llx", lockname->ln_type, - (unsigned long long)lockname->ln_number); - str->namelen = GDLM_STRNAME_BYTES; -} - -static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name, - struct gdlm_lock **lpp) -{ - struct gdlm_lock *lp; - - lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS); - if (!lp) - return -ENOMEM; - - lp->lockname = *name; - make_strname(name, &lp->strname); - lp->ls = ls; - lp->cur = DLM_LOCK_IV; - INIT_LIST_HEAD(&lp->delay_list); - - spin_lock(&ls->async_lock); - ls->all_locks_count++; - spin_unlock(&ls->async_lock); - - *lpp = lp; - return 0; -} - -int gdlm_get_lock(void *lockspace, struct lm_lockname *name, - void **lockp) -{ - struct gdlm_lock *lp; - int error; - - error = gdlm_create_lp(lockspace, name, &lp); - - *lockp = lp; - return error; -} - -void gdlm_put_lock(void *lock) -{ - gdlm_delete_lp(lock); -} - -unsigned int gdlm_do_lock(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - int error, bast = 1; - - /* - * When recovery is in progress, delay lock requests for submission - * once recovery is done. Requests for recovery (NOEXP) and unlocks - * can pass. - */ - - if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && - !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) { - gdlm_queue_delayed(lp); - return LM_OUT_ASYNC; - } - - /* - * Submit the actual lock request. - */ - - if (test_bit(LFL_NOBAST, &lp->flags)) - bast = 0; - - set_bit(LFL_ACTIVE, &lp->flags); - - log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid, - lp->cur, lp->req, lp->lkf); - - error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf, - lp->strname.name, lp->strname.namelen, 0, gdlm_ast, - lp, bast ? gdlm_bast : NULL); - - if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) { - lp->lksb.sb_status = -EAGAIN; - gdlm_ast(lp); - error = 0; - } - - if (error) { - log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x " - "flags=%lx", ls->fsname, lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, error, - lp->cur, lp->req, lp->lkf, lp->flags); - return LM_OUT_ERROR; - } - return LM_OUT_ASYNC; -} - -static unsigned int gdlm_do_unlock(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - unsigned int lkf = 0; - int error; - - set_bit(LFL_DLM_UNLOCK, &lp->flags); - set_bit(LFL_ACTIVE, &lp->flags); - - if (lp->lvb) - lkf = DLM_LKF_VALBLK; - - log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->lksb.sb_lkid, lp->cur, lkf); - - error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp); - - if (error) { - log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x " - "flags=%lx", ls->fsname, lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, error, - lp->cur, lp->req, lp->lkf, lp->flags); - return LM_OUT_ERROR; - } - return LM_OUT_ASYNC; -} - -unsigned int gdlm_lock(void *lock, unsigned int cur_state, - unsigned int req_state, unsigned int flags) -{ - struct gdlm_lock *lp = lock; - - if (req_state == LM_ST_UNLOCKED) - return gdlm_unlock(lock, cur_state); - - if (req_state == LM_ST_UNLOCKED) - return gdlm_unlock(lock, cur_state); - - clear_bit(LFL_DLM_CANCEL, &lp->flags); - if (flags & LM_FLAG_NOEXP) - set_bit(LFL_NOBLOCK, &lp->flags); - - check_cur_state(lp, cur_state); - lp->req = make_mode(req_state); - lp->lkf = make_flags(lp, flags, lp->cur, lp->req); - - return gdlm_do_lock(lp); -} - -unsigned int gdlm_unlock(void *lock, unsigned int cur_state) -{ - struct gdlm_lock *lp = lock; - - clear_bit(LFL_DLM_CANCEL, &lp->flags); - if (lp->cur == DLM_LOCK_IV) - return 0; - return gdlm_do_unlock(lp); -} - -void gdlm_cancel(void *lock) -{ - struct gdlm_lock *lp = lock; - struct gdlm_ls *ls = lp->ls; - int error, delay_list = 0; - - if (test_bit(LFL_DLM_CANCEL, &lp->flags)) - return; - - log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, lp->flags); - - spin_lock(&ls->async_lock); - if (!list_empty(&lp->delay_list)) { - list_del_init(&lp->delay_list); - delay_list = 1; - } - spin_unlock(&ls->async_lock); - - if (delay_list) { - set_bit(LFL_CANCEL, &lp->flags); - set_bit(LFL_ACTIVE, &lp->flags); - gdlm_ast(lp); - return; - } - - if (!test_bit(LFL_ACTIVE, &lp->flags) || - test_bit(LFL_DLM_UNLOCK, &lp->flags)) { - log_info("gdlm_cancel skip %x,%llx flags %lx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, lp->flags); - return; - } - - /* the lock is blocked in the dlm */ - - set_bit(LFL_DLM_CANCEL, &lp->flags); - set_bit(LFL_ACTIVE, &lp->flags); - - error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL, - NULL, lp); - - log_info("gdlm_cancel rv %d %x,%llx flags %lx", error, - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, lp->flags); - - if (error == -EBUSY) - clear_bit(LFL_DLM_CANCEL, &lp->flags); -} - -static int gdlm_add_lvb(struct gdlm_lock *lp) -{ - char *lvb; - - lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); - if (!lvb) - return -ENOMEM; - - lp->lksb.sb_lvbptr = lvb; - lp->lvb = lvb; - return 0; -} - -static void gdlm_del_lvb(struct gdlm_lock *lp) -{ - kfree(lp->lvb); - lp->lvb = NULL; - lp->lksb.sb_lvbptr = NULL; -} - -static int gdlm_ast_wait(void *word) -{ - schedule(); - return 0; -} - -/* This can do a synchronous dlm request (requiring a lock_dlm thread to get - the completion) because gfs won't call hold_lvb() during a callback (from - the context of a lock_dlm thread). */ - -static int hold_null_lock(struct gdlm_lock *lp) -{ - struct gdlm_lock *lpn = NULL; - int error; - - if (lp->hold_null) { - printk(KERN_INFO "lock_dlm: lvb already held\n"); - return 0; - } - - error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn); - if (error) - goto out; - - lpn->lksb.sb_lvbptr = junk_lvb; - lpn->lvb = junk_lvb; - - lpn->req = DLM_LOCK_NL; - lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE; - set_bit(LFL_NOBAST, &lpn->flags); - set_bit(LFL_INLOCK, &lpn->flags); - set_bit(LFL_AST_WAIT, &lpn->flags); - - gdlm_do_lock(lpn); - wait_on_bit(&lpn->flags, LFL_AST_WAIT, gdlm_ast_wait, TASK_UNINTERRUPTIBLE); - error = lpn->lksb.sb_status; - if (error) { - printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n", - error); - gdlm_delete_lp(lpn); - lpn = NULL; - } -out: - lp->hold_null = lpn; - return error; -} - -/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get - the completion) because gfs may call unhold_lvb() during a callback (from - the context of a lock_dlm thread) which could cause a deadlock since the - other lock_dlm thread could be engaged in recovery. */ - -static void unhold_null_lock(struct gdlm_lock *lp) -{ - struct gdlm_lock *lpn = lp->hold_null; - - gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - lpn->lksb.sb_lvbptr = NULL; - lpn->lvb = NULL; - set_bit(LFL_UNLOCK_DELETE, &lpn->flags); - gdlm_do_unlock(lpn); - lp->hold_null = NULL; -} - -/* Acquire a NL lock because gfs requires the value block to remain - intact on the resource while the lvb is "held" even if it's holding no locks - on the resource. */ - -int gdlm_hold_lvb(void *lock, char **lvbp) -{ - struct gdlm_lock *lp = lock; - int error; - - error = gdlm_add_lvb(lp); - if (error) - return error; - - *lvbp = lp->lvb; - - error = hold_null_lock(lp); - if (error) - gdlm_del_lvb(lp); - - return error; -} - -void gdlm_unhold_lvb(void *lock, char *lvb) -{ - struct gdlm_lock *lp = lock; - - unhold_null_lock(lp); - gdlm_del_lvb(lp); -} - -void gdlm_submit_delayed(struct gdlm_ls *ls) -{ - struct gdlm_lock *lp, *safe; - - spin_lock(&ls->async_lock); - list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) { - list_del_init(&lp->delay_list); - list_add_tail(&lp->delay_list, &ls->submit); - } - spin_unlock(&ls->async_lock); - wake_up(&ls->thread_wait); -} - diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h deleted file mode 100644 index 3c98e7c6f93b..000000000000 --- a/fs/gfs2/locking/dlm/lock_dlm.h +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef LOCK_DLM_DOT_H -#define LOCK_DLM_DOT_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -/* - * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a - * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module - * as "lock_dlm". - */ - -#define GDLM_STRNAME_BYTES 24 -#define GDLM_LVB_SIZE 32 -#define GDLM_DROP_COUNT 0 -#define GDLM_DROP_PERIOD 60 -#define GDLM_NAME_LEN 128 - -/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number). - We sprintf these numbers into a 24 byte string of hex values to make them - human-readable (to make debugging simpler.) */ - -struct gdlm_strname { - unsigned char name[GDLM_STRNAME_BYTES]; - unsigned short namelen; -}; - -enum { - DFL_BLOCK_LOCKS = 0, - DFL_SPECTATOR = 1, - DFL_WITHDRAW = 2, -}; - -struct gdlm_ls { - u32 id; - int jid; - int first; - int first_done; - unsigned long flags; - struct kobject kobj; - char clustername[GDLM_NAME_LEN]; - char fsname[GDLM_NAME_LEN]; - int fsflags; - dlm_lockspace_t *dlm_lockspace; - lm_callback_t fscb; - struct gfs2_sbd *sdp; - int recover_jid; - int recover_jid_done; - int recover_jid_status; - spinlock_t async_lock; - struct list_head delayed; - struct list_head submit; - u32 all_locks_count; - wait_queue_head_t wait_control; - struct task_struct *thread; - wait_queue_head_t thread_wait; -}; - -enum { - LFL_NOBLOCK = 0, - LFL_NOCACHE = 1, - LFL_DLM_UNLOCK = 2, - LFL_DLM_CANCEL = 3, - LFL_SYNC_LVB = 4, - LFL_FORCE_PROMOTE = 5, - LFL_REREQUEST = 6, - LFL_ACTIVE = 7, - LFL_INLOCK = 8, - LFL_CANCEL = 9, - LFL_NOBAST = 10, - LFL_HEADQUE = 11, - LFL_UNLOCK_DELETE = 12, - LFL_AST_WAIT = 13, -}; - -struct gdlm_lock { - struct gdlm_ls *ls; - struct lm_lockname lockname; - struct gdlm_strname strname; - char *lvb; - struct dlm_lksb lksb; - - s16 cur; - s16 req; - s16 prev_req; - u32 lkf; /* dlm flags DLM_LKF_ */ - unsigned long flags; /* lock_dlm flags LFL_ */ - - struct list_head delay_list; /* delayed */ - struct gdlm_lock *hold_null; /* NL lock for hold_lvb */ -}; - -#define gdlm_assert(assertion, fmt, args...) \ -do { \ - if (unlikely(!(assertion))) { \ - printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \ - "lock_dlm: " fmt "\n", \ - #assertion, ##args); \ - BUG(); \ - } \ -} while (0) - -#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg) -#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg) -#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg) -#ifdef LOCK_DLM_LOG_DEBUG -#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg) -#else -#define log_debug(fmt, arg...) -#endif - -/* sysfs.c */ - -int gdlm_sysfs_init(void); -void gdlm_sysfs_exit(void); -int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *); -void gdlm_kobject_release(struct gdlm_ls *); - -/* thread.c */ - -int gdlm_init_threads(struct gdlm_ls *); -void gdlm_release_threads(struct gdlm_ls *); - -/* lock.c */ - -void gdlm_submit_delayed(struct gdlm_ls *); -unsigned int gdlm_do_lock(struct gdlm_lock *); - -int gdlm_get_lock(void *, struct lm_lockname *, void **); -void gdlm_put_lock(void *); -unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int); -unsigned int gdlm_unlock(void *, unsigned int); -void gdlm_cancel(void *); -int gdlm_hold_lvb(void *, char **); -void gdlm_unhold_lvb(void *, char *); - -/* mount.c */ - -extern const struct lm_lockops gdlm_ops; - -#endif - diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c deleted file mode 100644 index b9a03a7ff801..000000000000 --- a/fs/gfs2/locking/dlm/main.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include - -#include "lock_dlm.h" - -static int __init init_lock_dlm(void) -{ - int error; - - error = gfs2_register_lockproto(&gdlm_ops); - if (error) { - printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n", - error); - return error; - } - - error = gdlm_sysfs_init(); - if (error) { - gfs2_unregister_lockproto(&gdlm_ops); - return error; - } - - printk(KERN_INFO - "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__); - return 0; -} - -static void __exit exit_lock_dlm(void) -{ - gdlm_sysfs_exit(); - gfs2_unregister_lockproto(&gdlm_ops); -} - -module_init(init_lock_dlm); -module_exit(exit_lock_dlm); - -MODULE_DESCRIPTION("GFS DLM Locking Module"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c deleted file mode 100644 index 1aa7eb6a0226..000000000000 --- a/fs/gfs2/locking/dlm/mount.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include "lock_dlm.h" - -const struct lm_lockops gdlm_ops; - - -static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp, - int flags, char *table_name) -{ - struct gdlm_ls *ls; - char buf[256], *p; - - ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL); - if (!ls) - return NULL; - - ls->fscb = cb; - ls->sdp = sdp; - ls->fsflags = flags; - spin_lock_init(&ls->async_lock); - INIT_LIST_HEAD(&ls->delayed); - INIT_LIST_HEAD(&ls->submit); - init_waitqueue_head(&ls->thread_wait); - init_waitqueue_head(&ls->wait_control); - ls->jid = -1; - - strncpy(buf, table_name, 256); - buf[255] = '\0'; - - p = strchr(buf, ':'); - if (!p) { - log_info("invalid table_name \"%s\"", table_name); - kfree(ls); - return NULL; - } - *p = '\0'; - p++; - - strncpy(ls->clustername, buf, GDLM_NAME_LEN); - strncpy(ls->fsname, p, GDLM_NAME_LEN); - - return ls; -} - -static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir) -{ - char data[256]; - char *options, *x, *y; - int error = 0; - - memset(data, 0, 256); - strncpy(data, data_arg, 255); - - if (!strlen(data)) { - log_error("no mount options, (u)mount helpers not installed"); - return -EINVAL; - } - - for (options = data; (x = strsep(&options, ":")); ) { - if (!*x) - continue; - - y = strchr(x, '='); - if (y) - *y++ = 0; - - if (!strcmp(x, "jid")) { - if (!y) { - log_error("need argument to jid"); - error = -EINVAL; - break; - } - sscanf(y, "%u", &ls->jid); - - } else if (!strcmp(x, "first")) { - if (!y) { - log_error("need argument to first"); - error = -EINVAL; - break; - } - sscanf(y, "%u", &ls->first); - - } else if (!strcmp(x, "id")) { - if (!y) { - log_error("need argument to id"); - error = -EINVAL; - break; - } - sscanf(y, "%u", &ls->id); - - } else if (!strcmp(x, "nodir")) { - if (!y) { - log_error("need argument to nodir"); - error = -EINVAL; - break; - } - sscanf(y, "%u", nodir); - - } else { - log_error("unkonwn option: %s", x); - error = -EINVAL; - break; - } - } - - return error; -} - -static int gdlm_mount(char *table_name, char *host_data, - lm_callback_t cb, void *cb_data, - unsigned int min_lvb_size, int flags, - struct lm_lockstruct *lockstruct, - struct kobject *fskobj) -{ - struct gdlm_ls *ls; - int error = -ENOMEM, nodir = 0; - - if (min_lvb_size > GDLM_LVB_SIZE) - goto out; - - ls = init_gdlm(cb, cb_data, flags, table_name); - if (!ls) - goto out; - - error = make_args(ls, host_data, &nodir); - if (error) - goto out; - - error = gdlm_init_threads(ls); - if (error) - goto out_free; - - error = gdlm_kobject_setup(ls, fskobj); - if (error) - goto out_thread; - - error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), - &ls->dlm_lockspace, - DLM_LSFL_FS | DLM_LSFL_NEWEXCL | - (nodir ? DLM_LSFL_NODIR : 0), - GDLM_LVB_SIZE); - if (error) { - log_error("dlm_new_lockspace error %d", error); - goto out_kobj; - } - - lockstruct->ls_jid = ls->jid; - lockstruct->ls_first = ls->first; - lockstruct->ls_lockspace = ls; - lockstruct->ls_ops = &gdlm_ops; - lockstruct->ls_flags = 0; - lockstruct->ls_lvb_size = GDLM_LVB_SIZE; - return 0; - -out_kobj: - gdlm_kobject_release(ls); -out_thread: - gdlm_release_threads(ls); -out_free: - kfree(ls); -out: - return error; -} - -static void gdlm_unmount(void *lockspace) -{ - struct gdlm_ls *ls = lockspace; - - log_debug("unmount flags %lx", ls->flags); - - /* FIXME: serialize unmount and withdraw in case they - happen at once. Also, if unmount follows withdraw, - wait for withdraw to finish. */ - - if (test_bit(DFL_WITHDRAW, &ls->flags)) - goto out; - - gdlm_kobject_release(ls); - dlm_release_lockspace(ls->dlm_lockspace, 2); - gdlm_release_threads(ls); - BUG_ON(ls->all_locks_count); -out: - kfree(ls); -} - -static void gdlm_recovery_done(void *lockspace, unsigned int jid, - unsigned int message) -{ - char env_jid[20]; - char env_status[20]; - char *envp[] = { env_jid, env_status, NULL }; - struct gdlm_ls *ls = lockspace; - ls->recover_jid_done = jid; - ls->recover_jid_status = message; - sprintf(env_jid, "JID=%d", jid); - sprintf(env_status, "RECOVERY=%s", - message == LM_RD_SUCCESS ? "Done" : "Failed"); - kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp); -} - -static void gdlm_others_may_mount(void *lockspace) -{ - char *message = "FIRSTMOUNT=Done"; - char *envp[] = { message, NULL }; - struct gdlm_ls *ls = lockspace; - ls->first_done = 1; - kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp); -} - -/* Userspace gets the offline uevent, blocks new gfs locks on - other mounters, and lets us know (sets WITHDRAW flag). Then, - userspace leaves the mount group while we leave the lockspace. */ - -static void gdlm_withdraw(void *lockspace) -{ - struct gdlm_ls *ls = lockspace; - - kobject_uevent(&ls->kobj, KOBJ_OFFLINE); - - wait_event_interruptible(ls->wait_control, - test_bit(DFL_WITHDRAW, &ls->flags)); - - dlm_release_lockspace(ls->dlm_lockspace, 2); - gdlm_release_threads(ls); - gdlm_kobject_release(ls); -} - -static int gdlm_plock(void *lockspace, struct lm_lockname *name, - struct file *file, int cmd, struct file_lock *fl) -{ - struct gdlm_ls *ls = lockspace; - return dlm_posix_lock(ls->dlm_lockspace, name->ln_number, file, cmd, fl); -} - -static int gdlm_punlock(void *lockspace, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - struct gdlm_ls *ls = lockspace; - return dlm_posix_unlock(ls->dlm_lockspace, name->ln_number, file, fl); -} - -static int gdlm_plock_get(void *lockspace, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - struct gdlm_ls *ls = lockspace; - return dlm_posix_get(ls->dlm_lockspace, name->ln_number, file, fl); -} - -const struct lm_lockops gdlm_ops = { - .lm_proto_name = "lock_dlm", - .lm_mount = gdlm_mount, - .lm_others_may_mount = gdlm_others_may_mount, - .lm_unmount = gdlm_unmount, - .lm_withdraw = gdlm_withdraw, - .lm_get_lock = gdlm_get_lock, - .lm_put_lock = gdlm_put_lock, - .lm_lock = gdlm_lock, - .lm_unlock = gdlm_unlock, - .lm_plock = gdlm_plock, - .lm_punlock = gdlm_punlock, - .lm_plock_get = gdlm_plock_get, - .lm_cancel = gdlm_cancel, - .lm_hold_lvb = gdlm_hold_lvb, - .lm_unhold_lvb = gdlm_unhold_lvb, - .lm_recovery_done = gdlm_recovery_done, - .lm_owner = THIS_MODULE, -}; - diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c deleted file mode 100644 index 9b7edcf7bd49..000000000000 --- a/fs/gfs2/locking/dlm/sysfs.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include - -#include "lock_dlm.h" - -static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name); -} - -static ssize_t block_show(struct gdlm_ls *ls, char *buf) -{ - ssize_t ret; - int val = 0; - - if (test_bit(DFL_BLOCK_LOCKS, &ls->flags)) - val = 1; - ret = sprintf(buf, "%d\n", val); - return ret; -} - -static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len) -{ - ssize_t ret = len; - int val; - - val = simple_strtol(buf, NULL, 0); - - if (val == 1) - set_bit(DFL_BLOCK_LOCKS, &ls->flags); - else if (val == 0) { - clear_bit(DFL_BLOCK_LOCKS, &ls->flags); - gdlm_submit_delayed(ls); - } else { - ret = -EINVAL; - } - return ret; -} - -static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf) -{ - ssize_t ret; - int val = 0; - - if (test_bit(DFL_WITHDRAW, &ls->flags)) - val = 1; - ret = sprintf(buf, "%d\n", val); - return ret; -} - -static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len) -{ - ssize_t ret = len; - int val; - - val = simple_strtol(buf, NULL, 0); - - if (val == 1) - set_bit(DFL_WITHDRAW, &ls->flags); - else - ret = -EINVAL; - wake_up(&ls->wait_control); - return ret; -} - -static ssize_t id_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%u\n", ls->id); -} - -static ssize_t jid_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->jid); -} - -static ssize_t first_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->first); -} - -static ssize_t first_done_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->first_done); -} - -static ssize_t recover_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->recover_jid); -} - -static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len) -{ - ls->recover_jid = simple_strtol(buf, NULL, 0); - ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid); - return len; -} - -static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->recover_jid_done); -} - -static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->recover_jid_status); -} - -struct gdlm_attr { - struct attribute attr; - ssize_t (*show)(struct gdlm_ls *, char *); - ssize_t (*store)(struct gdlm_ls *, const char *, size_t); -}; - -#define GDLM_ATTR(_name,_mode,_show,_store) \ -static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) - -GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); -GDLM_ATTR(block, 0644, block_show, block_store); -GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(id, 0444, id_show, NULL); -GDLM_ATTR(jid, 0444, jid_show, NULL); -GDLM_ATTR(first, 0444, first_show, NULL); -GDLM_ATTR(first_done, 0444, first_done_show, NULL); -GDLM_ATTR(recover, 0644, recover_show, recover_store); -GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); -GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); - -static struct attribute *gdlm_attrs[] = { - &gdlm_attr_proto_name.attr, - &gdlm_attr_block.attr, - &gdlm_attr_withdraw.attr, - &gdlm_attr_id.attr, - &gdlm_attr_jid.attr, - &gdlm_attr_first.attr, - &gdlm_attr_first_done.attr, - &gdlm_attr_recover.attr, - &gdlm_attr_recover_done.attr, - &gdlm_attr_recover_status.attr, - NULL, -}; - -static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); - struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr); - return a->show ? a->show(ls, buf) : 0; -} - -static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); - struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr); - return a->store ? a->store(ls, buf, len) : len; -} - -static struct sysfs_ops gdlm_attr_ops = { - .show = gdlm_attr_show, - .store = gdlm_attr_store, -}; - -static struct kobj_type gdlm_ktype = { - .default_attrs = gdlm_attrs, - .sysfs_ops = &gdlm_attr_ops, -}; - -static struct kset *gdlm_kset; - -int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj) -{ - int error; - - ls->kobj.kset = gdlm_kset; - error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj, - "lock_module"); - if (error) - log_error("can't register kobj %d", error); - kobject_uevent(&ls->kobj, KOBJ_ADD); - - return error; -} - -void gdlm_kobject_release(struct gdlm_ls *ls) -{ - kobject_put(&ls->kobj); -} - -static int gdlm_uevent(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env) -{ - struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); - add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname); - add_uevent_var(env, "LOCKPROTO=lock_dlm"); - return 0; -} - -static struct kset_uevent_ops gdlm_uevent_ops = { - .uevent = gdlm_uevent, -}; - - -int gdlm_sysfs_init(void) -{ - gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj); - if (!gdlm_kset) { - printk(KERN_WARNING "%s: can not create kset\n", __func__); - return -ENOMEM; - } - return 0; -} - -void gdlm_sysfs_exit(void) -{ - kset_unregister(gdlm_kset); -} - diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c deleted file mode 100644 index 38823efd698c..000000000000 --- a/fs/gfs2/locking/dlm/thread.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include "lock_dlm.h" - -static inline int no_work(struct gdlm_ls *ls) -{ - int ret; - - spin_lock(&ls->async_lock); - ret = list_empty(&ls->submit); - spin_unlock(&ls->async_lock); - - return ret; -} - -static int gdlm_thread(void *data) -{ - struct gdlm_ls *ls = (struct gdlm_ls *) data; - struct gdlm_lock *lp = NULL; - - while (!kthread_should_stop()) { - wait_event_interruptible(ls->thread_wait, - !no_work(ls) || kthread_should_stop()); - - spin_lock(&ls->async_lock); - - if (!list_empty(&ls->submit)) { - lp = list_entry(ls->submit.next, struct gdlm_lock, - delay_list); - list_del_init(&lp->delay_list); - spin_unlock(&ls->async_lock); - gdlm_do_lock(lp); - spin_lock(&ls->async_lock); - } - spin_unlock(&ls->async_lock); - } - - return 0; -} - -int gdlm_init_threads(struct gdlm_ls *ls) -{ - struct task_struct *p; - int error; - - p = kthread_run(gdlm_thread, ls, "lock_dlm"); - error = IS_ERR(p); - if (error) { - log_error("can't start lock_dlm thread %d", error); - return error; - } - ls->thread = p; - - return 0; -} - -void gdlm_release_threads(struct gdlm_ls *ls) -{ - kthread_stop(ls->thread); -} - diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index ad305854bdc6..98918a756410 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4390f6f4047d..80e4f5f898bb 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -13,7 +13,6 @@ #include #include #include -#include #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 86fe06798711..a6892ed0840a 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include "gfs2.h" @@ -47,8 +46,6 @@ static void gfs2_init_glock_once(void *foo) INIT_HLIST_NODE(&gl->gl_list); spin_lock_init(&gl->gl_spin); INIT_LIST_HEAD(&gl->gl_holders); - gl->gl_lvb = NULL; - atomic_set(&gl->gl_lvb_count, 0); INIT_LIST_HEAD(&gl->gl_lru); INIT_LIST_HEAD(&gl->gl_ail_list); atomic_set(&gl->gl_ail_count, 0); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 09853620c951..870d65ae7ae2 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -19,7 +19,6 @@ #include #include #include -#include #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index 3524ae81189b..fba502aa8b2d 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include "gfs2.h" diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index dde4ead2c3be..a6d00e8ffe10 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include "gfs2.h" diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c index c2ad36330ca3..5eb57b044382 100644 --- a/fs/gfs2/ops_dentry.c +++ b/fs/gfs2/ops_dentry.c @@ -13,7 +13,6 @@ #include #include #include -#include #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index 7fdeb14ddd1a..9200ef221716 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -14,7 +14,6 @@ #include #include #include -#include #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 93fe41b67f97..99d726f1c7a6 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -20,9 +20,10 @@ #include #include #include -#include #include #include +#include +#include #include "gfs2.h" #include "incore.h" @@ -560,57 +561,24 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) return ret; } +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + /** * gfs2_setlease - acquire/release a file lease * @file: the file pointer * @arg: lease type * @fl: file lock * + * We don't currently have a way to enforce a lease across the whole + * cluster; until we do, disable leases (by just returning -EINVAL), + * unless the administrator has requested purely local locking. + * * Returns: errno */ static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) { - struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); - - /* - * We don't currently have a way to enforce a lease across the whole - * cluster; until we do, disable leases (by just returning -EINVAL), - * unless the administrator has requested purely local locking. - */ - if (!sdp->sd_args.ar_localflocks) - return -EINVAL; - return generic_setlease(file, arg, fl); -} - -static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_plock_get( - sdp->sd_lockstruct.ls_lockspace, name, file, fl); - return error; -} - -static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, int cmd, struct file_lock *fl) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_plock( - sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl); - return error; -} - -static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_punlock( - sdp->sd_lockstruct.ls_lockspace, name, file, fl); - return error; + return -EINVAL; } /** @@ -626,9 +594,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); - struct lm_lockname name = - { .ln_number = ip->i_no_addr, - .ln_type = LM_TYPE_PLOCK }; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; @@ -640,12 +606,14 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) cmd = F_SETLK; fl->fl_type = F_UNLCK; } + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; if (IS_GETLK(cmd)) - return gfs2_lm_plock_get(sdp, &name, file, fl); + return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); else if (fl->fl_type == F_UNLCK) - return gfs2_lm_punlock(sdp, &name, file, fl); + return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); else - return gfs2_lm_plock(sdp, &name, file, cmd, fl); + return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); } static int do_flock(struct file *file, int cmd, struct file_lock *fl) @@ -732,7 +700,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) } } -const struct file_operations gfs2_file_fops = { +const struct file_operations *gfs2_file_fops = &(const struct file_operations){ .llseek = gfs2_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, @@ -750,7 +718,7 @@ const struct file_operations gfs2_file_fops = { .setlease = gfs2_setlease, }; -const struct file_operations gfs2_dir_fops = { +const struct file_operations *gfs2_dir_fops = &(const struct file_operations){ .readdir = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, @@ -760,7 +728,9 @@ const struct file_operations gfs2_dir_fops = { .flock = gfs2_flock, }; -const struct file_operations gfs2_file_fops_nolock = { +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ + +const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){ .llseek = gfs2_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, @@ -773,10 +743,10 @@ const struct file_operations gfs2_file_fops_nolock = { .fsync = gfs2_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, - .setlease = gfs2_setlease, + .setlease = generic_setlease, }; -const struct file_operations gfs2_dir_fops_nolock = { +const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){ .readdir = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 402b6a2cd2c9..95bb33e41a76 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -17,7 +17,6 @@ #include #include #include -#include #include "gfs2.h" #include "incore.h" @@ -627,13 +626,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp) return rc; } -static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) +static void gfs2_others_may_mount(struct gfs2_sbd *sdp) { - if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount) - return; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_others_may_mount( - sdp->sd_lockstruct.ls_lockspace); + char *message = "FIRSTMOUNT=Done"; + char *envp[] = { message, NULL }; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ls->ls_first_done = 1; + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } /** @@ -793,7 +792,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) } } - gfs2_lm_others_may_mount(sdp); + gfs2_others_may_mount(sdp); } else if (!sdp->sd_args.ar_spectator) { error = gfs2_recover_journal(sdp->sd_jdesc); if (error) { @@ -1002,7 +1001,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo) goto fail_quotad; sdp->sd_log_flush_time = jiffies; - sdp->sd_jindex_refresh_time = jiffies; p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); error = IS_ERR(p); @@ -1030,6 +1028,17 @@ fail: return error; } +static const match_table_t nolock_tokens = { + { Opt_jid, "jid=%d\n", }, + { Opt_err, NULL }, +}; + +static const struct lm_lockops nolock_ops = { + .lm_proto_name = "lock_nolock", + .lm_put_lock = kmem_cache_free, + .lm_tokens = &nolock_tokens, +}; + /** * gfs2_lm_mount - mount a locking protocol * @sdp: the filesystem @@ -1041,31 +1050,73 @@ fail: static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) { - char *proto = sdp->sd_proto_name; - char *table = sdp->sd_table_name; - int flags = LM_MFLAG_CONV_NODROP; - int error; + const struct lm_lockops *lm; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + struct gfs2_args *args = &sdp->sd_args; + const char *proto = sdp->sd_proto_name; + const char *table = sdp->sd_table_name; + const char *fsname; + char *o, *options; + int ret; - if (sdp->sd_args.ar_spectator) - flags |= LM_MFLAG_SPECTATOR; + if (!strcmp("lock_nolock", proto)) { + lm = &nolock_ops; + sdp->sd_args.ar_localflocks = 1; + sdp->sd_args.ar_localcaching = 1; +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + } else if (!strcmp("lock_dlm", proto)) { + lm = &gfs2_dlm_ops; +#endif + } else { + printk(KERN_INFO "GFS2: can't find protocol %s\n", proto); + return -ENOENT; + } fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); - error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata, - gfs2_glock_cb, sdp, - GFS2_MIN_LVB_SIZE, flags, - &sdp->sd_lockstruct, &sdp->sd_kobj); - if (error) { - fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n", - proto, table, sdp->sd_args.ar_hostdata); - goto out; - } + ls->ls_ops = lm; + ls->ls_first = 1; + ls->ls_id = 0; - if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) || - gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= - GFS2_MIN_LVB_SIZE)) { - gfs2_unmount_lockproto(&sdp->sd_lockstruct); - goto out; + for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { + substring_t tmp[MAX_OPT_ARGS]; + int token, option; + + if (!o || !*o) + continue; + + token = match_token(o, *lm->lm_tokens, tmp); + switch (token) { + case Opt_jid: + ret = match_int(&tmp[0], &option); + if (ret || option < 0) + goto hostdata_error; + ls->ls_jid = option; + break; + case Opt_id: + ret = match_int(&tmp[0], &option); + if (ret) + goto hostdata_error; + ls->ls_id = option; + break; + case Opt_first: + ret = match_int(&tmp[0], &option); + if (ret || (option != 0 && option != 1)) + goto hostdata_error; + ls->ls_first = option; + break; + case Opt_nodir: + ret = match_int(&tmp[0], &option); + if (ret || (option != 0 && option != 1)) + goto hostdata_error; + ls->ls_nodir = option; + break; + case Opt_err: + default: +hostdata_error: + fs_info(sdp, "unknown hostdata (%s)\n", o); + return -EINVAL; + } } if (sdp->sd_args.ar_spectator) @@ -1074,22 +1125,25 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, sdp->sd_lockstruct.ls_jid); - fs_info(sdp, "Joined cluster. Now mounting FS...\n"); - - if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) && - !sdp->sd_args.ar_ignore_local_fs) { - sdp->sd_args.ar_localflocks = 1; - sdp->sd_args.ar_localcaching = 1; + fsname = strchr(table, ':'); + if (fsname) + fsname++; + if (lm->lm_mount == NULL) { + fs_info(sdp, "Now mounting FS...\n"); + return 0; } - -out: - return error; + ret = lm->lm_mount(sdp, fsname); + if (ret == 0) + fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + return ret; } void gfs2_lm_unmount(struct gfs2_sbd *sdp) { - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - gfs2_unmount_lockproto(&sdp->sd_lockstruct); + const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) && + lm->lm_unmount) + lm->lm_unmount(sdp); } /** diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 49877546beb9..abd5429ae285 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index f0699ac453f7..4ecdad026eaf 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include "gfs2.h" diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e8ef0f80fb11..8d53f66b5bcc 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #include @@ -108,7 +107,7 @@ int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) gfs2_assert_warn(sdp, !qd->qd_slot_count); gfs2_assert_warn(sdp, !qd->qd_bh_count); - gfs2_lvb_unhold(qd->qd_gl); + gfs2_glock_put(qd->qd_gl); atomic_dec(&sdp->sd_quota_count); /* Delete it from the common reclaim list */ @@ -157,11 +156,6 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, if (error) goto fail; - error = gfs2_lvb_hold(qd->qd_gl); - gfs2_glock_put(qd->qd_gl); - if (error) - goto fail; - *qdp = qd; return 0; @@ -211,7 +205,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, if (qd || !create) { if (new_qd) { - gfs2_lvb_unhold(new_qd->qd_gl); + gfs2_glock_put(new_qd->qd_gl); kmem_cache_free(gfs2_quotad_cachep, new_qd); } *qdp = qd; @@ -1280,7 +1274,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, qd->qd_slot_count == 1); gfs2_assert_warn(sdp, !qd->qd_bh_count); - gfs2_lvb_unhold(qd->qd_gl); + gfs2_glock_put(qd->qd_gl); kmem_cache_free(gfs2_quotad_cachep, qd); spin_lock(&qd_lru_lock); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index efd09c3d2b26..247e8f7d6b3d 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include @@ -427,20 +426,23 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea } -static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, - unsigned int message) +static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int message) { - if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done) - return; - - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_recovery_done( - sdp->sd_lockstruct.ls_lockspace, jid, message); + char env_jid[20]; + char env_status[20]; + char *envp[] = { env_jid, env_status, NULL }; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ls->ls_recover_jid_done = jid; + ls->ls_recover_jid_status = message; + sprintf(env_jid, "JID=%d", jid); + sprintf(env_status, "RECOVERY=%s", + message == LM_RD_SUCCESS ? "Done" : "Failed"); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } - /** - * gfs2_recover_journal - recovery a given journal + * gfs2_recover_journal - recover a given journal * @jd: the struct gfs2_jdesc describing the journal * * Acquire the journal's lock, check to see if the journal is clean, and @@ -561,7 +563,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd) if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) gfs2_glock_dq_uninit(&ji_gh); - gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) gfs2_glock_dq_uninit(&j_gh); @@ -581,7 +583,7 @@ fail_gunlock_j: fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); fail: - gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); return error; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 8b01c635d925..ba5a021b1c57 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include "gfs2.h" diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 141b781f2fcc..7cf302b135ce 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -15,7 +15,6 @@ #include #include #include -#include #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index a58a120dac92..a78997ea5037 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -14,9 +14,8 @@ #include #include #include -#include -#include #include +#include #include "gfs2.h" #include "incore.h" @@ -224,14 +223,145 @@ static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name) LOCKSTRUCT_ATTR(jid, "%u\n"); LOCKSTRUCT_ATTR(first, "%u\n"); -LOCKSTRUCT_ATTR(lvb_size, "%u\n"); -LOCKSTRUCT_ATTR(flags, "%d\n"); static struct attribute *lockstruct_attrs[] = { &lockstruct_attr_jid.attr, &lockstruct_attr_first.attr, - &lockstruct_attr_lvb_size.attr, - &lockstruct_attr_flags.attr, + NULL, +}; + +/* + * lock_module. Originally from lock_dlm + */ + +static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf) +{ + const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops; + return sprintf(buf, "%s\n", ops->lm_proto_name); +} + +static ssize_t block_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ssize_t ret; + int val = 0; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags)) + val = 1; + ret = sprintf(buf, "%d\n", val); + return ret; +} + +static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if (val == 1) + set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); + else if (val == 0) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); + smp_mb__after_clear_bit(); + gfs2_glock_thaw(sdp); + } else { + ret = -EINVAL; + } + return ret; +} + +static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%u\n", ls->ls_id); +} + +static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_first); +} + +static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_first_done); +} + +static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid); +} + +static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_jid != jid) + continue; + jd->jd_dirty = 1; + break; + } + spin_unlock(&sdp->sd_jindex_spin); +} + +static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ls->ls_recover_jid = simple_strtol(buf, NULL, 0); + gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid); + if (sdp->sd_recoverd_process) + wake_up_process(sdp->sd_recoverd_process); + return len; +} + +static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid_done); +} + +static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid_status); +} + +struct gdlm_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *sdp, char *); + ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t); +}; + +#define GDLM_ATTR(_name,_mode,_show,_store) \ +static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) + +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(id, 0444, lkid_show, NULL); +GDLM_ATTR(first, 0444, lkfirst_show, NULL); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0644, recover_show, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); + +static struct attribute *lock_module_attrs[] = { + &gdlm_attr_proto_name.attr, + &gdlm_attr_block.attr, + &gdlm_attr_withdraw.attr, + &gdlm_attr_id.attr, + &lockstruct_attr_jid.attr, + &gdlm_attr_first.attr, + &gdlm_attr_first_done.attr, + &gdlm_attr_recover.attr, + &gdlm_attr_recover_done.attr, + &gdlm_attr_recover_status.attr, NULL, }; @@ -412,6 +542,11 @@ static struct attribute_group tune_group = { .attrs = tune_attrs, }; +static struct attribute_group lock_module_group = { + .name = "lock_module", + .attrs = lock_module_attrs, +}; + int gfs2_sys_fs_add(struct gfs2_sbd *sdp) { int error; @@ -434,9 +569,15 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) if (error) goto fail_args; + error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group); + if (error) + goto fail_tune; + kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); return 0; +fail_tune: + sysfs_remove_group(&sdp->sd_kobj, &tune_group); fail_args: sysfs_remove_group(&sdp->sd_kobj, &args_group); fail_lockstruct: @@ -453,6 +594,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) sysfs_remove_group(&sdp->sd_kobj, &tune_group); sysfs_remove_group(&sdp->sd_kobj, &args_group); sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); kobject_put(&sdp->sd_kobj); } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index f677b8a83f0c..33cd523ec97e 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -12,9 +12,8 @@ #include #include #include -#include #include -#include +#include #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 374f50e95496..9d12b1118ba0 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include "gfs2.h" @@ -35,6 +34,8 @@ void gfs2_assert_i(struct gfs2_sbd *sdp) int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) { + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + const struct lm_lockops *lm = ls->ls_ops; va_list args; if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) @@ -47,8 +48,12 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) fs_err(sdp, "about to withdraw this file system\n"); BUG_ON(sdp->sd_args.ar_debug); - fs_err(sdp, "telling LM to withdraw\n"); - gfs2_withdraw_lockproto(&sdp->sd_lockstruct); + kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + + if (lm->lm_unmount) { + fs_err(sdp, "telling LM to unmount\n"); + lm->lm_unmount(sdp); + } fs_err(sdp, "withdrawn\n"); dump_stack(); -- cgit v1.2.2 From ac2425e7d319dec0523e52ee120a158ce6668cbd Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 13 Jan 2009 09:53:43 +0000 Subject: GFS2: Remove unused field from glock The time stamp field is unused in the glock now that we are using a shrinker, so that we can remove it and save sizeof(unsigned long) bytes in each glock. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 2 -- fs/gfs2/incore.h | 1 - 2 files changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index cd200a564c79..173e59ce9ad3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -700,7 +700,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number); memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); gl->gl_lksb.sb_lvbptr = gl->gl_lvb; - gl->gl_stamp = jiffies; gl->gl_tchange = jiffies; gl->gl_object = NULL; gl->gl_sbd = sdp; @@ -1008,7 +1007,6 @@ void gfs2_glock_dq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); clear_bit(GLF_LOCK, &gl->gl_flags); } - gl->gl_stamp = jiffies; if (list_empty(&gl->gl_holders) && !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && !test_bit(GLF_DEMOTE, &gl->gl_flags)) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0af7c24de6a1..8fe0675120ac 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -211,7 +211,6 @@ struct gfs2_glock { char gl_strname[GDLM_STRNAME_BYTES]; struct dlm_lksb gl_lksb; char gl_lvb[32]; - unsigned long gl_stamp; unsigned long gl_tchange; void *gl_object; -- cgit v1.2.2 From e7c8707ea2b9106f0f78c43348ff5d5e82ba7961 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 20 Jan 2009 16:39:23 +0000 Subject: GFS2: Fix error path ref counting for root inode We were keeping hold of an extra ref to the root inode in one of the error paths, that resulted in a hang. Reported-by: Nate Straz Signed-off-by: Steven Whitehouse Tested-by: Robert Peterson --- fs/gfs2/ops_fstype.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 95bb33e41a76..e502b379a4da 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1258,6 +1258,8 @@ fail_sb: dput(sdp->sd_root_dir); if (sdp->sd_master_dir) dput(sdp->sd_master_dir); + if (sb->s_root) + dput(sb->s_root); sb->s_root = NULL; fail_locking: init_locking(sdp, &mount_gh, UNDO); -- cgit v1.2.2 From d8348de06f704fc34d24ec068546ecb1045fc11a Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 5 Feb 2009 10:12:38 +0000 Subject: GFS2: Fix deadlock on journal flush This patch fixes a deadlock when the journal is flushed and there are dirty inodes other than the one which caused the journal flush. Originally the journal flushing code was trying to obtain the transaction glock while running the flush code for an inode glock. We no longer require the transaction glock at this point in time since we know that any attempt to get the transaction glock from another node will result in a journal flush. So if we are flushing the journal, we can be sure that the transaction lock is still cached from when the transaction was started. By inlining a version of gfs2_trans_begin() (minus the bit which gets the transaction glock) we can avoid the deadlock problems caused if there is a demote request queued up on the transaction glock. In addition I've also moved the umount rwsem so that it covers the glock workqueue, since it all demotions are done by this workqueue now. That fixes a bug on umount which I came across while fixing the original problem. Reported-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 26 ++++++++++++-------------- fs/gfs2/glops.c | 19 ++++++++++++------- fs/gfs2/trans.c | 16 ++++++++++------ 3 files changed, 34 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 173e59ce9ad3..ad8e121427c0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -167,6 +167,7 @@ static void glock_free(struct gfs2_glock *gl) static void gfs2_glock_hold(struct gfs2_glock *gl) { + GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0); atomic_inc(&gl->gl_ref); } @@ -206,16 +207,15 @@ int gfs2_glock_put(struct gfs2_glock *gl) atomic_dec(&lru_count); } spin_unlock(&lru_lock); - GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru)); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); glock_free(gl); rv = 1; goto out; } - write_unlock(gl_lock_addr(gl->gl_hash)); /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */ if (atomic_read(&gl->gl_ref) == 2) gfs2_glock_schedule_for_reclaim(gl); + write_unlock(gl_lock_addr(gl->gl_hash)); out: return rv; } @@ -597,10 +597,11 @@ __acquires(&gl->gl_spin) GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); + down_read(&gfs2_umount_flush_sem); if (test_bit(GLF_DEMOTE, &gl->gl_flags) && gl->gl_demote_state != gl->gl_state) { if (find_first_holder(gl)) - goto out; + goto out_unlock; if (nonblock) goto out_sched; set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); @@ -611,23 +612,26 @@ __acquires(&gl->gl_spin) gfs2_demote_wake(gl); ret = do_promote(gl); if (ret == 0) - goto out; + goto out_unlock; if (ret == 2) - return; + goto out_sem; gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ } do_xmote(gl, gh, gl->gl_target); +out_sem: + up_read(&gfs2_umount_flush_sem); return; out_sched: gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); -out: +out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); + goto out_sem; } static void glock_work_func(struct work_struct *work) @@ -1225,7 +1229,6 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; - down_read(&gfs2_umount_flush_sem); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { struct gfs2_holder *gh; @@ -1236,16 +1239,13 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) ((ret & ~LM_OUT_ST_MASK) != 0)) set_bit(GLF_FROZEN, &gl->gl_flags); spin_unlock(&gl->gl_spin); - if (test_bit(GLF_FROZEN, &gl->gl_flags)) { - up_read(&gfs2_umount_flush_sem); + if (test_bit(GLF_FROZEN, &gl->gl_flags)) return; - } } set_bit(GLF_REPLY_PENDING, &gl->gl_flags); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); - up_read(&gfs2_umount_flush_sem); } /** @@ -1389,12 +1389,10 @@ static void thaw_glock(struct gfs2_glock *gl) { if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) return; - down_read(&gfs2_umount_flush_sem); set_bit(GLF_REPLY_PENDING, &gl->gl_flags); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); - up_read(&gfs2_umount_flush_sem); } /** @@ -1580,7 +1578,7 @@ static const char *gflags2str(char *buf, const unsigned long *gflags) if (test_bit(GLF_REPLY_PENDING, gflags)) *p++ = 'r'; if (test_bit(GLF_INITIAL, gflags)) - *p++ = 'i'; + *p++ = 'I'; if (test_bit(GLF_FROZEN, gflags)) *p++ = 'F'; *p = 0; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f07ede8cb9ba..a9b7d3a60081 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -37,20 +37,25 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; - unsigned int blocks; struct list_head *head = &gl->gl_ail_list; struct gfs2_bufdata *bd; struct buffer_head *bh; - int error; + struct gfs2_trans tr; - blocks = atomic_read(&gl->gl_ail_count); - if (!blocks) - return; + memset(&tr, 0, sizeof(tr)); + tr.tr_revokes = atomic_read(&gl->gl_ail_count); - error = gfs2_trans_begin(sdp, 0, blocks); - if (gfs2_assert_withdraw(sdp, !error)) + if (!tr.tr_revokes) return; + /* A shortened, inline version of gfs2_trans_begin() */ + tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); + tr.tr_ip = (unsigned long)__builtin_return_address(0); + INIT_LIST_HEAD(&tr.tr_list_buf); + gfs2_log_reserve(sdp, tr.tr_reserved); + BUG_ON(current->journal_info); + current->journal_info = &tr; + gfs2_log_lock(sdp); while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 33cd523ec97e..053752d4b27f 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -87,9 +87,11 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (!tr->tr_touched) { gfs2_log_release(sdp, tr->tr_reserved); - gfs2_glock_dq(&tr->tr_t_gh); - gfs2_holder_uninit(&tr->tr_t_gh); - kfree(tr); + if (tr->tr_t_gh.gh_gl) { + gfs2_glock_dq(&tr->tr_t_gh); + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + } return; } @@ -105,9 +107,11 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) } gfs2_log_commit(sdp, tr); - gfs2_glock_dq(&tr->tr_t_gh); - gfs2_holder_uninit(&tr->tr_t_gh); - kfree(tr); + if (tr->tr_t_gh.gh_gl) { + gfs2_glock_dq(&tr->tr_t_gh); + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + } if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) gfs2_log_flush(sdp, NULL); -- cgit v1.2.2 From f15ab5619d8068a321094f4705147764d689e88e Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 9 Feb 2009 09:25:01 +0000 Subject: GFS2: Support generation of discard requests This patch allows GFS2 to generate discard requests for blocks which are no longer useful to the filesystem (i.e. those which have been freed as the result of an unlink operation). The requests are generated at the time which those blocks become available for reuse in the filesystem. In order to use this new feature, you have to specify the "discard" mount option. The code coalesces adjacent blocks into a single extent when generating the discard requests, thus generating the minimum number. If an error occurs when the request has been sent to the block device, then it will print a message and turn off the requests for that filesystem. If the problem is temporary, then you can use remount to turn the option back on again. There is also a nodiscard mount option so that you can use remount to turn discard requests off, if required. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 2 +- fs/gfs2/mount.c | 10 ++++++++++ fs/gfs2/ops_super.c | 2 ++ fs/gfs2/rgrp.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 68 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 8fe0675120ac..3f29bd224ba1 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -416,7 +416,7 @@ struct gfs2_args { unsigned int ar_suiddir:1; /* suiddir support */ unsigned int ar_data:2; /* ordered/writeback */ unsigned int ar_meta:1; /* mount metafs */ - unsigned int ar_num_glockd; /* Number of glockd threads */ + unsigned int ar_discard:1; /* discard requests */ }; struct gfs2_tune { diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index fba502aa8b2d..ee69701a7777 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -41,6 +41,8 @@ enum { Opt_data_writeback, Opt_data_ordered, Opt_meta, + Opt_discard, + Opt_nodiscard, Opt_err, }; @@ -65,6 +67,8 @@ static const match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_data_ordered, "data=ordered"}, {Opt_meta, "meta"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL} }; @@ -157,6 +161,12 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) case Opt_meta: args->ar_meta = 1; break; + case Opt_discard: + args->ar_discard = 1; + break; + case Opt_nodiscard: + args->ar_discard = 0; + break; case Opt_err: default: fs_info(sdp, "invalid mount option: %s\n", o); diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 4ecdad026eaf..458019569dcb 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -608,6 +608,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } seq_printf(s, ",data=%s", state); } + if (args->ar_discard) + seq_printf(s, ",discard"); return 0; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ba5a021b1c57..789953a2b6a8 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -830,6 +831,58 @@ void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) spin_unlock(&sdp->sd_rindex_spin); } +static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + const struct gfs2_bitmap *bi) +{ + struct super_block *sb = sdp->sd_vfs; + struct block_device *bdev = sb->s_bdev; + const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / + bdev_hardsect_size(sb->s_bdev); + u64 blk; + sector_t start; + sector_t nr_sects = 0; + int rv; + unsigned int x; + + for (x = 0; x < bi->bi_len; x++) { + const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x; + const u8 *clone = bi->bi_clone + bi->bi_offset + x; + u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); + diff &= 0x55; + if (diff == 0) + continue; + blk = offset + ((bi->bi_start + x) * GFS2_NBBY); + blk *= sects_per_blk; /* convert to sectors */ + while(diff) { + if (diff & 1) { + if (nr_sects == 0) + goto start_new_extent; + if ((start + nr_sects) != blk) { + rv = blkdev_issue_discard(bdev, start, + nr_sects, GFP_NOFS); + if (rv) + goto fail; + nr_sects = 0; +start_new_extent: + start = blk; + } + nr_sects += sects_per_blk; + } + diff >>= 2; + blk += sects_per_blk; + } + } + if (nr_sects) { + rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS); + if (rv) + goto fail; + } + return; +fail: + fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv); + sdp->sd_args.ar_discard = 0; +} + void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; @@ -840,6 +893,8 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) struct gfs2_bitmap *bi = rgd->rd_bits + x; if (!bi->bi_clone) continue; + if (sdp->sd_args.ar_discard) + gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi); memcpy(bi->bi_clone + bi->bi_offset, bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); } -- cgit v1.2.2 From 02e3cc70ecbd4352ae4d26459929f43ab1547251 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 10 Feb 2009 13:48:30 +0000 Subject: GFS2: Expose UUID via sysfs/uevent Since we have a UUID, we ought to expose it to the user via sysfs and uevents. We already have the fs name in both of these places (a combination of the lock proto and lock table name) so if we add the UUID as well, we have a full set. For older filesystems (i.e. those created before mkfs.gfs2 was writing UUIDs by default) the sysfs file will appear zero length, and no UUID env var will be added to the uevents. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/ops_fstype.c | 1 + fs/gfs2/sys.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 3f29bd224ba1..980a0864ca6c 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -470,6 +470,7 @@ struct gfs2_sb_host { char sb_lockproto[GFS2_LOCKNAME_LEN]; char sb_locktable[GFS2_LOCKNAME_LEN]; + u8 sb_uuid[16]; }; /* diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e502b379a4da..804ca7273a49 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -234,6 +234,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); + memcpy(sb->sb_uuid, str->sb_uuid, 16); } /** diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index a78997ea5037..4d284d14980b 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -36,6 +36,30 @@ static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); } +static int gfs2_uuid_valid(const u8 *uuid) +{ + int i; + + for (i = 0; i < 16; i++) { + if (uuid[i]) + return 1; + } + return 0; +} + +static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) +{ + const u8 *uuid = sdp->sd_sb.sb_uuid; + buf[0] = '\0'; + if (!gfs2_uuid_valid(uuid)) + return 0; + return snprintf(buf, PAGE_SIZE, "%02X%02X%02X%02X-%02X%02X-" + "%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n", + uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], + uuid[6], uuid[7], uuid[8], uuid[9], uuid[10], uuid[11], + uuid[12], uuid[13], uuid[14], uuid[15]); +} + static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) { unsigned int count; @@ -158,6 +182,7 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) GFS2_ATTR(id, 0444, id_show, NULL); GFS2_ATTR(fsname, 0444, fsname_show, NULL); +GFS2_ATTR(uuid, 0444, uuid_show, NULL); GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); @@ -168,6 +193,7 @@ GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); static struct attribute *gfs2_attrs[] = { &gfs2_attr_id.attr, &gfs2_attr_fsname.attr, + &gfs2_attr_uuid.attr, &gfs2_attr_freeze.attr, &gfs2_attr_withdraw.attr, &gfs2_attr_statfs_sync.attr, @@ -598,12 +624,23 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) kobject_put(&sdp->sd_kobj); } + static int gfs2_uevent(struct kset *kset, struct kobject *kobj, struct kobj_uevent_env *env) { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + const u8 *uuid = sdp->sd_sb.sb_uuid; + add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); + if (gfs2_uuid_valid(uuid)) { + add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-" + "%02X%02X-%02X%02X%02X%02X%02X%02X", + uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], + uuid[5], uuid[6], uuid[7], uuid[8], uuid[9], + uuid[10], uuid[11], uuid[12], uuid[13], + uuid[14], uuid[15]); + } return 0; } -- cgit v1.2.2 From 64d576ba23bfd9b770cbb0279200f479272eb859 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 12 Feb 2009 13:31:58 +0000 Subject: GFS2: Add a "demote a glock" interface to sysfs This adds a sysfs file called demote_rq to GFS2's per filesystem directory. Its possible to use this file to demote arbitrary glocks in exactly the same way as if a request had come in from a remote node. This is intended for testing issues relating to caching of data under glocks. Despite that, the interface is generic enough to send requests to any type of glock, but be careful as its not always safe to send an arbitrary message to an arbitrary glock. For that reason and to prevent DoS, this interface is restricted to root only. The messages look like this: : Example: echo -n "2:13324 EX" >/sys/fs/gfs2/unity:myfs/demote_rq Which means "please demote inode glock (type 2) number 13324 so that I can get an EX (exclusive) lock". The lock modes are those which would normally be sent by a remote node in its callback so if you want to unlock a glock, you use EX, to demote to shared, use SH or PR (depending on whether you like GFS2 or DLM lock modes better!). If the glock doesn't exist, you'll get -ENOENT returned. If the arguments don't make sense, you'll get -EINVAL returned. The plan is that this interface will be used in combination with the blktrace patch which I recently posted for comments although it is, of course, still useful in its own right. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 7 ++++--- fs/gfs2/glops.c | 12 ++++++++++++ fs/gfs2/glops.h | 1 + fs/gfs2/rgrp.c | 2 +- fs/gfs2/sys.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 61 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ad8e121427c0..3984e47d1d33 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -684,10 +684,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl = search_bucket(hash, sdp, &name); read_unlock(gl_lock_addr(hash)); - if (gl || !create) { - *glp = gl; + *glp = gl; + if (gl) return 0; - } + if (!create) + return -ENOENT; gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); if (!gl) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index a9b7d3a60081..f34bc7093dd1 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -447,3 +447,15 @@ const struct gfs2_glock_operations gfs2_journal_glops = { .go_type = LM_TYPE_JOURNAL, }; +const struct gfs2_glock_operations *gfs2_glops_list[] = { + [LM_TYPE_META] = &gfs2_meta_glops, + [LM_TYPE_INODE] = &gfs2_inode_glops, + [LM_TYPE_RGRP] = &gfs2_rgrp_glops, + [LM_TYPE_NONDISK] = &gfs2_trans_glops, + [LM_TYPE_IOPEN] = &gfs2_iopen_glops, + [LM_TYPE_FLOCK] = &gfs2_flock_glops, + [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, + [LM_TYPE_QUOTA] = &gfs2_quota_glops, + [LM_TYPE_JOURNAL] = &gfs2_journal_glops, +}; + diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index a1d9b5b024e6..b3aa2e3210fd 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -21,5 +21,6 @@ extern const struct gfs2_glock_operations gfs2_flock_glops; extern const struct gfs2_glock_operations gfs2_nondisk_glops; extern const struct gfs2_glock_operations gfs2_quota_glops; extern const struct gfs2_glock_operations gfs2_journal_glops; +extern const struct gfs2_glock_operations *gfs2_glops_list[]; #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 789953a2b6a8..a068ac940de1 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -839,7 +839,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / bdev_hardsect_size(sb->s_bdev); u64 blk; - sector_t start; + sector_t start = 0; sector_t nr_sects = 0; int rv; unsigned int x; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 4d284d14980b..7655f5025fec 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -24,6 +24,7 @@ #include "glock.h" #include "quota.h" #include "util.h" +#include "glops.h" static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) { @@ -171,6 +172,46 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, return len; } +static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct gfs2_glock *gl; + const struct gfs2_glock_operations *glops; + unsigned int glmode; + unsigned int gltype; + unsigned long long glnum; + char mode[16]; + int rv; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum, + mode); + if (rv != 3) + return -EINVAL; + + if (strcmp(mode, "EX") == 0) + glmode = LM_ST_UNLOCKED; + else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0)) + glmode = LM_ST_DEFERRED; + else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0)) + glmode = LM_ST_SHARED; + else + return -EINVAL; + + if (gltype > LM_TYPE_JOURNAL) + return -EINVAL; + glops = gfs2_glops_list[gltype]; + if (glops == NULL) + return -EINVAL; + rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); + if (rv) + return rv; + gfs2_glock_cb(gl, glmode); + gfs2_glock_put(gl); + return len; +} + struct gfs2_attr { struct attribute attr; ssize_t (*show)(struct gfs2_sbd *, char *); @@ -189,6 +230,7 @@ GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store); GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); +GFS2_ATTR(demote_rq, 0200, NULL, demote_rq_store); static struct attribute *gfs2_attrs[] = { &gfs2_attr_id.attr, @@ -200,6 +242,7 @@ static struct attribute *gfs2_attrs[] = { &gfs2_attr_quota_sync.attr, &gfs2_attr_quota_refresh_user.attr, &gfs2_attr_quota_refresh_group.attr, + &gfs2_attr_demote_rq.attr, NULL, }; -- cgit v1.2.2 From 223b2b889f379dcea9cef722336a57e8b398bc95 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 17 Feb 2009 14:13:35 +0000 Subject: GFS2: Fix alignment issue and tidy gfs2_bitfit An alignment issue with the existing bitfit algorithm was reported on IA64. This patch attempts to fix that, and also to tidy up the code a bit. There is now more documentation about how this works and it has survived a number of different tests. Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 132 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 70 insertions(+), 62 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index a068ac940de1..c0abe698af82 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -131,82 +131,90 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, return cur_state; } +/** + * gfs2_bit_search + * @ptr: Pointer to bitmap data + * @mask: Mask to use (normally 0x55555.... but adjusted for search start) + * @state: The state we are searching for + * + * We xor the bitmap data with a patter which is the bitwise opposite + * of what we are looking for, this gives rise to a pattern of ones + * wherever there is a match. Since we have two bits per entry, we + * take this pattern, shift it down by one place and then and it with + * the original. All the even bit positions (0,2,4, etc) then represent + * successful matches, so we mask with 0x55555..... to remove the unwanted + * odd bit positions. + * + * This allows searching of a whole u64 at once (32 blocks) with a + * single test (on 64 bit arches). + */ + +static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) +{ + u64 tmp; + static const u64 search[] = { + [0] = 0xffffffffffffffff, + [1] = 0xaaaaaaaaaaaaaaaa, + [2] = 0x5555555555555555, + [3] = 0x0000000000000000, + }; + tmp = le64_to_cpu(*ptr) ^ search[state]; + tmp &= (tmp >> 1); + tmp &= mask; + return tmp; +} + /** * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing * a block in a given allocation state. * @buffer: the buffer that holds the bitmaps - * @buflen: the length (in bytes) of the buffer + * @len: the length (in bytes) of the buffer * @goal: start search at this block's bit-pair (within @buffer) - * @old_state: GFS2_BLKST_XXX the state of the block we're looking for. + * @state: GFS2_BLKST_XXX the state of the block we're looking for. * * Scope of @goal and returned block number is only within this bitmap buffer, * not entire rgrp or filesystem. @buffer will be offset from the actual - * beginning of a bitmap block buffer, skipping any header structures. + * beginning of a bitmap block buffer, skipping any header structures, but + * headers are always a multiple of 64 bits long so that the buffer is + * always aligned to a 64 bit boundary. + * + * The size of the buffer is in bytes, but is it assumed that it is + * always ok to to read a complete multiple of 64 bits at the end + * of the block in case the end is no aligned to a natural boundary. * * Return: the block number (bitmap buffer scope) that was found */ -static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal, - u8 old_state) +u32 gfs2_bitfit(const u8 *buf, const unsigned int len, u32 goal, u8 state) { - const u8 *byte, *start, *end; - int bit, startbit; - u32 g1, g2, misaligned; - unsigned long *plong; - unsigned long lskipval; - - lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55; - g1 = (goal / GFS2_NBBY); - start = buffer + g1; - byte = start; - end = buffer + buflen; - g2 = ALIGN(g1, sizeof(unsigned long)); - plong = (unsigned long *)(buffer + g2); - startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; - misaligned = g2 - g1; - if (!misaligned) - goto ulong_aligned; -/* parse the bitmap a byte at a time */ -misaligned: - while (byte < end) { - if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) { - return goal + - (((byte - start) * GFS2_NBBY) + - ((bit - startbit) >> 1)); - } - bit += GFS2_BIT_SIZE; - if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) { - bit = 0; - byte++; - misaligned--; - if (!misaligned) { - plong = (unsigned long *)byte; - goto ulong_aligned; - } - } - } - return BFITNOENT; - -/* parse the bitmap a unsigned long at a time */ -ulong_aligned: - /* Stop at "end - 1" or else prefetch can go past the end and segfault. - We could "if" it but we'd lose some of the performance gained. - This way will only slow down searching the very last 4/8 bytes - depending on architecture. I've experimented with several ways - of writing this section such as using an else before the goto - but this one seems to be the fastest. */ - while ((unsigned char *)plong < end - sizeof(unsigned long)) { - prefetch(plong + 1); - if (((*plong) & LBITMASK) != lskipval) - break; - plong++; - } - if ((unsigned char *)plong < end) { - byte = (const u8 *)plong; - misaligned += sizeof(unsigned long) - 1; - goto misaligned; + u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1); + const __le64 *ptr = ((__le64 *)buf) + (goal >> 5); + const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64))); + u64 tmp; + u64 mask = 0x5555555555555555; + u32 bit; + + BUG_ON(state > 3); + + /* Mask off bits we don't care about at the start of the search */ + mask <<= spoint; + tmp = gfs2_bit_search(ptr, mask, state); + ptr++; + while(tmp == 0 && ptr < end) { + tmp = gfs2_bit_search(ptr, 0x5555555555555555, state); + ptr++; } - return BFITNOENT; + /* Mask off any bits which are more than len bytes from the start */ + if (ptr == end && (len & (sizeof(u64) - 1))) + tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1)))); + /* Didn't find anything, so return */ + if (tmp == 0) + return BFITNOENT; + ptr--; + bit = fls64(tmp); + bit--; /* fls64 always adds one to the bit count */ + bit /= 2; /* two bits per entry in the bitmap */ + return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit; } /** -- cgit v1.2.2 From b9a9694570756e689068f0450cf3c570f74b2b01 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 19 Feb 2009 10:32:35 +0000 Subject: GFS2: Support quota/noquota mount arguments This adds support for "quota" and "noquota" mount options in addition to the existing "quota=on/off/account" so that we are compatible with the names by which these options are more generally known. Signed-off-by: Steven Whitehouse --- fs/gfs2/mount.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index ee69701a7777..f7e8527a21e0 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -36,6 +36,8 @@ enum { Opt_quota_off, Opt_quota_account, Opt_quota_on, + Opt_quota, + Opt_noquota, Opt_suiddir, Opt_nosuiddir, Opt_data_writeback, @@ -62,6 +64,8 @@ static const match_table_t tokens = { {Opt_quota_off, "quota=off"}, {Opt_quota_account, "quota=account"}, {Opt_quota_on, "quota=on"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, {Opt_suiddir, "suiddir"}, {Opt_nosuiddir, "nosuiddir"}, {Opt_data_writeback, "data=writeback"}, @@ -138,12 +142,14 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) args->ar_posix_acl = 0; break; case Opt_quota_off: + case Opt_noquota: args->ar_quota = GFS2_QUOTA_OFF; break; case Opt_quota_account: args->ar_quota = GFS2_QUOTA_ACCOUNT; break; case Opt_quota_on: + case Opt_quota: args->ar_quota = GFS2_QUOTA_ON; break; case Opt_suiddir: -- cgit v1.2.2 From 075ac44875323941210335b3b0abc1895356d919 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sat, 21 Feb 2009 02:11:42 +0100 Subject: GFS2: fix sparse warnings: constant is so big it is ... Fix this sparse warnings: fs/gfs2/rgrp.c:156:23: warning: constant 0xffffffffffffffff is so big it is unsigned long long fs/gfs2/rgrp.c:157:23: warning: constant 0xaaaaaaaaaaaaaaaa is so big it is unsigned long long fs/gfs2/rgrp.c:158:23: warning: constant 0x5555555555555555 is so big it is long long fs/gfs2/rgrp.c:194:20: warning: constant 0x5555555555555555 is so big it is long long fs/gfs2/rgrp.c:204:44: warning: constant 0x5555555555555555 is so big it is long long Signed-off-by: Hannes Eder Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c0abe698af82..34691d75819a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -153,10 +153,10 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) { u64 tmp; static const u64 search[] = { - [0] = 0xffffffffffffffff, - [1] = 0xaaaaaaaaaaaaaaaa, - [2] = 0x5555555555555555, - [3] = 0x0000000000000000, + [0] = 0xffffffffffffffffULL, + [1] = 0xaaaaaaaaaaaaaaaaULL, + [2] = 0x5555555555555555ULL, + [3] = 0x0000000000000000ULL, }; tmp = le64_to_cpu(*ptr) ^ search[state]; tmp &= (tmp >> 1); @@ -191,7 +191,7 @@ u32 gfs2_bitfit(const u8 *buf, const unsigned int len, u32 goal, u8 state) const __le64 *ptr = ((__le64 *)buf) + (goal >> 5); const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64))); u64 tmp; - u64 mask = 0x5555555555555555; + u64 mask = 0x5555555555555555ULL; u32 bit; BUG_ON(state > 3); @@ -201,7 +201,7 @@ u32 gfs2_bitfit(const u8 *buf, const unsigned int len, u32 goal, u8 state) tmp = gfs2_bit_search(ptr, mask, state); ptr++; while(tmp == 0 && ptr < end) { - tmp = gfs2_bit_search(ptr, 0x5555555555555555, state); + tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state); ptr++; } /* Mask off any bits which are more than len bytes from the start */ -- cgit v1.2.2 From 02ab1721591f7ac1f632fc74b301513bd6f5849f Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sat, 21 Feb 2009 02:12:05 +0100 Subject: GFS2: fix sparse warning: Should it be static? Impact: Make symbol static. Fix this sparse warning: fs/gfs2/rgrp.c:188:5: warning: symbol 'gfs2_bitfit' was not declared. Should it be static? Signed-off-by: Hannes Eder Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 34691d75819a..f03d024038ea 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -185,7 +185,8 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) * Return: the block number (bitmap buffer scope) that was found */ -u32 gfs2_bitfit(const u8 *buf, const unsigned int len, u32 goal, u8 state) +static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, + u32 goal, u8 state) { u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1); const __le64 *ptr = ((__le64 *)buf) + (goal >> 5); -- cgit v1.2.2 From 229615def3f573fc448d20f62c6ec1bc9340cefb Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Tue, 3 Mar 2009 11:45:20 +0900 Subject: GFS2: Pagecache usage optimization on GFS2 I introduced "is_partially_uptodate" aops for GFS2. A page can have multiple buffers and even if a page is not uptodate, some buffers can be uptodate on pagesize != blocksize environment. This aops checks that all buffers which correspond to a part of a file that we want to read are uptodate. If so, we do not have to issue actual read IO to HDD even if a page is not uptodate because the portion we want to read are uptodate. "block_is_partially_uptodate" function is already used by ext2/3/4. With the following patch random read/write mixed workloads or random read after random write workloads can be optimized and we can get performance improvement. I did a performance test using the sysbench. #sysbench --num-threads=16 --max-requests=200000 --test=fileio --file-num=1 --file-block-size=8K --file-total-size=2G --file-test-mode=rndrw --file-fsync-freq=0 --file-rw-ratio=1 run -2.6.29-rc6 Test execution summary: total time: 202.6389s total number of events: 200000 total time taken by event execution: 2580.0480 per-request statistics: min: 0.0000s avg: 0.0129s max: 49.5852s approx. 95 percentile: 0.0462s -2.6.29-rc6-patched Test execution summary: total time: 177.8639s total number of events: 200000 total time taken by event execution: 2419.0199 per-request statistics: min: 0.0000s avg: 0.0121s max: 52.4306s approx. 95 percentile: 0.0444s arch: ia64 pagesize: 16k blocksize: 4k Signed-off-by: Hisashi Hifumi Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_address.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index a6d00e8ffe10..a6dde1751e17 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -1096,6 +1096,7 @@ static const struct address_space_operations gfs2_writeback_aops = { .releasepage = gfs2_releasepage, .direct_IO = gfs2_direct_IO, .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; static const struct address_space_operations gfs2_ordered_aops = { @@ -1111,6 +1112,7 @@ static const struct address_space_operations gfs2_ordered_aops = { .releasepage = gfs2_releasepage, .direct_IO = gfs2_direct_IO, .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; static const struct address_space_operations gfs2_jdata_aops = { @@ -1125,6 +1127,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, }; void gfs2_set_aops(struct inode *inode) -- cgit v1.2.2 From 02ffad08e838997fad3de05c85560a57e5fd92de Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Fri, 6 Mar 2009 10:03:20 -0600 Subject: GFS2: Fix locking bug in failed shared to exclusive conversion After calling out to the dlm, GFS2 sets the new state of a glock to gl_target in gdlm_ast(). However, gl_target is not always the lock state that was requested. If a conversion from shared to exclusive fails, finish_xmote() will call do_xmote() with LM_ST_UNLOCKED, instead of gl->gl_target, so that it can reacquire the lock in exlusive the next time around. In this case, setting the lock to gl_target in gdlm_ast() will make GFS2 think that it has the glock in exclusive mode, when really, it doesn't have the glock locked at all. This patch adds a new field to the gfs2_glock structure, gl_req, to track the mode that was requested. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/lock_dlm.c | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 980a0864ca6c..399d1b978049 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -203,6 +203,7 @@ struct gfs2_glock { unsigned int gl_target; unsigned int gl_reply; unsigned int gl_hash; + unsigned int gl_req; unsigned int gl_demote_state; /* state requested by remote node */ unsigned long gl_demote_time; /* time of first demote request */ struct list_head gl_holders; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index a0bb7d2251a0..46df988323bc 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -46,11 +46,11 @@ static void gdlm_ast(void *arg) BUG(); } - ret = gl->gl_target; + ret = gl->gl_req; if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) { - if (gl->gl_target == LM_ST_SHARED) + if (gl->gl_req == LM_ST_SHARED) ret = LM_ST_DEFERRED; - else if (gl->gl_target == LM_ST_DEFERRED) + else if (gl->gl_req == LM_ST_DEFERRED) ret = LM_ST_SHARED; else BUG(); @@ -147,6 +147,7 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl, int req; u32 lkf; + gl->gl_req = req_state; req = make_mode(req_state); lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); -- cgit v1.2.2 From 6bac243f0793499782267342eba852a8a6cc7ac4 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 9 Mar 2009 09:03:51 +0000 Subject: GFS2: Clean up of glops.c This cleans up a number of bits of code mostly based in glops.c. A couple of simple functions have been merged into the callers to make it more obvious what is going on, the mysterious raising of i_writecount around the truncate_inode_pages() call has been removed. The meta_go_* operations have been renamed rgrp_go_* since that is the only lock type that they are used with. The unused argument of gfs2_read_sb has been removed. Also a bug has been fixed where a check for the rindex inode was in the wrong callback. More comments are added, and the debugging code is improved too. Signed-off-by: Steven Whitehouse --- fs/gfs2/glops.c | 115 +++++++++++++++++++++++---------------------------- fs/gfs2/meta_io.c | 21 ---------- fs/gfs2/meta_io.h | 1 - fs/gfs2/ops_file.c | 3 +- fs/gfs2/ops_fstype.c | 6 +-- 5 files changed, 57 insertions(+), 89 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f34bc7093dd1..bf23a62aa925 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -76,29 +76,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) } /** - * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock - * @gl: the glock - * - */ - -static void gfs2_pte_inval(struct gfs2_glock *gl) -{ - struct gfs2_inode *ip; - struct inode *inode; - - ip = gl->gl_object; - inode = &ip->i_inode; - if (!ip || !S_ISREG(inode->i_mode)) - return; - - unmap_shared_mapping_range(inode->i_mapping, 0, 0); - if (test_bit(GIF_SW_PAGED, &ip->i_flags)) - set_bit(GLF_DIRTY, &gl->gl_flags); - -} - -/** - * meta_go_sync - sync out the metadata for this glock + * rgrp_go_sync - sync out the metadata for this glock * @gl: the glock * * Called when demoting or unlocking an EX glock. We must flush @@ -106,36 +84,42 @@ static void gfs2_pte_inval(struct gfs2_glock *gl) * not return to caller to demote/unlock the glock until I/O is complete. */ -static void meta_go_sync(struct gfs2_glock *gl) +static void rgrp_go_sync(struct gfs2_glock *gl) { - if (gl->gl_state != LM_ST_EXCLUSIVE) + struct address_space *metamapping = gl->gl_aspace->i_mapping; + int error; + + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; + BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); - if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) { - gfs2_log_flush(gl->gl_sbd, gl); - gfs2_meta_sync(gl); - gfs2_ail_empty_gl(gl); - } + gfs2_log_flush(gl->gl_sbd, gl); + filemap_fdatawrite(metamapping); + error = filemap_fdatawait(metamapping); + mapping_set_error(metamapping, error); + gfs2_ail_empty_gl(gl); } /** - * meta_go_inval - invalidate the metadata for this glock + * rgrp_go_inval - invalidate the metadata for this glock * @gl: the glock * @flags: * + * We never used LM_ST_DEFERRED with resource groups, so that we + * should always see the metadata flag set here. + * */ -static void meta_go_inval(struct gfs2_glock *gl, int flags) +static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { - if (!(flags & DIO_METADATA)) - return; + struct address_space *mapping = gl->gl_aspace->i_mapping; - gfs2_meta_inval(gl); - if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex)) - gl->gl_sbd->sd_rindex_uptodate = 0; - else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) { - struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; + BUG_ON(!(flags & DIO_METADATA)); + gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); + truncate_inode_pages(mapping, 0); + if (gl->gl_object) { + struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; } } @@ -152,48 +136,54 @@ static void inode_go_sync(struct gfs2_glock *gl) struct address_space *metamapping = gl->gl_aspace->i_mapping; int error; - if (gl->gl_state != LM_ST_UNLOCKED) - gfs2_pte_inval(gl); - if (gl->gl_state != LM_ST_EXCLUSIVE) - return; - if (ip && !S_ISREG(ip->i_inode.i_mode)) ip = NULL; + if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + return; - if (test_bit(GLF_DIRTY, &gl->gl_flags)) { - gfs2_log_flush(gl->gl_sbd, gl); - filemap_fdatawrite(metamapping); - if (ip) { - struct address_space *mapping = ip->i_inode.i_mapping; - filemap_fdatawrite(mapping); - error = filemap_fdatawait(mapping); - mapping_set_error(mapping, error); - } - error = filemap_fdatawait(metamapping); - mapping_set_error(metamapping, error); - clear_bit(GLF_DIRTY, &gl->gl_flags); - gfs2_ail_empty_gl(gl); + BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); + + gfs2_log_flush(gl->gl_sbd, gl); + filemap_fdatawrite(metamapping); + if (ip) { + struct address_space *mapping = ip->i_inode.i_mapping; + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); + mapping_set_error(mapping, error); } + error = filemap_fdatawait(metamapping); + mapping_set_error(metamapping, error); + gfs2_ail_empty_gl(gl); } /** * inode_go_inval - prepare a inode glock to be released * @gl: the glock * @flags: + * + * Normally we invlidate everything, but if we are moving into + * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we + * can keep hold of the metadata, since it won't have changed. * */ static void inode_go_inval(struct gfs2_glock *gl, int flags) { struct gfs2_inode *ip = gl->gl_object; - int meta = (flags & DIO_METADATA); - if (meta) { - gfs2_meta_inval(gl); + gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); + + if (flags & DIO_METADATA) { + struct address_space *mapping = gl->gl_aspace->i_mapping; + truncate_inode_pages(mapping, 0); if (ip) set_bit(GIF_INVALID, &ip->i_flags); } + if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) + gl->gl_sbd->sd_rindex_uptodate = 0; if (ip && S_ISREG(ip->i_inode.i_mode)) truncate_inode_pages(ip->i_inode.i_mapping, 0); } @@ -395,7 +385,6 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl) } const struct gfs2_glock_operations gfs2_meta_glops = { - .go_xmote_th = meta_go_sync, .go_type = LM_TYPE_META, }; @@ -410,8 +399,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = { }; const struct gfs2_glock_operations gfs2_rgrp_glops = { - .go_xmote_th = meta_go_sync, - .go_inval = meta_go_inval, + .go_xmote_th = rgrp_go_sync, + .go_inval = rgrp_go_inval, .go_demote_ok = rgrp_go_demote_ok, .go_lock = rgrp_go_lock, .go_unlock = rgrp_go_unlock, diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 870d65ae7ae2..8d6f13256b26 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -88,27 +88,6 @@ void gfs2_aspace_put(struct inode *aspace) iput(aspace); } -/** - * gfs2_meta_inval - Invalidate all buffers associated with a glock - * @gl: the glock - * - */ - -void gfs2_meta_inval(struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - struct inode *aspace = gl->gl_aspace; - struct address_space *mapping = gl->gl_aspace->i_mapping; - - gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); - - atomic_inc(&aspace->i_writecount); - truncate_inode_pages(mapping, 0); - atomic_dec(&aspace->i_writecount); - - gfs2_assert_withdraw(sdp, !mapping->nrpages); -} - /** * gfs2_meta_sync - Sync all buffers associated with a glock * @gl: The glock diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index b1a5f3674d43..de270c2f9b63 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -40,7 +40,6 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh, struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); void gfs2_aspace_put(struct inode *aspace); -void gfs2_meta_inval(struct gfs2_glock *gl); void gfs2_meta_sync(struct gfs2_glock *gl); struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 99d726f1c7a6..48ec3d5e29eb 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -355,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) if (ret) goto out; - set_bit(GIF_SW_PAGED, &ip->i_flags); ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); if (ret || !alloc_required) goto out_unlock; @@ -396,6 +395,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) goto out_unlock_page; } ret = gfs2_allocate_page_backing(page); + if (!ret) + set_bit(GIF_SW_PAGED, &ip->i_flags); out_unlock_page: unlock_page(page); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 804ca7273a49..51883b3ad89c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -296,15 +296,15 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) __free_page(page); return 0; } + /** * gfs2_read_sb - Read super block * @sdp: The GFS2 superblock - * @gl: the glock for the superblock (assumed to be held) * @silent: Don't print message if mount fails * */ -static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) +static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) { u32 hash_blocks, ind_blocks, leaf_blocks; u32 tmp_blocks; @@ -524,7 +524,7 @@ static int init_sb(struct gfs2_sbd *sdp, int silent) return ret; } - ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent); + ret = gfs2_read_sb(sdp, silent); if (ret) { fs_err(sdp, "can't read superblock: %d\n", ret); goto out; -- cgit v1.2.2 From 9c538837d844574787c95bd5665f684559fb7065 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 19 Mar 2009 13:15:44 +0000 Subject: Fix a minor bug in the previous patch The logic requires that we mark the glock dirty in page_mkwrite otherwise we might not flush correctly in the case that no allocation was required in the process of dirying the page. Also we need to set the shared write flag early for the same reason. Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 48ec3d5e29eb..3b9e8de3500b 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -355,6 +355,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) if (ret) goto out; + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); + set_bit(GIF_SW_PAGED, &ip->i_flags); + ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); if (ret || !alloc_required) goto out_unlock; @@ -395,8 +398,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) goto out_unlock_page; } ret = gfs2_allocate_page_backing(page); - if (!ret) - set_bit(GIF_SW_PAGED, &ip->i_flags); out_unlock_page: unlock_page(page); -- cgit v1.2.2 From df3647b24510e23523f81a77bb179cd9ae3d613b Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 23 Mar 2009 11:38:55 +0000 Subject: GFS2: Fix freeze issue This removes some old code that was causing issues during filesystem freeze. Reported-by: Andrew Price Tested-by: Andrew Price Signed-off-by: Steven Whitehouse --- fs/gfs2/super.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 7cf302b135ce..601913e0a482 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -338,7 +338,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, struct gfs2_holder *t_gh) { struct gfs2_inode *ip; - struct gfs2_holder ji_gh; struct gfs2_jdesc *jd; struct lfcc *lfcc; LIST_HEAD(list); @@ -386,7 +385,6 @@ out: gfs2_glock_dq_uninit(&lfcc->gh); kfree(lfcc); } - gfs2_glock_dq_uninit(&ji_gh); return error; } -- cgit v1.2.2 From 8231f2f99a5e5fc45a25e8de09fd1ab9711babf1 Mon Sep 17 00:00:00 2001 From: Qinghuang Feng Date: Wed, 14 Jan 2009 15:45:13 +0800 Subject: SYSFS: use standard magic.h for sysfs SYSFS_MAGIC has been added into magic.h, so only use that definition in magic.h to avoid potential consistency problem. Signed-off-by: Qinghuang Feng Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/mount.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index ab343e371d64..8133ca36ee0e 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -17,11 +17,10 @@ #include #include #include +#include #include "sysfs.h" -/* Random magic number */ -#define SYSFS_MAGIC 0x62656572 static struct vfsmount *sysfs_mount; struct super_block * sysfs_sb = NULL; -- cgit v1.2.2 From 4a67a1bc0b3a0db017b560cee27370d141c58e25 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 21 Jan 2009 11:55:11 -0800 Subject: sysfs: Take sysfs_mutex when fetching the root inode. sysfs_get_inode ultimately calls sysfs_count_nlink when the a directory inode is fectched. sysfs_count_nlink needs to be called under the sysfs_mutex to guard against the unlikely but possible scenario that the root directory is changing as we are counting the number entries in it, and just in general to be consistent. Signed-off-by: Eric W. Biederman Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/mount.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8133ca36ee0e..84ef378673a8 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -52,7 +52,9 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) sysfs_sb = sb; /* get root inode, initialize and unlock it */ + mutex_lock(&sysfs_mutex); inode = sysfs_get_inode(&sysfs_root); + mutex_unlock(&sysfs_mutex); if (!inode) { pr_debug("sysfs: could not get root inode\n"); return -ENOMEM; -- cgit v1.2.2 From 425cb02912d1095febfeaf8d379af7b2ac9e4a89 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Thu, 12 Feb 2009 10:56:59 -0700 Subject: sysfs: sysfs_add_one WARNs with full path to duplicate filename sysfs: sysfs_add_one WARNs with full path to duplicate filename As a debugging aid, it can be useful to know the full path to a duplicate file being created in sysfs. We now will display warnings such as: sysfs: cannot create duplicate filename '/foo' when attempting to create multiple files named 'foo' in the sysfs root, or: sysfs: cannot create duplicate filename '/bus/pci/slots/5/foo' when attempting to create multiple files named 'foo' under a given directory in sysfs. The path displayed is always a relative path to sysfs_root. The leading '/' in the path name refers to the sysfs_root mount point, and should not be confused with the "real" '/'. Thanks to Alex Williamson for essentially writing sysfs_pathname. Cc: Alex Williamson Signed-off-by: Alex Chiang Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 82d3b79d0e08..f13d852ab3c1 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -433,6 +433,26 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) return 0; } +/** + * sysfs_pathname - return full path to sysfs dirent + * @sd: sysfs_dirent whose path we want + * @path: caller allocated buffer + * + * Gives the name "/" to the sysfs_root entry; any path returned + * is relative to wherever sysfs is mounted. + * + * XXX: does no error checking on @path size + */ +static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) +{ + if (sd->s_parent) { + sysfs_pathname(sd->s_parent, path); + strcat(path, "/"); + } + strcat(path, sd->s_name); + return path; +} + /** * sysfs_add_one - add sysfs_dirent to parent * @acxt: addrm context to use @@ -458,8 +478,16 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) int ret; ret = __sysfs_add_one(acxt, sd); - WARN(ret == -EEXIST, KERN_WARNING "sysfs: duplicate filename '%s' " - "can not be created\n", sd->s_name); + if (ret == -EEXIST) { + char *path = kzalloc(PATH_MAX, GFP_KERNEL); + WARN(1, KERN_WARNING + "sysfs: cannot create duplicate filename '%s'\n", + (path == NULL) ? sd->s_name : + strcat(strcat(sysfs_pathname(acxt->parent_sd, path), "/"), + sd->s_name)); + kfree(path); + } + return ret; } -- cgit v1.2.2 From 04256b4a8fc73f54cd14f20867882c299728a446 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 11 Feb 2009 13:20:23 -0800 Subject: sysfs: reference sysfs_dirent from sysfs inodes The sysfs_dirent serves as both an inode and a directory entry for sysfs. To prevent the sysfs inode numbers from being freed prematurely hold a reference to sysfs_dirent from the sysfs inode. [akpm@linux-foundation.org: add comment] Signed-off-by: Eric W. Biederman Cc: Tejun Heo Cc: Al Viro Cc: Cornelia Huck Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/inode.c | 17 +++++++++++++++++ fs/sysfs/mount.c | 1 + fs/sysfs/sysfs.h | 1 + 3 files changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index dfa3d94cfc74..555f0ff988df 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -147,6 +147,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { struct bin_attribute *bin_attr; + inode->i_private = sysfs_get(sd); inode->i_mapping->a_ops = &sysfs_aops; inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; inode->i_op = &sysfs_inode_operations; @@ -214,6 +215,22 @@ struct inode * sysfs_get_inode(struct sysfs_dirent *sd) return inode; } +/* + * The sysfs_dirent serves as both an inode and a directory entry for sysfs. + * To prevent the sysfs inode numbers from being freed prematurely we take a + * reference to sysfs_dirent from the sysfs inode. A + * super_operations.delete_inode() implementation is needed to drop that + * reference upon inode destruction. + */ +void sysfs_delete_inode(struct inode *inode) +{ + struct sysfs_dirent *sd = inode->i_private; + + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + sysfs_put(sd); +} + int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) { struct sysfs_addrm_cxt acxt; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 84ef378673a8..49749955ccaf 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -29,6 +29,7 @@ struct kmem_cache *sysfs_dir_cachep; static const struct super_operations sysfs_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, + .delete_inode = sysfs_delete_inode, }; struct sysfs_dirent sysfs_root = { diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 93c6d6b27c4d..9055d04e4ab0 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -145,6 +145,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) * inode.c */ struct inode *sysfs_get_inode(struct sysfs_dirent *sd); +void sysfs_delete_inode(struct inode *inode); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); int sysfs_inode_init(void); -- cgit v1.2.2 From e0edd3c65aa5b53e20280565a7ce11675eb7ed6b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Mar 2009 11:57:20 -0800 Subject: sysfs: don't block indefinitely for unmapped files. Modify sysfs bin files so that we can remove the bin file while they are still mapped. When the kobject is removed we unmap the bin file and arrange for future accesses to the mapping to receive SIGBUS. Implementing this prevents a nasty DOS when pci devices are hot plugged and unplugged. Where if any of their resources were mmaped the kernel could not free up their pci resources or release their pci data structures. [akpm@linux-foundation.org: remove unused var] Signed-off-by: Eric W. Biederman Cc: Jesse Barnes Acked-by: Tejun Heo Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/bin.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++++++---- fs/sysfs/dir.c | 1 + fs/sysfs/sysfs.h | 2 + 3 files changed, 174 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index f2c478c3424e..96cc2bf6a84e 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -21,15 +21,28 @@ #include #include #include +#include #include #include "sysfs.h" +/* + * There's one bin_buffer for each open file. + * + * filp->private_data points to bin_buffer and + * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s + * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock + */ +static DEFINE_MUTEX(sysfs_bin_lock); + struct bin_buffer { - struct mutex mutex; - void *buffer; - int mmapped; + struct mutex mutex; + void *buffer; + int mmapped; + struct vm_operations_struct *vm_ops; + struct file *file; + struct hlist_node list; }; static int @@ -168,29 +181,148 @@ out_free: return count; } +static void bin_vma_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + + if (!bb->vm_ops || !bb->vm_ops->open) + return; + + if (!sysfs_get_active_two(attr_sd)) + return; + + bb->vm_ops->open(vma); + + sysfs_put_active_two(attr_sd); +} + +static void bin_vma_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + + if (!bb->vm_ops || !bb->vm_ops->close) + return; + + if (!sysfs_get_active_two(attr_sd)) + return; + + bb->vm_ops->close(vma); + + sysfs_put_active_two(attr_sd); +} + +static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + int ret; + + if (!bb->vm_ops || !bb->vm_ops->fault) + return VM_FAULT_SIGBUS; + + if (!sysfs_get_active_two(attr_sd)) + return VM_FAULT_SIGBUS; + + ret = bb->vm_ops->fault(vma, vmf); + + sysfs_put_active_two(attr_sd); + return ret; +} + +static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + int ret; + + if (!bb->vm_ops || !bb->vm_ops->page_mkwrite) + return -EINVAL; + + if (!sysfs_get_active_two(attr_sd)) + return -EINVAL; + + ret = bb->vm_ops->page_mkwrite(vma, page); + + sysfs_put_active_two(attr_sd); + return ret; +} + +static int bin_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + int ret; + + if (!bb->vm_ops || !bb->vm_ops->access) + return -EINVAL; + + if (!sysfs_get_active_two(attr_sd)) + return -EINVAL; + + ret = bb->vm_ops->access(vma, addr, buf, len, write); + + sysfs_put_active_two(attr_sd); + return ret; +} + +static struct vm_operations_struct bin_vm_ops = { + .open = bin_vma_open, + .close = bin_vma_close, + .fault = bin_fault, + .page_mkwrite = bin_page_mkwrite, + .access = bin_access, +}; + static int mmap(struct file *file, struct vm_area_struct *vma) { struct bin_buffer *bb = file->private_data; struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; + struct vm_operations_struct *vm_ops; int rc; mutex_lock(&bb->mutex); /* need attr_sd for attr, its parent for kobj */ + rc = -ENODEV; if (!sysfs_get_active_two(attr_sd)) - return -ENODEV; + goto out_unlock; rc = -EINVAL; - if (attr->mmap) - rc = attr->mmap(kobj, attr, vma); + if (!attr->mmap) + goto out_put; - if (rc == 0 && !bb->mmapped) - bb->mmapped = 1; - else - sysfs_put_active_two(attr_sd); + rc = attr->mmap(kobj, attr, vma); + vm_ops = vma->vm_ops; + vma->vm_ops = &bin_vm_ops; + if (rc) + goto out_put; + rc = -EINVAL; + if (bb->mmapped && bb->vm_ops != vma->vm_ops) + goto out_put; + +#ifdef CONFIG_NUMA + rc = -EINVAL; + if (vm_ops && ((vm_ops->set_policy || vm_ops->get_policy || vm_ops->migrate))) + goto out_put; +#endif + + rc = 0; + bb->mmapped = 1; + bb->vm_ops = vm_ops; +out_put: + sysfs_put_active_two(attr_sd); +out_unlock: mutex_unlock(&bb->mutex); return rc; @@ -223,8 +355,13 @@ static int open(struct inode * inode, struct file * file) goto err_out; mutex_init(&bb->mutex); + bb->file = file; file->private_data = bb; + mutex_lock(&sysfs_bin_lock); + hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers); + mutex_unlock(&sysfs_bin_lock); + /* open succeeded, put active references */ sysfs_put_active_two(attr_sd); return 0; @@ -237,11 +374,12 @@ static int open(struct inode * inode, struct file * file) static int release(struct inode * inode, struct file * file) { - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_buffer *bb = file->private_data; - if (bb->mmapped) - sysfs_put_active_two(attr_sd); + mutex_lock(&sysfs_bin_lock); + hlist_del(&bb->list); + mutex_unlock(&sysfs_bin_lock); + kfree(bb->buffer); kfree(bb); return 0; @@ -256,6 +394,26 @@ const struct file_operations bin_fops = { .release = release, }; + +void unmap_bin_file(struct sysfs_dirent *attr_sd) +{ + struct bin_buffer *bb; + struct hlist_node *tmp; + + if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR) + return; + + mutex_lock(&sysfs_bin_lock); + + hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) { + struct inode *inode = bb->file->f_path.dentry->d_inode; + + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + } + + mutex_unlock(&sysfs_bin_lock); +} + /** * sysfs_create_bin_file - create binary file for object. * @kobj: object. diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index f13d852ab3c1..66aeb4fff0c3 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -609,6 +609,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) sysfs_drop_dentry(sd); sysfs_deactivate(sd); + unmap_bin_file(sd); sysfs_put(sd); } } diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 9055d04e4ab0..3fa0d98481e2 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -28,6 +28,7 @@ struct sysfs_elem_attr { struct sysfs_elem_bin_attr { struct bin_attribute *bin_attr; + struct hlist_head buffers; }; /* @@ -164,6 +165,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, * bin.c */ extern const struct file_operations bin_fops; +void unmap_bin_file(struct sysfs_dirent *attr_sd); /* * symlink.c -- cgit v1.2.2 From f67f129e519fa87f8ebd236b6336fe43f31ee141 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Mar 2009 21:10:49 +0800 Subject: Driver core: implement uevent suppress in kobject This patch implements uevent suppress in kobject and removes it from struct device, based on the following ideas: 1,Uevent sending should be one attribute of kobject, so suppressing it in kobject layer is more natural than in device layer. By this way, we can do it for other objects embedded with kobject. 2,It may save several bytes for each instance of struct device.(On my omap3(32bit ARM) based box, can save 8bytes per device object) This patch also introduces dev_set|get_uevent_suppress() helpers to set and query uevent_suppress attribute in case to help kobject as private part of struct device in future. [This version is against the latest driver-core patch set of Greg,please ignore the last version.] Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/partitions/check.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 6d720243f5f4..38e337d51ced 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -400,7 +400,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ - pdev->uevent_suppress = 1; + dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); if (err) goto out_put; @@ -410,7 +410,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!p->holder_dir) goto out_del; - pdev->uevent_suppress = 0; + dev_set_uevent_suppress(pdev, 0); if (flags & ADDPART_FLAG_WHOLEDISK) { err = device_create_file(pdev, &dev_attr_whole_disk); if (err) @@ -422,7 +422,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk supresses it */ - if (!ddev->uevent_suppress) + if (!dev_get_uevent_suppress(pdev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); return p; @@ -455,7 +455,7 @@ void register_disk(struct gendisk *disk) dev_set_name(ddev, disk->disk_name); /* delay uevents, until we scanned partition table */ - ddev->uevent_suppress = 1; + dev_set_uevent_suppress(ddev, 1); if (device_add(ddev)) return; @@ -490,7 +490,7 @@ void register_disk(struct gendisk *disk) exit: /* announce disk after possible partitions are created */ - ddev->uevent_suppress = 0; + dev_set_uevent_suppress(ddev, 0); kobject_uevent(&ddev->kobj, KOBJ_ADD); /* announce possible partitions */ -- cgit v1.2.2 From 669420644c79c207f83fdf9105ae782867e2991f Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 13 Mar 2009 12:07:36 -0600 Subject: sysfs: only allow one scheduled removal callback per kobj The only way for a sysfs attribute to remove itself (without deadlock) is to use the sysfs_schedule_callback() interface. Vegard Nossum discovered that a poorly written sysfs ->store callback can repeatedly schedule remove callbacks on the same device over and over, e.g. $ while true ; do echo 1 > /sys/devices/.../remove ; done If the 'remove' attribute uses the sysfs_schedule_callback API and also does not protect itself from concurrent accesses, its callback handler will be called multiple times, and will eventually attempt to perform operations on a freed kobject, leading to many problems. Instead of requiring all callers of sysfs_schedule_callback to implement their own synchronization, provide the protection in the infrastructure. Now, sysfs_schedule_callback will only allow one scheduled callback per kobject. On subsequent calls with the same kobject, return -EAGAIN. This is a short term fix. The long term fix is to allow sysfs attributes to remove themselves directly, without any of this callback hokey pokey. [cornelia.huck@de.ibm.com: s390 ccwgroup bits] Reported-by: vegard.nossum@gmail.com Signed-off-by: Alex Chiang Acked-by: Cornelia Huck Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1f4a3f877262..289c43a47263 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -659,13 +659,16 @@ void sysfs_remove_file_from_group(struct kobject *kobj, EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); struct sysfs_schedule_callback_struct { - struct kobject *kobj; + struct list_head workq_list; + struct kobject *kobj; void (*func)(void *); void *data; struct module *owner; struct work_struct work; }; +static DEFINE_MUTEX(sysfs_workq_mutex); +static LIST_HEAD(sysfs_workq); static void sysfs_schedule_callback_work(struct work_struct *work) { struct sysfs_schedule_callback_struct *ss = container_of(work, @@ -674,6 +677,9 @@ static void sysfs_schedule_callback_work(struct work_struct *work) (ss->func)(ss->data); kobject_put(ss->kobj); module_put(ss->owner); + mutex_lock(&sysfs_workq_mutex); + list_del(&ss->workq_list); + mutex_unlock(&sysfs_workq_mutex); kfree(ss); } @@ -695,15 +701,25 @@ static void sysfs_schedule_callback_work(struct work_struct *work) * until @func returns. * * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available. + * be allocated, -ENODEV if a reference to @owner isn't available, + * -EAGAIN if a callback has already been scheduled for @kobj. */ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), void *data, struct module *owner) { - struct sysfs_schedule_callback_struct *ss; + struct sysfs_schedule_callback_struct *ss, *tmp; if (!try_module_get(owner)) return -ENODEV; + + mutex_lock(&sysfs_workq_mutex); + list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) + if (ss->kobj == kobj) { + mutex_unlock(&sysfs_workq_mutex); + return -EAGAIN; + } + mutex_unlock(&sysfs_workq_mutex); + ss = kmalloc(sizeof(*ss), GFP_KERNEL); if (!ss) { module_put(owner); @@ -715,6 +731,10 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), ss->data = data; ss->owner = owner; INIT_WORK(&ss->work, sysfs_schedule_callback_work); + INIT_LIST_HEAD(&ss->workq_list); + mutex_lock(&sysfs_workq_mutex); + list_add_tail(&ss->workq_list, &sysfs_workq); + mutex_unlock(&sysfs_workq_mutex); schedule_work(&ss->work); return 0; } -- cgit v1.2.2 From 095160aee954688a9bad225952c4bee546541e19 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 23 Mar 2009 01:41:27 +0000 Subject: sysfs: fix some bin_vm_ops errors Commit 86c9508eb1c0ce5aa07b5cf1d36b60c54efc3d7a "sysfs: don't block indefinitely for unmapped files" in linux-next crashes the PowerMac G5 when X starts up. It's caught out by the way powerpc's pci_mmap of legacy_mem uses shmem_zero_setup(), substituting a new vma->vm_file whose private_data no longer points to the bin_buffer (substitution done because some versions of X crash if that mmap fails). The fix to this is straightforward: the original vm_file is fput() in that case, so this mmap won't block sysfs at all, so just don't switch over to bin_vm_ops if vm_file has changed. But more fixes made before realizing that was the problem:- It should not be an error if bin_page_mkwrite() finds no underlying page_mkwrite(). Check that a file already mmap'ed has the same underlying vm_ops _before_ pointing vma->vm_ops at bin_vm_ops. If the file being mmap'ed is a shmem/tmpfs file, don't fail the mmap on CONFIG_NUMA=y, just because that has a set_policy and get_policy: provide bin_set_policy, bin_get_policy and bin_migrate. Signed-off-by: Hugh Dickins Acked-by: Eric Biederman Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/bin.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 79 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 96cc2bf6a84e..07703d3ff4a1 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -241,9 +241,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page) struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; int ret; - if (!bb->vm_ops || !bb->vm_ops->page_mkwrite) + if (!bb->vm_ops) return -EINVAL; + if (!bb->vm_ops->page_mkwrite) + return 0; + if (!sysfs_get_active_two(attr_sd)) return -EINVAL; @@ -273,12 +276,78 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr, return ret; } +#ifdef CONFIG_NUMA +static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + int ret; + + if (!bb->vm_ops || !bb->vm_ops->set_policy) + return 0; + + if (!sysfs_get_active_two(attr_sd)) + return -EINVAL; + + ret = bb->vm_ops->set_policy(vma, new); + + sysfs_put_active_two(attr_sd); + return ret; +} + +static struct mempolicy *bin_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct mempolicy *pol; + + if (!bb->vm_ops || !bb->vm_ops->get_policy) + return vma->vm_policy; + + if (!sysfs_get_active_two(attr_sd)) + return vma->vm_policy; + + pol = bb->vm_ops->get_policy(vma, addr); + + sysfs_put_active_two(attr_sd); + return pol; +} + +static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from, + const nodemask_t *to, unsigned long flags) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + int ret; + + if (!bb->vm_ops || !bb->vm_ops->migrate) + return 0; + + if (!sysfs_get_active_two(attr_sd)) + return 0; + + ret = bb->vm_ops->migrate(vma, from, to, flags); + + sysfs_put_active_two(attr_sd); + return ret; +} +#endif + static struct vm_operations_struct bin_vm_ops = { .open = bin_vma_open, .close = bin_vma_close, .fault = bin_fault, .page_mkwrite = bin_page_mkwrite, .access = bin_access, +#ifdef CONFIG_NUMA + .set_policy = bin_set_policy, + .get_policy = bin_get_policy, + .migrate = bin_migrate, +#endif }; static int mmap(struct file *file, struct vm_area_struct *vma) @@ -287,7 +356,6 @@ static int mmap(struct file *file, struct vm_area_struct *vma) struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct vm_operations_struct *vm_ops; int rc; mutex_lock(&bb->mutex); @@ -302,24 +370,25 @@ static int mmap(struct file *file, struct vm_area_struct *vma) goto out_put; rc = attr->mmap(kobj, attr, vma); - vm_ops = vma->vm_ops; - vma->vm_ops = &bin_vm_ops; if (rc) goto out_put; - rc = -EINVAL; - if (bb->mmapped && bb->vm_ops != vma->vm_ops) + /* + * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() + * to satisfy versions of X which crash if the mmap fails: that + * substitutes a new vm_file, and we don't then want bin_vm_ops. + */ + if (vma->vm_file != file) goto out_put; -#ifdef CONFIG_NUMA rc = -EINVAL; - if (vm_ops && ((vm_ops->set_policy || vm_ops->get_policy || vm_ops->migrate))) + if (bb->mmapped && bb->vm_ops != vma->vm_ops) goto out_put; -#endif rc = 0; bb->mmapped = 1; - bb->vm_ops = vm_ops; + bb->vm_ops = vma->vm_ops; + vma->vm_ops = &bin_vm_ops; out_put: sysfs_put_active_two(attr_sd); out_unlock: -- cgit v1.2.2 From 11ff6f05f1e836a6a02369a4c4b64757e484adc1 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 26 Mar 2009 17:32:14 +0000 Subject: Allow relatime to update atime once a day Allow atime to be updated once per day even with relatime. This lets utilities like tmpreaper (which delete files based on last access time) continue working, making relatime a plausible default for distributions. Signed-off-by: Matthew Garrett Reviewed-by: Matthew Wilcox Acked-by: Valerie Aurora Henson Acked-by: Alan Cox Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- fs/inode.c | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 826fb0b9d1c3..6ac0cef6c5f5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1290,6 +1290,40 @@ sector_t bmap(struct inode * inode, sector_t block) } EXPORT_SYMBOL(bmap); +/* + * With relative atime, only update atime if the previous atime is + * earlier than either the ctime or mtime or if at least a day has + * passed since the last atime update. + */ +static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, + struct timespec now) +{ + + if (!(mnt->mnt_flags & MNT_RELATIME)) + return 1; + /* + * Is mtime younger than atime? If yes, update atime: + */ + if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0) + return 1; + /* + * Is ctime younger than atime? If yes, update atime: + */ + if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0) + return 1; + + /* + * Is the previous atime value older than a day? If yes, + * update atime: + */ + if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) + return 1; + /* + * Good, we can skip the atime update: + */ + return 0; +} + /** * touch_atime - update the access time * @mnt: mount the inode is accessed on @@ -1317,17 +1351,12 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry) goto out; if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) goto out; - if (mnt->mnt_flags & MNT_RELATIME) { - /* - * With relative atime, only update atime if the previous - * atime is earlier than either the ctime or mtime. - */ - if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 && - timespec_compare(&inode->i_ctime, &inode->i_atime) < 0) - goto out; - } now = current_fs_time(inode->i_sb); + + if (!relatime_need_update(mnt, inode, now)) + goto out; + if (timespec_equal(&inode->i_atime, &now)) goto out; -- cgit v1.2.2 From d0adde574b8487ef30f69e2d08bba769e4be513f Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 26 Mar 2009 17:49:56 +0000 Subject: Add a strictatime mount option Add support for explicitly requesting full atime updates. This makes it possible for kernels to default to relatime but still allow userspace to override it. Signed-off-by: Matthew Garrett Signed-off-by: Linus Torvalds --- fs/namespace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 06f8e63f6cb1..d0659ec291c9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -780,6 +780,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, + { MNT_STRICTATIME, ",strictatime" }, { 0, NULL } }; const struct proc_fs_info *fs_infop; @@ -1932,11 +1933,14 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_NODIRATIME; if (flags & MS_RELATIME) mnt_flags |= MNT_RELATIME; + if (flags & MS_STRICTATIME) + mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | - MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); + MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | + MS_STRICTATIME); /* ... and get the mountpoint */ retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); -- cgit v1.2.2 From 0a1c01c9477602ee8b44548a9405b2c1d587b5a2 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 26 Mar 2009 17:53:14 +0000 Subject: Make relatime default Change the default behaviour of the kernel to use relatime for all filesystems. This can be overridden with the "strictatime" mount option. Signed-off-by: Matthew Garrett Signed-off-by: Linus Torvalds --- fs/namespace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index d0659ec291c9..f0e753097353 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1920,6 +1920,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; + /* Default to relatime */ + mnt_flags |= MNT_RELATIME; + /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; @@ -1931,8 +1934,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; - if (flags & MS_RELATIME) - mnt_flags |= MNT_RELATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) -- cgit v1.2.2