From 3c6af7fa787f21f8873a050568ed892312899eb5 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Thu, 24 Nov 2005 13:41:33 +0000 Subject: NTFS: Fix a potential overflow by casting (index + 1) to s64 before doing a left shift using PAGE_CACHE_SHIFT in fs/ntfs/file.c. Thanks to Andrew Morton pointing this out to. Signed-off-by: Anton Altaparmakov --- fs/ntfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 7275338918..c73c8864cb 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -248,7 +248,7 @@ do_non_resident_extend: * enough to make ntfs_writepage() work. */ write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = (index + 1) << PAGE_CACHE_SHIFT; + ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT; if (ni->initialized_size > new_init_size) ni->initialized_size = new_init_size; write_unlock_irqrestore(&ni->size_lock, flags); -- cgit v1.2.2 From 0820e15a35b3cf37caadf550ddb7c75a7a77afd0 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 23 Jan 2006 12:50:04 -0800 Subject: [CIFS] Do not zero non-existent iovec in SendReceive response processing. Could cause memory leak in some readpaths depending on what junk followed it in the stack. Signed-off-by: Steve French --- fs/cifs/transport.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 7b98792150..b12cb8a7da 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -498,7 +498,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, else *pRespBufType = CIFS_SMALL_BUFFER; iov[0].iov_len = receive_len + 4; - iov[1].iov_len = 0; dump_smb(midQ->resp_buf, 80); /* convert the length into a more usable form */ -- cgit v1.2.2 From 17cbbafe8e82bde4258e407ce043b61f4f9a350f Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 24 Jan 2006 20:26:48 -0800 Subject: [CIFS] Make cifs default wsize match what we actually want to send (52K typically - header + 13 pages). Forgetting to set wsize on the mount command costs more than 10% on large write (can be much more) so this makes a saner default. We still shrink this default smaller if server can not support it. Signed-off-by: Steve French --- fs/cifs/connect.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 88f60aa520..eae306fa24 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1785,7 +1785,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, } else if(volume_info.wsize) cifs_sb->wsize = volume_info.wsize; else - cifs_sb->wsize = CIFSMaxBufSize; /* default */ + cifs_sb->wsize = + min(PAGEVEC_SIZE * PAGE_CACHE_SIZE, 127*1024); + /* old default of CIFSMaxBufSize was too small now + that SMB Write2 can send multiple pages in kvec. + RFC1001 does not describe what happens when frame + bigger than 128K is sent so use that as max in + conjunction with 52K kvec constraint on arch with 4K + page size */ + if(cifs_sb->rsize < PAGE_CACHE_SIZE) { cifs_sb->rsize = PAGE_CACHE_SIZE; /* Windows ME does this */ -- cgit v1.2.2 From eb9bdaa3f3b9d30d09bcad47037216aa39639b8e Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 27 Jan 2006 15:11:47 -0800 Subject: Signed-off-by: Steve French --- fs/cifs/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 77c990f0cb..d17c97d07c 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1190,7 +1190,6 @@ retry: /* BB what if continued retry is requested via mount flags? */ set_bit(AS_EIO, &mapping->flags); - SetPageError(page); } else { cifs_stats_bytes_written(cifs_sb->tcon, bytes_written); @@ -1198,6 +1197,13 @@ retry: } for (i = 0; i < n_iov; i++) { page = pvec.pages[first + i]; + /* Should we also set page error on + success rc but too little data written? */ + /* BB investigate retry logic on temporary + server crash cases and how recovery works + when page marked as error */ + if(rc) + SetPageError(page); kunmap(page); unlock_page(page); page_cache_release(page); -- cgit v1.2.2 From 1877c9ea66a29563987f22d0a86c66f438a87ce2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 27 Jan 2006 18:36:11 -0800 Subject: [CIFS] Remove compiler warning Signed-off-by: Steve French --- fs/cifs/connect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index eae306fa24..e488603fb1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1786,7 +1786,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, cifs_sb->wsize = volume_info.wsize; else cifs_sb->wsize = - min(PAGEVEC_SIZE * PAGE_CACHE_SIZE, 127*1024); + min_t(const int, PAGEVEC_SIZE * PAGE_CACHE_SIZE, + 127*1024); /* old default of CIFSMaxBufSize was too small now that SMB Write2 can send multiple pages in kvec. RFC1001 does not describe what happens when frame -- cgit v1.2.2 From fddfdeafa8396f85c666bfc5e1e920eb535514cf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 31 Jan 2006 15:24:34 +0100 Subject: [BLOCK] A few kerneldoc fixups Signed-off-by: Jens Axboe --- fs/bio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index bbc442b8c8..1f3bb501c2 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -411,6 +411,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page /** * bio_add_pc_page - attempt to add page to bio + * @q: the target queue * @bio: destination bio * @page: page to add * @len: vec entry length -- cgit v1.2.2 From 3a69c7dc6f3d58aeb9ce5051fc7060d55e05c003 Mon Sep 17 00:00:00 2001 From: Yingping Lu Date: Wed, 1 Feb 2006 12:14:34 +1100 Subject: [XFS] Interim solution for attribute insertion failure during file creation due to ENOSPC. The current solution removes the inode when the attribute insertion fails. Long term solution would be to make the inode creation and attribute insertion atomic. SGI-PV: 947610 SGI-Modid: xfs-linux-melb:xfs-kern:205193a Signed-off-by: Yingping Lu Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_iops.c | 50 +++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 76c6df34d0..eda7919b70 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -262,6 +262,31 @@ has_fs_struct(struct task_struct *task) return (task->fs != init_task.fs); } +STATIC inline void +cleanup_inode( + vnode_t *dvp, + vnode_t *vp, + struct dentry *dentry, + int mode) +{ + struct dentry teardown = {}; + int err2; + + /* Oh, the horror. + * If we can't add the ACL or we fail in + * linvfs_init_security we must back out. + * ENOSPC can hit here, among other things. + */ + teardown.d_inode = LINVFS_GET_IP(vp); + teardown.d_name = dentry->d_name; + + if (S_ISDIR(mode)) + VOP_RMDIR(dvp, &teardown, NULL, err2); + else + VOP_REMOVE(dvp, &teardown, NULL, err2); + VN_RELE(vp); +} + STATIC int linvfs_mknod( struct inode *dir, @@ -316,30 +341,19 @@ linvfs_mknod( } if (!error) + { error = linvfs_init_security(vp, dir); + if (error) + cleanup_inode(dvp, vp, dentry, mode); + } if (default_acl) { if (!error) { error = _ACL_INHERIT(vp, &va, default_acl); - if (!error) { + if (!error) VMODIFY(vp); - } else { - struct dentry teardown = {}; - int err2; - - /* Oh, the horror. - * If we can't add the ACL we must back out. - * ENOSPC can hit here, among other things. - */ - teardown.d_inode = ip = LINVFS_GET_IP(vp); - teardown.d_name = dentry->d_name; - - if (S_ISDIR(mode)) - VOP_RMDIR(dvp, &teardown, NULL, err2); - else - VOP_REMOVE(dvp, &teardown, NULL, err2); - VN_RELE(vp); - } + else + cleanup_inode(dvp, vp, dentry, mode); } _ACL_FREE(default_acl); } -- cgit v1.2.2 From fad3aa1e8e2e4123a19b926fefd91ec63dd56497 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Wed, 1 Feb 2006 12:14:52 +1100 Subject: [XFS] Fix regression in xfs_buf_rele dealing with non-hashed buffers, as occur during log replay. Novell bug 145204, Fedora bug 177848. SGI-PV: 948860 SGI-Modid: xfs-linux-melb:xfs-kern:25064a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_buf.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index e44b7c1a3a..a36a8e3b70 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -822,6 +822,13 @@ xfs_buf_rele( XB_TRACE(bp, "rele", bp->b_relse); + if (unlikely(!hash)) { + ASSERT(!bp->b_relse); + if (atomic_dec_and_test(&bp->b_hold)) + xfs_buf_free(bp); + return; + } + if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { if (bp->b_relse) { atomic_inc(&bp->b_hold); -- cgit v1.2.2 From 3fb803a990cd17546bd89c38e0e29a891f71ce7d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 1 Feb 2006 03:04:34 -0800 Subject: [PATCH] knfsd: Restore recently broken ACL functionality to NFS server A recent patch to Allow run-time selection of NFS versions to export meant that NO nfsacl service versions were exported. This patch restored that functionality. Signed-off-by: Andreas Gruenbacher Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfssvc.c | 76 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 89ed046968..1d163b6169 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -64,6 +64,32 @@ struct nfsd_list { }; static struct list_head nfsd_list = LIST_HEAD_INIT(nfsd_list); +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static struct svc_stat nfsd_acl_svcstats; +static struct svc_version * nfsd_acl_version[] = { + [2] = &nfsd_acl_version2, + [3] = &nfsd_acl_version3, +}; + +#define NFSD_ACL_MINVERS 2 +#define NFSD_ACL_NRVERS (sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0])) +static struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS]; + +static struct svc_program nfsd_acl_program = { + .pg_prog = NFS_ACL_PROGRAM, + .pg_nvers = NFSD_ACL_NRVERS, + .pg_vers = nfsd_acl_versions, + .pg_name = "nfsd", + .pg_class = "nfsd", + .pg_stats = &nfsd_acl_svcstats, + .pg_authenticate = &svc_set_client, +}; + +static struct svc_stat nfsd_acl_svcstats = { + .program = &nfsd_acl_program, +}; +#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ + static struct svc_version * nfsd_version[] = { [2] = &nfsd_version2, #if defined(CONFIG_NFSD_V3) @@ -79,6 +105,9 @@ static struct svc_version * nfsd_version[] = { static struct svc_version *nfsd_versions[NFSD_NRVERS]; struct svc_program nfsd_program = { +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + .pg_next = &nfsd_acl_program, +#endif .pg_prog = NFS_PROGRAM, /* program number */ .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */ .pg_vers = nfsd_versions, /* version table */ @@ -147,6 +176,26 @@ nfsd_svc(unsigned short port, int nrservs) nfsd_program.pg_vers[i] = nfsd_version[i]; } + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + found_one = 0; + + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { + if (NFSCTL_VERISSET(nfsd_versbits, i)) { + nfsd_acl_program.pg_vers[i] = + nfsd_acl_version[i]; + found_one = 1; + } else + nfsd_acl_program.pg_vers[i] = NULL; + } + + if (!found_one) { + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) + nfsd_acl_program.pg_vers[i] = + nfsd_acl_version[i]; + } +#endif + atomic_set(&nfsd_busy, 0); error = -ENOMEM; nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE); @@ -411,30 +460,3 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp) nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); return 1; } - -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -static struct svc_stat nfsd_acl_svcstats; -static struct svc_version * nfsd_acl_version[] = { - [2] = &nfsd_acl_version2, - [3] = &nfsd_acl_version3, -}; - -#define NFSD_ACL_NRVERS (sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0])) -static struct svc_program nfsd_acl_program = { - .pg_prog = NFS_ACL_PROGRAM, - .pg_nvers = NFSD_ACL_NRVERS, - .pg_vers = nfsd_acl_version, - .pg_name = "nfsd", - .pg_class = "nfsd", - .pg_stats = &nfsd_acl_svcstats, - .pg_authenticate = &svc_set_client, -}; - -static struct svc_stat nfsd_acl_svcstats = { - .program = &nfsd_acl_program, -}; - -#define nfsd_acl_program_p &nfsd_acl_program -#else -#define nfsd_acl_program_p NULL -#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ -- cgit v1.2.2 From caf736085f2f0d22a992a855d9caae14973f7ea4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 1 Feb 2006 03:04:39 -0800 Subject: [PATCH] smbfs readdir vs signal fix An old patch designed to fix http://bugme.osdl.org/show_bug.cgi?id=4497, "getdents gives empty/random result upon signal". If smbfs's readdir() is interupted by a signal, smb_readdir() failed to noticed that and proceeded to treat the unread-into page as valid directory contents. Fix that up by handling the -ERESTARTSYS. Thanks to Stian Skjelstad for reporting and testing. Cc: Stian Skjelstad Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/smbfs/dir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index c6c33e1514..0424d06b14 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -209,6 +209,8 @@ init_cache: ctl.valid = 1; read_really: result = server->ops->readdir(filp, dirent, filldir, &ctl); + if (result == -ERESTARTSYS && page) + ClearPageUptodate(page); if (ctl.idx == -1) goto invalid_cache; /* retry */ ctl.head.end = ctl.fpos - 1; @@ -217,7 +219,8 @@ finished: if (page) { cache->head = ctl.head; kunmap(page); - SetPageUptodate(page); + if (result != -ERESTARTSYS) + SetPageUptodate(page); unlock_page(page); page_cache_release(page); } -- cgit v1.2.2 From 9cd684551124e71630ab96d238747051463f5b56 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 1 Feb 2006 03:04:40 -0800 Subject: [PATCH] fuse: fix async read for legacy filesystems While asynchronous reads mean a performance improvement in most cases, if the filesystem assumed that reads are synchronous, then async reads may degrade performance (filesystem may receive reads out of order, which can confuse it's own readahead logic). With sshfs a 1.5 to 4 times slowdown can be measured. There's also a need for userspace filesystems to know whether asynchronous reads are supported by the kernel or not. To achive these, negotiate in the INIT request whether async reads will be used and the maximum readahead value. Update interface version to 7.6 If userspace uses a version earlier than 7.6, then disable async reads, and set maximum readahead value to the maximum read size, as done in previous versions. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/file.c | 9 +++++++-- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 14 ++++++++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a7ef5e716f..296351615b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -335,9 +335,14 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file, loff_t pos = page_offset(req->pages[0]); size_t count = req->num_pages << PAGE_CACHE_SHIFT; req->out.page_zeroing = 1; - req->end = fuse_readpages_end; fuse_read_fill(req, file, inode, pos, count, FUSE_READ); - request_send_background(fc, req); + if (fc->async_read) { + req->end = fuse_readpages_end; + request_send_background(fc, req); + } else { + request_send(fc, req); + fuse_readpages_end(fc, req); + } } struct fuse_readpages_data { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 46cf933aa3..4a83adfec9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -272,6 +272,9 @@ struct fuse_conn { reply, before any other request, and never cleared */ unsigned conn_error : 1; + /** Do readpages asynchronously? Only set in INIT */ + unsigned async_read : 1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c755a0440a..879e6fba94 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -473,6 +473,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) fc->conn_error = 1; else { + unsigned long ra_pages; + + if (arg->minor >= 6) { + ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; + if (arg->flags & FUSE_ASYNC_READ) + fc->async_read = 1; + } else + ra_pages = fc->max_read / PAGE_CACHE_SIZE; + + fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; } @@ -496,6 +506,8 @@ static void fuse_send_init(struct fuse_conn *fc) arg->major = FUSE_KERNEL_VERSION; arg->minor = FUSE_KERNEL_MINOR_VERSION; + arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; + arg->flags |= FUSE_ASYNC_READ; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -552,8 +564,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = d.max_read; - if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages) - fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE; /* Used by get_root_inode() */ sb->s_fs_info = fc; -- cgit v1.2.2 From cb82a6cdf994d6656ad0a25ed28395af3416a27c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 1 Feb 2006 03:04:49 -0800 Subject: [PATCH] compat_sys_pselect7() fix fs/compat.c: In function `compat_sys_pselect7': fs/compat.c:1820: warning: passing arg 5 of `compat_core_sys_select' from incompatible pointer type Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index ff0bafcff7..cc58a20df5 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1781,7 +1781,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - long timeout = MAX_SCHEDULE_TIMEOUT; + s64 timeout = MAX_SCHEDULE_TIMEOUT; struct compat_timespec ts; int ret; -- cgit v1.2.2 From 537421be79b94bcf620467f50dd9e38b739c2a00 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 1 Feb 2006 03:05:24 -0800 Subject: [PATCH] Mark CONFIG_UFS_FS_WRITE as BROKEN OpenBSD doesn't see "." correctly in directories created by Linux. Copying files over several KB will buy you infinite loop in __getblk_slow(). Copying files smaller than 1 KB seems to be OK. Sometimes files will be filled with zeros. Sometimes incorrectly copied file will reappear after next file with truncated size. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index ef78e3a42d..93b5dc4082 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1327,7 +1327,7 @@ config UFS_FS config UFS_FS_WRITE bool "UFS file system write support (DANGEROUS)" - depends on UFS_FS && EXPERIMENTAL + depends on UFS_FS && EXPERIMENTAL && BROKEN help Say Y here if you want to try writing to UFS partitions. This is experimental, so you should back up your UFS partitions beforehand. -- cgit v1.2.2 From 4e6a510a74145585f4111d60d1b5fd450d795dd8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 1 Feb 2006 03:05:31 -0800 Subject: [PATCH] mm: hugepage accounting fix 2.6.15's hugepage faulting introduced huge_pages_needed accounting into hugetlbfs: to count how many pages are already in cache, for spot check on how far a new mapping may be allowed to extend the file. But it's muddled: each hugepage found covers HPAGE_SIZE, not PAGE_SIZE. Once pages were already in cache, it would overshoot, wrap its hugepages count backwards, and so fail a harmless repeat mapping with -ENOMEM. Fixes the problem found by Don Dupuis. Signed-off-by: Hugh Dickins Acked-By: Adam Litke Acked-by: William Irwin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f568102da1..b351952899 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -72,8 +72,8 @@ huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) unsigned long start = vma->vm_start; unsigned long end = vma->vm_end; unsigned long hugepages = (end - start) >> HPAGE_SHIFT; - pgoff_t next = vma->vm_pgoff; - pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT); + pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); + pgoff_t endpg = next + hugepages; pagevec_init(&pvec, 0); while (next < endpg) { -- cgit v1.2.2 From e965f9630c651fa4249039fd4b80c9392d07a856 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Feb 2006 03:05:41 -0800 Subject: [PATCH] Direct Migration V9: Avoid writeback / page_migrate() method Migrate a page with buffers without requiring writeback This introduces a new address space operation migratepage() that may be used by a filesystem to implement its own version of page migration. A version is provided that migrates buffers attached to pages. Some filesystems (ext2, ext3, xfs) are modified to utilize this feature. The swapper address space operation are modified so that a regular migrate_page() will occur for anonymous pages without writeback (migrate_pages forces every anonymous page to have a swap entry). Signed-off-by: Mike Kravetz Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 60 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext2/inode.c | 2 ++ fs/ext3/inode.c | 2 ++ fs/xfs/linux-2.6/xfs_aops.c | 1 + fs/xfs/linux-2.6/xfs_buf.c | 1 + 5 files changed, 66 insertions(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 3dc712f29d..8bcbac87a2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3049,6 +3049,66 @@ asmlinkage long sys_bdflush(int func, long data) return 0; } +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +#ifdef CONFIG_MIGRATION +int buffer_migrate_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct buffer_head *bh, *head; + + if (!mapping) + return -EAGAIN; + + if (!page_has_buffers(page)) + return migrate_page(newpage, page); + + head = page_buffers(page); + + if (migrate_page_remove_references(newpage, page, 3)) + return -EAGAIN; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); +#endif + /* * Buffer-head allocation */ diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e7d3f0522d..a717837f27 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -706,6 +706,7 @@ struct address_space_operations ext2_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, + .migratepage = buffer_migrate_page, }; struct address_space_operations ext2_aops_xip = { @@ -723,6 +724,7 @@ struct address_space_operations ext2_nobh_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, + .migratepage = buffer_migrate_page, }; /* diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 8824e84f8a..3fc4238e97 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1559,6 +1559,7 @@ static struct address_space_operations ext3_ordered_aops = { .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, }; static struct address_space_operations ext3_writeback_aops = { @@ -1572,6 +1573,7 @@ static struct address_space_operations ext3_writeback_aops = { .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, }; static struct address_space_operations ext3_journalled_aops = { diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 1206267894..9892268e30 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1462,4 +1462,5 @@ struct address_space_operations linvfs_aops = { .commit_write = generic_commit_write, .bmap = linvfs_bmap, .direct_IO = linvfs_direct_IO, + .migratepage = buffer_migrate_page, }; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index a36a8e3b70..bfb4f2917b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1521,6 +1521,7 @@ xfs_mapping_buftarg( struct address_space *mapping; static struct address_space_operations mapping_aops = { .sync_page = block_sync_page, + .migratepage = fail_migrate_page, }; inode = new_inode(bdev->bd_inode->i_sb); -- cgit v1.2.2 From d739b42b82773206297db1fc0d96ef895a5d9688 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 1 Feb 2006 03:06:43 -0800 Subject: [PATCH] reiserfs: remove kmalloc wrapper Remove kmalloc() wrapper from fs/reiserfs/. Please note that a reiserfs /proc entry format is changed because kmalloc statistics is removed. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/dir.c | 16 +++------- fs/reiserfs/fix_node.c | 50 +++--------------------------- fs/reiserfs/journal.c | 84 ++++++++++++++++---------------------------------- fs/reiserfs/namei.c | 16 +++++----- fs/reiserfs/procfs.c | 3 +- fs/reiserfs/super.c | 6 ---- fs/reiserfs/xattr.c | 11 +++---- 7 files changed, 49 insertions(+), 137 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 9dd71e8070..d71ac65792 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -150,18 +150,15 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (d_reclen <= 32) { local_buf = small_buf; } else { - local_buf = - reiserfs_kmalloc(d_reclen, GFP_NOFS, - inode->i_sb); + local_buf = kmalloc(d_reclen, + GFP_NOFS); if (!local_buf) { pathrelse(&path_to_entry); ret = -ENOMEM; goto out; } if (item_moved(&tmp_ih, &path_to_entry)) { - reiserfs_kfree(local_buf, - d_reclen, - inode->i_sb); + kfree(local_buf); goto research; } } @@ -174,15 +171,12 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir) (dirent, local_buf, d_reclen, d_off, d_ino, DT_UNKNOWN) < 0) { if (local_buf != small_buf) { - reiserfs_kfree(local_buf, - d_reclen, - inode->i_sb); + kfree(local_buf); } goto end; } if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, - inode->i_sb); + kfree(local_buf); } // next entry should be looked for with such offset next_pos = deh_offset(deh) + 1; diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 45829889dc..aa22588019 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -2021,38 +2021,6 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h) return CARRY_ON; } -#ifdef CONFIG_REISERFS_CHECK -void *reiserfs_kmalloc(size_t size, gfp_t flags, struct super_block *s) -{ - void *vp; - static size_t malloced; - - vp = kmalloc(size, flags); - if (vp) { - REISERFS_SB(s)->s_kmallocs += size; - if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) { - reiserfs_warning(s, - "vs-8301: reiserfs_kmalloc: allocated memory %d", - REISERFS_SB(s)->s_kmallocs); - malloced = REISERFS_SB(s)->s_kmallocs; - } - } - return vp; -} - -void reiserfs_kfree(const void *vp, size_t size, struct super_block *s) -{ - kfree(vp); - - REISERFS_SB(s)->s_kmallocs -= size; - if (REISERFS_SB(s)->s_kmallocs < 0) - reiserfs_warning(s, - "vs-8302: reiserfs_kfree: allocated memory %d", - REISERFS_SB(s)->s_kmallocs); - -} -#endif - static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) { int max_num_of_items; @@ -2086,7 +2054,7 @@ static int get_mem_for_virtual_node(struct tree_balance *tb) /* we have to allocate more memory for virtual node */ if (tb->vn_buf) { /* free memory allocated before */ - reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + kfree(tb->vn_buf); /* this is not needed if kfree is atomic */ check_fs = 1; } @@ -2095,24 +2063,15 @@ static int get_mem_for_virtual_node(struct tree_balance *tb) tb->vn_buf_size = size; /* get memory for virtual item */ - buf = - reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, - tb->tb_sb); + buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); if (!buf) { /* getting memory with GFP_KERNEL priority may involve balancing now (due to indirect_to_direct conversion on dcache shrinking). So, release path and collected resources here */ free_buffers_in_tb(tb); - buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb); + buf = kmalloc(size, GFP_NOFS); if (!buf) { -#ifdef CONFIG_REISERFS_CHECK - reiserfs_warning(tb->tb_sb, - "vs-8345: get_mem_for_virtual_node: " - "kmalloc failed. reiserfs kmalloced %d bytes", - REISERFS_SB(tb->tb_sb)-> - s_kmallocs); -#endif tb->vn_buf_size = 0; } tb->vn_buf = buf; @@ -2619,7 +2578,6 @@ void unfix_nodes(struct tree_balance *tb) } } - if (tb->vn_buf) - reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + kfree(tb->vn_buf); } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4491fcf2a0..16b526fd20 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -152,18 +152,16 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block struct reiserfs_bitmap_node *bn; static int id; - bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, - p_s_sb); + bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS); if (!bn) { return NULL; } - bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb); + bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS); if (!bn->data) { - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + kfree(bn); return NULL; } bn->id = id++; - memset(bn->data, 0, p_s_sb->s_blocksize); INIT_LIST_HEAD(&bn->list); return bn; } @@ -197,8 +195,8 @@ static inline void free_bitmap_node(struct super_block *p_s_sb, struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); journal->j_used_bitmap_nodes--; if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { - reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb); - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + kfree(bn->data); + kfree(bn); } else { list_add(&bn->list, &journal->j_bitmap_nodes); journal->j_free_bitmap_nodes++; @@ -276,8 +274,8 @@ static int free_bitmap_nodes(struct super_block *p_s_sb) while (next != &journal->j_bitmap_nodes) { bn = list_entry(next, struct reiserfs_bitmap_node, list); list_del(next); - reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb); - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + kfree(bn->data); + kfree(bn); next = journal->j_bitmap_nodes.next; journal->j_free_bitmap_nodes--; } @@ -581,7 +579,7 @@ static inline void put_journal_list(struct super_block *s, jl->j_trans_id, jl->j_refcount); } if (--jl->j_refcount == 0) - reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s); + kfree(jl); } /* @@ -1818,8 +1816,7 @@ void remove_journal_hash(struct super_block *sb, static void free_journal_ram(struct super_block *p_s_sb) { struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); - reiserfs_kfree(journal->j_current_jl, - sizeof(struct reiserfs_journal_list), p_s_sb); + kfree(journal->j_current_jl); journal->j_num_lists--; vfree(journal->j_cnode_free_orig); @@ -2093,21 +2090,15 @@ static int journal_read_transaction(struct super_block *p_s_sb, } trans_id = get_desc_trans_id(desc); /* now we know we've got a good transaction, and it was inside the valid time ranges */ - log_blocks = - reiserfs_kmalloc(get_desc_trans_len(desc) * - sizeof(struct buffer_head *), GFP_NOFS, p_s_sb); - real_blocks = - reiserfs_kmalloc(get_desc_trans_len(desc) * - sizeof(struct buffer_head *), GFP_NOFS, p_s_sb); + log_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); + real_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); if (!log_blocks || !real_blocks) { brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); reiserfs_warning(p_s_sb, "journal-1169: kmalloc failed, unable to mount FS"); return -1; @@ -2145,12 +2136,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, brelse_array(real_blocks, i); brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return -1; } } @@ -2166,12 +2153,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, brelse_array(real_blocks, get_desc_trans_len(desc)); brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return -1; } memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, @@ -2193,12 +2176,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, get_desc_trans_len(desc) - i); brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return -1; } brelse(real_blocks[i]); @@ -2217,12 +2196,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, journal->j_trans_id = trans_id + 1; brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), - p_s_sb); - reiserfs_kfree(real_blocks, - le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), - p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return 0; } @@ -2472,13 +2447,11 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) { struct reiserfs_journal_list *jl; retry: - jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, - s); + jl = kzalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS); if (!jl) { yield(); goto retry; } - memset(jl, 0, sizeof(*jl)); INIT_LIST_HEAD(&jl->j_list); INIT_LIST_HEAD(&jl->j_working_list); INIT_LIST_HEAD(&jl->j_tail_bh_list); @@ -3042,14 +3015,12 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct } return th; } - th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), - GFP_NOFS, s); + th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); if (!th) return NULL; ret = journal_begin(th, s, nblocks); if (ret) { - reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), - s); + kfree(th); return NULL; } @@ -3067,8 +3038,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) ret = -EIO; if (th->t_refcount == 0) { SB_JOURNAL(s)->j_persistent_trans--; - reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), - s); + kfree(th); } return ret; } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 8f8d8d0110..c8123308e0 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -456,7 +456,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, /* get memory for composing the entry */ buflen = DEH_SIZE + ROUND_UP(namelen); if (buflen > sizeof(small_buf)) { - buffer = reiserfs_kmalloc(buflen, GFP_NOFS, dir->i_sb); + buffer = kmalloc(buflen, GFP_NOFS); if (buffer == 0) return -ENOMEM; } else @@ -490,7 +490,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, retval = reiserfs_find_entry(dir, name, namelen, &path, &de); if (retval != NAME_NOT_FOUND) { if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); pathrelse(&path); if (retval == IO_ERROR) { @@ -515,7 +515,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, reiserfs_warning(dir->i_sb, "reiserfs_add_entry: Congratulations! we have got hash function screwed up"); if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); pathrelse(&path); return -EBUSY; } @@ -535,7 +535,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, &entry_key); if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); pathrelse(&path); return -EBUSY; } @@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer, paste_size); if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); if (retval) { reiserfs_check_path(&path); return retval; @@ -1065,7 +1065,7 @@ static int reiserfs_symlink(struct inode *parent_dir, goto out_failed; } - name = reiserfs_kmalloc(item_len, GFP_NOFS, parent_dir->i_sb); + name = kmalloc(item_len, GFP_NOFS); if (!name) { drop_new_inode(inode); retval = -ENOMEM; @@ -1079,14 +1079,14 @@ static int reiserfs_symlink(struct inode *parent_dir, retval = journal_begin(&th, parent_dir->i_sb, jbegin_count); if (retval) { drop_new_inode(inode); - reiserfs_kfree(name, item_len, parent_dir->i_sb); + kfree(name); goto out_failed; } retval = reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname), dentry, inode); - reiserfs_kfree(name, item_len, parent_dir->i_sb); + kfree(name); if (retval) { /* reiserfs_new_inode iputs for us */ goto out_failed; } diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index fc2f43c75d..ef6caed933 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -88,7 +88,6 @@ static int show_super(struct seq_file *m, struct super_block *sb) seq_printf(m, "state: \t%s\n" "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" "gen. counter: \t%i\n" - "s_kmallocs: \t%i\n" "s_disk_reads: \t%i\n" "s_disk_writes: \t%i\n" "s_fix_nodes: \t%i\n" @@ -128,7 +127,7 @@ static int show_super(struct seq_file *m, struct super_block *sb) "SMALL_TAILS " : "NO_TAILS ", replay_only(sb) ? "REPLAY_ONLY " : "", convert_reiserfs(sb) ? "CONV " : "", - atomic_read(&r->s_generation_counter), SF(s_kmallocs), + atomic_read(&r->s_generation_counter), SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), SF(s_do_balance), SF(s_unneeded_left_neighbor), SF(s_good_search_by_key_reada), SF(s_bmaps), diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 397d9590c8..77891de0e0 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -472,12 +472,6 @@ static void reiserfs_put_super(struct super_block *s) print_statistics(s); - if (REISERFS_SB(s)->s_kmallocs != 0) { - reiserfs_warning(s, - "vs-2004: reiserfs_put_super: allocated memory left %d", - REISERFS_SB(s)->s_kmallocs); - } - if (REISERFS_SB(s)->reserved_blocks != 0) { reiserfs_warning(s, "green-2005: reiserfs_put_super: reserved blocks left %d", diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index cc061bfd43..4f0db4e545 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -368,15 +368,13 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) if (d_reclen <= 32) { local_buf = small_buf; } else { - local_buf = - reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb); + local_buf = kmalloc(d_reclen, GFP_NOFS); if (!local_buf) { pathrelse(&path_to_entry); return -ENOMEM; } if (item_moved(&tmp_ih, &path_to_entry)) { - reiserfs_kfree(local_buf, d_reclen, - inode->i_sb); + kfree(local_buf); /* sigh, must retry. Do this same offset again */ next_pos = d_off; @@ -399,13 +397,12 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filldir(dirent, local_buf, d_reclen, d_off, d_ino, DT_UNKNOWN) < 0) { if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, - inode->i_sb); + kfree(local_buf); } goto end; } if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, inode->i_sb); + kfree(local_buf); } } /* while */ -- cgit v1.2.2 From 8c777cc4be1390862d053cbc002246e87572147b Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 1 Feb 2006 03:06:43 -0800 Subject: [PATCH] reiserfs: use __GFP_NOFAIL instead of yield and retry loop for allocation This patch replaces yield and retry loop with __GFP_NOFAIL in alloc_journal_list(). Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 16b526fd20..2d04efb22e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2446,12 +2446,8 @@ static int journal_read(struct super_block *p_s_sb) static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) { struct reiserfs_journal_list *jl; - retry: - jl = kzalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS); - if (!jl) { - yield(); - goto retry; - } + jl = kzalloc(sizeof(struct reiserfs_journal_list), + GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&jl->j_list); INIT_LIST_HEAD(&jl->j_working_list); INIT_LIST_HEAD(&jl->j_tail_bh_list); -- cgit v1.2.2 From e5dd259f78ba0fd0c7bfc5c52179dbbff3eb48aa Mon Sep 17 00:00:00 2001 From: Diego Calleja Date: Wed, 1 Feb 2006 03:06:44 -0800 Subject: [PATCH] reiserfs: missing kmalloc failure check According to http://bugzilla.kernel.org/show_bug.cgi?id=5778 fs/reiserfs/file.c is missing this check. Signed-off-by: Diego Calleja Cc: Jeff Mahoney Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index ad6fa964b0..1cad5d066a 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -192,6 +192,8 @@ static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handl allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) * sizeof(b_blocknr_t), GFP_NOFS); + if (!allocated_blocks) + return -ENOMEM; /* First we compose a key to point at the writing position, we want to do that outside of any locking region. */ -- cgit v1.2.2 From c87d0c07ea198db1ce451421904edd60b7d385ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Feb 2006 03:06:45 -0800 Subject: [PATCH] reiserfs: remove reiserfs_permission_locked This function is completely unused since the xattr permission checking changes. Remove it and fold __reiserfs_permission into reiserfs_permission. Signed-off-by: Christoph Hellwig Cc: Jeff Mahoney Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4f0db4e545..2f085845f6 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -1319,9 +1319,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) return err; } -static int -__reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, - int need_lock) +int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) { umode_t mode = inode->i_mode; @@ -1357,15 +1355,14 @@ __reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, if (!(mode & S_IRWXG)) goto check_groups; - if (need_lock) { - reiserfs_read_lock_xattr_i(inode); - reiserfs_read_lock_xattrs(inode->i_sb); - } + reiserfs_read_lock_xattr_i(inode); + reiserfs_read_lock_xattrs(inode->i_sb); + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - if (need_lock) { - reiserfs_read_unlock_xattrs(inode->i_sb); - reiserfs_read_unlock_xattr_i(inode); - } + + reiserfs_read_unlock_xattrs(inode->i_sb); + reiserfs_read_unlock_xattr_i(inode); + if (IS_ERR(acl)) { if (PTR_ERR(acl) == -ENODATA) goto check_groups; @@ -1414,14 +1411,3 @@ __reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, return -EACCES; } - -int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - return __reiserfs_permission(inode, mask, nd, 1); -} - -int -reiserfs_permission_locked(struct inode *inode, int mask, struct nameidata *nd) -{ - return __reiserfs_permission(inode, mask, nd, 0); -} -- cgit v1.2.2 From ec191574b9c3cb7bfb95e4f803b63f7c8dc52690 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Feb 2006 03:06:46 -0800 Subject: [PATCH] reiserfs: use generic_permission Use the generic_permission code with a proper wrapper and callback instead of having a local copy. Signed-off-by: Christoph Hellwig Cc: Jeff Mahoney Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr.c | 103 +++++++++++++--------------------------------------- 1 file changed, 26 insertions(+), 77 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 2f085845f6..ffb79c48c5 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -1319,95 +1319,44 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) return err; } -int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) +static int reiserfs_check_acl(struct inode *inode, int mask) { - umode_t mode = inode->i_mode; - - if (mask & MAY_WRITE) { - /* - * Nobody gets write access to a read-only fs. - */ - if (IS_RDONLY(inode) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) - return -EROFS; - - /* - * Nobody gets write access to an immutable file. - */ - if (IS_IMMUTABLE(inode)) - return -EACCES; - } - - /* We don't do permission checks on the internal objects. - * Permissions are determined by the "owning" object. */ - if (is_reiserfs_priv_object(inode)) - return 0; - - if (current->fsuid == inode->i_uid) { - mode >>= 6; -#ifdef CONFIG_REISERFS_FS_POSIX_ACL - } else if (reiserfs_posixacl(inode->i_sb) && - get_inode_sd_version(inode) != STAT_DATA_V1) { - struct posix_acl *acl; - - /* ACL can't contain additional permissions if - the ACL_MASK entry is 0 */ - if (!(mode & S_IRWXG)) - goto check_groups; + struct posix_acl *acl; + int error = -EAGAIN; /* do regular unix permission checks by default */ - reiserfs_read_lock_xattr_i(inode); - reiserfs_read_lock_xattrs(inode->i_sb); - - acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + reiserfs_read_lock_xattr_i(inode); + reiserfs_read_lock_xattrs(inode->i_sb); - reiserfs_read_unlock_xattrs(inode->i_sb); - reiserfs_read_unlock_xattr_i(inode); + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) { - if (PTR_ERR(acl) == -ENODATA) - goto check_groups; - return PTR_ERR(acl); - } + reiserfs_read_unlock_xattrs(inode->i_sb); + reiserfs_read_unlock_xattr_i(inode); - if (acl) { - int err = posix_acl_permission(inode, acl, mask); + if (acl) { + if (!IS_ERR(acl)) { + error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); - if (err == -EACCES) { - goto check_capabilities; - } - return err; - } else { - goto check_groups; - } -#endif - } else { - check_groups: - if (in_group_p(inode->i_gid)) - mode >>= 3; + } else if (PTR_ERR(acl) != -ENODATA) + error = PTR_ERR(acl); } - /* - * If the DACs are ok we don't need any capability check. - */ - if (((mode & mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == mask)) - return 0; + return error; +} - check_capabilities: +int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) +{ /* - * Read/write DACs are always overridable. - * Executable DACs are overridable if at least one exec bit is set. + * We don't do permission checks on the internal objects. + * Permissions are determined by the "owning" object. */ - if (!(mask & MAY_EXEC) || - (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) - if (capable(CAP_DAC_OVERRIDE)) - return 0; + if (is_reiserfs_priv_object(inode)) + return 0; /* - * Searching includes executable on directories, else just read. + * Stat data v1 doesn't support ACLs. */ - if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) - if (capable(CAP_DAC_READ_SEARCH)) - return 0; - - return -EACCES; + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return generic_permission(inode, mask, NULL); + else + return generic_permission(inode, mask, reiserfs_check_acl); } -- cgit v1.2.2 From d62b1b87a7d1c3a21dddabed4251763090be3182 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 1 Feb 2006 03:06:47 -0800 Subject: [PATCH] resierfs: fix reiserfs_invalidatepage race against data=ordered After a transaction has closed but before it has finished commit, there is a window where data=ordered mode requires invalidatepage to pin pages instead of freeing them. This patch fixes a race between the invalidatepage checks and data=ordered writeback, and it also adds a check to the reiserfs write_ordered_buffers routines to write any anonymous buffers that were dirtied after its first writeback loop. That bug works like this: proc1: transaction closes and a new one starts proc1: write_ordered_buffers starts processing data=ordered list proc1: buffer A is cleaned and written proc2: buffer A is dirtied by another process proc2: File is truncated to zero, page A goes through invalidatepage proc2: reiserfs_invalidatepage sees dirty buffer A with reiserfs journal head, pins it proc1: write_ordered_buffers frees the journal head on buffer A At this point, buffer A stays dirty forever Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/inode.c | 4 +++- fs/reiserfs/journal.c | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ffa34b861b..60e2f23447 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2743,6 +2743,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) int ret = 1; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + lock_buffer(bh); spin_lock(&j->j_dirty_buffers_lock); if (!buffer_mapped(bh)) { goto free_jh; @@ -2758,7 +2759,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { ret = 0; } - } else if (buffer_dirty(bh) || buffer_locked(bh)) { + } else if (buffer_dirty(bh)) { struct reiserfs_journal_list *jl; struct reiserfs_jh *jh = bh->b_private; @@ -2784,6 +2785,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) reiserfs_free_jh(bh); } spin_unlock(&j->j_dirty_buffers_lock); + unlock_buffer(bh); return ret; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 2d04efb22e..bc8fe963b3 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -877,6 +877,19 @@ static int write_ordered_buffers(spinlock_t * lock, if (!buffer_uptodate(bh)) { ret = -EIO; } + /* ugly interaction with invalidatepage here. + * reiserfs_invalidate_page will pin any buffer that has a valid + * journal head from an older transaction. If someone else sets + * our buffer dirty after we write it in the first loop, and + * then someone truncates the page away, nobody will ever write + * the buffer. We're safe if we write the page one last time + * after freeing the journal header. + */ + if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { + spin_unlock(lock); + ll_rw_block(WRITE, 1, &bh); + spin_lock(lock); + } put_bh(bh); cond_resched_lock(lock); } -- cgit v1.2.2 From fc5cd582e9c934ddaf6f310179488932cd154794 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 1 Feb 2006 03:06:48 -0800 Subject: [PATCH] reiserfs: zero b_private when allocating buffer heads The b_private field in buffer heads needs to be zero filled when the buffers are allocated. Thanks to Nathan Scott for finding this. It was causing problems on systems with both XFS and reiserfs. Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 8bcbac87a2..5e4a90ee10 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1022,6 +1022,7 @@ try_again: bh->b_state = 0; atomic_set(&bh->b_count, 0); + bh->b_private = NULL; bh->b_size = size; /* Link the buffer to its page */ -- cgit v1.2.2 From e0e851cf30f1a9bd2e2a7624e9810378d6a2b072 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 1 Feb 2006 03:06:49 -0800 Subject: [PATCH] reiserfs: reiserfs hang and performance fix for data=journal mode In data=journal mode, reiserfs writepage needs to make sure not to trigger transactions while being run under PF_MEMALLOC. This patch makes sure to redirty the page instead of forcing a transaction start in this case. Also, calling filemap_fdata* in order to trigger io on the block device can cause lock inversions on the page lock. Instead, do simple batching from flush_commit_list. Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/inode.c | 7 +++++++ fs/reiserfs/journal.c | 19 ++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 60e2f23447..b33d67bba2 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2363,6 +2363,13 @@ static int reiserfs_write_full_page(struct page *page, int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; th.t_trans_id = 0; + /* no logging allowed when nonblocking or from PF_MEMALLOC */ + if (checked && (current->flags & PF_MEMALLOC)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + /* The page dirty bit is cleared before writepage is called, which * means we have to tell create_empty_buffers to make dirty buffers * The page really should be up to date at this point, so tossing diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index bc8fe963b3..1b2402a9a8 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -988,6 +988,7 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal *journal = SB_JOURNAL(s); int barrier = 0; int retval = 0; + int write_len; reiserfs_check_lock_depth(s, "flush_commit_list"); @@ -1037,16 +1038,24 @@ static int flush_commit_list(struct super_block *s, BUG_ON(!list_empty(&jl->j_bh_list)); /* * for the description block and all the log blocks, submit any buffers - * that haven't already reached the disk + * that haven't already reached the disk. Try to write at least 256 + * log blocks. later on, we will only wait on blocks that correspond + * to this transaction, but while we're unplugging we might as well + * get a chunk of data on there. */ atomic_inc(&journal->j_async_throttle); - for (i = 0; i < (jl->j_len + 1); i++) { + write_len = jl->j_len + 1; + if (write_len < 256) + write_len = 256; + for (i = 0 ; i < write_len ; i++) { bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); - if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */ - ll_rw_block(SWRITE, 1, &tbh); - put_bh(tbh); + if (tbh) { + if (buffer_dirty(tbh)) + ll_rw_block(WRITE, 1, &tbh) ; + put_bh(tbh) ; + } } atomic_dec(&journal->j_async_throttle); -- cgit v1.2.2 From 3d4492f81dd7b486f1be0616a1ce7f73760f406e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 1 Feb 2006 03:06:49 -0800 Subject: [PATCH] reiserfs: reiserfs write_ordered_buffers should not oops on dirty non-uptodate bh write_ordered_buffers should handle dirty non-uptodate buffers without a BUG() Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 1b2402a9a8..47c9f432de 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -846,6 +846,14 @@ static int write_ordered_buffers(spinlock_t * lock, spin_lock(lock); goto loop_next; } + /* in theory, dirty non-uptodate buffers should never get here, + * but the upper layer io error paths still have a few quirks. + * Handle them here as gracefully as we can + */ + if (!buffer_uptodate(bh) && buffer_dirty(bh)) { + clear_buffer_dirty(bh); + ret = -EIO; + } if (buffer_dirty(bh)) { list_del_init(&jh->list); list_add(&jh->list, &tmp); @@ -1030,9 +1038,12 @@ static int flush_commit_list(struct super_block *s, } if (!list_empty(&jl->j_bh_list)) { + int ret; unlock_kernel(); - write_ordered_buffers(&journal->j_dirty_buffers_lock, - journal, jl, &jl->j_bh_list); + ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_bh_list); + if (ret < 0 && retval == 0) + retval = ret; lock_kernel(); } BUG_ON(!list_empty(&jl->j_bh_list)); -- cgit v1.2.2 From 6ae1ea447d21c4fecf5df8d0e1022461274fb4e8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 1 Feb 2006 03:06:50 -0800 Subject: [PATCH] reiserfs: reiserfs fix journal accounting in journal_transaction_should_end reiserfs: journal_transaction_should_end should increase the count of blocks allocated so the transaction subsystem can keep new writers from creating a transaction that is too large. Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 47c9f432de..b7a179560a 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2823,6 +2823,9 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, journal->j_cnode_free < (journal->j_trans_max * 3)) { return 1; } + /* protected by the BKL here */ + journal->j_len_alloc += new_alloc; + th->t_blocks_allocated += new_alloc ; return 0; } -- cgit v1.2.2 From fa385bef256077f3b820b241e8f3755ef3905b74 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 1 Feb 2006 03:06:51 -0800 Subject: [PATCH] reiserfs: reiserfs: check for files > 2GB on 3.5.x disks When a filesystem has been converted from 3.5.x to 3.6.x, we need an extra check during file write to make sure we are not trying to make a 3.5.x file > 2GB. Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/file.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 1cad5d066a..f3473176c8 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1287,6 +1287,23 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t struct reiserfs_transaction_handle th; th.t_trans_id = 0; + /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items + * lying around (most of the disk, in fact). Despite the filesystem + * now being a v3.6 format, the old items still can't support large + * file sizes. Catch this case here, as the rest of the VFS layer is + * oblivious to the different limitations between old and new items. + * reiserfs_setattr catches this for truncates. This chunk is lifted + * from generic_write_checks. */ + if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 && + *ppos + count > MAX_NON_LFS) { + if (*ppos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + if (count > MAX_NON_LFS - (unsigned long)*ppos) + count = MAX_NON_LFS - (unsigned long)*ppos; + } + if (file->f_flags & O_DIRECT) { // Direct IO needs treatment ssize_t result, after_file_end = 0; if ((*ppos + count >= inode->i_size) -- cgit v1.2.2 From 7045f37b17ffa6e85435ca980122b46a74caa7e4 Mon Sep 17 00:00:00 2001 From: Martin Waitz Date: Wed, 1 Feb 2006 03:06:57 -0800 Subject: [PATCH] DocBook: fix some kernel-doc comments in fs and block Update some parameter descriptions to actually match the code. Signed-off-by: Martin Waitz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 2 +- fs/namei.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 108138d4e9..d0be6159eb 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1179,7 +1179,7 @@ EXPORT_SYMBOL(bmap); /** * touch_atime - update the access time * @mnt: mount the inode is accessed on - * @inode: inode accessed + * @dentry: dentry accessed * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, diff --git a/fs/namei.c b/fs/namei.c index 4acdac043b..7ac9fb4acb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1161,6 +1161,7 @@ static int __path_lookup_intent_open(int dfd, const char *name, /** * path_lookup_open - lookup a file path with open intent + * @dfd: the directory to use as base, or AT_FDCWD * @name: pointer to file name * @lookup_flags: lookup intent flags * @nd: pointer to nameidata @@ -1175,6 +1176,7 @@ int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags, /** * path_lookup_create - lookup a file path with open + create intent + * @dfd: the directory to use as base, or AT_FDCWD * @name: pointer to file name * @lookup_flags: lookup intent flags * @nd: pointer to nameidata -- cgit v1.2.2 From 16fb24252a8170799e7adf14d8fc31b817fcaf53 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Feb 2006 12:18:22 -0500 Subject: NLM: Fix arguments to NLM_CANCEL call The OpenGroup docs state that the arguments "block", "exclusive" and "alock" must exactly match the arguments for the lock call that we are trying to cancel. Currently, "block" is always set to false, which is wrong. See bug# 5956 on bugzilla.kernel.org. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 1455240395..b8ecfa1168 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -28,6 +28,7 @@ static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *); static int nlm_stat_to_errno(u32 stat); static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host); +static int nlmclnt_cancel(struct nlm_host *, int , struct file_lock *); static const struct rpc_call_ops nlmclnt_unlock_ops; static const struct rpc_call_ops nlmclnt_cancel_ops; @@ -598,7 +599,7 @@ out_unblock: nlmclnt_finish_block(req); /* Cancel the blocked request if it is still pending */ if (resp->status == NLM_LCK_BLOCKED) - nlmclnt_cancel(host, fl); + nlmclnt_cancel(host, req->a_args.block, fl); out: nlmclnt_release_lockargs(req); return status; @@ -728,8 +729,7 @@ static const struct rpc_call_ops nlmclnt_unlock_ops = { * We always use an async RPC call for this in order not to hang a * process that has been Ctrl-C'ed. */ -int -nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl) +static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl) { struct nlm_rqst *req; unsigned long flags; @@ -750,6 +750,7 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl) req->a_flags = RPC_TASK_ASYNC; nlmclnt_setlockargs(req, fl); + req->a_args.block = block; status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status < 0) { -- cgit v1.2.2 From aaaa99423b4b1f9cfd33ea5643d9274c25f62491 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Feb 2006 12:18:25 -0500 Subject: NLM: Ensure that nlmclnt_cancel_callback() doesn't loop forever If the server returns NLM_LCK_DENIED_NOLOCKS, we currently retry the entire NLM_CANCEL request. This may end up looping forever unless the server changes its mind (why would it do that, though?). Ensure that we limit the number of retries (to 3). See bug# 5957 in bugzilla.kernel.org. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index b8ecfa1168..220058d861 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -22,6 +22,7 @@ #define NLMDBG_FACILITY NLMDBG_CLIENT #define NLMCLNT_GRACE_WAIT (5*HZ) #define NLMCLNT_POLL_TIMEOUT (30*HZ) +#define NLMCLNT_MAX_RETRIES 3 static int nlmclnt_test(struct nlm_rqst *, struct file_lock *); static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); @@ -802,6 +803,9 @@ die: return; retry_cancel: + /* Don't ever retry more than 3 times */ + if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) + goto die; nlm_rebind_host(req->a_host); rpc_restart_call(task); rpc_delay(task, 30 * HZ); -- cgit v1.2.2 From 1935245655996ca4d14e687c3a100d2e2bbdc78d Mon Sep 17 00:00:00 2001 From: Dirk Mueller Date: Wed, 1 Feb 2006 12:19:47 -0500 Subject: NFSv3: fix sync_retry in direct i/o NFS Only do a sync_retry if the memcmp failed. Signed-off-by: Dirk Mueller Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 10ae377e68..04ab2fc360 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -481,7 +481,7 @@ retry: if (wdata->verf.committed != NFS_FILE_SYNC) { need_commit = 1; if (memcmp(&first_verf.verifier, &wdata->verf.verifier, - sizeof(first_verf.verifier))); + sizeof(first_verf.verifier))) goto sync_retry; } -- cgit v1.2.2 From 9ad11ab48b1ad618bf47076e9e579f267f5306c2 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 2 Feb 2006 16:11:51 +1100 Subject: [PATCH] compat: fix compat_sys_openat and friends Most of the 64 bit architectures will zero extend the first argument to compat_sys_{openat,newfstatat,futimesat} which will fail if the 32 bit syscall was passed AT_FDCWD (which is a small negative number). Declare the first argument to be an unsigned int which will force the correct sign extension when the internal functions are called in each case. Also, do some small white space cleanups in fs/compat.c. Signed-off-by: Stephen Rothwell Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- fs/compat.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index cc58a20df5..70c5af4cc2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -73,17 +73,17 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __ return do_utimes(AT_FDCWD, filename, t ? tv : NULL); } -asmlinkage long compat_sys_futimesat(int dfd, char __user *filename, struct compat_timeval __user *t) +asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t) { struct timeval tv[2]; - if (t) { + if (t) { if (get_user(tv[0].tv_sec, &t[0].tv_sec) || get_user(tv[0].tv_usec, &t[0].tv_usec) || get_user(tv[1].tv_sec, &t[1].tv_sec) || get_user(tv[1].tv_usec, &t[1].tv_usec)) - return -EFAULT; - } + return -EFAULT; + } return do_utimes(dfd, filename, t ? tv : NULL); } @@ -114,7 +114,7 @@ asmlinkage long compat_sys_newlstat(char __user * filename, return error; } -asmlinkage long compat_sys_newfstatat(int dfd, char __user *filename, +asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename, struct compat_stat __user *statbuf, int flag) { struct kstat stat; @@ -1326,7 +1326,7 @@ compat_sys_open(const char __user *filename, int flags, int mode) * O_LARGEFILE flag. */ asmlinkage long -compat_sys_openat(int dfd, const char __user *filename, int flags, int mode) +compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode) { return do_sys_open(dfd, filename, flags, mode); } -- cgit v1.2.2 From d35c602870ece3166cff3d25fbc687a7f707acf3 Mon Sep 17 00:00:00 2001 From: Vitaly Fertman Date: Fri, 3 Feb 2006 03:04:01 -0800 Subject: [PATCH] someone broke reiserfs V3 mount options, this fixes it Signed-off-by: Hans Reiser Signed-off-by: Vitaly Fertman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 77891de0e0..ef5e5414e7 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1125,7 +1125,7 @@ static void handle_attrs(struct super_block *s) REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); } } else if (le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared) { - REISERFS_SB(s)->s_mount_opt |= REISERFS_ATTRS; + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ATTRS); } } -- cgit v1.2.2 From 8c17e1eb05977283bc7ad94d16ace3a0d586921a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 3 Feb 2006 03:04:03 -0800 Subject: [PATCH] quota_v2: printk warning fixes fs/quota_v2.c: In function `v2_check_quota_file': fs/quota_v2.c:39: warning: int format, different type arg (arg 2) fs/quota_v2.c:39: warning: int format, different type arg (arg 3) Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/quota_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/quota_v2.c b/fs/quota_v2.c index a4ef91bb4f..b4199ec3ec 100644 --- a/fs/quota_v2.c +++ b/fs/quota_v2.c @@ -35,7 +35,7 @@ static int v2_check_quota_file(struct super_block *sb, int type) size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); if (size != sizeof(struct v2_disk_dqheader)) { - printk("quota_v2: failed read expected=%d got=%d\n", + printk("quota_v2: failed read expected=%zd got=%zd\n", sizeof(struct v2_disk_dqheader), size); return 0; } -- cgit v1.2.2 From e295cfcb2907ae4c5df57f5d4ada1ce6f3ae4657 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Fri, 3 Feb 2006 03:04:04 -0800 Subject: [PATCH] ufs: fix oops with `ufs1' type "rm" command, on file system with "ufs1" type cause system hang up. This is, in fact, not so bad as it seems to be, because of after that in "kernel control path" there are 3-4 places which may cause "oops". So the first patch fix oopses, and the second patch fix "kernel hang up". "oops" appears because of reading of group's summary info partly wrong, and access to not first group's summary info cause "oops". Signed-off-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ufs/super.c b/fs/ufs/super.c index d4aacee593..e9055ef7f5 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -388,7 +388,8 @@ static int ufs_parse_options (char * options, unsigned * mount_options) /* * Read on-disk structures associated with cylinder groups */ -static int ufs_read_cylinder_structures (struct super_block *sb) { +static int ufs_read_cylinder_structures (struct super_block *sb) +{ struct ufs_sb_info * sbi = UFS_SB(sb); struct ufs_sb_private_info * uspi; struct ufs_super_block *usb; @@ -415,6 +416,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb) { base = space = kmalloc(size, GFP_KERNEL); if (!base) goto failed; + sbi->s_csp = (struct ufs_csum *)space; for (i = 0; i < blks; i += uspi->s_fpb) { size = uspi->s_bsize; if (i + uspi->s_fpb > blks) @@ -430,7 +432,6 @@ static int ufs_read_cylinder_structures (struct super_block *sb) { goto failed; ubh_ubhcpymem (space, ubh, size); - sbi->s_csp[ufs_fragstoblks(i)]=(struct ufs_csum *)space; space += size; ubh_brelse (ubh); @@ -486,7 +487,8 @@ failed: * Put on-disk structures associated with cylinder groups and * write them back to disk */ -static void ufs_put_cylinder_structures (struct super_block *sb) { +static void ufs_put_cylinder_structures (struct super_block *sb) +{ struct ufs_sb_info * sbi = UFS_SB(sb); struct ufs_sb_private_info * uspi; struct ufs_buffer_head * ubh; @@ -499,7 +501,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb) { size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; - base = space = (char*) sbi->s_csp[0]; + base = space = (char*) sbi->s_csp; for (i = 0; i < blks; i += uspi->s_fpb) { size = uspi->s_bsize; if (i + uspi->s_fpb > blks) -- cgit v1.2.2 From 09114eb8c53d2d3b2ff9523e011cb68b2e245dce Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Fri, 3 Feb 2006 03:04:06 -0800 Subject: [PATCH] ufs: fix hang during `rm' This fixes the code like this: bh = sb_find_get_block (sb, tmp + j); if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { retry = 1; brelse (bh); goto next1; } bforget (bh); sb_find_get_block() ordinarily returns a buffer_head with b_count>=2, and this code assume that in case if "b_count>1" buffer is used, so this caused infinite loop. (akpm: that is-the-buffer-busy code is incomprehensible. Good riddance. Use of block_truncate_page() seems sane). Signed-off-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/inode.c | 2 +- fs/ufs/truncate.c | 72 +++++++++++++------------------------------------------ 2 files changed, 17 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e0c04e36a0..3c3f62ce2a 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -376,7 +376,7 @@ out: * This function gets the block which contains the fragment. */ -static int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) +int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) { struct super_block * sb = inode->i_sb; struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 61d2e35012..02e86291ef 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -29,6 +29,11 @@ * Idea from Pierre del Perugia */ +/* + * Modified to avoid infinite loop on 2006 by + * Evgeniy Dushistov + */ + #include #include #include @@ -65,19 +70,16 @@ #define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) -#define DATA_BUFFER_USED(bh) \ - (atomic_read(&bh->b_count)>1 || buffer_locked(bh)) static int ufs_trunc_direct (struct inode * inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block * sb; struct ufs_sb_private_info * uspi; - struct buffer_head * bh; __fs32 * p; unsigned frag1, frag2, frag3, frag4, block1, block2; unsigned frag_to_free, free_count; - unsigned i, j, tmp; + unsigned i, tmp; int retry; UFSD(("ENTER\n")) @@ -117,15 +119,7 @@ static int ufs_trunc_direct (struct inode * inode) ufs_panic (sb, "ufs_trunc_direct", "internal error"); frag1 = ufs_fragnum (frag1); frag2 = ufs_fragnum (frag2); - for (j = frag1; j < frag2; j++) { - bh = sb_find_get_block (sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { - retry = 1; - brelse (bh); - goto next1; - } - bforget (bh); - } + inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift; mark_inode_dirty(inode); ufs_free_fragments (inode, tmp + frag1, frag2 - frag1); @@ -140,15 +134,7 @@ next1: tmp = fs32_to_cpu(sb, *p); if (!tmp) continue; - for (j = 0; j < uspi->s_fpb; j++) { - bh = sb_find_get_block(sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { - retry = 1; - brelse (bh); - goto next2; - } - bforget (bh); - } + *p = 0; inode->i_blocks -= uspi->s_nspb; mark_inode_dirty(inode); @@ -162,7 +148,6 @@ next1: frag_to_free = tmp; free_count = uspi->s_fpb; } -next2:; } if (free_count > 0) @@ -179,15 +164,7 @@ next2:; if (!tmp ) ufs_panic(sb, "ufs_truncate_direct", "internal error"); frag4 = ufs_fragnum (frag4); - for (j = 0; j < frag4; j++) { - bh = sb_find_get_block (sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { - retry = 1; - brelse (bh); - goto next1; - } - bforget (bh); - } + *p = 0; inode->i_blocks -= frag4 << uspi->s_nspfshift; mark_inode_dirty(inode); @@ -204,9 +181,8 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_buffer_head * ind_ubh; - struct buffer_head * bh; __fs32 * ind; - unsigned indirect_block, i, j, tmp; + unsigned indirect_block, i, tmp; unsigned frag_to_free, free_count; int retry; @@ -238,15 +214,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) tmp = fs32_to_cpu(sb, *ind); if (!tmp) continue; - for (j = 0; j < uspi->s_fpb; j++) { - bh = sb_find_get_block(sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *ind)) { - retry = 1; - brelse (bh); - goto next; - } - bforget (bh); - } + *ind = 0; ubh_mark_buffer_dirty(ind_ubh); if (free_count == 0) { @@ -261,7 +229,6 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) } inode->i_blocks -= uspi->s_nspb; mark_inode_dirty(inode); -next:; } if (free_count > 0) { @@ -430,9 +397,7 @@ void ufs_truncate (struct inode * inode) struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block * sb; struct ufs_sb_private_info * uspi; - struct buffer_head * bh; - unsigned offset; - int err, retry; + int retry; UFSD(("ENTER\n")) sb = inode->i_sb; @@ -442,6 +407,9 @@ void ufs_truncate (struct inode * inode) return; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; + + block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); + lock_kernel(); while (1) { retry = ufs_trunc_direct(inode); @@ -457,15 +425,7 @@ void ufs_truncate (struct inode * inode) blk_run_address_space(inode->i_mapping); yield(); } - offset = inode->i_size & uspi->s_fshift; - if (offset) { - bh = ufs_bread (inode, inode->i_size >> uspi->s_fshift, 0, &err); - if (bh) { - memset (bh->b_data + offset, 0, uspi->s_fsize - offset); - mark_buffer_dirty (bh); - brelse (bh); - } - } + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; ufsi->i_lastfrag = DIRECT_FRAGMENT; unlock_kernel(); -- cgit v1.2.2 From 47ba87e0b1269698801310bfd1716b0538282405 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Feb 2006 03:04:06 -0800 Subject: [PATCH] make "struct d_cookie" depend on CONFIG_PROFILING Shrinks "struct dentry" from 128 bytes to 124 on x86, allowing 31 objects per slab instead of 30. Cc: John Levon Cc: Philippe Elie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 86bdb93789..a173bba326 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -743,7 +743,9 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_op = NULL; dentry->d_fsdata = NULL; dentry->d_mounted = 0; +#ifdef CONFIG_PROFILING dentry->d_cookie = NULL; +#endif INIT_HLIST_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); -- cgit v1.2.2 From dfa08592ca0440d793ecc8dfc6277dd2aa4b8dda Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 3 Feb 2006 03:04:13 -0800 Subject: [PATCH] Fix two ext[23] uninitialized warnings There is a code path that passed size to ext2_xattr_set (ext3_xattr_set_handle) before initializing it. The callees don't use the value in that case, but gcc cannot tell. Always initialize size to get rid of the warnings. Signed-off-by: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/acl.c | 2 +- fs/ext3/acl.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 35acc43b89..da52b4a5db 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -220,7 +220,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) struct ext2_inode_info *ei = EXT2_I(inode); int name_index; void *value = NULL; - size_t size; + size_t size = 0; int error; if (S_ISLNK(inode->i_mode)) diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 47a9da2dfb..0d21d558b8 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -226,7 +226,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, struct ext3_inode_info *ei = EXT3_I(inode); int name_index; void *value = NULL; - size_t size; + size_t size = 0; int error; if (S_ISLNK(inode->i_mode)) -- cgit v1.2.2 From bd3bfeb58aeddb660dc600ded2fa9243e0c2d12b Mon Sep 17 00:00:00 2001 From: Felix Oxley Date: Fri, 3 Feb 2006 03:04:15 -0800 Subject: [PATCH] fs/jffs/intrep.c: 255 is unsigned char Signed-off-by: Felix Oxley Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs/intrep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c index b2e95421d9..ce7b54b0b2 100644 --- a/fs/jffs/intrep.c +++ b/fs/jffs/intrep.c @@ -1965,7 +1965,7 @@ retry: iovec_cnt++; if (JFFS_GET_PAD_BYTES(raw_inode->nsize)) { - static char allff[3]={255,255,255}; + static unsigned char allff[3]={255,255,255}; /* Add some extra padding if necessary */ node_iovec[iovec_cnt].iov_base = allff; node_iovec[iovec_cnt].iov_len = -- cgit v1.2.2 From 93c615feffbcea4f09ecee154f46062f6041776e Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Fri, 3 Feb 2006 03:04:17 -0800 Subject: [PATCH] v9fs: symlink support fixes Two symlink fixes, v9fs_readlink didn't copy the last character of the symlink name, v9fs_vfs_follow_link incorrectly called strlen of newly allocated buffer instead of PATH_MAX. Signed-off-by: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 91f552454c..63e5b0398e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -886,8 +886,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) } /* copy extension buffer into buffer */ - if (fcall->params.rstat.stat.extension.len < buflen) - buflen = fcall->params.rstat.stat.extension.len; + if (fcall->params.rstat.stat.extension.len+1 < buflen) + buflen = fcall->params.rstat.stat.extension.len + 1; memcpy(buffer, fcall->params.rstat.stat.extension.str, buflen - 1); buffer[buflen-1] = 0; @@ -951,7 +951,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) if (!link) link = ERR_PTR(-ENOMEM); else { - len = v9fs_readlink(dentry, link, strlen(link)); + len = v9fs_readlink(dentry, link, PATH_MAX); if (len < 0) { __putname(link); -- cgit v1.2.2 From 05818a004a84951fd383694f3b35d89eb49fa308 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Fri, 3 Feb 2006 03:04:18 -0800 Subject: [PATCH] v9fs: v9fs_put_str fix v9fs_put_str used to store pointer to the source string, instead of the cbuf copy. This patch corrects it. Signed-off-by: Latchesar Ionkov Cc: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/conv.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/9p/conv.c b/fs/9p/conv.c index 32a9f99154..bf1f100679 100644 --- a/fs/9p/conv.c +++ b/fs/9p/conv.c @@ -116,13 +116,19 @@ static void buf_put_int64(struct cbuf *buf, u64 val) } } -static void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen) +static char *buf_put_stringn(struct cbuf *buf, const char *s, u16 slen) { + char *ret; + + ret = NULL; if (buf_check_size(buf, slen + 2)) { buf_put_int16(buf, slen); + ret = buf->p; memcpy(buf->p, s, slen); buf->p += slen; } + + return ret; } static inline void buf_put_string(struct cbuf *buf, const char *s) @@ -430,15 +436,19 @@ static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p) static void v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str) { - if (data) { - str->len = strlen(data); - str->str = bufp->p; - } else { - str->len = 0; - str->str = NULL; - } + int len; + char *s; + + if (data) + len = strlen(data); + else + len = 0; - buf_put_stringn(bufp, data, str->len); + s = buf_put_stringn(bufp, data, len); + if (str) { + str->len = len; + str->str = s; + } } static int -- cgit v1.2.2 From 034b91a3b66cf9d2983ac45f73162395c0936c36 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Fri, 3 Feb 2006 03:04:20 -0800 Subject: [PATCH] v9fs: fix corner cases when flushing request When v9fs_mux_rpc sends a 9P message, it may be put in the queue of unsent request. If the user process receives a signal, v9fs_mux_rpc sets the request error to ERREQFLUSH and assigns NULL to request's send message. If the message was still in the unsent queue, v9fs_write_work would produce an oops while processing it. The patch makes sure that requests that are being flushed are moved to the pending requests queue safely. If a request is being flushed, don't remove it from the list of pending requests even if it receives a reply before the flush is acknoledged. The request will be removed during from the Rflush handler (v9fs_mux_flush_cb). Signed-off-by: Latchesar Ionkov Cc: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/mux.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/9p/mux.c b/fs/9p/mux.c index 945cb368d4..ea1134eb47 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -471,10 +471,13 @@ static void v9fs_write_work(void *a) } spin_lock(&m->lock); - req = - list_entry(m->unsent_req_list.next, struct v9fs_req, +again: + req = list_entry(m->unsent_req_list.next, struct v9fs_req, req_list); list_move_tail(&req->req_list, &m->req_list); + if (req->err == ERREQFLUSH) + goto again; + m->wbuf = req->tcall->sdata; m->wsize = req->tcall->size; m->wpos = 0; @@ -525,7 +528,7 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) struct v9fs_str *ename; tag = req->tag; - if (req->rcall->id == RERROR && !req->err) { + if (!req->err && req->rcall->id == RERROR) { ecode = req->rcall->params.rerror.errno; ename = &req->rcall->params.rerror.error; @@ -551,7 +554,10 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) req->err = -EIO; } - if (req->cb && req->err != ERREQFLUSH) { + if (req->err == ERREQFLUSH) + return; + + if (req->cb) { dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n", req->tcall, req->rcall); @@ -812,6 +818,7 @@ v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err) struct v9fs_mux_rpc *r; if (err == ERREQFLUSH) { + kfree(rc); dprintk(DEBUG_MUX, "err req flush\n"); return; } -- cgit v1.2.2 From a18546110ed6bec483d55bfffccb2487dfbd77af Mon Sep 17 00:00:00 2001 From: "schwab@suse.de" Date: Fri, 3 Feb 2006 03:04:24 -0800 Subject: [PATCH] disable per cpu intr in /proc/stat Don't compute and display the per-irq sums on ia64 either, too much overhead for mostly useless figures. Cc: Olaf Hering Acked-by: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 8f8014285a..1d24fead51 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -548,7 +548,7 @@ static int show_stat(struct seq_file *p, void *v) } seq_printf(p, "intr %llu", (unsigned long long)sum); -#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) +#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64) for (i = 0; i < NR_IRQS; i++) seq_printf(p, " %u", kstat_irqs(i)); #endif -- cgit v1.2.2 From 835417967c10b6dfaffdffddba59196196e5d431 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Fri, 3 Feb 2006 03:04:25 -0800 Subject: [PATCH] ext2: print xip mount option in ext2_show_options In case we have CONFIG_FS_XIP, ext2_show_options shows "xip" if EXT2_MOUNT_XIP mount flag is set. Signed-off-by: Carsten Otte Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/super.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 8d6819846f..cb6f9bd658 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -221,6 +221,11 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",grpquota"); #endif +#if defined(CONFIG_EXT2_FS_XIP) + if (sbi->s_mount_opt & EXT2_MOUNT_XIP) + seq_puts(seq, ",xip"); +#endif + return 0; } -- cgit v1.2.2 From 35dc8161d0a6fa5e654bcb3d6240acc9ecb0a259 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 3 Feb 2006 03:04:27 -0800 Subject: [PATCH] fix O_DIRECT read of last block in a sparse file Currently, if you open a file O_DIRECT, truncate it to a size that is not a multiple of the disk block size, and then try to read the last block in the file, the read will return 0. The problem is in do_direct_IO, here: /* Handle holes */ if (!buffer_mapped(map_bh)) { char *kaddr; ... if (dio->block_in_file >= i_size_read(dio->inode)>>blkbits) { /* We hit eof */ page_cache_release(page); goto out; } We shift off any remaining bytes in the final block of the I/O, resulting in a 0-sized read. I've attached a patch that fixes this. I'm not happy about how ugly the math is getting, so suggestions are more than welcome. I've tested this with a simple program that performs the steps outlined for reproducing the problem above. Without the patch, we get a 0-sized result from read. With the patch, we get the correct return value from the short read. Signed-off-by: Jeff Moyer Cc: Badari Pulavarty Cc: Suparna Bhattacharya Cc: Mingming Cao Cc: Joel Becker Cc: "Chen, Kenneth W" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 30dbbd1df5..848044af7e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -857,6 +857,7 @@ do_holes: /* Handle holes */ if (!buffer_mapped(map_bh)) { char *kaddr; + loff_t i_size_aligned; /* AKPM: eargh, -ENOTBLK is a hack */ if (dio->rw == WRITE) { @@ -864,8 +865,14 @@ do_holes: return -ENOTBLK; } + /* + * Be sure to account for a partial block as the + * last block in the file + */ + i_size_aligned = ALIGN(i_size_read(dio->inode), + 1 << blkbits); if (dio->block_in_file >= - i_size_read(dio->inode)>>blkbits) { + i_size_aligned >> blkbits) { /* We hit eof */ page_cache_release(page); goto out; -- cgit v1.2.2 From 7d95c8f27d9be65bf160f1edaf653d33dfceb58c Mon Sep 17 00:00:00 2001 From: dean gaudet Date: Fri, 3 Feb 2006 03:04:30 -0800 Subject: [PATCH] fcntl F_SETFL and read-only IS_APPEND files There is code in setfl() which attempts to preserve the O_APPEND flag on IS_APPEND files... however IS_APPEND files could also be opened O_RDONLY and in that case setfl() should not require O_APPEND... coreutils 5.93 tail -f attempts to set O_NONBLOCK even on regular files... unfortunately if you try this on an append-only log file the result is this: fcntl64(3, F_GETFL) = 0x8000 (flags O_RDONLY|O_LARGEFILE) fcntl64(3, F_SETFL, O_RDONLY|O_NONBLOCK|O_LARGEFILE) = -1 EPERM (Operation not permitted) I offer up the patch below as one way of fixing the problem... i've tested it fixes the problem with tail -f but haven't really tested beyond that. (I also reported the coreutils bug upstream... it shouldn't fail imho... ) Signed-off-by: dean gaudet Cc: Al Viro Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 5f96786d1c..dc4a7007f4 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -208,8 +208,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg) struct inode * inode = filp->f_dentry->d_inode; int error = 0; - /* O_APPEND cannot be cleared if the file is marked as append-only */ - if (!(arg & O_APPEND) && IS_APPEND(inode)) + /* + * O_APPEND cannot be cleared if the file is marked as append-only + * and the file is open for write. + */ + if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; /* O_NOATIME can only be set by the owner or superuser */ -- cgit v1.2.2 From 9d9c0531c91755a90b646b27bb722d59ee3eb46d Mon Sep 17 00:00:00 2001 From: Herbert Poetzl Date: Fri, 3 Feb 2006 03:04:37 -0800 Subject: [PATCH] quota: fix error code for ext2_new_inode() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The quota check in ext2_new_inode() returns ENOSPC where it should return EDQUOT instead. Signed-off-by: Herbert Pötzl Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 74714af4ae..e52765219e 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -605,7 +605,7 @@ got: insert_inode_hash(inode); if (DQUOT_ALLOC_INODE(inode)) { - err = -ENOSPC; + err = -EDQUOT; goto fail_drop; } -- cgit v1.2.2 From 5b00226d4d3aa7969d84e16f857ea100465d9c98 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 3 Feb 2006 03:04:42 -0800 Subject: [PATCH] fat: Replace an own implementation with ll_rw_block(SWRITE,) This patch replaces an own implementation with LL_RW_BLOCK(SWRITE,) which was newly added. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/misc.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 32fb0a3f1d..944652e9dd 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -196,19 +196,9 @@ EXPORT_SYMBOL_GPL(fat_date_unix2dos); int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) { - int i, e, err = 0; + int i, err = 0; - for (i = 0; i < nr_bhs; i++) { - lock_buffer(bhs[i]); - if (test_clear_buffer_dirty(bhs[i])) { - get_bh(bhs[i]); - bhs[i]->b_end_io = end_buffer_write_sync; - e = submit_bh(WRITE, bhs[i]); - if (!err && e) - err = e; - } else - unlock_buffer(bhs[i]); - } + ll_rw_block(SWRITE, nr_bhs, bhs); for (i = 0; i < nr_bhs; i++) { wait_on_buffer(bhs[i]); if (buffer_eopnotsupp(bhs[i])) { -- cgit v1.2.2 From e60e5c50aa5389db86e96fc52d02bc7db3d23f4a Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 3 Feb 2006 03:04:43 -0800 Subject: [PATCH] Trivial optimization of ll_rw_block() The ll_rw_block() needs to get ref-count only if it submits a buffer(). This patch avoids the needless get/put of ref-count. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 5e4a90ee10..62cfd17dc5 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2867,22 +2867,22 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) else if (test_set_buffer_locked(bh)) continue; - get_bh(bh); if (rw == WRITE || rw == SWRITE) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; + get_bh(bh); submit_bh(WRITE, bh); continue; } } else { if (!buffer_uptodate(bh)) { bh->b_end_io = end_buffer_read_sync; + get_bh(bh); submit_bh(rw, bh); continue; } } unlock_buffer(bh); - put_bh(bh); } } -- cgit v1.2.2 From 3b641407a1447759ac8159180e90ed2e4387a0b6 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 3 Feb 2006 03:04:44 -0800 Subject: [PATCH] fat: Fix truncate() write ordering The truncate() should write the file size before writing the new EOF entry. This patch fixes it. This bug was pointed out by Machida Hiroyuki. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/file.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/fat/file.c b/fs/fat/file.c index e99c5a73b3..88aa1ae13f 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -210,10 +210,30 @@ static int fat_free(struct inode *inode, int skip) if (MSDOS_I(inode)->i_start == 0) return 0; - /* - * Write a new EOF, and get the remaining cluster chain for freeing. - */ + fat_cache_inval_inode(inode); + wait = IS_DIRSYNC(inode); + i_start = free_start = MSDOS_I(inode)->i_start; + i_logstart = MSDOS_I(inode)->i_logstart; + + /* First, we write the new file size. */ + if (!skip) { + MSDOS_I(inode)->i_start = 0; + MSDOS_I(inode)->i_logstart = 0; + } + MSDOS_I(inode)->i_attrs |= ATTR_ARCH; + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + if (wait) { + err = fat_sync_inode(inode); + if (err) { + MSDOS_I(inode)->i_start = i_start; + MSDOS_I(inode)->i_logstart = i_logstart; + return err; + } + } else + mark_inode_dirty(inode); + + /* Write a new EOF, and get the remaining cluster chain for freeing. */ if (skip) { struct fat_entry fatent; int ret, fclus, dclus; @@ -244,35 +264,11 @@ static int fat_free(struct inode *inode, int skip) return ret; free_start = ret; - i_start = i_logstart = 0; - fat_cache_inval_inode(inode); - } else { - fat_cache_inval_inode(inode); - - i_start = free_start = MSDOS_I(inode)->i_start; - i_logstart = MSDOS_I(inode)->i_logstart; - MSDOS_I(inode)->i_start = 0; - MSDOS_I(inode)->i_logstart = 0; } - MSDOS_I(inode)->i_attrs |= ATTR_ARCH; - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; - if (wait) { - err = fat_sync_inode(inode); - if (err) - goto error; - } else - mark_inode_dirty(inode); inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9); /* Freeing the remained cluster chain */ return fat_free_clusters(inode, free_start); - -error: - if (i_start) { - MSDOS_I(inode)->i_start = i_start; - MSDOS_I(inode)->i_logstart = i_logstart; - } - return err; } void fat_truncate(struct inode *inode) -- cgit v1.2.2 From 7656f328f68b351a8bb71ad465cedc8d0a039f9e Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Fri, 3 Feb 2006 03:04:48 -0800 Subject: [PATCH] debugfs: hard link count wrong Fix incorrect nlink of root inode for filesystems that use simple_fill_super(). Signed-off-by: Vincent Hanquez Cc: Greg KH Cc: Heiko Carstens Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/libfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 63c020e658..71fd08fa41 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -388,6 +388,7 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; + inode->i_nlink = 2; root = d_alloc_root(inode); if (!root) { iput(inode); -- cgit v1.2.2 From 99603966f5b44693901ea68cef2c1c21ce6a49c3 Mon Sep 17 00:00:00 2001 From: "KAMBAROV, ZAUR" Date: Fri, 3 Feb 2006 03:04:49 -0800 Subject: [PATCH] coverity: udf/balloc.c null deref fix It's doing if (obh) else dereference obh So presumably `obh' is never null in there. This defect was found automatically by Coverity Prevent, a static analysis tool. Signed-off-by: Zaur Kambarov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/udf/balloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 4fae57d9d1..201049ac8a 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -579,10 +579,9 @@ static void udf_table_free_blocks(struct super_block * sb, { loffset = nextoffset; aed->lengthAllocDescs = cpu_to_le32(adsize); - if (obh) - sptr = UDF_I_DATA(inode) + nextoffset - udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode) - adsize; - else - sptr = obh->b_data + nextoffset - adsize; + sptr = UDF_I_DATA(inode) + nextoffset - + udf_file_entry_alloc_offset(inode) + + UDF_I_LENEATTR(inode) - adsize; dptr = nbh->b_data + sizeof(struct allocExtDesc); memcpy(dptr, sptr, adsize); nextoffset = sizeof(struct allocExtDesc) + adsize; -- cgit v1.2.2 From db9a369ec172c8251dbc6f7bf6bf13f6c5b6e7f5 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Fri, 3 Feb 2006 03:04:50 -0800 Subject: [PATCH] UDF: Fix issues reported by Coverity in namei.c This patch fixes an issue in fs/udf/namei.c reported by Coverity: Error reported(1776) CID: 1776 Checker: UNUSED_VALUE (help) File: fs/udf/namei.c Function: udf_lookup Description: Pointer returned from "udf_find_entry" is never used Patch description: remove unused variable fi. Signed-off-by: Jayachandran C. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/udf/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/udf/namei.c b/fs/udf/namei.c index ca732e79c4..ab9a7629d2 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -296,7 +296,7 @@ static struct dentry * udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct inode *inode = NULL; - struct fileIdentDesc cfi, *fi; + struct fileIdentDesc cfi; struct udf_fileident_bh fibh; if (dentry->d_name.len > UDF_NAME_LEN-2) @@ -318,7 +318,7 @@ udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) else #endif /* UDF_RECOVERY */ - if ((fi = udf_find_entry(dir, dentry, &fibh, &cfi))) + if (udf_find_entry(dir, dentry, &fibh, &cfi)) { if (fibh.sbh != fibh.ebh) udf_release_data(fibh.ebh); -- cgit v1.2.2 From 8c5a950c9693aa24828d16dd7bc38bced3f37d48 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Fri, 6 Jan 2006 13:46:31 -0800 Subject: o Remove confusing Kconfig text for CONFIGFS_FS. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/Kconfig | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 93b5dc4082..e9749b0eec 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -883,8 +883,6 @@ config CONFIGFS_FS Both sysfs and configfs can and should exist together on the same system. One is not a replacement for the other. - If unsure, say N. - endmenu menu "Miscellaneous filesystems" -- cgit v1.2.2 From 0c6c98fb187524935a93fdd4f9a7193e7b110782 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 7 Jan 2006 20:07:02 +0100 Subject: [PATCH] OCFS2: __init / __exit problem Functions called by __init funtions mustn't be __exit. Reported by Jan-Benedict Glaw . Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh --- fs/ocfs2/extent_map.c | 2 +- fs/ocfs2/uptodate.c | 2 +- fs/ocfs2/uptodate.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index f2fb40cd29..eb2bd8a4ca 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -988,7 +988,7 @@ int __init init_ocfs2_extent_maps(void) return 0; } -void __exit exit_ocfs2_extent_maps(void) +void exit_ocfs2_extent_maps(void) { kmem_cache_destroy(ocfs2_em_ent_cachep); } diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 3a0458fd3e..50c8fb3de0 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -537,7 +537,7 @@ int __init init_ocfs2_uptodate_cache(void) return 0; } -void __exit exit_ocfs2_uptodate_cache(void) +void exit_ocfs2_uptodate_cache(void) { if (ocfs2_uptodate_cachep) kmem_cache_destroy(ocfs2_uptodate_cachep); diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index e5aacdf4ea..01cd32d26b 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -27,7 +27,7 @@ #define OCFS2_UPTODATE_H int __init init_ocfs2_uptodate_cache(void); -void __exit exit_ocfs2_uptodate_cache(void); +void exit_ocfs2_uptodate_cache(void); void ocfs2_metadata_cache_init(struct inode *inode); void ocfs2_metadata_cache_purge(struct inode *inode); -- cgit v1.2.2 From aee93ac4b7ad461255939248d0d51566cff77e05 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 9 Jan 2006 12:36:40 -0500 Subject: [PATCH] ocfs2/dlm: fix compilation on ia64 Including results in compilation failure on ia64 due to not including Including corrects the problem. Please apply. Signed-off-by: Jeff Mahoney Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index e1fdd28879..c3764f4744 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -27,7 +27,7 @@ * Boston, MA 021110-1307, USA. */ -#include +#include #include #include -- cgit v1.2.2 From 251b6eccbeff4f0f8a3509769b327705e899f5dd Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 10 Jan 2006 15:41:43 -0800 Subject: [OCFS2] Make ip_io_sem a mutex ip_io_sem is now ip_io_mutex. Signed-off-by: Mark Fasheh --- fs/ocfs2/buffer_head_io.c | 10 +++++----- fs/ocfs2/inode.c | 6 +++--- fs/ocfs2/inode.h | 4 ++-- fs/ocfs2/journal.c | 4 ++-- fs/ocfs2/super.c | 2 +- fs/ocfs2/uptodate.c | 10 +++++----- 6 files changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index d424041b38..bae3d7548b 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -58,7 +58,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, goto out; } - down(&OCFS2_I(inode)->ip_io_sem); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); lock_buffer(bh); set_buffer_uptodate(bh); @@ -82,7 +82,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, brelse(bh); } - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); out: mlog_exit(ret); return ret; @@ -125,13 +125,13 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, flags &= ~OCFS2_BH_CACHED; if (inode) - down(&OCFS2_I(inode)->ip_io_sem); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { if (inode) - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); status = -EIO; mlog_errno(status); goto bail; @@ -220,7 +220,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, ocfs2_set_buffer_uptodate(inode, bh); } if (inode) - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr, (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d4ecc06277..8122489c57 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -903,10 +903,10 @@ void ocfs2_clear_inode(struct inode *inode) "Clear inode of %"MLFu64", inode is locked\n", oi->ip_blkno); - mlog_bug_on_msg(down_trylock(&oi->ip_io_sem), - "Clear inode of %"MLFu64", io_sem is locked\n", + mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), + "Clear inode of %"MLFu64", io_mutex is locked\n", oi->ip_blkno); - up(&oi->ip_io_sem); + mutex_unlock(&oi->ip_io_mutex); /* * down_trylock() returns 0, down_write_trylock() returns 1 diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 9b01774336..84c5079612 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -46,10 +46,10 @@ struct ocfs2_inode_info struct list_head ip_io_markers; int ip_orphaned_slot; - struct semaphore ip_io_sem; + struct mutex ip_io_mutex; /* Used by the journalling code to attach an inode to a - * handle. These are protected by ip_io_sem in order to lock + * handle. These are protected by ip_io_mutex in order to lock * out other I/O to the inode until we either commit or * abort. */ struct list_head ip_handle_list; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 303c8d9645..65bd69d1c7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -401,7 +401,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle, * j_trans_barrier for us. */ ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); - down(&OCFS2_I(inode)->ip_io_sem); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); switch (type) { case OCFS2_JOURNAL_ACCESS_CREATE: case OCFS2_JOURNAL_ACCESS_WRITE: @@ -416,7 +416,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle, status = -EINVAL; mlog(ML_ERROR, "Uknown access type!\n"); } - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); if (status < 0) mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 364d64bd5f..c44075d4b5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -932,7 +932,7 @@ static void ocfs2_inode_init_once(void *data, oi->ip_dir_start_lookup = 0; init_rwsem(&oi->ip_alloc_sem); - init_MUTEX(&(oi->ip_io_sem)); + mutex_init(&oi->ip_io_mutex); oi->ip_blkno = 0ULL; oi->ip_clusters = 0; diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 50c8fb3de0..300b5bedfb 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -388,7 +388,7 @@ out_free: } } -/* Item insertion is guarded by ip_io_sem, so the insertion path takes +/* Item insertion is guarded by ip_io_mutex, so the insertion path takes * advantage of this by not rechecking for a duplicate insert during * the slow case. Additionally, if the cache needs to be bumped up to * a tree, the code will not recheck after acquiring the lock -- @@ -418,7 +418,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode, (unsigned long long) bh->b_blocknr); /* No need to recheck under spinlock - insertion is guarded by - * ip_io_sem */ + * ip_io_mutex */ spin_lock(&oi->ip_lock); if (ocfs2_insert_can_use_array(oi, ci)) { /* Fast case - it's an array and there's a free @@ -440,7 +440,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode, /* Called against a newly allocated buffer. Most likely nobody should * be able to read this sort of metadata while it's still being - * allocated, but this is careful to take ip_io_sem anyway. */ + * allocated, but this is careful to take ip_io_mutex anyway. */ void ocfs2_set_new_buffer_uptodate(struct inode *inode, struct buffer_head *bh) { @@ -451,9 +451,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode, set_buffer_uptodate(bh); - down(&oi->ip_io_sem); + mutex_lock(&oi->ip_io_mutex); ocfs2_set_buffer_uptodate(inode, bh); - up(&oi->ip_io_sem); + mutex_unlock(&oi->ip_io_mutex); } /* Requires ip_lock. */ -- cgit v1.2.2 From e2faea4ce340f199c1957986c4c3dc2de76f5746 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Thu, 12 Jan 2006 14:24:55 -0800 Subject: [PATCH] ocfs2/dlm: fixes * fix a hang which can occur during shutdown migration * do not allow nodes to join during recovery * when restarting lock mastery, do not ignore nodes which come up * more than one node could become recovery master, fix this * sleep to allow some time for heartbeat state to catch up to network * extra debug info for bad recovery state problems * make DLM_RECO_NODE_DATA_DONE a valid state for non-master recovery nodes * prune all locks from dead nodes on $RECOVERY lock resources * do NOT automatically add new nodes to mle nodemaps until they have properly joined the domain * make sure dlm_pick_recovery_master only exits when all nodes have synced * properly handle dlmunlock errors in dlm_pick_recovery_master * do not propagate network errors in dlm_send_begin_reco_message * dead nodes were not being put in the recovery map sometimes, fix this * dlmunlock was failing to clear the unlock actions on DLM_DENIED Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmcommon.h | 1 + fs/ocfs2/dlm/dlmdomain.c | 18 +++- fs/ocfs2/dlm/dlmmaster.c | 24 +++-- fs/ocfs2/dlm/dlmrecovery.c | 249 ++++++++++++++++++++++++++++++++++++++------- fs/ocfs2/dlm/dlmunlock.c | 13 +++ 5 files changed, 256 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 3fecba0a60..42eb53b529 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -657,6 +657,7 @@ void dlm_complete_thread(struct dlm_ctxt *dlm); int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); void dlm_wait_for_recovery(struct dlm_ctxt *dlm); +int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index da3c22045f..6ee3083738 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -573,8 +573,11 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm_domain_lock); dlm = __dlm_lookup_domain_full(query->domain, query->name_len); /* Once the dlm ctxt is marked as leaving then we don't want - * to be put in someone's domain map. */ + * to be put in someone's domain map. + * Also, explicitly disallow joining at certain troublesome + * times (ie. during recovery). */ if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { + int bit = query->node_idx; spin_lock(&dlm->spinlock); if (dlm->dlm_state == DLM_CTXT_NEW && @@ -586,6 +589,19 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { /* Disallow parallel joins. */ response = JOIN_DISALLOW; + } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(ML_NOTICE, "node %u trying to join, but recovery " + "is ongoing.\n", bit); + response = JOIN_DISALLOW; + } else if (test_bit(bit, dlm->recovery_map)) { + mlog(ML_NOTICE, "node %u trying to join, but it " + "still needs recovery.\n", bit); + response = JOIN_DISALLOW; + } else if (test_bit(bit, dlm->domain_map)) { + mlog(ML_NOTICE, "node %u trying to join, but it " + "is still in the domain! needs recovery?\n", + bit); + response = JOIN_DISALLOW; } else { /* Alright we're fully a part of this domain * so we keep some state as to who's joining diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 27e984f7e4..a3194fe173 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1050,17 +1050,10 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, node = dlm_bitmap_diff_iter_next(&bdi, &sc); while (node >= 0) { if (sc == NODE_UP) { - /* a node came up. easy. might not even need - * to talk to it if its node number is higher - * or if we are already blocked. */ - mlog(0, "node up! %d\n", node); - if (blocked) - goto next; - - if (node > dlm->node_num) { - mlog(0, "node > this node. skipping.\n"); - goto next; - } + /* a node came up. clear any old vote from + * the response map and set it in the vote map + * then restart the mastery. */ + mlog(ML_NOTICE, "node %d up while restarting\n", node); /* redo the master request, but only for the new node */ mlog(0, "sending request to new node\n"); @@ -2005,6 +1998,15 @@ fail: break; mlog(0, "timed out during migration\n"); + /* avoid hang during shutdown when migrating lockres + * to a node which also goes down */ + if (dlm_is_node_dead(dlm, target)) { + mlog(0, "%s:%.*s: expected migration target %u " + "is no longer up. restarting.\n", + dlm->name, res->lockname.len, + res->lockname.name, target); + ret = -ERESTARTSYS; + } } if (ret == -ERESTARTSYS) { /* migration failed, detach and clean up mle */ diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0c8eb1093f..325c9f5529 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -256,6 +256,27 @@ static int dlm_recovery_thread(void *data) return 0; } +/* returns true when the recovery master has contacted us */ +static int dlm_reco_master_ready(struct dlm_ctxt *dlm) +{ + int ready; + spin_lock(&dlm->spinlock); + ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM); + spin_unlock(&dlm->spinlock); + return ready; +} + +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) +{ + int dead; + spin_lock(&dlm->spinlock); + dead = test_bit(node, dlm->domain_map); + spin_unlock(&dlm->spinlock); + return dead; +} + /* callers of the top-level api calls (dlmlock/dlmunlock) should * block on the dlm->reco.event when recovery is in progress. * the dlm recovery thread will set this state when it begins @@ -297,6 +318,7 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm) static int dlm_do_recovery(struct dlm_ctxt *dlm) { int status = 0; + int ret; spin_lock(&dlm->spinlock); @@ -343,10 +365,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) goto master_here; if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { - /* choose a new master */ - if (!dlm_pick_recovery_master(dlm)) { + /* choose a new master, returns 0 if this node + * is the master, -EEXIST if it's another node. + * this does not return until a new master is chosen + * or recovery completes entirely. */ + ret = dlm_pick_recovery_master(dlm); + if (!ret) { /* already notified everyone. go. */ - dlm->reco.new_master = dlm->node_num; goto master_here; } mlog(0, "another node will master this recovery session.\n"); @@ -371,8 +396,13 @@ master_here: if (status < 0) { mlog(ML_ERROR, "error %d remastering locks for node %u, " "retrying.\n", status, dlm->reco.dead_node); + /* yield a bit to allow any final network messages + * to get handled on remaining nodes */ + msleep(100); } else { /* success! see if any other nodes need recovery */ + mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", + dlm->name, dlm->reco.dead_node, dlm->node_num); dlm_reset_recovery(dlm); } dlm_end_recovery(dlm); @@ -477,7 +507,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) BUG(); break; case DLM_RECO_NODE_DATA_DEAD: - mlog(0, "node %u died after " + mlog(ML_NOTICE, "node %u died after " "requesting recovery info for " "node %u\n", ndata->node_num, dead_node); @@ -485,6 +515,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) // start all over destroy = 1; status = -EAGAIN; + /* instead of spinning like crazy here, + * wait for the domain map to catch up + * with the network state. otherwise this + * can be hit hundreds of times before + * the node is really seen as dead. */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, + ndata->node_num), + msecs_to_jiffies(1000)); + mlog(0, "waited 1 sec for %u, " + "dead? %s\n", ndata->node_num, + dlm_is_node_dead(dlm, ndata->node_num) ? + "yes" : "no"); goto leave; case DLM_RECO_NODE_DATA_RECEIVING: case DLM_RECO_NODE_DATA_REQUESTED: @@ -678,11 +721,27 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) dlm = item->dlm; dead_node = item->u.ral.dead_node; reco_master = item->u.ral.reco_master; + mres = (struct dlm_migratable_lockres *)data; + + if (dead_node != dlm->reco.dead_node || + reco_master != dlm->reco.new_master) { + /* show extra debug info if the recovery state is messed */ + mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " + "request(dead=%u, master=%u)\n", + dlm->name, dlm->reco.dead_node, dlm->reco.new_master, + dead_node, reco_master); + mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " + "entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", + dlm->name, mres->lockname_len, mres->lockname, mres->master, + mres->num_locks, mres->total_locks, mres->flags, + mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags, + mres->ml[0].type, mres->ml[0].convert_type, + mres->ml[0].highest_blocked, mres->ml[0].node); + BUG(); + } BUG_ON(dead_node != dlm->reco.dead_node); BUG_ON(reco_master != dlm->reco.new_master); - mres = (struct dlm_migratable_lockres *)data; - /* lock resources should have already been moved to the * dlm->reco.resources list. now move items from that list * to a temp list if the dead owner matches. note that the @@ -757,15 +816,18 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) continue; switch (ndata->state) { + /* should have moved beyond INIT but not to FINALIZE yet */ case DLM_RECO_NODE_DATA_INIT: case DLM_RECO_NODE_DATA_DEAD: - case DLM_RECO_NODE_DATA_DONE: case DLM_RECO_NODE_DATA_FINALIZE_SENT: mlog(ML_ERROR, "bad ndata state for node %u:" " state=%d\n", ndata->node_num, ndata->state); BUG(); break; + /* these states are possible at this point, anywhere along + * the line of recovery */ + case DLM_RECO_NODE_DATA_DONE: case DLM_RECO_NODE_DATA_RECEIVING: case DLM_RECO_NODE_DATA_REQUESTED: case DLM_RECO_NODE_DATA_REQUESTING: @@ -799,13 +861,31 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, { struct dlm_lock_resource *res; struct list_head *iter, *iter2; + struct dlm_lock *lock; spin_lock(&dlm->spinlock); list_for_each_safe(iter, iter2, &dlm->reco.resources) { res = list_entry (iter, struct dlm_lock_resource, recovering); + /* always prune any $RECOVERY entries for dead nodes, + * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, - res->lockname.len)) + res->lockname.len)) { + spin_lock(&res->spinlock); + list_for_each_entry(lock, &res->granted, list) { + if (lock->ml.node == dead_node) { + mlog(0, "AHA! there was " + "a $RECOVERY lock for dead " + "node %u (%s)!\n", + dead_node, dlm->name); + list_del_init(&lock->list); + dlm_lock_put(lock); + break; + } + } + spin_unlock(&res->spinlock); continue; + } + if (res->owner == dead_node) { mlog(0, "found lockres owned by dead node while " "doing recovery for node %u. sending it.\n", @@ -1179,7 +1259,7 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data) again: ret = dlm_lockres_master_requery(dlm, res, &real_master); if (ret < 0) { - mlog(0, "dlm_lockres_master_requery failure: %d\n", + mlog(0, "dlm_lockres_master_requery ret=%d\n", ret); goto again; } @@ -1757,6 +1837,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_lock_resource *res; int i; struct list_head *bucket; + struct dlm_lock *lock; /* purge any stale mles */ @@ -1780,10 +1861,25 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) bucket = &(dlm->resources[i]); list_for_each(iter, bucket) { res = list_entry (iter, struct dlm_lock_resource, list); + /* always prune any $RECOVERY entries for dead nodes, + * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, - res->lockname.len)) + res->lockname.len)) { + spin_lock(&res->spinlock); + list_for_each_entry(lock, &res->granted, list) { + if (lock->ml.node == dead_node) { + mlog(0, "AHA! there was " + "a $RECOVERY lock for dead " + "node %u (%s)!\n", + dead_node, dlm->name); + list_del_init(&lock->list); + dlm_lock_put(lock); + break; + } + } + spin_unlock(&res->spinlock); continue; - + } spin_lock(&res->spinlock); /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); @@ -1869,12 +1965,9 @@ void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data) return; spin_lock(&dlm->spinlock); - set_bit(idx, dlm->live_nodes_map); - - /* notify any mles attached to the heartbeat events */ - dlm_hb_event_notify_attached(dlm, idx, 1); - + /* do NOT notify mle attached to the heartbeat events. + * new nodes are not interesting in mastery until joined. */ spin_unlock(&dlm->spinlock); dlm_put(dlm); @@ -1897,7 +1990,18 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) mlog(0, "unlockast for recovery lock fired!\n"); } - +/* + * dlm_pick_recovery_master will continually attempt to use + * dlmlock() on the special "$RECOVERY" lockres with the + * LKM_NOQUEUE flag to get an EX. every thread that enters + * this function on each node racing to become the recovery + * master will not stop attempting this until either: + * a) this node gets the EX (and becomes the recovery master), + * or b) dlm->reco.new_master gets set to some nodenum + * != O2NM_INVALID_NODE_NUM (another node will do the reco). + * so each time a recovery master is needed, the entire cluster + * will sync at this point. if the new master dies, that will + * be detected in dlm_do_recovery */ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) { enum dlm_status ret; @@ -1906,23 +2010,45 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); -retry: +again: memset(&lksb, 0, sizeof(lksb)); ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); + mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n", + dlm->name, ret, lksb.status); + if (ret == DLM_NORMAL) { mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", dlm->name, dlm->node_num); - /* I am master, send message to all nodes saying - * that I am beginning a recovery session */ - status = dlm_send_begin_reco_message(dlm, - dlm->reco.dead_node); + + /* got the EX lock. check to see if another node + * just became the reco master */ + if (dlm_reco_master_ready(dlm)) { + mlog(0, "%s: got reco EX lock, but %u will " + "do the recovery\n", dlm->name, + dlm->reco.new_master); + status = -EEXIST; + } else { + status = dlm_send_begin_reco_message(dlm, + dlm->reco.dead_node); + /* this always succeeds */ + BUG_ON(status); + + /* set the new_master to this node */ + spin_lock(&dlm->spinlock); + dlm->reco.new_master = dlm->node_num; + spin_unlock(&dlm->spinlock); + } /* recovery lock is a special case. ast will not get fired, * so just go ahead and unlock it. */ ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); + if (ret == DLM_DENIED) { + mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n"); + ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm); + } if (ret != DLM_NORMAL) { /* this would really suck. this could only happen * if there was a network error during the unlock @@ -1930,20 +2056,42 @@ retry: * is actually "done" and the lock structure is * even freed. we can continue, but only * because this specific lock name is special. */ - mlog(0, "dlmunlock returned %d\n", ret); - } - - if (status < 0) { - mlog(0, "failed to send recovery message. " - "must retry with new node map.\n"); - goto retry; + mlog(ML_ERROR, "dlmunlock returned %d\n", ret); } } else if (ret == DLM_NOTQUEUED) { mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", dlm->name, dlm->node_num); /* another node is master. wait on - * reco.new_master != O2NM_INVALID_NODE_NUM */ + * reco.new_master != O2NM_INVALID_NODE_NUM + * for at most one second */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_reco_master_ready(dlm), + msecs_to_jiffies(1000)); + if (!dlm_reco_master_ready(dlm)) { + mlog(0, "%s: reco master taking awhile\n", + dlm->name); + goto again; + } + /* another node has informed this one that it is reco master */ + mlog(0, "%s: reco master %u is ready to recover %u\n", + dlm->name, dlm->reco.new_master, dlm->reco.dead_node); status = -EEXIST; + } else { + struct dlm_lock_resource *res; + + /* dlmlock returned something other than NOTQUEUED or NORMAL */ + mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), " + "lksb.status=%s\n", dlm->name, dlm_errname(ret), + dlm_errname(lksb.status)); + res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, + DLM_RECOVERY_LOCK_NAME_LEN); + if (res) { + dlm_print_one_lock_resource(res); + dlm_lockres_put(res); + } else { + mlog(ML_ERROR, "recovery lock not found\n"); + } + BUG(); } return status; @@ -1982,7 +2130,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) mlog(0, "not sending begin reco to self\n"); continue; } - +retry: ret = -EINVAL; mlog(0, "attempting to send begin reco msg to %d\n", nodenum); @@ -1991,8 +2139,17 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) /* negative status is handled ok by caller here */ if (ret >= 0) ret = status; + if (dlm_is_host_down(ret)) { + /* node is down. not involved in recovery + * so just keep going */ + mlog(0, "%s: node %u was down when sending " + "begin reco msg (%d)\n", dlm->name, nodenum, ret); + ret = 0; + } if (ret < 0) { struct dlm_lock_resource *res; + /* this is now a serious problem, possibly ENOMEM + * in the network stack. must retry */ mlog_errno(ret); mlog(ML_ERROR, "begin reco of dlm %s to node %u " " returned %d\n", dlm->name, nodenum, ret); @@ -2004,7 +2161,10 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) } else { mlog(ML_ERROR, "recovery lock not found\n"); } - break; + /* sleep for a bit in hopes that we can avoid + * another ENOMEM */ + msleep(100); + goto retry; } } @@ -2027,19 +2187,34 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm->spinlock); if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { - mlog(0, "new_master already set to %u!\n", - dlm->reco.new_master); + if (test_bit(dlm->reco.new_master, dlm->recovery_map)) { + mlog(0, "%s: new_master %u died, changing " + "to %u\n", dlm->name, dlm->reco.new_master, + br->node_idx); + } else { + mlog(0, "%s: new_master %u NOT DEAD, changing " + "to %u\n", dlm->name, dlm->reco.new_master, + br->node_idx); + /* may not have seen the new master as dead yet */ + } } if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { - mlog(0, "dead_node already set to %u!\n", - dlm->reco.dead_node); + mlog(ML_NOTICE, "%s: dead_node previously set to %u, " + "node %u changing it to %u\n", dlm->name, + dlm->reco.dead_node, br->node_idx, br->dead_node); } dlm->reco.new_master = br->node_idx; dlm->reco.dead_node = br->dead_node; if (!test_bit(br->dead_node, dlm->recovery_map)) { - mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " + mlog(0, "recovery master %u sees %u as dead, but this " "node has not yet. marking %u as dead\n", br->node_idx, br->dead_node, br->dead_node); + if (!test_bit(br->dead_node, dlm->domain_map) || + !test_bit(br->dead_node, dlm->live_nodes_map)) + mlog(0, "%u not in domain/live_nodes map " + "so setting it in reco map manually\n", + br->dead_node); + set_bit(br->dead_node, dlm->recovery_map); __dlm_hb_node_down(dlm, br->dead_node); } spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index cec2ce1cd3..c95f08d2e9 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -188,6 +188,19 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, actions &= ~(DLM_UNLOCK_REMOVE_LOCK| DLM_UNLOCK_REGRANT_LOCK| DLM_UNLOCK_CLEAR_CONVERT_TYPE); + } else if (status == DLM_RECOVERING || + status == DLM_MIGRATING || + status == DLM_FORWARD) { + /* must clear the actions because this unlock + * is about to be retried. cannot free or do + * any list manipulation. */ + mlog(0, "%s:%.*s: clearing actions, %s\n", + dlm->name, res->lockname.len, + res->lockname.name, + status==DLM_RECOVERING?"recovering": + (status==DLM_MIGRATING?"migrating": + "forward")); + actions = 0; } if (flags & LKM_CANCEL) lock->cancel_pending = 0; -- cgit v1.2.2 From c74ec2f77a7763a4a56c6cb13ecab961e1bbb456 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 13 Jan 2006 21:54:23 -0800 Subject: [PATCH] ocfs2: Semaphore to mutex conversion. Semaphore to mutex conversion. The conversion was generated via scripts, and the result was validated automatically via a script as well. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Mark Fasheh --- fs/ocfs2/journal.c | 10 +++++----- fs/ocfs2/ocfs2.h | 3 ++- fs/ocfs2/super.c | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 65bd69d1c7..ccabed9a0a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1072,10 +1072,10 @@ restart: NULL); bail: - down(&osb->recovery_lock); + mutex_lock(&osb->recovery_lock); if (!status && !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); goto restart; } @@ -1083,7 +1083,7 @@ bail: mb(); /* sync with ocfs2_recovery_thread_running */ wake_up(&osb->recovery_event); - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); mlog_exit(status); /* no one is callint kthread_stop() for us so the kthread() api @@ -1098,7 +1098,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) mlog_entry("(node_num=%d, osb->node_num = %d)\n", node_num, osb->node_num); - down(&osb->recovery_lock); + mutex_lock(&osb->recovery_lock); if (osb->disable_recovery) goto out; @@ -1120,7 +1120,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) } out: - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); wake_up(&osb->recovery_event); mlog_exit_void(); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index f468c600cf..8d8e4779df 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -33,6 +33,7 @@ #include #include #include +#include #include "cluster/nodemanager.h" #include "cluster/heartbeat.h" @@ -233,7 +234,7 @@ struct ocfs2_super struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/ */ atomic_t vol_state; - struct semaphore recovery_lock; + struct mutex recovery_lock; struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c44075d4b5..e7e17bdf62 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1137,9 +1137,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* disable any new recovery threads and wait for any currently * running ones to exit. Do this before setting the vol_state. */ - down(&osb->recovery_lock); + mutex_lock(&osb->recovery_lock); osb->disable_recovery = 1; - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); /* At this point, we know that no more recovery threads can be @@ -1283,7 +1283,7 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); - init_MUTEX(&osb->recovery_lock); + mutex_init(&osb->recovery_lock); osb->disable_recovery = 0; osb->recovery_thread_task = NULL; -- cgit v1.2.2 From b4c7f538508adcde7a0a5162faec0b2ab19b90bd Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 14 Jan 2006 20:55:10 +0100 Subject: [PATCH] fs/ocfs2/dlm/dlmrecovery.c must #include fs/ocfs2/dlm/dlmrecovery.c does now use msleep(), and does therefore need to #include for getting the prototype of this function. Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmrecovery.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 325c9f5529..186e9a76aa 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "cluster/heartbeat.h" -- cgit v1.2.2 From ebdec83ba46c123fe3bfdcaacf62d0dfe8fe4187 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn / snakebyte Date: Fri, 27 Jan 2006 10:32:52 +0100 Subject: [PATCH] BUG_ON() Conversion in fs/ocfs2/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Mark Fasheh --- fs/ocfs2/extent_map.c | 10 ++++------ fs/ocfs2/journal.c | 12 ++++-------- fs/ocfs2/super.c | 3 +-- fs/ocfs2/sysfile.c | 6 ++---- 4 files changed, 11 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index eb2bd8a4ca..b6ba292e95 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -262,8 +262,7 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode, el = &eb->h_list; } - if (el->l_tree_depth) - BUG(); + BUG_ON(el->l_tree_depth); for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; @@ -364,8 +363,8 @@ static int ocfs2_extent_map_lookup_read(struct inode *inode, return ret; } - if (ent->e_tree_depth) - BUG(); /* FIXME: Make sure this isn't a corruption */ + /* FIXME: Make sure this isn't a corruption */ + BUG_ON(ent->e_tree_depth); *ret_ent = ent; @@ -423,8 +422,7 @@ static int ocfs2_extent_map_try_insert(struct inode *inode, le32_to_cpu(rec->e_clusters), NULL, NULL); - if (!old_ent) - BUG(); + BUG_ON(!old_ent); ret = -EEXIST; if (old_ent->e_tree_depth < tree_depth) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ccabed9a0a..b71b3385fd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -147,8 +147,7 @@ struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, mlog_entry("(max_buffs = %d)\n", max_buffs); - if (!osb || !osb->journal->j_journal) - BUG(); + BUG_ON(!osb || !osb->journal->j_journal); if (ocfs2_is_hard_readonly(osb)) { ret = -EROFS; @@ -672,8 +671,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) mlog_entry_void(); - if (!osb) - BUG(); + BUG_ON(!osb); journal = osb->journal; if (!journal) @@ -805,8 +803,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) mlog_entry_void(); - if (!journal) - BUG(); + BUG_ON(!journal); status = journal_wipe(journal->j_journal, full); if (status < 0) { @@ -1271,8 +1268,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* Should not ever be called to recover ourselves -- in that * case we should've called ocfs2_journal_load instead. */ - if (osb->node_num == node_num) - BUG(); + BUG_ON(osb->node_num == node_num); slot_num = ocfs2_node_num_to_slot(si, node_num); if (slot_num == OCFS2_INVALID_SLOT) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e7e17bdf62..046824b6b6 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1254,8 +1254,7 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->sb = sb; /* Save off for ocfs2_rw_direct */ osb->s_sectsize_bits = blksize_bits(sector_size); - if (!osb->s_sectsize_bits) - BUG(); + BUG_ON(!osb->s_sectsize_bits); osb->net_response_ids = 0; spin_lock_init(&osb->net_response_lock); diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 600a8bc5b5..fc29cb7a43 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -77,8 +77,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ inode = igrab(inode); - if (!inode) - BUG(); + BUG_ON(!inode); return inode; } @@ -89,8 +88,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, /* add one more if putting into array for first time */ if (arr && inode) { *arr = igrab(inode); - if (!*arr) - BUG(); + BUG_ON(!*arr); } return inode; } -- cgit v1.2.2 From 215c7f9fa11d3fc6ccd2df242d259c721ec7ae6a Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 1 Feb 2006 16:42:10 -0800 Subject: [PATCH] ocfs2: fix compile warnings Fix a couple of compile warnings found when compiling on a ppc64 build box. Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/heartbeat.c | 5 +++-- fs/ocfs2/cluster/tcp.c | 16 +++++++++------- fs/ocfs2/file.c | 10 ++++++---- 3 files changed, 18 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 7307ba5289..d08971d29b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -917,8 +917,9 @@ static int o2hb_thread(void *data) elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", - before_hb.tv_sec, before_hb.tv_usec, - after_hb.tv_sec, after_hb.tv_usec, elapsed_msec); + before_hb.tv_sec, (unsigned long) before_hb.tv_usec, + after_hb.tv_sec, (unsigned long) after_hb.tv_usec, + elapsed_msec); if (elapsed_msec < reg->hr_timeout_ms) { /* the kthread api has blocked signals for us so no diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 35d92c01a9..d22d4cf08d 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1285,14 +1285,16 @@ static void o2net_idle_timer(unsigned long data) mlog(ML_NOTICE, "here are some times that might help debug the " "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", - sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, - now.tv_sec, now.tv_usec, - sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, - sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, - sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, + sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, + now.tv_sec, (long) now.tv_usec, + sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, + sc->sc_tv_advance_start.tv_sec, + (long) sc->sc_tv_advance_start.tv_usec, + sc->sc_tv_advance_stop.tv_sec, + (long) sc->sc_tv_advance_stop.tv_usec, sc->sc_msg_key, sc->sc_msg_type, - sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec, - sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec); + sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, + sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index eaf33caa0a..1715bc90e7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1022,8 +1022,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, } newsize = count + saved_pos; - mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", - saved_pos, newsize, i_size_read(inode)); + mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", + (long long) saved_pos, (long long) newsize, + (long long) i_size_read(inode)); /* No need for a higher level metadata lock if we're * never going past i_size. */ @@ -1042,8 +1043,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, spin_unlock(&OCFS2_I(inode)->ip_lock); mlog(0, "Writing at EOF, may need more allocation: " - "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", - i_size_read(inode), newsize, clusters); + "i_size = %lld, newsize = %lld, need %u clusters\n", + (long long) i_size_read(inode), (long long) newsize, + clusters); /* We only want to continue the rest of this loop if * our extend will actually require more -- cgit v1.2.2 From 3d0f89bb169482d26d5aa4e82e763077e7e9bc4d Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 25 Jan 2006 13:31:07 -0800 Subject: configfs: Add permission and ownership to configfs objects. configfs always made item and attribute ownership root.root and permissions based on a umask of 022. Add ->setattr() to allow chown(2)/chmod(2), and persist the changes for the lifetime of the items and attributes. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/configfs_internal.h | 11 ++-- fs/configfs/dir.c | 36 +++++++++---- fs/configfs/file.c | 19 +++---- fs/configfs/inode.c | 117 ++++++++++++++++++++++++++++++++++++---- fs/configfs/mount.c | 28 ++++++++-- fs/configfs/symlink.c | 1 + 6 files changed, 176 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 8899d9c5f6..f70e46951b 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -36,6 +36,7 @@ struct configfs_dirent { int s_type; umode_t s_mode; struct dentry * s_dentry; + struct iattr * s_iattr; }; #define CONFIGFS_ROOT 0x0001 @@ -48,10 +49,11 @@ struct configfs_dirent { #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) extern struct vfsmount * configfs_mount; +extern kmem_cache_t *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); -extern struct inode * configfs_new_inode(mode_t mode); +extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *); extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); @@ -63,6 +65,7 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); +extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr); extern int configfs_pin_fs(void); extern void configfs_release_fs(void); @@ -120,8 +123,10 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry static inline void release_configfs_dirent(struct configfs_dirent * sd) { - if (!(sd->s_type & CONFIGFS_ROOT)) - kfree(sd); + if (!(sd->s_type & CONFIGFS_ROOT)) { + kfree(sd->s_iattr); + kmem_cache_free(configfs_dir_cachep, sd); + } } static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b668ec6152..ca60e3abef 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -72,7 +72,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare { struct configfs_dirent * sd; - sd = kmalloc(sizeof(*sd), GFP_KERNEL); + sd = kmem_cache_alloc(configfs_dir_cachep, GFP_KERNEL); if (!sd) return NULL; @@ -136,13 +136,19 @@ static int create_dir(struct config_item * k, struct dentry * p, int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; - error = configfs_create(d, mode, init_dir); + error = configfs_make_dirent(p->d_fsdata, d, k, mode, + CONFIGFS_DIR); if (!error) { - error = configfs_make_dirent(p->d_fsdata, d, k, mode, - CONFIGFS_DIR); + error = configfs_create(d, mode, init_dir); if (!error) { p->d_inode->i_nlink++; (d)->d_op = &configfs_dentry_ops; + } else { + struct configfs_dirent *sd = d->d_fsdata; + if (sd) { + list_del_init(&sd->s_sibling); + configfs_put(sd); + } } } return error; @@ -182,12 +188,19 @@ int configfs_create_link(struct configfs_symlink *sl, int err = 0; umode_t mode = S_IFLNK | S_IRWXUGO; - err = configfs_create(dentry, mode, init_symlink); + err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, + CONFIGFS_ITEM_LINK); if (!err) { - err = configfs_make_dirent(parent->d_fsdata, dentry, sl, - mode, CONFIGFS_ITEM_LINK); + err = configfs_create(dentry, mode, init_symlink); if (!err) dentry->d_op = &configfs_dentry_ops; + else { + struct configfs_dirent *sd = dentry->d_fsdata; + if (sd) { + list_del_init(&sd->s_sibling); + configfs_put(sd); + } + } } return err; } @@ -241,13 +254,15 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den struct configfs_attribute * attr = sd->s_element; int error; + dentry->d_fsdata = configfs_get(sd); + sd->s_dentry = dentry; error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file); - if (error) + if (error) { + configfs_put(sd); return error; + } dentry->d_op = &configfs_dentry_ops; - dentry->d_fsdata = configfs_get(sd); - sd->s_dentry = dentry; d_rehash(dentry); return 0; @@ -839,6 +854,7 @@ struct inode_operations configfs_dir_inode_operations = { .symlink = configfs_symlink, .unlink = configfs_unlink, .lookup = configfs_lookup, + .setattr = configfs_setattr, }; #if 0 diff --git a/fs/configfs/file.c b/fs/configfs/file.c index c26cd61f13..3921920d87 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -26,7 +26,6 @@ #include #include -#include #include #include #include @@ -150,7 +149,7 @@ out: /** * fill_write_buffer - copy buffer from userspace. * @buffer: data buffer for file. - * @userbuf: data from user. + * @buf: data from user. * @count: number of bytes in @userbuf. * * Allocate @buffer->page if it hasn't been already, then @@ -177,8 +176,9 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size /** * flush_write_buffer - push buffer to config_item. - * @file: file pointer. + * @dentry: dentry to the attribute * @buffer: data buffer for file. + * @count: number of bytes * * Get the correct pointers for the config_item and the attribute we're * dealing with, then call the store() method for the attribute, @@ -217,15 +217,16 @@ static ssize_t configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct configfs_buffer * buffer = file->private_data; + ssize_t len; down(&buffer->sem); - count = fill_write_buffer(buffer,buf,count); - if (count > 0) - count = flush_write_buffer(file->f_dentry,buffer,count); - if (count > 0) - *ppos += count; + len = fill_write_buffer(buffer, buf, count); + if (len > 0) + len = flush_write_buffer(file->f_dentry, buffer, count); + if (len > 0) + *ppos += len; up(&buffer->sem); - return count; + return len; } static int check_perm(struct inode * inode, struct file * file) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 6577c588de..737842f276 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "configfs_internal.h" @@ -48,18 +49,107 @@ static struct backing_dev_info configfs_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, }; -struct inode * configfs_new_inode(mode_t mode) +static struct inode_operations configfs_inode_operations ={ + .setattr = configfs_setattr, +}; + +int configfs_setattr(struct dentry * dentry, struct iattr * iattr) +{ + struct inode * inode = dentry->d_inode; + struct configfs_dirent * sd = dentry->d_fsdata; + struct iattr * sd_iattr; + unsigned int ia_valid = iattr->ia_valid; + int error; + + if (!sd) + return -EINVAL; + + sd_iattr = sd->s_iattr; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + error = inode_setattr(inode, iattr); + if (error) + return error; + + if (!sd_iattr) { + /* setting attributes for the first time, allocate now */ + sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL); + if (!sd_iattr) + return -ENOMEM; + /* assign default attributes */ + memset(sd_iattr, 0, sizeof(struct iattr)); + sd_iattr->ia_mode = sd->s_mode; + sd_iattr->ia_uid = 0; + sd_iattr->ia_gid = 0; + sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; + sd->s_iattr = sd_iattr; + } + + /* attributes were changed atleast once in past */ + + if (ia_valid & ATTR_UID) + sd_iattr->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + sd_iattr->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + sd_iattr->ia_mode = sd->s_mode = mode; + } + + return error; +} + +static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +{ + inode->i_mode = mode; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) +{ + inode->i_mode = iattr->ia_mode; + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { - inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->a_ops = &configfs_aops; inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; + inode->i_op = &configfs_inode_operations; + + if (sd->s_iattr) { + /* sysfs_dirent has non-default attributes + * get them for the new inode from persistent copy + * in sysfs_dirent + */ + set_inode_attr(inode, sd->s_iattr); + } else + set_default_inode_attr(inode, mode); } return inode; } @@ -70,7 +160,8 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode * struct inode * inode = NULL; if (dentry) { if (!dentry->d_inode) { - if ((inode = configfs_new_inode(mode))) { + struct configfs_dirent *sd = dentry->d_fsdata; + if ((inode = configfs_new_inode(mode, sd))) { if (dentry->d_parent && dentry->d_parent->d_inode) { struct inode *p_inode = dentry->d_parent->d_inode; p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; @@ -103,7 +194,7 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode * */ const unsigned char * configfs_get_name(struct configfs_dirent *sd) { - struct attribute * attr; + struct configfs_attribute *attr; if (!sd || !sd->s_element) BUG(); @@ -114,7 +205,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd) if (sd->s_type & CONFIGFS_ITEM_ATTR) { attr = sd->s_element; - return attr->name; + return attr->ca_name; } return NULL; } @@ -130,13 +221,17 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry) && dentry->d_inode)) { dget_locked(dentry); __d_drop(dentry); + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, dentry); - } else + } else { + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + } } } @@ -145,6 +240,10 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) struct configfs_dirent * sd; struct configfs_dirent * parent_sd = dir->d_fsdata; + if (dir->d_inode == NULL) + /* no inode means this hasn't been made visible yet */ + return; + mutex_lock(&dir->d_inode->i_mutex); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 1a2f6f6a4d..f920d30478 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -38,6 +38,7 @@ struct vfsmount * configfs_mount = NULL; struct super_block * configfs_sb = NULL; +kmem_cache_t *configfs_dir_cachep; static int configfs_mnt_count = 0; static struct super_operations configfs_ops = { @@ -62,6 +63,7 @@ static struct configfs_dirent configfs_root = { .s_children = LIST_HEAD_INIT(configfs_root.s_children), .s_element = &configfs_root_group.cg_item, .s_type = CONFIGFS_ROOT, + .s_iattr = NULL, }; static int configfs_fill_super(struct super_block *sb, void *data, int silent) @@ -73,9 +75,11 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = CONFIGFS_MAGIC; sb->s_op = &configfs_ops; + sb->s_time_gran = 1; configfs_sb = sb; - inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); + inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + &configfs_root); if (inode) { inode->i_op = &configfs_dir_inode_operations; inode->i_fop = &configfs_dir_operations; @@ -128,19 +132,31 @@ static decl_subsys(config, NULL, NULL); static int __init configfs_init(void) { - int err; + int err = -ENOMEM; + + configfs_dir_cachep = kmem_cache_create("configfs_dir_cache", + sizeof(struct configfs_dirent), + 0, 0, NULL, NULL); + if (!configfs_dir_cachep) + goto out; kset_set_kset_s(&config_subsys, kernel_subsys); err = subsystem_register(&config_subsys); - if (err) - return err; + if (err) { + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; + goto out; + } err = register_filesystem(&configfs_fs_type); if (err) { printk(KERN_ERR "configfs: Unable to register filesystem!\n"); subsystem_unregister(&config_subsys); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; } +out: return err; } @@ -148,11 +164,13 @@ static void __exit configfs_exit(void) { unregister_filesystem(&configfs_fs_type); subsystem_unregister(&config_subsys); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; } MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); -MODULE_VERSION("0.0.1"); +MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); module_init(configfs_init); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 50f5840521..99137026b4 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -277,5 +277,6 @@ struct inode_operations configfs_symlink_inode_operations = { .follow_link = configfs_follow_link, .readlink = generic_readlink, .put_link = configfs_put_link, + .setattr = configfs_setattr, }; -- cgit v1.2.2 From 1a1974fd4533afdb73873cdacb942d9a79ff7c9b Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn / snakebyte Date: Fri, 27 Jan 2006 10:32:24 +0100 Subject: [PATCH] BUG_ON() Conversion in fs/configfs/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/inode.c | 3 +-- fs/configfs/symlink.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 737842f276..c153bd9534 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -196,8 +196,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd) { struct configfs_attribute *attr; - if (!sd || !sd->s_element) - BUG(); + BUG_ON(!sd || !sd->s_element); /* These always have a dentry, so use that */ if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 99137026b4..e5512e295c 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -162,8 +162,7 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry) if (!(sd->s_type & CONFIGFS_ITEM_LINK)) goto out; - if (dentry->d_parent == configfs_sb->s_root) - BUG(); + BUG_ON(dentry->d_parent == configfs_sb->s_root); sl = sd->s_element; -- cgit v1.2.2 From 6eff5790d57a5c9c01489c95946881808a4b2a2c Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 18 Jan 2006 10:31:47 -0800 Subject: [PATCH] ocfs2: don't wait on recovery when locking journal The mount path had incorrectly asked the locking code to wait for recovery completion, which deadlocks things because recovery waits for mount to complete first. Signed-off-by: Mark Fasheh --- fs/ocfs2/journal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b71b3385fd..fa0bcac5ce 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -560,7 +560,11 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) SET_INODE_JOURNAL(inode); OCFS2_I(inode)->ip_open_count++; - status = ocfs2_meta_lock(inode, NULL, &bh, 1); + /* Skip recovery waits here - journal inode metadata never + * changes in a live cluster so it can be considered an + * exception to the rule. */ + status = ocfs2_meta_lock_full(inode, NULL, &bh, 1, + OCFS2_META_LOCK_RECOVERY); if (status < 0) { if (status != -ERESTARTSYS) mlog(ML_ERROR, "Could not get lock on journal!\n"); -- cgit v1.2.2 From 88a2a4ac6b671a4b0dd5d2d762418904c05f4104 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Feb 2006 23:27:36 -0800 Subject: [PATCH] percpu data: only iterate over possible CPUs percpu_data blindly allocates bootmem memory to store NR_CPUS instances of cpudata, instead of allocating memory only for possible cpus. As a preparation for changing that, we need to convert various 0 -> NR_CPUS loops to use for_each_cpu(). (The above only applies to users of asm-generic/percpu.h. powerpc has gone it alone and is presently only allocating memory for present CPUs, so it's currently corrupting memory). Signed-off-by: Eric Dumazet Cc: "David S. Miller" Cc: James Bottomley Acked-by: Ingo Molnar Cc: Jens Axboe Cc: Anton Blanchard Acked-by: William Irwin Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index fd066b261c..cea7cbea11 100644 --- a/fs/file.c +++ b/fs/file.c @@ -379,7 +379,6 @@ static void __devinit fdtable_defer_list_init(int cpu) void __init files_defer_init(void) { int i; - /* Really early - can't use for_each_cpu */ - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) fdtable_defer_list_init(i); } -- cgit v1.2.2 From 7128ec2a747d7a5f3c764c37bef17081ccc2374c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 4 Feb 2006 23:27:40 -0800 Subject: [PATCH] fuse: fix request_end() vs fuse_reset_request() race The last fix for this function in fact opened up a much more often triggering race. It was uncommented tricky code, that was buggy. Add comment, make it less tricky and fix bug. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4526da8907..f556a0d5c0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -120,9 +120,9 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc) return do_get_request(fc); } +/* Must be called with fuse_lock held */ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) { - spin_lock(&fuse_lock); if (req->preallocated) { atomic_dec(&fc->num_waiting); list_add(&req->list, &fc->unused_list); @@ -134,10 +134,18 @@ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) fc->outstanding_debt--; else up(&fc->outstanding_sem); - spin_unlock(&fuse_lock); } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +{ + if (atomic_dec_and_test(&req->count)) { + spin_lock(&fuse_lock); + fuse_putback_request(fc, req); + spin_unlock(&fuse_lock); + } +} + +static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) fuse_putback_request(fc, req); @@ -163,26 +171,36 @@ void fuse_release_background(struct fuse_req *req) * still waiting), the 'end' callback is called if given, else the * reference to the request is released * + * Releasing extra reference for foreground requests must be done + * within the same locked region as setting state to finished. This + * is because fuse_reset_request() may be called after request is + * finished and it must be the sole possessor. If request is + * interrupted and put in the background, it will return with an error + * and hence never be reset and reused. + * * Called with fuse_lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) { - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - req->end = NULL; list_del(&req->list); req->state = FUSE_REQ_FINISHED; - spin_unlock(&fuse_lock); - if (req->background) { + if (!req->background) { + wake_up(&req->waitq); + fuse_put_request_locked(fc, req); + spin_unlock(&fuse_lock); + } else { + void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; + req->end = NULL; + spin_unlock(&fuse_lock); down_read(&fc->sbput_sem); if (fc->mounted) fuse_release_background(req); up_read(&fc->sbput_sem); + if (end) + end(fc, req); + else + fuse_put_request(fc, req); } - wake_up(&req->waitq); - if (end) - end(fc, req); - else - fuse_put_request(fc, req); } /* -- cgit v1.2.2 From fe1dcbc4f311c2e6c23b33c0fa8572461618ab3e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 4 Feb 2006 23:27:54 -0800 Subject: [PATCH] jbd: fix transaction batching Ben points out that: When writing files out using O_SYNC, jbd's 1 jiffy delay results in a significant drop in throughput as the disk sits idle. The patch below results in a 4-5x performance improvement (from 6.5MB/s to ~24-30MB/s on my IDE test box) when writing out files using O_SYNC. So optimise the batching code by omitting it entirely if the process which is doing a sync write is the same as the one which did the most recent sync write. If that's true, we're unlikely to get any other processes joining the transaction. (Has been in -mm for ages - it took me a long time to get on to performance testing it) Numbers, on write-cache-disabled IDE: /usr/bin/time -p synctest -n 10 -uf -t 1 -p 1 dir-name Unpatched: 40 seconds Patched: 35 seconds Batching disabled: 35 seconds This is the problematic single-process-doing-fsync case. With multiple fsyncing processes the numbers are AFACIT unaltered by the patch. Aside: performance testing and instrumentation shows that the transaction batching almost doesn't help (testing with synctest -n 1 -uf -t 100 -p 10 dir-name on non-writeback-caching IDE). This is because by the time one process is running a synchronous commit, a bunch of other processes already have a transaction handle open, so they're all going to batch into the same transaction anyway. The batching seems to offer maybe 5-10% speedup with this workload, but I'm pretty sure it was more important than that when it was first developed 4-odd years ago... Cc: "Stephen C. Tweedie" Cc: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/transaction.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 429f4b263c..ca917973c2 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1308,6 +1308,7 @@ int journal_stop(handle_t *handle) transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; int old_handle_count, err; + pid_t pid; J_ASSERT(transaction->t_updates > 0); J_ASSERT(journal_current_handle() == handle); @@ -1333,8 +1334,15 @@ int journal_stop(handle_t *handle) * It doesn't cost much - we're about to run a commit and sleep * on IO anyway. Speeds up many-threaded, many-dir operations * by 30x or more... + * + * But don't do this if this process was the most recent one to + * perform a synchronous write. We do this to detect the case where a + * single process is doing a stream of sync writes. No point in waiting + * for joiners in that case. */ - if (handle->h_sync) { + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid) { + journal->j_last_sync_writer = pid; do { old_handle_count = transaction->t_handle_count; schedule_timeout_uninterruptible(1); -- cgit v1.2.2 From f55eab822b93864ef4eef3bd7eadac2a727c914b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 4 Feb 2006 23:28:01 -0800 Subject: [PATCH] VFS: Ensure LOOKUP_CONTINUE flag is preserved by link_path_walk() When walking a path, the LOOKUP_CONTINUE flag is used by some filesystems (for instance NFS) in order to determine whether or not it is looking up the last component of the path. It this is the case, it may have to look at the intent information in order to perform various tasks such as atomic open. A problem currently occurs when link_path_walk() hits a symlink. In this case LOOKUP_CONTINUE may be cleared prematurely when we hit the end of the path passed by __vfs_follow_link() (i.e. the end of the symlink path) rather than when we hit the end of the path passed by the user. The solution is to have link_path_walk() clear LOOKUP_CONTINUE if and only if that flag was unset when we entered the function. Signed-off-by: Trond Myklebust Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 7ac9fb4acb..b760e1e18b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -790,7 +790,7 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd) inode = nd->dentry->d_inode; if (nd->depth) - lookup_flags = LOOKUP_FOLLOW; + lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); /* At this point we know we have a real path component. */ for(;;) { @@ -885,7 +885,8 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd) last_with_slashes: lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; last_component: - nd->flags &= ~LOOKUP_CONTINUE; + /* Clear LOOKUP_CONTINUE iff it was previously unset */ + nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; if (lookup_flags & LOOKUP_PARENT) goto lookup_parent; if (this.name[0] == '.') switch (this.len) { -- cgit v1.2.2 From 170aa3d02614ae621d54af10555e2f48977ae8de Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 4 Feb 2006 23:28:02 -0800 Subject: [PATCH] namei.c: unlock missing in error case Signed-off-by: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index b760e1e18b..faf61c3530 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1070,6 +1070,8 @@ static int fastcall do_path_lookup(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { int retval = 0; + int fput_needed; + struct file *file; nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags; @@ -1091,29 +1093,22 @@ static int fastcall do_path_lookup(int dfd, const char *name, nd->mnt = mntget(current->fs->pwdmnt); nd->dentry = dget(current->fs->pwd); } else { - struct file *file; - int fput_needed; struct dentry *dentry; file = fget_light(dfd, &fput_needed); - if (!file) { - retval = -EBADF; - goto out_fail; - } + retval = -EBADF; + if (!file) + goto unlock_fail; dentry = file->f_dentry; - if (!S_ISDIR(dentry->d_inode->i_mode)) { - retval = -ENOTDIR; - fput_light(file, fput_needed); - goto out_fail; - } + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_unlock_fail; retval = file_permission(file, MAY_EXEC); - if (retval) { - fput_light(file, fput_needed); - goto out_fail; - } + if (retval) + goto fput_unlock_fail; nd->mnt = mntget(file->f_vfsmnt); nd->dentry = dget(dentry); @@ -1127,7 +1122,12 @@ out: if (unlikely(current->audit_context && nd && nd->dentry && nd->dentry->d_inode)) audit_inode(name, nd->dentry->d_inode, flags); -out_fail: + return retval; + +fput_unlock_fail: + fput_light(file, fput_needed); +unlock_fail: + read_unlock(¤t->fs->lock); return retval; } -- cgit v1.2.2 From 64419d93a5906600af5817ad0cae3c6ecf7fb389 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 5 Feb 2006 21:43:57 +0000 Subject: NTFS: We have struct kmem_cache now so use it instead of the typedef. Signed-off-by: Pekka Enberg Signed-off-by: Anton Altaparmakov --- fs/ntfs/ntfs.h | 10 +++++----- fs/ntfs/super.c | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h index 446b501411..653d2a5c48 100644 --- a/fs/ntfs/ntfs.h +++ b/fs/ntfs/ntfs.h @@ -50,11 +50,11 @@ typedef enum { /* Global variables. */ /* Slab caches (from super.c). */ -extern kmem_cache_t *ntfs_name_cache; -extern kmem_cache_t *ntfs_inode_cache; -extern kmem_cache_t *ntfs_big_inode_cache; -extern kmem_cache_t *ntfs_attr_ctx_cache; -extern kmem_cache_t *ntfs_index_ctx_cache; +extern struct kmem_cache *ntfs_name_cache; +extern struct kmem_cache *ntfs_inode_cache; +extern struct kmem_cache *ntfs_big_inode_cache; +extern struct kmem_cache *ntfs_attr_ctx_cache; +extern struct kmem_cache *ntfs_index_ctx_cache; /* The various operations structs defined throughout the driver files. */ extern struct address_space_operations ntfs_aops; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index c3a3f1a831..e9c0d80dfa 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2987,14 +2987,14 @@ err_out_now: * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN * (255) Unicode characters + a terminating NULL Unicode character. */ -kmem_cache_t *ntfs_name_cache; +struct kmem_cache *ntfs_name_cache; /* Slab caches for efficient allocation/deallocation of inodes. */ -kmem_cache_t *ntfs_inode_cache; -kmem_cache_t *ntfs_big_inode_cache; +struct kmem_cache *ntfs_inode_cache; +struct kmem_cache *ntfs_big_inode_cache; /* Init once constructor for the inode slab cache. */ -static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep, +static void ntfs_big_inode_init_once(void *foo, struct kmem_cache *cachep, unsigned long flags) { ntfs_inode *ni = (ntfs_inode *)foo; @@ -3008,8 +3008,8 @@ static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep, * Slab caches to optimize allocations and deallocations of attribute search * contexts and index contexts, respectively. */ -kmem_cache_t *ntfs_attr_ctx_cache; -kmem_cache_t *ntfs_index_ctx_cache; +struct kmem_cache *ntfs_attr_ctx_cache; +struct kmem_cache *ntfs_index_ctx_cache; /* Driver wide semaphore. */ DECLARE_MUTEX(ntfs_lock); -- cgit v1.2.2 From 276e0c75f1e9a8b34b7b19e8fe188be958d420dd Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Wed, 25 Jan 2006 14:49:13 +0100 Subject: [PATCH] debugfs: trivial comment fix Fix trivial type mixup in the debugfs function comments. Signed-off-by: Vincent Hanquez Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index efc97d9b78..d575452cd9 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -56,7 +56,7 @@ static u64 debugfs_u8_get(void *data) DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); /** - * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. + * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write an unsigned 8 bit value. * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have @@ -98,7 +98,7 @@ static u64 debugfs_u16_get(void *data) DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); /** - * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. + * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write an unsigned 16 bit value. * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have @@ -140,7 +140,7 @@ static u64 debugfs_u32_get(void *data) DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); /** - * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. + * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write an unsigned 32 bit value. * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have -- cgit v1.2.2 From 9fddaca2293d768eb21ea115e5eedec7f1c13c1c Mon Sep 17 00:00:00 2001 From: David Chinner Date: Tue, 7 Feb 2006 20:27:24 +1100 Subject: [XFS] Account for the page we just wrote when we detect congestion during the clustering of extra pages in a buffered write. SGI-PV: 949210 SGI-Modid: xfs-linux-melb:xfs-kern:25130a Signed-off-by: David Chinner Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_aops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 9892268e30..8f2beec526 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -747,10 +747,11 @@ xfs_convert_page( struct backing_dev_info *bdi; bdi = inode->i_mapping->backing_dev_info; + wbc->nr_to_write--; if (bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; - } else if (--wbc->nr_to_write <= 0) { + } else if (wbc->nr_to_write <= 0) { done = 1; } } -- cgit v1.2.2 From 9bd6f13dfd1dfb2e8f20df50581ebe7344ba97bd Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 7 Feb 2006 20:27:44 +1100 Subject: [XFS] Fix missing inode atime update from the utime syscall. SGI-PV: 949214 SGI-Modid: xfs-linux-melb:xfs-kern:25136a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_iops.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index eda7919b70..d7f6f2d8ac 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -673,6 +673,8 @@ linvfs_setattr( if (ia_valid & ATTR_ATIME) { vattr.va_mask |= XFS_AT_ATIME; vattr.va_atime = attr->ia_atime; + if (ia_valid & ATTR_ATIME_SET) + inode->i_atime = attr->ia_atime; } if (ia_valid & ATTR_MTIME) { vattr.va_mask |= XFS_AT_MTIME; -- cgit v1.2.2 From cbd0d51a3318583fabf03bccc7a987e158482361 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 7 Feb 2006 12:58:32 -0800 Subject: [PATCH] knfsd: fix nfs4_open lock leak I just noticed that my patch "don't create on open that fails due to ERR_GRACE" (recently commited as fb553c0f17444e090db951b96df4d2d71b4f4b6b) had an obvious problem that causes a deadlock on reboot recovery. Sending in this now since it seems like a clear 2.6.16 candidate.--b. We're returning with a lock held in some error cases. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4proc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a00fe86862..6d63f1d9e5 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -195,10 +195,12 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ + status = nfserr_grace; if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) - return nfserr_grace; + goto out; + status = nfserr_no_grace; if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) - return nfserr_no_grace; + goto out; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: -- cgit v1.2.2 From 3bc8414b079ec372485c99ed1f33c6c42ca9d756 Mon Sep 17 00:00:00 2001 From: Suzuki Date: Tue, 7 Feb 2006 12:58:36 -0800 Subject: [PATCH] Fix do_path_lookup() to add the check for error in link_path_walk() Fix do_path_lookup() to avoid accessing invalid dentry or inode when the link_path_walk() has failed. This should fix Bugme #5897. Signed-off-by: Suzuki K P Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index faf61c3530..e28de846c5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1119,9 +1119,11 @@ static int fastcall do_path_lookup(int dfd, const char *name, current->total_link_count = 0; retval = link_path_walk(name, nd); out: - if (unlikely(current->audit_context - && nd && nd->dentry && nd->dentry->d_inode)) + if (likely(retval == 0)) { + if (unlikely(current->audit_context && nd && nd->dentry && + nd->dentry->d_inode)) audit_inode(name, nd->dentry->d_inode, flags); + } return retval; fput_unlock_fail: -- cgit v1.2.2 From b5173119ff10c5538e92a7957a50887ae170b8da Mon Sep 17 00:00:00 2001 From: Robert Love Date: Tue, 7 Feb 2006 12:58:45 -0800 Subject: [PATCH] inotify: fix one-shot support Fix one-shot support in inotify. We currently drop the IN_ONESHOT flag during watch addition. Fix is to not do that. Signed-off-by: Robert Love Cc: John McCutchan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inotify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/inotify.c b/fs/inotify.c index 878ccca612..3041503bde 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -967,7 +967,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) mask_add = 1; /* don't let user-space set invalid bits: we don't want flags set */ - mask &= IN_ALL_EVENTS; + mask &= IN_ALL_EVENTS | IN_ONESHOT; if (unlikely(!mask)) { ret = -EINVAL; goto out; -- cgit v1.2.2 From 7b4fe29e00a5ab4e778bb24be86d836a25570bc9 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 7 Feb 2006 12:58:48 -0800 Subject: [PATCH] More informative message on umount failure We had a user trigger this message on a box that had a lot of different mounts, all with different options. It might help narrow down wtf happened if we print out which device failed. Signed-off-by: Dave Jones Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index c177b92419..30294218fa 100644 --- a/fs/super.c +++ b/fs/super.c @@ -247,8 +247,9 @@ void generic_shutdown_super(struct super_block *sb) /* Forget any remaining inodes */ if (invalidate_inodes(sb)) { - printk("VFS: Busy inodes after unmount. " - "Self-destruct in 5 seconds. Have a nice day...\n"); + printk("VFS: Busy inodes after unmount of %s. " + "Self-destruct in 5 seconds. Have a nice day...\n", + sb->s_id); } unlock_kernel(); -- cgit v1.2.2 From 741a295130606143edbf9fc740f633dbc1e6225f Mon Sep 17 00:00:00 2001 From: JANAK DESAI Date: Tue, 7 Feb 2006 12:59:00 -0800 Subject: [PATCH] unshare system call -v5: unshare namespace If the namespace structure is being shared, allocate a new one and copy information from the current, shared, structure. Signed-off-by: Janak Desai Cc: Al Viro Cc: Christoph Hellwig Cc: Michael Kerrisk Cc: Andi Kleen Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 56 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index ce97becff4..a2bef5c810 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1325,27 +1325,17 @@ dput_out: return retval; } -int copy_namespace(int flags, struct task_struct *tsk) +/* + * Allocate a new namespace structure and populate it with contents + * copied from the namespace of the passed in task structure. + */ +struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) { struct namespace *namespace = tsk->namespace; struct namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; - struct fs_struct *fs = tsk->fs; struct vfsmount *p, *q; - if (!namespace) - return 0; - - get_namespace(namespace); - - if (!(flags & CLONE_NEWNS)) - return 0; - - if (!capable(CAP_SYS_ADMIN)) { - put_namespace(namespace); - return -EPERM; - } - new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); if (!new_ns) goto out; @@ -1396,8 +1386,6 @@ int copy_namespace(int flags, struct task_struct *tsk) } up_write(&namespace_sem); - tsk->namespace = new_ns; - if (rootmnt) mntput(rootmnt); if (pwdmnt) @@ -1405,12 +1393,40 @@ int copy_namespace(int flags, struct task_struct *tsk) if (altrootmnt) mntput(altrootmnt); - put_namespace(namespace); - return 0; +out: + return new_ns; +} + +int copy_namespace(int flags, struct task_struct *tsk) +{ + struct namespace *namespace = tsk->namespace; + struct namespace *new_ns; + int err = 0; + + if (!namespace) + return 0; + + get_namespace(namespace); + + if (!(flags & CLONE_NEWNS)) + return 0; + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + new_ns = dup_namespace(tsk, tsk->fs); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->namespace = new_ns; out: put_namespace(namespace); - return -ENOMEM; + return err; } asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, -- cgit v1.2.2 From 1b8623545b42c03eb92e51b28c84acf4b8ba00a3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Dec 2005 01:07:03 -0500 Subject: [PATCH] remove bogus asm/bug.h includes. A bunch of asm/bug.h includes are both not needed (since it will get pulled anyway) and bogus (since they are done too early). Removed. Signed-off-by: Al Viro --- fs/reiserfs/hashes.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c index a3ec238fd9..e664ac16fa 100644 --- a/fs/reiserfs/hashes.c +++ b/fs/reiserfs/hashes.c @@ -21,7 +21,6 @@ #include #include #include -#include #define DELTA 0x9E3779B9 #define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ -- cgit v1.2.2 From e110ab94ebc714de57f75f0c7c576dde8cf80944 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Feb 2006 05:26:09 -0500 Subject: [PATCH] fix __user annotations in fs/select.c Signed-off-by: Al Viro --- fs/select.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index c0f02d36c6..bc60a3e14e 100644 --- a/fs/select.c +++ b/fs/select.c @@ -510,9 +510,9 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp, if (sig) { if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t)) - || __get_user(up, (sigset_t * __user *)sig) + || __get_user(up, (sigset_t __user * __user *)sig) || __get_user(sigsetsize, - (size_t * __user)(sig+sizeof(void *)))) + (size_t __user *)(sig+sizeof(void *)))) return -EFAULT; } -- cgit v1.2.2 From 8854eddbdb3e45b8d381ecff2937a942d0cb2067 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 4 Jan 2006 01:44:17 -0500 Subject: [PATCH] nfsroot port= parameter fix [backport of 2.4 fix] Direct backport of 2.4 fix that didn't get propagated to 2.6; original comment follows: When I specify the NFS port for nfsroot (e.g., nfsroot=,port=2049), the kernel uses the wrong port. In my case it tries to use 264 (0x108) instead of 2049 (0x801). This patch adds the missing htons(). Eric Patch got applied in 2.4.21-pre6. Author: Eric Lammerts (, AFAICS). Signed-off-by: Al Viro --- fs/nfs/nfsroot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index e897e00c2c..c0a754ecde 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -465,10 +465,11 @@ static int __init root_nfs_ports(void) "number from server, using default\n"); port = nfsd_port; } - nfs_port = htons(port); + nfs_port = port; dprintk("Root-NFS: Portmapper on server returned %d " "as nfsd port\n", port); } + nfs_port = htons(nfs_port); if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) { printk(KERN_ERR "Root-NFS: Unable to get mountd port " -- cgit v1.2.2 From f30ac319f1b91878cdc57a50930f15c36e0e103a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Feb 2006 07:53:21 -0500 Subject: [PATCH] umount_tree() decrements mount count on wrong dentry Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index a2bef5c810..058a44865b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -494,7 +494,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) p->mnt_namespace = NULL; list_del_init(&p->mnt_child); if (p->mnt_parent != p) - mnt->mnt_mountpoint->d_mounted--; + p->mnt_mountpoint->d_mounted--; change_mnt_propagation(p, MS_PRIVATE); } } -- cgit v1.2.2 From 6b2b4e5a26fe3795b1c6711cee0eae057844491d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Feb 2006 06:33:33 -0500 Subject: [PATCH] compat_ioctl __user annotations Signed-off-by: Al Viro --- fs/compat_ioctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 5dd0207ffd..057e60217f 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -931,8 +931,8 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg) { int err, i; - sg_req_info_t *r; - struct compat_sg_req_info *o = (struct compat_sg_req_info *)arg; + sg_req_info_t __user *r; + struct compat_sg_req_info __user *o = (void __user *)arg; r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); err = sys_ioctl(fd,cmd,(unsigned long)r); if (err < 0) @@ -2739,8 +2739,8 @@ static int do_ncp_setprivatedata(unsigned int fd, unsigned int cmd, unsigned lon static int lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg) { - struct compat_timeval *tc = (struct compat_timeval *)arg; - struct timeval *tn = compat_alloc_user_space(sizeof(struct timeval)); + struct compat_timeval __user *tc = (struct compat_timeval __user *)arg; + struct timeval __user *tn = compat_alloc_user_space(sizeof(struct timeval)); struct timeval ts; if (get_user(ts.tv_sec, &tc->tv_sec) || get_user(ts.tv_usec, &tc->tv_usec) || -- cgit v1.2.2 From cff2b760096d1e6feaa31948e7af4abbefe47822 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 11 Feb 2006 17:55:47 -0800 Subject: [PATCH] fstatat64 support The *at patches introduced fstatat and, due to inusfficient research, I used the newfstat functions generally as the guideline. The result is that on 32-bit platforms we don't have all the information needed to implement fstatat64. This patch modifies the code to pass up 64-bit information if __ARCH_WANT_STAT64 is defined. I renamed the syscall entry point to make this clear. Other archs will continue to use the existing code. On x86-64 the compat code is implemented using a new sys32_ function. this is what is done for the other stat syscalls as well. This patch might break some other archs (those which define __ARCH_WANT_STAT64 and which already wired up the syscall). Yet others might need changes to accomodate the compatibility mode. I really don't want to do that work because all this stat handling is a mess (more so in glibc, but the kernel is also affected). It should be done by the arch maintainers. I'll provide some stand-alone test shortly. Those who are eager could compile glibc and run 'make check' (no installation needed). The patch below has been tested on x86 and x86-64. Signed-off-by: Ulrich Drepper Cc: Christoph Hellwig Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/stat.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs') diff --git a/fs/stat.c b/fs/stat.c index 24211b030f..9948cc1685 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -261,6 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf) return error; } +#ifndef __ARCH_WANT_STAT64 asmlinkage long sys_newfstatat(int dfd, char __user *filename, struct stat __user *statbuf, int flag) { @@ -281,6 +282,7 @@ asmlinkage long sys_newfstatat(int dfd, char __user *filename, out: return error; } +#endif asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf) { @@ -395,6 +397,26 @@ asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf) return error; } +asmlinkage long sys_fstatat64(int dfd, char __user *filename, + struct stat64 __user *statbuf, int flag) +{ + struct kstat stat; + int error = -EINVAL; + + if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + goto out; + + if (flag & AT_SYMLINK_NOFOLLOW) + error = vfs_lstat_fd(dfd, filename, &stat); + else + error = vfs_stat_fd(dfd, filename, &stat); + + if (!error) + error = cp_new_stat64(&stat, statbuf); + +out: + return error; +} #endif /* __ARCH_WANT_STAT64 */ void inode_add_bytes(struct inode *inode, loff_t bytes) -- cgit v1.2.2 From 643a654540579b0dcc7a206a4a7475276a41aff0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 11 Feb 2006 17:55:52 -0800 Subject: [PATCH] select: fix returned timeval With David Woodhouse select() presently has a habit of increasing the value of the user's `timeout' argument on return. We were writing back a timeout larger than the original. We _deliberately_ round up, since we know we must wait at _least_ as long as the caller asks us to. The patch adds a couple of helper functions for magnitude comparison of timespecs and of timevals, and uses them to prevent the various poll and select functions from returning a timeout which is larger than the one which was passed in. The patch also fixes a bug in compat_sys_pselect7(): it was adding the new timeout value to the old one and was returning that. It should just return the new timeout value. (We have various handy timespec/timeval-to-from-nsec conversion functions in time.h. But this code open-codes it all). Cc: "David S. Miller" Cc: Andi Kleen Cc: Ulrich Drepper Cc: Thomas Gleixner Cc: george anzinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 37 +++++++++++++++++++++++++------------ fs/select.c | 32 +++++++++++++++++++++++--------- 2 files changed, 48 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 70c5af4cc2..a2ba78bdf7 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1751,11 +1751,15 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, ret = compat_core_sys_select(n, inp, outp, exp, &timeout); if (tvp) { + struct compat_timeval rtv; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - tv.tv_sec = timeout; - if (copy_to_user(tvp, &tv, sizeof(tv))) { + rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); + rtv.tv_sec = timeout; + if (compat_timeval_compare(&rtv, &tv) < 0) + rtv = tv; + if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: /* * If an application puts its timeval in read-only @@ -1822,13 +1826,17 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec)); if (tsp && !(current->personality & STICKY_TIMEOUTS)) { - ts.tv_sec += timeout / HZ; - ts.tv_nsec += (timeout % HZ) * (1000000000/HZ); - if (ts.tv_nsec >= 1000000000) { - ts.tv_sec++; - ts.tv_nsec -= 1000000000; + struct compat_timespec rts; + + rts.tv_sec = timeout / HZ; + rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ); + if (rts.tv_nsec >= NSEC_PER_SEC) { + rts.tv_sec++; + rts.tv_nsec -= NSEC_PER_SEC; } - (void)copy_to_user(tsp, &ts, sizeof(ts)); + if (compat_timespec_compare(&rts, &ts) < 0) + rts = ts; + copy_to_user(tsp, &rts, sizeof(rts)); } if (ret == -ERESTARTNOHAND) { @@ -1918,12 +1926,17 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, sigprocmask(SIG_SETMASK, &sigsaved, NULL); if (tsp && timeout >= 0) { + struct compat_timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; /* Yes, we know it's actually an s64, but it's also positive. */ - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (compat_timespec_compare(&rts, &ts) < 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only diff --git a/fs/select.c b/fs/select.c index bc60a3e14e..6ce68a9c89 100644 --- a/fs/select.c +++ b/fs/select.c @@ -398,11 +398,15 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, ret = core_sys_select(n, inp, outp, exp, &timeout); if (tvp) { + struct timeval rtv; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - tv.tv_sec = timeout; - if (copy_to_user(tvp, &tv, sizeof(tv))) { + rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); + rtv.tv_sec = timeout; + if (timeval_compare(&rtv, &tv) < 0) + rtv = tv; + if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: /* * If an application puts its timeval in read-only @@ -460,11 +464,16 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, ret = core_sys_select(n, inp, outp, exp, &timeout); if (tsp) { + struct timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (timespec_compare(&rts, &ts) < 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only @@ -758,12 +767,17 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, sigprocmask(SIG_SETMASK, &sigsaved, NULL); if (tsp && timeout >= 0) { + struct timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; /* Yes, we know it's actually an s64, but it's also positive. */ - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (timespec_compare(&rts, &ts) < 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only -- cgit v1.2.2 From 89edc3d2b429136a0e25f40275fd82dc58f147fd Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Sun, 12 Feb 2006 14:34:55 -0800 Subject: [PATCH] reiserfs: disable automatic enabling of reiserfs inode attributes Unfortunately, the reiserfs_attrs_cleared bit in the superblock flag can lie. File systems have been observed with the bit set, yet still contain garbage in the stat data field, causing unpredictable results. This patch backs out the enable-by-default behavior. It eliminates the changes from: d50a5cd860ce721dbeac6a4f3c6e42abcde68cd8, and ef5e5414e7a83eb9b4295bbaba5464410b11e030. Signed-off-by: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/super.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index ef5e5414e7..d63da756eb 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1124,8 +1124,6 @@ static void handle_attrs(struct super_block *s) "reiserfs: cannot support attributes until flag is set in super-block"); REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); } - } else if (le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared) { - REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ATTRS); } } -- cgit v1.2.2 From 90947ef26fa689a3252aa8282a01f60648e70fdb Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 13 Feb 2006 11:12:36 -0500 Subject: [PATCH] reiserfs: fix potential (unlikely) oops in reiserfs_get_acl This fixes a potential oops if there is an error reported by posix_acl_from_disk(). This is mostly theoretical due to the use of magics and checksums in xattrs, but is still possible. Signed-off-by: Jeff Mahoney Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr_acl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 43de3ba833..ab8894c3b9 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -228,7 +228,8 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) acl = ERR_PTR(retval); } else { acl = posix_acl_from_disk(value, retval); - *p_acl = posix_acl_dup(acl); + if (!IS_ERR(acl)) + *p_acl = posix_acl_dup(acl); } kfree(value); -- cgit v1.2.2 From 7c8903f6373f9abecf060bad53ca36bc4ac037f2 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 14 Feb 2006 13:53:03 -0800 Subject: [PATCH] jbd: revert checkpoint list changes This patch reverts commit f93ea411b73594f7d144855fd34278bcf34a9afc: [PATCH] jbd: split checkpoint lists This broke journal_flush() for OCFS2, which is its method of being sure that metadata is sent to disk for another node. And two related commits 8d3c7fce2d20ecc3264c8d8c91ae3beacdeaed1b and 43c3e6f5abdf6acac9b90c86bf03f995bf7d3d92 with the subjects: [PATCH] jbd: log_do_checkpoint fix [PATCH] jbd: remove_transaction fix These seem to be incremental bugfixes on the original patch and as such are no longer needed. Signed-off-by: Mark Fasheh Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/checkpoint.c | 418 ++++++++++++++++++++++------------------------------ fs/jbd/commit.c | 3 +- 2 files changed, 178 insertions(+), 243 deletions(-) (limited to 'fs') diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index e6265a0b56..543ed543d1 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -24,75 +24,29 @@ #include /* - * Unlink a buffer from a transaction checkpoint list. + * Unlink a buffer from a transaction. * * Called with j_list_lock held. */ -static void __buffer_unlink_first(struct journal_head *jh) +static inline void __buffer_unlink(struct journal_head *jh) { transaction_t *transaction; transaction = jh->b_cp_transaction; + jh->b_cp_transaction = NULL; jh->b_cpnext->b_cpprev = jh->b_cpprev; jh->b_cpprev->b_cpnext = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) { + if (transaction->t_checkpoint_list == jh) transaction->t_checkpoint_list = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) - transaction->t_checkpoint_list = NULL; - } -} - -/* - * Unlink a buffer from a transaction checkpoint(io) list. - * - * Called with j_list_lock held. - */ - -static inline void __buffer_unlink(struct journal_head *jh) -{ - transaction_t *transaction; - - transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - if (transaction->t_checkpoint_io_list == jh) { - transaction->t_checkpoint_io_list = jh->b_cpnext; - if (transaction->t_checkpoint_io_list == jh) - transaction->t_checkpoint_io_list = NULL; - } -} - -/* - * Move a buffer from the checkpoint list to the checkpoint io list - * - * Called with j_list_lock held - */ - -static inline void __buffer_relink_io(struct journal_head *jh) -{ - transaction_t *transaction; - - transaction = jh->b_cp_transaction; - __buffer_unlink_first(jh); - - if (!transaction->t_checkpoint_io_list) { - jh->b_cpnext = jh->b_cpprev = jh; - } else { - jh->b_cpnext = transaction->t_checkpoint_io_list; - jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; - jh->b_cpprev->b_cpnext = jh; - jh->b_cpnext->b_cpprev = jh; - } - transaction->t_checkpoint_io_list = jh; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; } /* * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it and 2 if we also released the - * whole transaction. - * + * Returns 1 if we released it. * Requires j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ @@ -103,11 +57,12 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); - ret = __journal_remove_checkpoint(jh) + 1; + __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); + ret = 1; } else { jbd_unlock_bh_state(bh); } @@ -162,53 +117,83 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) } /* - * Clean up transaction's list of buffers submitted for io. - * We wait for any pending IO to complete and remove any clean - * buffers. Note that we take the buffers in the opposite ordering - * from the one in which they were submitted for IO. + * Clean up a transaction's checkpoint list. + * + * We wait for any pending IO to complete and make sure any clean + * buffers are removed from the transaction. + * + * Return 1 if we performed any actions which might have destroyed the + * checkpoint. (journal_remove_checkpoint() deletes the transaction when + * the last checkpoint buffer is cleansed) * * Called with j_list_lock held. */ - -static void __wait_cp_io(journal_t *journal, transaction_t *transaction) +static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) { - struct journal_head *jh; + struct journal_head *jh, *next_jh, *last_jh; struct buffer_head *bh; - tid_t this_tid; - int released = 0; - - this_tid = transaction->t_tid; -restart: - /* Didn't somebody clean up the transaction in the meanwhile */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - return; - while (!released && transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; + int ret = 0; + + assert_spin_locked(&journal->j_list_lock); + jh = transaction->t_checkpoint_list; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + next_jh = jh; + do { + jh = next_jh; bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); - goto restart; - } if (buffer_locked(bh)) { atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart; + goto out_return_1; } + /* - * Now in whatever state the buffer currently is, we know that - * it has been written out and so we can drop it from the list + * This is foul */ - released = __journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); - } + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + goto out_return_1; + } + + if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + log_start_commit(journal, tid); + log_wait_commit(journal, tid); + goto out_return_1; + } + + /* + * AKPM: I think the buffer_jbddirty test is redundant - it + * shouldn't have NULL b_transaction? + */ + next_jh = jh->b_cpnext; + if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + __brelse(bh); + ret = 1; + } else { + jbd_unlock_bh_state(bh); + } + } while (jh != last_jh); + + return ret; +out_return_1: + spin_lock(&journal->j_list_lock); + return 1; } #define NR_BATCH 64 @@ -218,7 +203,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + spin_unlock(&journal->j_list_lock); ll_rw_block(SWRITE, *batch_count, bhs); + spin_lock(&journal->j_list_lock); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; clear_buffer_jwrite(bh); @@ -234,46 +221,19 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Return 1 if something happened which requires us to abort the current * scan of the checkpoint list. * - * Called with j_list_lock held and drops it if 1 is returned + * Called with j_list_lock held. * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ -static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count) +static int __flush_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count, + int *drop_count) { struct buffer_head *bh = jh2bh(jh); int ret = 0; - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - put_bh(bh); - ret = 1; - } - else if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; + if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { + J_ASSERT_JH(jh, jh->b_transaction == NULL); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - log_start_commit(journal, tid); - log_wait_commit(journal, tid); - ret = 1; - } - else if (!buffer_dirty(bh)) { - J_ASSERT_JH(jh, !buffer_jbddirty(bh)); - BUFFER_TRACE(bh, "remove from checkpoint"); - __journal_remove_checkpoint(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - ret = 1; - } - else { /* * Important: we are about to write the buffer, and * possibly block, while still holding the journal lock. @@ -286,30 +246,45 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, J_ASSERT_BH(bh, !buffer_jwrite(bh)); set_buffer_jwrite(bh); bhs[*batch_count] = bh; - __buffer_relink_io(jh); jbd_unlock_bh_state(bh); (*batch_count)++; if (*batch_count == NR_BATCH) { - spin_unlock(&journal->j_list_lock); __flush_batch(journal, bhs, batch_count); ret = 1; } + } else { + int last_buffer = 0; + if (jh->b_cpnext == jh) { + /* We may be about to drop the transaction. Tell the + * caller that the lists have changed. + */ + last_buffer = 1; + } + if (__try_to_free_cp_buf(jh)) { + (*drop_count)++; + ret = last_buffer; + } } return ret; } /* - * Perform an actual checkpoint. We take the first transaction on the - * list of transactions to be checkpointed and send all its buffers - * to disk. We submit larger chunks of data at once. + * Perform an actual checkpoint. We don't write out only enough to + * satisfy the current blocked requests: rather we submit a reasonably + * sized chunk of the outstanding data to disk at once for + * efficiency. __log_wait_for_space() will retry if we didn't free enough. * + * However, we _do_ take into account the amount requested so that once + * the IO has been queued, we can return as soon as enough of it has + * completed to disk. + * * The journal should be locked before calling this function. */ int log_do_checkpoint(journal_t *journal) { - transaction_t *transaction; - tid_t this_tid; int result; + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; jbd_debug(1, "Start checkpoint\n"); @@ -324,70 +299,79 @@ int log_do_checkpoint(journal_t *journal) return result; /* - * OK, we need to start writing disk blocks. Take one transaction - * and write it. + * OK, we need to start writing disk blocks. Try to free up a + * quarter of the log in a single checkpoint if we can. */ - spin_lock(&journal->j_list_lock); - if (!journal->j_checkpoint_transactions) - goto out; - transaction = journal->j_checkpoint_transactions; - this_tid = transaction->t_tid; -restart: /* - * If someone cleaned up this transaction while we slept, we're - * done (maybe it's a new transaction, but it fell at the same - * address). + * AKPM: check this code. I had a feeling a while back that it + * degenerates into a busy loop at unmount time. */ - if (journal->j_checkpoint_transactions == transaction && - transaction->t_tid == this_tid) { - int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; - struct journal_head *jh; - int retry = 0; - - while (!retry && transaction->t_checkpoint_list) { + spin_lock(&journal->j_list_lock); + while (journal->j_checkpoint_transactions) { + transaction_t *transaction; + struct journal_head *jh, *last_jh, *next_jh; + int drop_count = 0; + int cleanup_ret, retry = 0; + tid_t this_tid; + + transaction = journal->j_checkpoint_transactions; + this_tid = transaction->t_tid; + jh = transaction->t_checkpoint_list; + last_jh = jh->b_cpprev; + next_jh = jh; + do { struct buffer_head *bh; - jh = transaction->t_checkpoint_list; + jh = next_jh; + next_jh = jh->b_cpnext; bh = jh2bh(jh); if (!jbd_trylock_bh_state(bh)) { jbd_sync_bh(journal, bh); + spin_lock(&journal->j_list_lock); retry = 1; break; } - retry = __process_buffer(journal, jh, bhs, - &batch_count); - if (!retry && - lock_need_resched(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); + retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); + if (cond_resched_lock(&journal->j_list_lock)) { retry = 1; break; } - } + } while (jh != last_jh && !retry); if (batch_count) { - if (!retry) { - spin_unlock(&journal->j_list_lock); - retry = 1; - } __flush_batch(journal, bhs, &batch_count); + retry = 1; } - if (retry) { - spin_lock(&journal->j_list_lock); - goto restart; - } /* - * Now we have cleaned up the first transaction's checkpoint - * list. Let's clean up the second one. + * If someone cleaned up this transaction while we slept, we're + * done + */ + if (journal->j_checkpoint_transactions != transaction) + break; + if (retry) + continue; + /* + * Maybe it's a new transaction, but it fell at the same + * address */ - __wait_cp_io(journal, transaction); + if (transaction->t_tid != this_tid) + continue; + /* + * We have walked the whole transaction list without + * finding anything to write to disk. We had better be + * able to make some progress or we are in trouble. + */ + cleanup_ret = __cleanup_transaction(journal, transaction); + J_ASSERT(drop_count != 0 || cleanup_ret != 0); + if (journal->j_checkpoint_transactions != transaction) + break; } -out: spin_unlock(&journal->j_list_lock); result = cleanup_journal_tail(journal); if (result < 0) return result; + return 0; } @@ -471,53 +455,6 @@ int cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ -/* - * journal_clean_one_cp_list - * - * Find all the written-back checkpoint buffers in the given list and release them. - * - * Called with the journal locked. - * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) - */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) -{ - struct journal_head *last_jh; - struct journal_head *next_jh = jh; - int ret, freed = 0; - - *released = 0; - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - do { - jh = next_jh; - next_jh = jh->b_cpnext; - /* Use trylock because of the ranking */ - if (jbd_trylock_bh_state(jh2bh(jh))) { - ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } - } - } - /* - * This function only frees up some memory if possible so we - * dont have an obligation to finish processing. Bail out if - * preemption requested: - */ - if (need_resched()) - return freed; - } while (jh != last_jh); - - return freed; -} - /* * journal_clean_checkpoint_list * @@ -525,38 +462,46 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) * * Called with the journal locked. * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) + * Returns number of bufers reaped (for debug) */ int __journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret = 0, released; + int ret = 0; transaction = journal->j_checkpoint_transactions; - if (!transaction) + if (transaction == 0) goto out; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { + struct journal_head *jh; + transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); - if (need_resched()) - goto out; - if (released) - continue; - /* - * It is essential that we are as careful as in the case of - * t_checkpoint_list with removing the buffer from the list as - * we can possibly see not yet submitted buffers on io_list - */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); - if (need_resched()) - goto out; + jh = transaction->t_checkpoint_list; + if (jh) { + struct journal_head *last_jh = jh->b_cpprev; + struct journal_head *next_jh = jh; + + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranknig */ + if (jbd_trylock_bh_state(jh2bh(jh))) + ret += __try_to_free_cp_buf(jh); + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + goto out; + } while (jh != last_jh); + } } while (transaction != last_transaction); out: return ret; @@ -571,22 +516,18 @@ out: * buffer updates committed in that transaction have safely been stored * elsewhere on disk. To achieve this, all of the buffers in a * transaction need to be maintained on the transaction's checkpoint - * lists until they have been rewritten, at which point this function is + * list until they have been rewritten, at which point this function is * called to remove the buffer from the existing transaction's - * checkpoint lists. - * - * The function returns 1 if it frees the transaction, 0 otherwise. + * checkpoint list. * * This function is called with the journal locked. * This function is called with j_list_lock held. - * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ -int __journal_remove_checkpoint(struct journal_head *jh) +void __journal_remove_checkpoint(struct journal_head *jh) { transaction_t *transaction; journal_t *journal; - int ret = 0; JBUFFER_TRACE(jh, "entry"); @@ -597,10 +538,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) journal = transaction->t_journal; __buffer_unlink(jh); - jh->b_cp_transaction = NULL; - if (transaction->t_checkpoint_list != NULL || - transaction->t_checkpoint_io_list != NULL) + if (transaction->t_checkpoint_list != NULL) goto out; JBUFFER_TRACE(jh, "transaction has no more buffers"); @@ -626,10 +565,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) /* Just in case anybody was waiting for more transactions to be checkpointed... */ wake_up(&journal->j_wait_logspace); - ret = 1; out: JBUFFER_TRACE(jh, "exit"); - return ret; } /* @@ -691,7 +628,6 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(transaction->t_updates == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 29e62d98ba..002ad2bbc7 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -829,8 +829,7 @@ restart_loop: journal->j_committing_transaction = NULL; spin_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { + if (commit_transaction->t_checkpoint_list == NULL) { __journal_drop_transaction(journal, commit_transaction); } else { if (journal->j_checkpoint_transactions == NULL) { -- cgit v1.2.2 From 5ac5f9d1ce8492163dbde5d357dc5d03becf7e36 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 14 Feb 2006 13:53:04 -0800 Subject: [PATCH] NLM: Fix the NLM_GRANTED callback checks If 2 threads attached to the same process are blocking on different locks on different files (maybe even on different servers) but have the same lock arguments (i.e. same offset+length - actually quite common, since most processes try to lock the entire file) then the first GRANTED call that wakes one up will also wake the other. Currently when the NLM_GRANTED callback comes in, lockd walks the list of blocked locks in search of a match to the lock that the NLM server has granted. Although it checks the lock pid, start and end, it fails to check the filehandle and the server address. By checking the filehandle and server IP address, we ensure that this only happens if the locks truly are referencing the same file. Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/lockd/clntlock.c | 27 +++++++++++++++++---------- fs/lockd/svc4proc.c | 2 +- fs/lockd/svcproc.c | 2 +- 3 files changed, 19 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 3eaf6e7010..da6354baa0 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -111,9 +111,10 @@ long nlmclnt_block(struct nlm_rqst *req, long timeout) /* * The server lockd has called us back to tell us the lock was granted */ -u32 -nlmclnt_grant(struct nlm_lock *lock) +u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) { + const struct file_lock *fl = &lock->fl; + const struct nfs_fh *fh = &lock->fh; struct nlm_wait *block; u32 res = nlm_lck_denied; @@ -122,14 +123,20 @@ nlmclnt_grant(struct nlm_lock *lock) * Warning: must not use cookie to match it! */ list_for_each_entry(block, &nlm_blocked, b_list) { - if (nlm_compare_locks(block->b_lock, &lock->fl)) { - /* Alright, we found a lock. Set the return status - * and wake up the caller - */ - block->b_status = NLM_LCK_GRANTED; - wake_up(&block->b_wait); - res = nlm_granted; - } + struct file_lock *fl_blocked = block->b_lock; + + if (!nlm_compare_locks(fl_blocked, fl)) + continue; + if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) + continue; + if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_dentry->d_inode) ,fh) != 0) + continue; + /* Alright, we found a lock. Set the return status + * and wake up the caller + */ + block->b_status = NLM_LCK_GRANTED; + wake_up(&block->b_wait); + res = nlm_granted; } return res; } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4063095d84..b10f913aa0 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -228,7 +228,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(&argp->lock); + resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3bc437e0cf..35681d9cf1 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -256,7 +256,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(&argp->lock); + resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } -- cgit v1.2.2 From 93544cc6486bea12e127ed58ca33477bb6ceafe6 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 14 Feb 2006 22:30:52 -0600 Subject: [PATCH] CIFS: fix cifs_user_read oops when null SMB response on forcedirectio mount This patch fixes an oops reported by Adrian Bunk in cifs_user_read when a null read response is returned on a forcedirectio mount. Signed-off-by: Dave Kleikamp Signed-off-by: Steve French Signed-off-by: Linus Torvalds --- fs/cifs/file.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d17c97d07c..675bd25682 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1442,13 +1442,15 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data, &bytes_read, &smb_read_data, &buf_type); pSMBr = (struct smb_com_read_rsp *)smb_read_data; - if (copy_to_user(current_offset, - smb_read_data + 4 /* RFC1001 hdr */ - + le16_to_cpu(pSMBr->DataOffset), - bytes_read)) { - rc = -EFAULT; - } if (smb_read_data) { + if (copy_to_user(current_offset, + smb_read_data + + 4 /* RFC1001 length field */ + + le16_to_cpu(pSMBr->DataOffset), + bytes_read)) { + rc = -EFAULT; + } + if(buf_type == CIFS_SMALL_BUFFER) cifs_small_buf_release(smb_read_data); else if(buf_type == CIFS_LARGE_BUFFER) -- cgit v1.2.2 From 5ecfbae093f0c37311e89b29bfc0c9d586eace87 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 15 Feb 2006 22:50:10 +0300 Subject: [PATCH] fix zap_thread's ptrace related problems 1. The tracee can go from ptrace_stop() to do_signal_stop() after __ptrace_unlink(p). 2. It is unsafe to __ptrace_unlink(p) while p->parent may wait for tasklist_lock in ptrace_detach(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Eric W. Biederman Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 055378d251..0e1c95074d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1403,7 +1403,7 @@ static void zap_threads (struct mm_struct *mm) do_each_thread(g,p) { if (mm == p->mm && p != tsk && p->ptrace && p->parent->mm == mm) { - __ptrace_unlink(p); + __ptrace_detach(p, 0); } } while_each_thread(g,p); write_unlock_irq(&tasklist_lock); -- cgit v1.2.2 From 898efface1a5076cbae5af87b935212b1869971b Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:01:25 -0800 Subject: [PATCH] ocfs2: recheck recovery state after getting lock * after successfully taking the $RECOVERY lock in EX mode, recheck to make sure that recovery has not already begun or completed on another node Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmrecovery.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 186e9a76aa..f9ce864966 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2032,6 +2032,30 @@ again: dlm->reco.new_master); status = -EEXIST; } else { + status = 0; + + /* see if recovery was already finished elsewhere */ + spin_lock(&dlm->spinlock); + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + status = -EINVAL; + mlog(0, "%s: got reco EX lock, but " + "node got recovered already\n", dlm->name); + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { + mlog(ML_ERROR, "%s: new master is %u " + "but no dead node!\n", + dlm->name, dlm->reco.new_master); + BUG(); + } + } + spin_unlock(&dlm->spinlock); + } + + /* if this node has actually become the recovery master, + * set the master and send the messages to begin recovery */ + if (!status) { + mlog(0, "%s: dead=%u, this=%u, sending " + "begin_reco now\n", dlm->name, + dlm->reco.dead_node, dlm->node_num); status = dlm_send_begin_reco_message(dlm, dlm->reco.dead_node); /* this always succeeds */ -- cgit v1.2.2 From e2b5e4506f5c5187b91d7a79fbad28fe3ebd2fc5 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:02:56 -0800 Subject: [PATCH] ocfs2: fix release of ast never reserved * fix a bug in dlm_convert_lock_handler where dlm_lockres_release_ast was being called even if no ast was ever reserved Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmconvert.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 6001b22a99..f5c2f1979a 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -421,7 +421,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_lockstatus *lksb; enum dlm_status status = DLM_NORMAL; u32 flags; - int call_ast = 0, kick_thread = 0; + int call_ast = 0, kick_thread = 0, ast_reserved = 0; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); @@ -490,6 +490,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) status = __dlm_lockres_state_to_status(res); if (status == DLM_NORMAL) { __dlm_lockres_reserve_ast(res); + ast_reserved = 1; res->state |= DLM_LOCK_RES_IN_PROGRESS; status = __dlmconvert_master(dlm, res, lock, flags, cnv->requested_type, @@ -512,10 +513,10 @@ leave: else dlm_lock_put(lock); - /* either queue the ast or release it */ + /* either queue the ast or release it, if reserved */ if (call_ast) dlm_queue_ast(dlm, lock); - else + else if (ast_reserved) dlm_lockres_release_ast(dlm, res); if (kick_thread) -- cgit v1.2.2 From 44465a7daf7c4e34199b2b0ebb3c5101619dcb9d Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:05:38 -0800 Subject: [PATCH] ocfs2: add dlm_wait_for_node_death * add dlm_wait_for_node_death function to be used after receiving a network error. this will wait for the given timeout to allow the heartbeat callbacks to update the domain map. without this, some paths may spin and consume enough cpu that the heartbeat gets starved and never updates. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmcommon.h | 4 ++++ fs/ocfs2/dlm/dlmconvert.c | 5 +++++ fs/ocfs2/dlm/dlmlock.c | 14 +++++++++++++- fs/ocfs2/dlm/dlmrecovery.c | 18 ++++++++++++++++++ 4 files changed, 40 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 42eb53b529..23ceaa7127 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -208,6 +208,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 #define DLM_LOCK_RES_MIGRATING 0x00000020 +/* max milliseconds to wait to sync up a network failure with a node death */ +#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) + #define DLM_PURGE_INTERVAL_MS (8 * 1000) struct dlm_lock_resource @@ -658,6 +661,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); void dlm_wait_for_recovery(struct dlm_ctxt *dlm); int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); +int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index f5c2f1979a..f66e2d818c 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -392,6 +392,11 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, } else { mlog_errno(tmpret); if (dlm_is_host_down(tmpret)) { + /* instead of logging the same network error over + * and over, sleep here and wait for the heartbeat + * to notice the node is dead. times out after 5s. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); ret = DLM_RECOVERING; mlog(0, "node %u died so returning DLM_RECOVERING " "from convert message!\n", res->owner); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index d1a0038557..e709412e6e 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -646,7 +646,19 @@ retry_lock: mlog(0, "retrying lock with migration/" "recovery/in progress\n"); msleep(100); - dlm_wait_for_recovery(dlm); + /* no waiting for dlm_reco_thread */ + if (recovery) { + if (status == DLM_RECOVERING) { + mlog(0, "%s: got RECOVERING " + "for $REOCVERY lock, master " + "was %u\n", dlm->name, + res->owner); + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); + } + } else { + dlm_wait_for_recovery(dlm); + } goto retry_lock; } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index f9ce864966..ed76bda1a5 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -278,6 +278,24 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) return dead; } +int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (timeout) { + mlog(ML_NOTICE, "%s: waiting %dms for notification of " + "death of node %u\n", dlm->name, timeout, node); + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node), + msecs_to_jiffies(timeout)); + } else { + mlog(ML_NOTICE, "%s: waiting indefinitely for notification " + "of death of node %u\n", dlm->name, node); + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node)); + } + /* for now, return 0 */ + return 0; +} + /* callers of the top-level api calls (dlmlock/dlmunlock) should * block on the dlm->reco.event when recovery is in progress. * the dlm recovery thread will set this state when it begins -- cgit v1.2.2 From 558c70c59b75a5a53ba496fe3bccea80a9e3e6fb Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:07:47 -0800 Subject: [PATCH] ocfs2: manually grant remote recovery lock * fix a hang in recovery that occurred in dlmlock_remote. the $RECOVERY lock was never moved to the granted queue even after getting DLM_NORMAL back from the master node. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmlock.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index e709412e6e..671d4ff222 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -220,6 +220,17 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, dlm_error(status); dlm_revert_pending_lock(res, lock); dlm_lock_put(lock); + } else if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* special case for the $RECOVERY lock. + * there will never be an AST delivered to put + * this lock on the proper secondary queue + * (granted), so do it manually. */ + mlog(0, "%s: $RECOVERY lock for this node (%u) is " + "mastered by %u; got lock, manually granting (no ast)\n", + dlm->name, dlm->node_num, res->owner); + list_del_init(&lock->list); + list_add_tail(&lock->list, &res->granted); } spin_unlock(&res->spinlock); -- cgit v1.2.2 From 745ae8ba29e729ec922393fa4d9448c385673599 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 9 Feb 2006 13:23:39 -0800 Subject: [PATCH] ocfs2: only checkpoint journal when asked to Disable automatic checkpointing of the journal - this is a relic from older ocfs2 days. Worth quite a bit of performance on longer running single node tests. Signed-off-by: Mark Fasheh --- fs/ocfs2/journal.c | 7 +++---- fs/ocfs2/journal.h | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index fa0bcac5ce..d329c9df90 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1584,10 +1584,9 @@ static int ocfs2_commit_thread(void *arg) while (!(kthread_should_stop() && atomic_read(&journal->j_num_trans) == 0)) { - wait_event_interruptible_timeout(osb->checkpoint_event, - atomic_read(&journal->j_num_trans) - || kthread_should_stop(), - OCFS2_CHECKPOINT_INTERVAL); + wait_event_interruptible(osb->checkpoint_event, + atomic_read(&journal->j_num_trans) + || kthread_should_stop()); status = ocfs2_commit_cache(osb); if (status < 0) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 7d0a816184..2f3a6acdac 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -29,8 +29,6 @@ #include #include -#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ) - enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, OCFS2_JOURNAL_LOADED, -- cgit v1.2.2 From f671c09bce88ea253d576c842f8f39d9a2a29028 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Tue, 14 Feb 2006 11:45:21 -0800 Subject: [PATCH] ocfs2: detach from heartbeat events before freeing mle Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmmaster.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a3194fe173..2e2e95e694 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2482,7 +2482,9 @@ top: atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); wake_up(&mle->wq); - /* final put will take care of list removal */ + /* do not need events any longer, so detach + * from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); __dlm_put_mle(mle); } continue; @@ -2537,6 +2539,9 @@ top: spin_unlock(&res->spinlock); dlm_lockres_put(res); + /* about to get rid of mle, detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); + /* dump the mle */ spin_lock(&dlm->master_lock); __dlm_put_mle(mle); -- cgit v1.2.2 From b2f49033d80c952a0ffc2d5647bc1a0b8a09c1b3 Mon Sep 17 00:00:00 2001 From: Peter Staubach Date: Fri, 17 Feb 2006 13:52:36 -0800 Subject: [PATCH] fix deadlock in ext2 Fix a deadlock possible in the ext2 file system implementation. This deadlock occurs when a file is removed from an ext2 file system which was mounted with the "sync" mount option. The problem is that ext2_xattr_delete_inode() was invoking the routine, sync_dirty_buffer(), using a buffer head which was previously locked via lock_buffer(). The first thing that sync_dirty_buffer() does is to lock the buffer head that it was passed. It does this via lock_buffer(). Oops. The solution is to unlock the buffer head in ext2_xattr_delete_inode() before invoking sync_dirty_buffer(). This makes the code in ext2_xattr_delete_inode() obey the same locking rules as all other callers of sync_dirty_buffer() in the ext2 file system implementation. Signed-off-by: Peter Staubach Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/xattr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index a2ca3107d4..86ae8e93ad 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -792,18 +792,20 @@ ext2_xattr_delete_inode(struct inode *inode) ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); + unlock_buffer(bh); } else { HDR(bh)->h_refcount = cpu_to_le32( le32_to_cpu(HDR(bh)->h_refcount) - 1); if (ce) mb_cache_entry_release(ce); + ea_bdebug(bh, "refcount now=%d", + le32_to_cpu(HDR(bh)->h_refcount)); + unlock_buffer(bh); mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); DQUOT_FREE_BLOCK(inode, 1); } - ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); - unlock_buffer(bh); EXT2_I(inode)->i_file_acl = 0; cleanup: -- cgit v1.2.2 From 77e7f250f88cd62844e24c42aff4d0e95969c746 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 17 Feb 2006 13:52:52 -0800 Subject: [PATCH] fuse: fix bug in aborted fuse_release_end() There's a rather theoretical case of the BUG triggering in fuse_reset_request(): - iget() fails because of OOM after a successful CREATE_OPEN request - during IO on the resulting RELEASE request the connection is aborted Fix and add warning to fuse_reset_request(). Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 6 ++++++ fs/fuse/file.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f556a0d5c0..0c9a2ee54c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -66,6 +66,12 @@ static void restore_sigs(sigset_t *oldset) sigprocmask(SIG_SETMASK, oldset, NULL); } +/* + * Reset request, so that it can be reused + * + * The caller must be _very_ careful to make sure, that it is holding + * the only reference to req + */ void fuse_reset_request(struct fuse_req *req) { int preallocated = req->preallocated; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 296351615b..6f05379b0a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -116,9 +116,14 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir) /* Special case for failed iget in CREATE */ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - u64 nodeid = req->in.h.nodeid; - fuse_reset_request(req); - fuse_send_forget(fc, req, nodeid, 1); + /* If called from end_io_requests(), req has more than one + reference and fuse_reset_request() cannot work */ + if (fc->connected) { + u64 nodeid = req->in.h.nodeid; + fuse_reset_request(req); + fuse_send_forget(fc, req, nodeid, 1); + } else + fuse_put_request(fc, req); } void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff, -- cgit v1.2.2 From 74910e6c7dc7471b286a883c1a7af70483ffd2ba Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 17 Feb 2006 13:52:58 -0800 Subject: [PATCH] select: time comparison fixes I got all of these backwards. We want to return min(input timeout, new timeout) to userspace to prevent increasing the time-remaining value. Thanks to Ernst Herzberg for reporting and diagnosing. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 6 +++--- fs/select.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index a2ba78bdf7..5333c7d742 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1757,7 +1757,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, goto sticky; rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); rtv.tv_sec = timeout; - if (compat_timeval_compare(&rtv, &tv) < 0) + if (compat_timeval_compare(&rtv, &tv) >= 0) rtv = tv; if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: @@ -1834,7 +1834,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, rts.tv_sec++; rts.tv_nsec -= NSEC_PER_SEC; } - if (compat_timespec_compare(&rts, &ts) < 0) + if (compat_timespec_compare(&rts, &ts) >= 0) rts = ts; copy_to_user(tsp, &rts, sizeof(rts)); } @@ -1934,7 +1934,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; rts.tv_sec = timeout; - if (compat_timespec_compare(&rts, &ts) < 0) + if (compat_timespec_compare(&rts, &ts) >= 0) rts = ts; if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: diff --git a/fs/select.c b/fs/select.c index 6ce68a9c89..1815a57d22 100644 --- a/fs/select.c +++ b/fs/select.c @@ -404,7 +404,7 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, goto sticky; rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); rtv.tv_sec = timeout; - if (timeval_compare(&rtv, &tv) < 0) + if (timeval_compare(&rtv, &tv) >= 0) rtv = tv; if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: @@ -471,7 +471,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; rts.tv_sec = timeout; - if (timespec_compare(&rts, &ts) < 0) + if (timespec_compare(&rts, &ts) >= 0) rts = ts; if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: @@ -775,7 +775,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; rts.tv_sec = timeout; - if (timespec_compare(&rts, &ts) < 0) + if (timespec_compare(&rts, &ts) >= 0) rts = ts; if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: -- cgit v1.2.2 From 76b6159ba094544e003a237cedcf555d82fa3bfe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Feb 2006 14:37:40 -0500 Subject: [PATCH] fix handling of st_nlink on procfs root 1) it should use nr_processes(), not nr_threads; otherwise we are getting very confused find(1) and friends, among other things. 2) better do that at stat() time than at every damn lookup in procfs root. Patch had been sitting in FC4 kernels for many months now... Signed-off-by: Al Viro --- fs/proc/inode.c | 4 ---- fs/proc/root.c | 17 +++++++++-------- 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6573f31f1f..075d3e9456 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -204,10 +204,6 @@ int proc_fill_super(struct super_block *s, void *data, int silent) root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); if (!root_inode) goto out_no_root; - /* - * Fixup the root inode's nlink value - */ - root_inode->i_nlink += nr_processes(); root_inode->i_uid = 0; root_inode->i_gid = 0; s->s_root = d_alloc_root(root_inode); diff --git a/fs/proc/root.c b/fs/proc/root.c index 68896283c8..c3fd361111 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -80,16 +80,16 @@ void __init proc_root_init(void) proc_bus = proc_mkdir("bus", NULL); } -static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat +) { - /* - * nr_threads is actually protected by the tasklist_lock; - * however, it's conventional to do reads, especially for - * reporting, without any locking whatsoever. - */ - if (dir->i_ino == PROC_ROOT_INO) /* check for safety... */ - dir->i_nlink = proc_root.nlink + nr_threads; + generic_fillattr(dentry->d_inode, stat); + stat->nlink = proc_root.nlink + nr_processes(); + return 0; +} +static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +{ if (!proc_lookup(dir, dentry, nd)) { return NULL; } @@ -134,6 +134,7 @@ static struct file_operations proc_root_operations = { */ static struct inode_operations proc_root_inode_operations = { .lookup = proc_root_lookup, + .getattr = proc_root_getattr, }; /* -- cgit v1.2.2 From e1c92117558261d5504c59712751f6c7925ff3ba Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Mon, 20 Feb 2006 18:28:05 -0800 Subject: [PATCH] v9fs: update documentation and fix debug flag Minor updates to the documentation to bring them into sync with current websites and available features. The debug flag was switched back to hex to match the documentation. Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/v9fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 5250c428fc..ef33865491 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -66,7 +66,7 @@ static match_table_t tokens = { {Opt_afid, "afid=%u"}, {Opt_rfdno, "rfdno=%u"}, {Opt_wfdno, "wfdno=%u"}, - {Opt_debug, "debug=%u"}, + {Opt_debug, "debug=%x"}, {Opt_name, "name=%s"}, {Opt_remotename, "aname=%s"}, {Opt_unix, "proto=unix"}, -- cgit v1.2.2 From fa675765afed59bb89adba3369094ebd428b930b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 22 Feb 2006 09:39:02 -0800 Subject: Revert mount/umount uevent removal This change reverts the 033b96fd30db52a710d97b06f87d16fc59fee0f1 commit from Kay Sievers that removed the mount/umount uevents from the kernel. Some older versions of HAL still depend on these events to detect when a new device has been mounted. These events are not correctly emitted, and are broken by design, and so, should not be relied upon by any future program. Instead, the /proc/mounts file should be polled to properly detect this kind of event. A feature-removal-schedule.txt entry has been added, noting when this interface will be removed from the kernel. Signed-off-by: Greg Kroah-Hartman --- fs/super.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 30294218fa..e20b5580af 100644 --- a/fs/super.c +++ b/fs/super.c @@ -666,6 +666,16 @@ static int test_bdev_super(struct super_block *s, void *data) return (void *)s->s_bdev == data; } +static void bdev_uevent(struct block_device *bdev, enum kobject_action action) +{ + if (bdev->bd_disk) { + if (bdev->bd_part) + kobject_uevent(&bdev->bd_part->kobj, action); + else + kobject_uevent(&bdev->bd_disk->kobj, action); + } +} + struct super_block *get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) @@ -707,8 +717,10 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type, up_write(&s->s_umount); deactivate_super(s); s = ERR_PTR(error); - } else + } else { s->s_flags |= MS_ACTIVE; + bdev_uevent(bdev, KOBJ_MOUNT); + } } return s; @@ -724,6 +736,7 @@ void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; + bdev_uevent(bdev, KOBJ_UMOUNT); generic_shutdown_super(sb); sync_blockdev(bdev); close_bdev_excl(bdev); -- cgit v1.2.2 From 6cec2aed8686840906f6298391dc4fd04d9ba843 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 22 Feb 2006 17:31:52 -0600 Subject: [PATCH] CIFS: CIFSSMBRead was returning an invalid pointer in buf on socket error Thanks to Adrian Bunk for debugging the problem and to Shaggy for helping find the solution. Also added a fix for 64K pages we found in loosely-related testing Signed-off-by: Dave Kleikamp Signed-off-by: Steve French Signed-off-by: Linus Torvalds --- fs/cifs/cifssmb.c | 7 ++++--- fs/cifs/connect.c | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 217323b0c8..b41e8b3796 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1048,13 +1048,14 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, cifs_small_buf_release(iov[0].iov_base); else if(resp_buf_type == CIFS_LARGE_BUFFER) cifs_buf_release(iov[0].iov_base); - } else /* return buffer to caller to free */ /* BB FIXME how do we tell caller if it is not a large buffer */ { - *buf = iov[0].iov_base; + } else if(resp_buf_type != CIFS_NO_BUFFER) { + /* return buffer to caller to free */ + *buf = iov[0].iov_base; if(resp_buf_type == CIFS_SMALL_BUFFER) *pbuf_type = CIFS_SMALL_BUFFER; else if(resp_buf_type == CIFS_LARGE_BUFFER) *pbuf_type = CIFS_LARGE_BUFFER; - } + } /* else no valid buffer on return - leave as null */ /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e488603fb1..ef5ae6f93c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1795,10 +1795,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, conjunction with 52K kvec constraint on arch with 4K page size */ - if(cifs_sb->rsize < PAGE_CACHE_SIZE) { - cifs_sb->rsize = PAGE_CACHE_SIZE; - /* Windows ME does this */ - cFYI(1,("Attempt to set readsize for mount to less than one page (4096)")); + if(cifs_sb->rsize < 2048) { + cifs_sb->rsize = 2048; + /* Windows ME may prefer this */ + cFYI(1,("readsize set to minimum 2048")); } cifs_sb->mnt_uid = volume_info.linux_uid; cifs_sb->mnt_gid = volume_info.linux_gid; -- cgit v1.2.2 From 3672b638ec1d5b1020ea27986060b830f09c96c1 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Fri, 24 Feb 2006 09:55:07 +0000 Subject: NTFS: - Cope with attribute list attribute having invalid flags. Windows copes with this and even chkdsk does not detect or fix this so we have to cope with it, too. Thanks to Pawel Kot for reporting the problem. - Miscellaneous updates to layout.h. Signed-off-by: Anton Altaparmakov --- fs/ntfs/ChangeLog | 28 +++++++++++++++++++--------- fs/ntfs/inode.c | 49 +++++++++++++++++++++++++++++++++++++++---------- fs/ntfs/layout.h | 25 ++++++++++++++++++------- 3 files changed, 76 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog index 02f44094bd..4a62201e84 100644 --- a/fs/ntfs/ChangeLog +++ b/fs/ntfs/ChangeLog @@ -1,9 +1,9 @@ ToDo/Notes: - Find and fix bugs. - The only places in the kernel where a file is resized are - ntfs_file_write*() and ntfs_truncate() for both of which i_sem is + ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is held. Just have to be careful in read-/writepage and other helpers - not running under i_sem that we play nice... Also need to be careful + not running under i_mutex that we play nice. Also need to be careful with initialized_size extension in ntfs_file_write*() and writepage. UPDATE: The only things that need to be checked are the compressed write and the other attribute resize/write cases like index @@ -19,6 +19,16 @@ ToDo/Notes: - Enable the code for setting the NT4 compatibility flag when we start making NTFS 1.2 specific modifications. +2.1.26 - Minor bug fixes and updates. + + - We have struct kmem_cache now so use it instead of the typedef + kmem_cache_t. (Pekka Enberg) + - Miscellaneous updates to layout.h. + - Cope with attribute list attribute having invalid flags. Windows + copes with this and even chkdsk does not detect or fix this so we + have to cope with it, too. Thanks to Pawel Kot for reporting the + problem. + 2.1.25 - (Almost) fully implement write(2) and truncate(2). - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and @@ -373,7 +383,7 @@ ToDo/Notes: single one of them had an mst error. (Thanks to Ken MacFerrin for the bug report.) - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date() - where we failed to release i_sem on the $Quota/$Q attribute inode. + where we failed to release i_mutex on the $Quota/$Q attribute inode. - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup(). - Add mapping of unmapped buffers to all remaining code paths, i.e. fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(), @@ -874,7 +884,7 @@ ToDo/Notes: clusters. (Philipp Thomas) - attrib.c::load_attribute_list(): Fix bug when initialized_size is a multiple of the block_size but not the cluster size. (Szabolcs - Szakacsits ) + Szakacsits) 2.1.2 - Important bug fixes aleviating the hangs in statfs. @@ -884,7 +894,7 @@ ToDo/Notes: - Add handling for initialized_size != data_size in compressed files. - Reduce function local stack usage from 0x3d4 bytes to just noise in - fs/ntfs/upcase.c. (Randy Dunlap ) + fs/ntfs/upcase.c. (Randy Dunlap) - Remove compiler warnings for newer gcc. - Pages are no longer kmapped by mm/filemap.c::generic_file_write() around calls to ->{prepare,commit}_write. Adapt NTFS appropriately @@ -1201,11 +1211,11 @@ ToDo/Notes: the kernel. We probably want a kernel generic init_address_space() function... - Drop BKL from ntfs_readdir() after consultation with Al Viro. The - only caller of ->readdir() is vfs_readdir() which holds i_sem during - the call, and i_sem is sufficient protection against changes in the - directory inode (including ->i_size). + only caller of ->readdir() is vfs_readdir() which holds i_mutex + during the call, and i_mutex is sufficient protection against changes + in the directory inode (including ->i_size). - Use generic_file_llseek() for directories (as opposed to - default_llseek()) as this downs i_sem instead of the BKL which is + default_llseek()) as this downs i_mutex instead of the BKL which is what we now need for exclusion against ->f_pos changes considering we no longer take the BKL in ntfs_readdir(). diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index ea1bd3feea..55263b7de9 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -677,13 +677,28 @@ static int ntfs_read_locked_inode(struct inode *vi) ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); NInoSetAttrList(ni); a = ctx->attr; - if (a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_COMPRESSION_MASK || - a->flags & ATTR_IS_SPARSE) { + if (a->flags & ATTR_COMPRESSION_MASK) { ntfs_error(vi->i_sb, "Attribute list attribute is " - "compressed/encrypted/sparse."); + "compressed."); goto unm_err_out; } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(vi->i_sb, "Non-resident attribute " + "list attribute is encrypted/" + "sparse."); + goto unm_err_out; + } + ntfs_warning(vi->i_sb, "Resident attribute list " + "attribute in inode 0x%lx is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set.", + vi->i_ino); + } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); @@ -1809,19 +1824,33 @@ int ntfs_read_inode_mount(struct inode *vi) } else /* if (!err) */ { ATTR_LIST_ENTRY *al_entry, *next_al_entry; u8 *al_end; + static const char *es = " Not allowed. $MFT is corrupt. " + "You should run chkdsk."; ntfs_debug("Attribute list attribute found in $MFT."); NInoSetAttrList(ni); a = ctx->attr; - if (a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_COMPRESSION_MASK || - a->flags & ATTR_IS_SPARSE) { + if (a->flags & ATTR_COMPRESSION_MASK) { ntfs_error(sb, "Attribute list attribute is " - "compressed/encrypted/sparse. Not " - "allowed. $MFT is corrupt. You should " - "run chkdsk."); + "compressed.%s", es); goto put_err_out; } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(sb, "Non-resident attribute list " + "attribute is encrypted/" + "sparse.%s", es); + goto put_err_out; + } + ntfs_warning(sb, "Resident attribute list attribute " + "in $MFT system file is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set."); + } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index f5678d5d79..bb408d4dcb 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -838,15 +838,19 @@ enum { F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask is used to to obtain all flags that are valid for setting. */ - /* - * The following flags are only present in the FILE_NAME attribute (in + * The following flag is only present in the FILE_NAME attribute (in * the field file_attributes). */ FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000), /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this is a directory or not, i.e. whether it has an index root attribute or not. */ + /* + * The following flag is present both in the STANDARD_INFORMATION + * attribute and in the FILE_NAME attribute (in the field + * file_attributes). + */ FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000), /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this file has a view index present (eg. object id @@ -1071,9 +1075,15 @@ typedef struct { modified. */ /* 20*/ sle64 last_access_time; /* Time this mft record was last accessed. */ -/* 28*/ sle64 allocated_size; /* Byte size of allocated space for the - data attribute. NOTE: Is a multiple - of the cluster size. */ +/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space + for the data attribute. So for + normal $DATA, this is the + allocated_size from the unnamed + $DATA attribute and for compressed + and/or sparse $DATA, this is the + compressed_size from the unnamed + $DATA attribute. NOTE: This is a + multiple of the cluster size. */ /* 30*/ sle64 data_size; /* Byte size of actual data in data attribute. */ /* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ @@ -1904,12 +1914,13 @@ enum { VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010), VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020), + VOLUME_CHKDSK_UNDERWAY = const_cpu_to_le16(0x4000), VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000), - VOLUME_FLAGS_MASK = const_cpu_to_le16(0x803f), + VOLUME_FLAGS_MASK = const_cpu_to_le16(0xc03f), /* To make our life easier when checking if we must mount read-only. */ - VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0x8027), + VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0xc027), } __attribute__ ((__packed__)); typedef le16 VOLUME_FLAGS; -- cgit v1.2.2 From 78af34f03d33d2ba179c9d35685860170b94a285 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Fri, 24 Feb 2006 10:32:33 +0000 Subject: NTFS: Implement support for sector sizes above 512 bytes (up to the maximum supported by NTFS which is 4096 bytes). --- fs/ntfs/ChangeLog | 6 +++ fs/ntfs/aops.c | 18 +++---- fs/ntfs/file.c | 8 +-- fs/ntfs/mft.c | 8 +-- fs/ntfs/super.c | 151 +++++++++++++++++++++++++++++++++++------------------- 5 files changed, 121 insertions(+), 70 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog index 4a62201e84..e66b4ac2fa 100644 --- a/fs/ntfs/ChangeLog +++ b/fs/ntfs/ChangeLog @@ -21,8 +21,14 @@ ToDo/Notes: 2.1.26 - Minor bug fixes and updates. + - Fix a potential overflow in file.c where a cast to s64 was missing in + a left shift of a page index. + - The struct inode has had its i_sem semaphore changed to a mutex named + i_mutex. - We have struct kmem_cache now so use it instead of the typedef kmem_cache_t. (Pekka Enberg) + - Implement support for sector sizes above 512 bytes (up to the maximum + supported by NTFS which is 4096 bytes). - Miscellaneous updates to layout.h. - Cope with attribute list attribute having invalid flags. Windows copes with this and even chkdsk does not detect or fix this so we diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 1c0a431587..7e361da770 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -2,7 +2,7 @@ * aops.c - NTFS kernel address space operations and page cache handling. * Part of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2006 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -200,8 +200,8 @@ static int ntfs_read_block(struct page *page) /* $MFT/$DATA must have its complete runlist in memory at all times. */ BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)); - blocksize_bits = VFS_I(ni)->i_blkbits; - blocksize = 1 << blocksize_bits; + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; if (!page_has_buffers(page)) { create_empty_buffers(page, blocksize, 0); @@ -569,10 +569,8 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) BUG_ON(!NInoNonResident(ni)); BUG_ON(NInoMstProtected(ni)); - - blocksize_bits = vi->i_blkbits; - blocksize = 1 << blocksize_bits; - + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; if (!page_has_buffers(page)) { BUG_ON(!PageUptodate(page)); create_empty_buffers(page, blocksize, @@ -949,8 +947,8 @@ static int ntfs_write_mst_block(struct page *page, */ BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); - bh_size_bits = vi->i_blkbits; - bh_size = 1 << bh_size_bits; + bh_size = vol->sb->s_blocksize; + bh_size_bits = vol->sb->s_blocksize_bits; max_bhs = PAGE_CACHE_SIZE / bh_size; BUG_ON(!max_bhs); BUG_ON(max_bhs > MAX_BUF_PER_PAGE); @@ -1596,7 +1594,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { BUG_ON(!PageUptodate(page)); end = ofs + ni->itype.index.block_size; - bh_size = 1 << VFS_I(ni)->i_blkbits; + bh_size = VFS_I(ni)->i_sb->s_blocksize; spin_lock(&mapping->private_lock); if (unlikely(!page_has_buffers(page))) { spin_unlock(&mapping->private_lock); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 3a119a8768..5027d3d1b3 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1,7 +1,7 @@ /* * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2006 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -529,8 +529,8 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", vi->i_ino, ni->type, pages[0]->index, nr_pages, (long long)pos, bytes); - blocksize_bits = vi->i_blkbits; - blocksize = 1 << blocksize_bits; + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; u = 0; do { struct page *page = pages[u]; @@ -1525,7 +1525,7 @@ static inline int ntfs_commit_pages_after_non_resident_write( vi = pages[0]->mapping->host; ni = NTFS_I(vi); - blocksize = 1 << vi->i_blkbits; + blocksize = vi->i_sb->s_blocksize; end = pos + bytes; u = 0; do { diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 0c65cbb8c5..6499aafc22 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1,7 +1,7 @@ /** * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2006 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -473,7 +473,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, runlist_element *rl; unsigned int block_start, block_end, m_start, m_end, page_ofs; int i_bhs, nr_bhs, err = 0; - unsigned char blocksize_bits = vol->mftmirr_ino->i_blkbits; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; ntfs_debug("Entering for inode 0x%lx.", mft_no); BUG_ON(!max_bhs); @@ -672,8 +672,8 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) { ntfs_volume *vol = ni->vol; struct page *page = ni->page; - unsigned char blocksize_bits = vol->mft_ino->i_blkbits; - unsigned int blocksize = 1 << blocksize_bits; + unsigned int blocksize = vol->sb->s_blocksize; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; int max_bhs = vol->mft_record_size / blocksize; struct buffer_head *bhs[max_bhs]; struct buffer_head *bh, *head; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index e9c0d80dfa..489f704914 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1,7 +1,7 @@ /* * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2006 Anton Altaparmakov * Copyright (c) 2001,2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -22,6 +22,7 @@ #include #include +#include #include #include #include /* For bdev_hardsect_size(). */ @@ -641,7 +642,7 @@ static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, { const char *read_err_str = "Unable to read %s boot sector."; struct buffer_head *bh_primary, *bh_backup; - long nr_blocks = NTFS_SB(sb)->nr_blocks; + sector_t nr_blocks = NTFS_SB(sb)->nr_blocks; /* Try to read primary boot sector. */ if ((bh_primary = sb_bread(sb, 0))) { @@ -688,13 +689,18 @@ hotfix_primary_boot_sector: /* * If we managed to read sector zero and the volume is not * read-only, copy the found, valid backup boot sector to the - * primary boot sector. + * primary boot sector. Note we only copy the actual boot + * sector structure, not the actual whole device sector as that + * may be bigger and would potentially damage the $Boot system + * file (FIXME: Would be nice to know if the backup boot sector + * on a large sector device contains the whole boot loader or + * just the first 512 bytes). */ if (!(sb->s_flags & MS_RDONLY)) { ntfs_warning(sb, "Hot-fix: Recovering invalid primary " "boot sector from backup copy."); memcpy(bh_primary->b_data, bh_backup->b_data, - sb->s_blocksize); + NTFS_BLOCK_SIZE); mark_buffer_dirty(bh_primary); sync_dirty_buffer(bh_primary); if (buffer_uptodate(bh_primary)) { @@ -733,9 +739,13 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) vol->sector_size); ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, vol->sector_size_bits); - if (vol->sector_size != vol->sb->s_blocksize) - ntfs_warning(vol->sb, "The boot sector indicates a sector size " - "different from the device sector size."); + if (vol->sector_size < vol->sb->s_blocksize) { + ntfs_error(vol->sb, "Sector size (%i) is smaller than the " + "device block size (%lu). This is not " + "supported. Sorry.", vol->sector_size, + vol->sb->s_blocksize); + return FALSE; + } ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; ntfs_debug("sectors_per_cluster_bits = 0x%x", @@ -748,16 +758,11 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size, vol->cluster_size); ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); - ntfs_debug("vol->cluster_size_bits = %i (0x%x)", - vol->cluster_size_bits, vol->cluster_size_bits); - if (vol->sector_size > vol->cluster_size) { - ntfs_error(vol->sb, "Sector sizes above the cluster size are " - "not supported. Sorry."); - return FALSE; - } - if (vol->sb->s_blocksize > vol->cluster_size) { - ntfs_error(vol->sb, "Cluster sizes smaller than the device " - "sector size are not supported. Sorry."); + ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits); + if (vol->cluster_size < vol->sector_size) { + ntfs_error(vol->sb, "Cluster size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->cluster_size, vol->sector_size); return FALSE; } clusters_per_mft_record = b->clusters_per_mft_record; @@ -786,11 +791,18 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) * we store $MFT/$DATA, the table of mft records in the page cache. */ if (vol->mft_record_size > PAGE_CACHE_SIZE) { - ntfs_error(vol->sb, "Mft record size %i (0x%x) exceeds the " - "page cache size on your system %lu (0x%lx). " + ntfs_error(vol->sb, "Mft record size (%i) exceeds the " + "PAGE_CACHE_SIZE on your system (%lu). " "This is not supported. Sorry.", - vol->mft_record_size, vol->mft_record_size, - PAGE_CACHE_SIZE, PAGE_CACHE_SIZE); + vol->mft_record_size, PAGE_CACHE_SIZE); + return FALSE; + } + /* We cannot support mft record sizes below the sector size. */ + if (vol->mft_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Mft record size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->mft_record_size, + vol->sector_size); return FALSE; } clusters_per_index_record = b->clusters_per_index_record; @@ -816,6 +828,14 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) ntfs_debug("vol->index_record_size_bits = %i (0x%x)", vol->index_record_size_bits, vol->index_record_size_bits); + /* We cannot support index record sizes below the sector size. */ + if (vol->index_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Index record size (%i) is smaller than " + "the sector size (%i). This is not " + "supported. Sorry.", vol->index_record_size, + vol->sector_size); + return FALSE; + } /* * Get the size of the volume in clusters and check for 64-bit-ness. * Windows currently only uses 32 bits to save the clusters so we do @@ -845,15 +865,18 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) } ll = sle64_to_cpu(b->mft_lcn); if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFT LCN is beyond end of volume. Weird."); + ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of " + "volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); return FALSE; } vol->mft_lcn = ll; ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); ll = sle64_to_cpu(b->mftmirr_lcn); if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFTMirr LCN is beyond end of volume. " - "Weird."); + ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end " + "of volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); return FALSE; } vol->mftmirr_lcn = ll; @@ -2685,7 +2708,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) ntfs_volume *vol; struct buffer_head *bh; struct inode *tmp_ino; - int result; + int blocksize, result; ntfs_debug("Entering."); #ifndef NTFS_RW @@ -2724,60 +2747,85 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) if (!parse_options(vol, (char*)opt)) goto err_out_now; + /* We support sector sizes up to the PAGE_CACHE_SIZE. */ + if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) { + if (!silent) + ntfs_error(sb, "Device has unsupported sector size " + "(%i). The maximum supported sector " + "size on this architecture is %lu " + "bytes.", + bdev_hardsect_size(sb->s_bdev), + PAGE_CACHE_SIZE); + goto err_out_now; + } /* - * TODO: Fail safety check. In the future we should really be able to - * cope with this being the case, but for now just bail out. + * Setup the device access block size to NTFS_BLOCK_SIZE or the hard + * sector size, whichever is bigger. */ - if (bdev_hardsect_size(sb->s_bdev) > NTFS_BLOCK_SIZE) { + blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE); + if (blocksize < NTFS_BLOCK_SIZE) { if (!silent) - ntfs_error(sb, "Device has unsupported hardsect_size."); + ntfs_error(sb, "Unable to set device block size."); goto err_out_now; } - - /* Setup the device access block size to NTFS_BLOCK_SIZE. */ - if (sb_set_blocksize(sb, NTFS_BLOCK_SIZE) != NTFS_BLOCK_SIZE) { + BUG_ON(blocksize != sb->s_blocksize); + ntfs_debug("Set device block size to %i bytes (block size bits %i).", + blocksize, sb->s_blocksize_bits); + /* Determine the size of the device in units of block_size bytes. */ + if (!i_size_read(sb->s_bdev->bd_inode)) { if (!silent) - ntfs_error(sb, "Unable to set block size."); + ntfs_error(sb, "Unable to determine device size."); goto err_out_now; } - - /* Get the size of the device in units of NTFS_BLOCK_SIZE bytes. */ vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> - NTFS_BLOCK_SIZE_BITS; - + sb->s_blocksize_bits; /* Read the boot sector and return unlocked buffer head to it. */ if (!(bh = read_ntfs_boot_sector(sb, silent))) { if (!silent) ntfs_error(sb, "Not an NTFS volume."); goto err_out_now; } - /* - * Extract the data from the boot sector and setup the ntfs super block + * Extract the data from the boot sector and setup the ntfs volume * using it. */ result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); - - /* Initialize the cluster and mft allocators. */ - ntfs_setup_allocators(vol); - brelse(bh); - if (!result) { if (!silent) ntfs_error(sb, "Unsupported NTFS filesystem."); goto err_out_now; } - /* - * TODO: When we start coping with sector sizes different from - * NTFS_BLOCK_SIZE, we now probably need to set the blocksize of the - * device (probably to NTFS_BLOCK_SIZE). + * If the boot sector indicates a sector size bigger than the current + * device block size, switch the device block size to the sector size. + * TODO: It may be possible to support this case even when the set + * below fails, we would just be breaking up the i/o for each sector + * into multiple blocks for i/o purposes but otherwise it should just + * work. However it is safer to leave disabled until someone hits this + * error message and then we can get them to try it without the setting + * so we know for sure that it works. */ - + if (vol->sector_size > blocksize) { + blocksize = sb_set_blocksize(sb, vol->sector_size); + if (blocksize != vol->sector_size) { + if (!silent) + ntfs_error(sb, "Unable to set device block " + "size to sector size (%i).", + vol->sector_size); + goto err_out_now; + } + BUG_ON(blocksize != sb->s_blocksize); + vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> + sb->s_blocksize_bits; + ntfs_debug("Changed device block size to %i bytes (block size " + "bits %i) to match volume sector size.", + blocksize, sb->s_blocksize_bits); + } + /* Initialize the cluster and mft allocators. */ + ntfs_setup_allocators(vol); /* Setup remaining fields in the super block. */ sb->s_magic = NTFS_SB_MAGIC; - /* * Ntfs allows 63 bits for the file size, i.e. correct would be: * sb->s_maxbytes = ~0ULL >> 1; @@ -2787,9 +2835,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) * without overflowing the index or to 2^63 - 1, whichever is smaller. */ sb->s_maxbytes = MAX_LFS_FILESIZE; - + /* Ntfs measures time in 100ns intervals. */ sb->s_time_gran = 100; - /* * Now load the metadata required for the page cache and our address * space operations to function. We do this by setting up a specialised -- cgit v1.2.2 From 1cf3109ffb26a6ea572fd02436bd10458b4b2187 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Fri, 24 Feb 2006 10:48:14 +0000 Subject: NTFS: Do more detailed reporting of why we cannot mount read-write by special casing the VOLUME_MODIFIED_BY_CHKDSK flag. Signed-off-by: Anton Altaparmakov --- fs/ntfs/ChangeLog | 2 ++ fs/ntfs/Makefile | 2 +- fs/ntfs/super.c | 34 +++++++++++++++++++++++++++------- fs/ntfs/upcase.c | 10 ++++------ fs/ntfs/volume.h | 28 +++++++++++++--------------- 5 files changed, 47 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog index e66b4ac2fa..9d8ffa89e2 100644 --- a/fs/ntfs/ChangeLog +++ b/fs/ntfs/ChangeLog @@ -29,6 +29,8 @@ ToDo/Notes: kmem_cache_t. (Pekka Enberg) - Implement support for sector sizes above 512 bytes (up to the maximum supported by NTFS which is 4096 bytes). + - Do more detailed reporting of why we cannot mount read-write by + special casing the VOLUME_MODIFIED_BY_CHKDSK flag. - Miscellaneous updates to layout.h. - Cope with attribute list attribute having invalid flags. Windows copes with this and even chkdsk does not detect or fix this so we diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile index d0d45d1c85..d95fac7fde 100644 --- a/fs/ntfs/Makefile +++ b/fs/ntfs/Makefile @@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ unistr.o upcase.o -EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.25\" +EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.26\" ifeq ($(CONFIG_NTFS_DEBUG),y) EXTRA_CFLAGS += -DDEBUG diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 489f704914..368a8ec106 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -472,9 +472,16 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) ntfs_error(sb, "Volume is dirty and read-only%s", es); return -EROFS; } + if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + ntfs_error(sb, "Volume has been modified by chkdsk " + "and is read-only%s", es); + return -EROFS; + } if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { - ntfs_error(sb, "Volume has unsupported flags set and " - "is read-only%s", es); + ntfs_error(sb, "Volume has unsupported flags set " + "(0x%x) and is read-only%s", + (unsigned)le16_to_cpu(vol->vol_flags), + es); return -EROFS; } if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { @@ -1845,11 +1852,24 @@ get_ctx_vol_failed: /* Make sure that no unsupported volume flags are set. */ if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { static const char *es1a = "Volume is dirty"; - static const char *es1b = "Volume has unsupported flags set"; - static const char *es2 = ". Run chkdsk and mount in Windows."; - const char *es1; - - es1 = vol->vol_flags & VOLUME_IS_DIRTY ? es1a : es1b; + static const char *es1b = "Volume has been modified by chkdsk"; + static const char *es1c = "Volume has unsupported flags set"; + static const char *es2a = ". Run chkdsk and mount in Windows."; + static const char *es2b = ". Mount in Windows."; + const char *es1, *es2; + + es2 = es2a; + if (vol->vol_flags & VOLUME_IS_DIRTY) + es1 = es1a; + else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + es1 = es1b; + es2 = es2b; + } else { + es1 = es1c; + ntfs_warning(sb, "Unsupported volume flags 0x%x " + "encountered.", + (unsigned)le16_to_cpu(vol->vol_flags)); + } /* If a read-write mount, convert it to a read-only mount. */ if (!(sb->s_flags & MS_RDONLY)) { if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c index 879cdf1d5b..9101807dc8 100644 --- a/fs/ntfs/upcase.c +++ b/fs/ntfs/upcase.c @@ -3,10 +3,7 @@ * Part of the Linux-NTFS project. * * Copyright (c) 2001 Richard Russon - * Copyright (c) 2001-2004 Anton Altaparmakov - * - * Modified for mkntfs inclusion 9 June 2001 by Anton Altaparmakov. - * Modified for kernel inclusion 10 September 2001 by Anton Altparmakov. + * Copyright (c) 2001-2006 Anton Altaparmakov * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -75,12 +72,13 @@ ntfschar *generate_default_upcase(void) if (!uc) return uc; memset(uc, 0, default_upcase_len * sizeof(ntfschar)); + /* Generate the little endian Unicode upcase table used by ntfs. */ for (i = 0; i < default_upcase_len; i++) uc[i] = cpu_to_le16(i); for (r = 0; uc_run_table[r][0]; r++) for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) - uc[i] = cpu_to_le16((le16_to_cpu(uc[i]) + - uc_run_table[r][2])); + uc[i] = cpu_to_le16(le16_to_cpu(uc[i]) + + uc_run_table[r][2]); for (r = 0; uc_dup_table[r][0]; r++) for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1); diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h index 375cd20a9f..406ab55dfb 100644 --- a/fs/ntfs/volume.h +++ b/fs/ntfs/volume.h @@ -2,7 +2,7 @@ * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part * of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2006 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -41,10 +41,8 @@ typedef struct { * structure has stabilized... (AIA) */ /* Device specifics. */ - struct super_block *sb; /* Pointer back to the super_block, - so we don't have to get the offset - every time. */ - LCN nr_blocks; /* Number of NTFS_BLOCK_SIZE bytes + struct super_block *sb; /* Pointer back to the super_block. */ + LCN nr_blocks; /* Number of sb->s_blocksize bytes sized blocks on the device. */ /* Configuration provided by user at mount time. */ unsigned long flags; /* Miscellaneous flags, see below. */ @@ -141,8 +139,8 @@ typedef enum { NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ NV_CaseSensitive, /* 1: Treat file names as case sensitive and create filenames in the POSIX namespace. - Otherwise be case insensitive and create - file names in WIN32 namespace. */ + Otherwise be case insensitive but still + create file names in POSIX namespace. */ NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ @@ -153,7 +151,7 @@ typedef enum { * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() * functions. */ -#define NVOL_FNS(flag) \ +#define DEFINE_NVOL_BIT_OPS(flag) \ static inline int NVol##flag(ntfs_volume *vol) \ { \ return test_bit(NV_##flag, &(vol)->flags); \ @@ -168,12 +166,12 @@ static inline void NVolClear##flag(ntfs_volume *vol) \ } /* Emit the ntfs volume bitops functions. */ -NVOL_FNS(Errors) -NVOL_FNS(ShowSystemFiles) -NVOL_FNS(CaseSensitive) -NVOL_FNS(LogFileEmpty) -NVOL_FNS(QuotaOutOfDate) -NVOL_FNS(UsnJrnlStamped) -NVOL_FNS(SparseEnabled) +DEFINE_NVOL_BIT_OPS(Errors) +DEFINE_NVOL_BIT_OPS(ShowSystemFiles) +DEFINE_NVOL_BIT_OPS(CaseSensitive) +DEFINE_NVOL_BIT_OPS(LogFileEmpty) +DEFINE_NVOL_BIT_OPS(QuotaOutOfDate) +DEFINE_NVOL_BIT_OPS(UsnJrnlStamped) +DEFINE_NVOL_BIT_OPS(SparseEnabled) #endif /* _LINUX_NTFS_VOLUME_H */ -- cgit v1.2.2 From c04030e16dbea2f7581f82cc6688695927f6ac5b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 24 Feb 2006 13:04:21 -0800 Subject: [PATCH] flags parameter for linkat I'm currently at the POSIX meeting and one thing covered was the incompatibility of Linux's link() with the POSIX definition. The name. Linux does not follow symlinks, POSIX requires it does. Even if somebody thinks this is a good default behavior we cannot change this because it would break the ABI. But the fact remains that some application might want this behavior. We have one chance to help implementing this without breaking the behavior. For this we could use the new linkat interface which would need a new flags parameter. If the new parameter is AT_SYMLINK_FOLLOW the new behavior could be invoked. I do not want to introduce such a patch now. But we could add the parameter now, just don't use it. The patch below would do this. Can we get this late patch applied before the release more or less fixes the syscall API? Signed-off-by: Ulrich Drepper Signed-off-by: Ralf Baechle Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index e28de846c5..557dcf395c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2224,13 +2224,17 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de * and other special files. --ADM */ asmlinkage long sys_linkat(int olddfd, const char __user *oldname, - int newdfd, const char __user *newname) + int newdfd, const char __user *newname, + int flags) { struct dentry *new_dentry; struct nameidata nd, old_nd; int error; char * to; + if (flags != 0) + return -EINVAL; + to = getname(newname); if (IS_ERR(to)) return PTR_ERR(to); @@ -2263,7 +2267,7 @@ exit: asmlinkage long sys_link(const char __user *oldname, const char __user *newname) { - return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname); + return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } /* -- cgit v1.2.2 From 8dde0509e74ff6044cf1788c917a22facce9f68d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Feb 2006 13:04:23 -0800 Subject: [PATCH] ramfs: update dir mtime and ctime Phil Marek points out that ramfs forgets to update a directory's mtime and ctime when it is modified. Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index c66bd5e4c0..cde5d48994 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,7 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; + dir->i_mtime = dir->i_ctime = CURRENT_TIME; } return error; } -- cgit v1.2.2 From 5342fba5412cead88b61ead07168615dbeba1ee3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Sun, 26 Feb 2006 04:18:28 +0100 Subject: [PATCH] x86_64: Check for bad elf entry address. Fixes a local DOS on Intel systems that lead to an endless recursive fault. AMD machines don't seem to be affected. Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1b117a4412..c2eac2a50b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -938,6 +938,11 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) kfree(elf_interpreter); } else { elf_entry = loc->elf_ex.e_entry; + if (BAD_ADDR(elf_entry)) { + send_sig(SIGSEGV, current, 0); + retval = -ENOEXEC; /* Nobody gets to see this, but.. */ + goto out_free_dentry; + } } kfree(elf_phdata); -- cgit v1.2.2 From fc5870f66279fabedc9dbba7c28451bbb8f47778 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 26 Feb 2006 04:18:55 +0100 Subject: [PATCH] x86_64: Fix ioctl compat code for /dev/rtc RTC_IRQP_SET/RTC_EPOCH_SET don't take a pointer to an argument, but the argument itself. This actually simplifies the code and makes it work. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 057e60217f..537ac70edf 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -2531,18 +2531,9 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg) val32 = kval; return put_user(val32, (unsigned int __user *)arg); case RTC_IRQP_SET32: + return sys_ioctl(fd, RTC_IRQP_SET, arg); case RTC_EPOCH_SET32: - ret = get_user(val32, (unsigned int __user *)arg); - if (ret) - return ret; - kval = val32; - - set_fs(KERNEL_DS); - ret = sys_ioctl(fd, (cmd == RTC_IRQP_SET32) ? - RTC_IRQP_SET : RTC_EPOCH_SET, - (unsigned long)&kval); - set_fs(oldfs); - return ret; + return sys_ioctl(fd, RTC_EPOCH_SET, arg); default: /* unreached */ return -ENOIOCTLCMD; -- cgit v1.2.2 From 07ff2fa8fcb3d9207f1c16e5acf9086d5731ed8b Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 28 Feb 2006 12:29:51 +1100 Subject: [XFS] Fix a realtime allocator regression introduced by an old iget race fix. Noticed by Roger Willcocks. SGI-PV: 949821 SGI-Modid: xfs-linux-melb:xfs-kern:25257a Signed-off-by: Nathan Scott --- fs/xfs/xfs_rtalloc.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 06fc061c50..5b413946b1 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -130,7 +130,8 @@ xfs_growfs_rt_alloc( /* * Lock the inode. */ - if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip))) + if ((error = xfs_trans_iget(mp, tp, ino, 0, + XFS_ILOCK_EXCL, &ip))) goto error_exit; XFS_BMAP_INIT(&flist, &firstblock); /* @@ -170,8 +171,8 @@ xfs_growfs_rt_alloc( /* * Lock the bitmap inode. */ - if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, - &ip))) + if ((error = xfs_trans_iget(mp, tp, ino, 0, + XFS_ILOCK_EXCL, &ip))) goto error_exit; /* * Get a buffer for the block. @@ -2023,8 +2024,8 @@ xfs_growfs_rt( /* * Lock out other callers by grabbing the bitmap inode lock. */ - if ((error = xfs_trans_iget(mp, tp, 0, mp->m_sb.sb_rbmino, - XFS_ILOCK_EXCL, &ip))) + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, + XFS_ILOCK_EXCL, &ip))) goto error_exit; ASSERT(ip == mp->m_rbmip); /* @@ -2037,8 +2038,8 @@ xfs_growfs_rt( /* * Get the summary inode into the transaction. */ - if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, - 0, XFS_ILOCK_EXCL, &ip))) + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, + XFS_ILOCK_EXCL, &ip))) goto error_exit; ASSERT(ip == mp->m_rsumip); /* @@ -2158,10 +2159,9 @@ xfs_rtallocate_extent( /* * Lock out other callers by grabbing the bitmap inode lock. */ - error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); - if (error) { + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, + XFS_ILOCK_EXCL, &ip))) return error; - } sumbp = NULL; /* * Allocate by size, or near another block, or exactly at some block. @@ -2221,10 +2221,9 @@ xfs_rtfree_extent( /* * Synchronize by locking the bitmap inode. */ - error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); - if (error) { + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, + XFS_ILOCK_EXCL, &ip))) return error; - } #if defined(__KERNEL__) && defined(DEBUG) /* * Check to see that this whole range is currently allocated. @@ -2365,8 +2364,8 @@ xfs_rtpick_extent( __uint64_t seq; /* sequence number of file creation */ __uint64_t *seqp; /* pointer to seqno in inode */ - error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); - if (error) + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, + XFS_ILOCK_EXCL, &ip))) return error; ASSERT(ip == mp->m_rbmip); seqp = (__uint64_t *)&ip->i_d.di_atime; -- cgit v1.2.2 From dae81d4774ecbeb7d24bb9a6a4db9f9baee54d85 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 28 Feb 2006 12:30:13 +1100 Subject: [XFS] Reduce stack use during quota mounts (caused a panic). This regressed recently via the fix for inherited quota inode attributes. SGI-PV: 947312 SGI-Modid: xfs-linux-melb:xfs-kern:25318a Signed-off-by: Nathan Scott --- fs/xfs/quota/xfs_qm.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 53a00fb217..7c0e39dc61 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -68,6 +68,9 @@ kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; STATIC kmem_shaker_t xfs_qm_shaker; +STATIC cred_t xfs_zerocr; +STATIC xfs_inode_t xfs_zeroino; + STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); @@ -1393,8 +1396,6 @@ xfs_qm_qino_alloc( xfs_trans_t *tp; int error; unsigned long s; - cred_t zerocr; - xfs_inode_t zeroino; int committed; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); @@ -1406,11 +1407,9 @@ xfs_qm_qino_alloc( xfs_trans_cancel(tp, 0); return error; } - memset(&zerocr, 0, sizeof(zerocr)); - memset(&zeroino, 0, sizeof(zeroino)); - if ((error = xfs_dir_ialloc(&tp, &zeroino, S_IFREG, 1, 0, - &zerocr, 0, 1, ip, &committed))) { + if ((error = xfs_dir_ialloc(&tp, &xfs_zeroino, S_IFREG, 1, 0, + &xfs_zerocr, 0, 1, ip, &committed))) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); return error; -- cgit v1.2.2 From 2353e8e9b6ae29aad77935f21735a30f5cc419b4 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 28 Feb 2006 12:30:30 +1100 Subject: [XFS] Don't map non-uptodate buffers in xfs_probe_cluster; also fixes obscure corruption case SGI-PV: 942658 SGI-Modid: xfs-linux-melb:xfs-kern:207119a Signed-off-by: Eric Sandeen Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 8f2beec526..74d8be87f9 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -540,7 +540,7 @@ xfs_probe_cluster( /* First sum forwards in this page */ do { - if (mapped != buffer_mapped(bh)) + if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) return total; total += bh->b_size; } while ((bh = bh->b_this_page) != head); -- cgit v1.2.2 From 50322fe7d46b544d5649edb58bdbe5c95dd44b98 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 28 Feb 2006 16:59:03 -0800 Subject: [PATCH] fuse: fix bug in negative lookup If negative entries (nodeid == 0) were sent in reply to LOOKUP requests, two bugs could be triggered: - looking up a negative entry would return -EIO, - revaildate on an entry which turned negative would send a FORGET request with zero nodeid, which would cause an abort() in the library. The above would only happen if the 'negative_timeout=N' option was used, otherwise lookups reply -ENOENT, which worked correctly. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dir.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 21fd59c7bc..c72a8a9793 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -111,6 +111,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) /* Doesn't hurt to "reset" the validity timeout */ fuse_invalidate_entry_cache(entry); + + /* For negative dentries, always do a fresh lookup */ if (!inode) return 0; @@ -122,6 +124,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg); request_send(fc, req); err = req->out.h.error; + /* Zero nodeid is same as -ENOENT */ + if (!err && !outarg.nodeid) + err = -ENOENT; if (!err) { struct fuse_inode *fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode)) { @@ -190,8 +195,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, fuse_lookup_init(req, dir, entry, &outarg); request_send(fc, req); err = req->out.h.error; - if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) || - !valid_mode(outarg.attr.mode))) + /* Zero nodeid is same as -ENOENT, but with valid timeout */ + if (!err && outarg.nodeid && + (invalid_nodeid(outarg.nodeid) || !valid_mode(outarg.attr.mode))) err = -EIO; if (!err && outarg.nodeid) { inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, -- cgit v1.2.2 From 0551fbd29e16fccd46e41b7d01bf0f8f39b14212 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 28 Feb 2006 16:59:19 -0800 Subject: [PATCH] Add mm->task_size and fix powerpc vdso This patch adds mm->task_size to keep track of the task size of a given mm and uses that to fix the powerpc vdso so that it uses the mm task size to decide what pages to fault in instead of the current thread flags (which broke when ptracing). (akpm: I expect that mm_struct.task_size will become the way in which we finally sort out the confusion between 32-bit processes and 32-bit mm's. It may need tweaks, but at this stage this patch is powerpc-only.) Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 0e1c95074d..0b515ac531 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -885,6 +885,12 @@ int flush_old_exec(struct linux_binprm * bprm) current->flags &= ~PF_RANDOMIZE; flush_thread(); + /* Set the new mm task size. We have to do that late because it may + * depend on TIF_32BIT which is only updated in flush_thread() on + * some architectures like powerpc + */ + current->mm->task_size = TASK_SIZE; + if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || file_permission(bprm->file, MAY_READ) || (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) { -- cgit v1.2.2 From 6b7a6c94c9c15b2664b568ead83e6b3aaf60d65c Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 21 Feb 2006 11:57:30 -0500 Subject: [PATCH] ocfs2: fix -Wformat warnings when building UML on x86-64 The check to determine which format string is appopriate for u64 and friends works in most cases, but UML on x86_64 doesn't define CONFIG_X86_64, so it results in screen fulls of compile-time warnings. This patch fixes it to handle that case. fs/ocfs2/cluster/masklog.h | 2 +- 1 files changed, 1 insertion(+), 1 deletion(-) Signed-off-by: Jeff Mahoney Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/masklog.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index e8c56a3d9c..2cadc3009c 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -256,7 +256,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; } \ } while (0) -#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) +#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) || (defined(CONFIG_UML_X86) && defined(CONFIG_64BIT)) #define MLFi64 "lld" #define MLFu64 "llu" #define MLFx64 "llx" -- cgit v1.2.2 From d3178bcdd41b050e221337d7f5e30b3c58d4015a Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 24 Feb 2006 17:23:36 -0800 Subject: [PATCH] ocfs2: remove pointless max journal size limit Signed-off-by: Mark Fasheh --- fs/ocfs2/ocfs2_fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index dfb8a5bedf..c5b1ac547c 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -138,7 +138,6 @@ /* Journal limits (in bytes) */ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) -#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024) struct ocfs2_system_inode_info { char *si_name; -- cgit v1.2.2 From d267a56c883b350a2fa80f1daf4636809e3f8e67 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 23 Feb 2006 13:23:39 -0800 Subject: [PATCH] ocfs2: remove unused code Remove some #ifdef'd out code which was inadvertantly introduced in our initial merge. Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 51 +-------------------------------------------------- fs/ocfs2/ocfs2.h | 3 --- 2 files changed, 1 insertion(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1715bc90e7..8a4048b55f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -933,9 +933,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, struct file *filp = iocb->ki_filp; struct inode *inode = filp->f_dentry->d_inode; loff_t newsize, saved_pos; -#ifdef OCFS2_ORACORE_WORKAROUNDS - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -#endif mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, (unsigned int)count, @@ -951,14 +948,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, return -EIO; } -#ifdef OCFS2_ORACORE_WORKAROUNDS - /* ugh, work around some applications which open everything O_DIRECT + - * O_APPEND and really don't mean to use O_DIRECT. */ - if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && - (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) - filp->f_flags &= ~O_DIRECT; -#endif - mutex_lock(&inode->i_mutex); /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ if (filp->f_flags & O_DIRECT) { @@ -1079,27 +1068,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb); -#ifdef OCFS2_ORACORE_WORKAROUNDS - if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && - filp->f_flags & O_DIRECT) { - unsigned int saved_flags = filp->f_flags; - int sector_size = 1 << osb->s_sectsize_bits; - - if ((saved_pos & (sector_size - 1)) || - (count & (sector_size - 1)) || - ((unsigned long)buf & (sector_size - 1))) { - filp->f_flags |= O_SYNC; - filp->f_flags &= ~O_DIRECT; - } - - ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, - &iocb->ki_pos); - - filp->f_flags = saved_flags; - } else -#endif - ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, - &iocb->ki_pos); + ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); @@ -1140,9 +1109,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, int ret = 0, rw_level = -1, have_alloc_sem = 0; struct file *filp = iocb->ki_filp; struct inode *inode = filp->f_dentry->d_inode; -#ifdef OCFS2_ORACORE_WORKAROUNDS - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -#endif mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, (unsigned int)count, @@ -1155,21 +1121,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, goto bail; } -#ifdef OCFS2_ORACORE_WORKAROUNDS - if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) { - if (filp->f_flags & O_DIRECT) { - int sector_size = 1 << osb->s_sectsize_bits; - - if ((pos & (sector_size - 1)) || - (count & (sector_size - 1)) || - ((unsigned long)buf & (sector_size - 1)) || - (i_size_read(inode) & (sector_size -1))) { - filp->f_flags &= ~O_DIRECT; - } - } - } -#endif - /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 8d8e4779df..19360e3d84 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -174,9 +174,6 @@ enum ocfs2_mount_options OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ -#ifdef OCFS2_ORACORE_WORKAROUNDS - OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */ -#endif }; #define OCFS2_OSB_SOFT_RO 0x0001 -- cgit v1.2.2 From 362342f68e331f080d0438f08af1e2c570b0b5fe Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 21 Feb 2006 16:46:33 -0800 Subject: [PATCH] ocfs2: remove non existing function prototypes Remove some prototypes from tcp.h for functions which have long been gone. Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/tcp.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index a6f4585501..616ff2b843 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -85,13 +85,10 @@ enum { O2NET_DRIVER_READY, }; -int o2net_init_tcp_sock(struct inode *inode); int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, u8 target_node, int *status); int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, size_t veclen, u8 target_node, int *status); -int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len, - struct inode *group); int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, o2net_msg_handler_func *func, void *data, @@ -107,7 +104,5 @@ void o2net_disconnect_node(struct o2nm_node *node); int o2net_init(void); void o2net_exit(void); -int o2net_proc_init(struct proc_dir_entry *parent); -void o2net_proc_exit(struct proc_dir_entry *parent); #endif /* O2CLUSTER_TCP_H */ -- cgit v1.2.2 From 895928b8380cc697ac56e9732cedf549c0a4f79c Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 21 Feb 2006 16:54:00 -0800 Subject: [PATCH] ocfs2: complete failure recovery for nodemanager init This patch finishes cleaning up the node manager allocations if it fails to initialize. Signed-off-by: Jeff Mahoney Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/nodemanager.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index cf7828f233..e1fceb8aa3 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -756,7 +756,7 @@ static int __init init_o2nm(void) if (!ocfs2_table_header) { printk(KERN_ERR "nodemanager: unable to register sysctl\n"); ret = -ENOMEM; /* or something. */ - goto out; + goto out_o2net; } ret = o2net_register_hb_callbacks(); @@ -780,6 +780,8 @@ out_callbacks: o2net_unregister_hb_callbacks(); out_sysctl: unregister_sysctl_table(ocfs2_table_header); +out_o2net: + o2net_exit(); out: return ret; } -- cgit v1.2.2 From b4df6ed8db0c387d38292e31f00adc4cd297ed5a Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 22 Feb 2006 17:35:08 -0800 Subject: [PATCH] ocfs2: fix orphan recovery deadlock Orphan dir recovery can deadlock with another process in ocfs2_delete_inode() in some corner cases. Fix this by tracking recovery state more closely and allowing it to handle inode wipes which might deadlock. Signed-off-by: Mark Fasheh --- fs/ocfs2/heartbeat.c | 1 + fs/ocfs2/inode.c | 46 ++++++++++++++++++- fs/ocfs2/journal.c | 124 ++++++++++++++++++++++++++++++++++++++------------- fs/ocfs2/ocfs2.h | 4 ++ fs/ocfs2/super.c | 11 +++++ 5 files changed, 154 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 0bbd22f46c..cbfd45a97a 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -67,6 +67,7 @@ void ocfs2_init_node_maps(struct ocfs2_super *osb) ocfs2_node_map_init(&osb->mounted_map); ocfs2_node_map_init(&osb->recovery_map); ocfs2_node_map_init(&osb->umount_map); + ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); } static void ocfs2_do_node_down(int node_num, diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 8122489c57..315472a5c1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -41,6 +41,7 @@ #include "dlmglue.h" #include "extent_map.h" #include "file.h" +#include "heartbeat.h" #include "inode.h" #include "journal.h" #include "namei.h" @@ -544,6 +545,42 @@ bail: return status; } +/* + * Serialize with orphan dir recovery. If the process doing + * recovery on this orphan dir does an iget() with the dir + * i_mutex held, we'll deadlock here. Instead we detect this + * and exit early - recovery will wipe this inode for us. + */ +static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, + int slot) +{ + int ret = 0; + + spin_lock(&osb->osb_lock); + if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { + mlog(0, "Recovery is happening on orphan dir %d, will skip " + "this inode\n", slot); + ret = -EDEADLK; + goto out; + } + /* This signals to the orphan recovery process that it should + * wait for us to handle the wipe. */ + osb->osb_orphan_wipes[slot]++; +out: + spin_unlock(&osb->osb_lock); + return ret; +} + +static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, + int slot) +{ + spin_lock(&osb->osb_lock); + osb->osb_orphan_wipes[slot]--; + spin_unlock(&osb->osb_lock); + + wake_up(&osb->osb_wipe_event); +} + static int ocfs2_wipe_inode(struct inode *inode, struct buffer_head *di_bh) { @@ -555,6 +592,11 @@ static int ocfs2_wipe_inode(struct inode *inode, /* We've already voted on this so it should be readonly - no * spinlock needed. */ orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + + status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); + if (status) + return status; + orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, orphaned_slot); @@ -597,6 +639,7 @@ bail_unlock_dir: brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); + ocfs2_signal_wipe_completion(osb, orphaned_slot); return status; } @@ -822,7 +865,8 @@ void ocfs2_delete_inode(struct inode *inode) status = ocfs2_wipe_inode(inode, di_bh); if (status < 0) { - mlog_errno(status); + if (status != -EDEADLK) + mlog_errno(status); goto bail_unlock_inode; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d329c9df90..4be801f455 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1408,21 +1408,17 @@ bail: return status; } -static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot) +static int ocfs2_queue_orphans(struct ocfs2_super *osb, + int slot, + struct inode **head) { - int status = 0; - int have_disk_lock = 0; - struct inode *inode = NULL; - struct inode *iter; + int status; struct inode *orphan_dir_inode = NULL; + struct inode *iter; unsigned long offset, blk, local; struct buffer_head *bh = NULL; struct ocfs2_dir_entry *de; struct super_block *sb = osb->sb; - struct ocfs2_inode_info *oi; - - mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, @@ -1430,17 +1426,15 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, if (!orphan_dir_inode) { status = -ENOENT; mlog_errno(status); - goto out; - } + return status; + } mutex_lock(&orphan_dir_inode->i_mutex); status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); mlog_errno(status); goto out; } - have_disk_lock = 1; offset = 0; iter = NULL; @@ -1451,11 +1445,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, if (!bh) status = -EINVAL; if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); if (bh) brelse(bh); mlog_errno(status); - goto out; + goto out_unlock; } local = 0; @@ -1465,11 +1458,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, if (!ocfs2_check_dir_entry(orphan_dir_inode, de, bh, local)) { - mutex_unlock(&orphan_dir_inode->i_mutex); status = -EINVAL; mlog_errno(status); brelse(bh); - goto out; + goto out_unlock; } local += le16_to_cpu(de->rec_len); @@ -1504,18 +1496,95 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, mlog(0, "queue orphan %"MLFu64"\n", OCFS2_I(iter)->ip_blkno); - OCFS2_I(iter)->ip_next_orphan = inode; - inode = iter; + /* No locking is required for the next_orphan + * queue as there is only ever a single + * process doing orphan recovery. */ + OCFS2_I(iter)->ip_next_orphan = *head; + *head = iter; } brelse(bh); } - mutex_unlock(&orphan_dir_inode->i_mutex); +out_unlock: ocfs2_meta_unlock(orphan_dir_inode, 0); - have_disk_lock = 0; - +out: + mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); - orphan_dir_inode = NULL; + return status; +} + +static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb, + int slot) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = !osb->osb_orphan_wipes[slot]; + spin_unlock(&osb->osb_lock); + return ret; +} + +static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb, + int slot) +{ + spin_lock(&osb->osb_lock); + /* Mark ourselves such that new processes in delete_inode() + * know to quit early. */ + ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot); + while (osb->osb_orphan_wipes[slot]) { + /* If any processes are already in the middle of an + * orphan wipe on this dir, then we need to wait for + * them. */ + spin_unlock(&osb->osb_lock); + wait_event_interruptible(osb->osb_wipe_event, + ocfs2_orphan_recovery_can_continue(osb, slot)); + spin_lock(&osb->osb_lock); + } + spin_unlock(&osb->osb_lock); +} + +static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, + int slot) +{ + ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot); +} + +/* + * Orphan recovery. Each mounted node has it's own orphan dir which we + * must run during recovery. Our strategy here is to build a list of + * the inodes in the orphan dir and iget/iput them. The VFS does + * (most) of the rest of the work. + * + * Orphan recovery can happen at any time, not just mount so we have a + * couple of extra considerations. + * + * - We grab as many inodes as we can under the orphan dir lock - + * doing iget() outside the orphan dir risks getting a reference on + * an invalid inode. + * - We must be sure not to deadlock with other processes on the + * system wanting to run delete_inode(). This can happen when they go + * to lock the orphan dir and the orphan recovery process attempts to + * iget() inside the orphan dir lock. This can be avoided by + * advertising our state to ocfs2_delete_inode(). + */ +static int ocfs2_recover_orphans(struct ocfs2_super *osb, + int slot) +{ + int ret = 0; + struct inode *inode = NULL; + struct inode *iter; + struct ocfs2_inode_info *oi; + + mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); + + ocfs2_mark_recovering_orphan_dir(osb, slot); + ret = ocfs2_queue_orphans(osb, slot, &inode); + ocfs2_clear_recovering_orphan_dir(osb, slot); + + /* Error here should be noted, but we want to continue with as + * many queued inodes as we've got. */ + if (ret) + mlog_errno(ret); while (inode) { oi = OCFS2_I(inode); @@ -1541,14 +1610,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, inode = iter; } -out: - if (have_disk_lock) - ocfs2_meta_unlock(orphan_dir_inode, 0); - - if (orphan_dir_inode) - iput(orphan_dir_inode); - - return status; + return ret; } static int ocfs2_wait_on_mount(struct ocfs2_super *osb) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 19360e3d84..e89de9b6e4 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -287,6 +287,10 @@ struct ocfs2_super struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct work_struct osb_truncate_log_wq; + + struct ocfs2_node_map osb_recovering_orphan_dirs; + unsigned int *osb_orphan_wipes; + wait_queue_head_t osb_wipe_event; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 046824b6b6..8dd3aafec4 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1325,6 +1325,16 @@ static int ocfs2_initialize_super(struct super_block *sb, } mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); + init_waitqueue_head(&osb->osb_wipe_event); + osb->osb_orphan_wipes = kcalloc(osb->max_slots, + sizeof(*osb->osb_orphan_wipes), + GFP_KERNEL); + if (!osb->osb_orphan_wipes) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + osb->s_feature_compat = le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); osb->s_feature_ro_compat = @@ -1638,6 +1648,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) if (osb->slot_info) ocfs2_free_slot_info(osb->slot_info); + kfree(osb->osb_orphan_wipes); /* FIXME * This belongs in journal shutdown, but because we have to * allocate osb->journal at the start of ocfs2_initalize_osb(), -- cgit v1.2.2 From 93cc9ac4555a9b95c78b2f5dfe536fe8196002a7 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Mon, 27 Feb 2006 16:53:05 -0800 Subject: ocfs2: Set .owner on masklog sysfs attributes. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/masklog.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index fd741cea57..636593bf4d 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -74,6 +74,7 @@ struct mlog_attribute { #define define_mask(_name) { \ .attr = { \ .name = #_name, \ + .owner = THIS_MODULE, \ .mode = S_IRUGO | S_IWUSR, \ }, \ .mask = ML_##_name, \ -- cgit v1.2.2 From 110ba90858a7f619ff26c6b9b43c27b3c0872335 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Tue, 28 Feb 2006 17:58:36 -0800 Subject: ocfs2: Respond to on-disk corruption in the extent map code. The extent map code has long noticed when the on-disk extent information is corrupt. However, so far it has only returned an error. We should take the filesystem read-only, as it is corrupt. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/extent_map.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index b6ba292e95..e6f207eeba 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -181,6 +181,12 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode, ret = -EBADR; if (rec_end > OCFS2_I(inode)->ip_clusters) { mlog_errno(ret); + ocfs2_error(inode->i_sb, + "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n", + i, + le64_to_cpu(rec->e_blkno), + OCFS2_I(inode)->ip_blkno, + OCFS2_I(inode)->ip_clusters); goto out_free; } @@ -226,6 +232,12 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode, ret = -EBADR; if (blkno) { mlog_errno(ret); + ocfs2_error(inode->i_sb, + "Multiple extents for (cpos = %u, clusters = %u) on inode %"MLFu64"; e_blkno %"MLFu64" and rec %d at e_blkno %"MLFu64"\n", + cpos, clusters, + OCFS2_I(inode)->ip_blkno, + blkno, i, + le64_to_cpu(rec->e_blkno)); goto out_free; } @@ -238,6 +250,10 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode, */ ret = -EBADR; if (!blkno) { + ocfs2_error(inode->i_sb, + "No record found for (cpos = %u, clusters = %u) on inode %"MLFu64"\n", + cpos, clusters, + OCFS2_I(inode)->ip_blkno); mlog_errno(ret); goto out_free; } @@ -266,6 +282,20 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode, for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; + + if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > + OCFS2_I(inode)->ip_clusters) { + ret = -EBADR; + mlog_errno(ret); + ocfs2_error(inode->i_sb, + "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n", + i, + le64_to_cpu(rec->e_blkno), + OCFS2_I(inode)->ip_blkno, + OCFS2_I(inode)->ip_clusters); + return ret; + } + ret = ocfs2_extent_map_insert(inode, rec, le16_to_cpu(el->l_tree_depth)); if (ret) { @@ -526,6 +556,10 @@ static int ocfs2_extent_map_insert(struct inode *inode, OCFS2_I(inode)->ip_map.em_clusters) { ret = -EBADR; mlog_errno(ret); + ocfs2_error(inode->i_sb, + "Zero e_clusters on non-tail extent record at e_blkno %"MLFu64" on inode %"MLFu64"\n", + le64_to_cpu(rec->e_blkno), + OCFS2_I(inode)->ip_blkno); return ret; } @@ -588,12 +622,12 @@ static int ocfs2_extent_map_insert(struct inode *inode, * Existing record in the extent map: * * cpos = 10, len = 10 - * |---------| + * |---------| * * New Record: * * cpos = 10, len = 20 - * |------------------| + * |------------------| * * The passed record is the new on-disk record. The new_clusters value * is how many clusters were added to the file. If the append is a -- cgit v1.2.2 From b7668c72d2ae004363fb0588600bfa942e1b245c Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Tue, 28 Feb 2006 23:28:01 -0800 Subject: [PATCH] ocfs2: added source addr to bind() in o2net_start_connect() to prevent confusion when a virtual ip is created on the same interface Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/tcp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d22d4cf08d..0f60cc0d39 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1318,7 +1318,7 @@ static void o2net_start_connect(void *arg) { struct o2net_node *nn = arg; struct o2net_sock_container *sc = NULL; - struct o2nm_node *node = NULL; + struct o2nm_node *node = NULL, *mynode = NULL; struct socket *sock = NULL; struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; int ret = 0; @@ -1334,6 +1334,12 @@ static void o2net_start_connect(void *arg) goto out; } + mynode = o2nm_get_node_by_num(o2nm_this_node()); + if (mynode == NULL) { + ret = 0; + goto out; + } + spin_lock(&nn->nn_lock); /* see if we already have one pending or have given up */ if (nn->nn_sc || nn->nn_persistent_error) @@ -1361,12 +1367,14 @@ static void o2net_start_connect(void *arg) sock->sk->sk_allocation = GFP_ATOMIC; myaddr.sin_family = AF_INET; + myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address; myaddr.sin_port = (__force u16)htons(0); /* any port */ ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, sizeof(myaddr)); if (ret) { - mlog(0, "bind failed: %d\n", ret); + mlog(ML_ERROR, "bind failed with %d at address %u.%u.%u.%u\n", + ret, NIPQUAD(mynode->nd_ipv4_address)); goto out; } @@ -1407,6 +1415,8 @@ out: sc_put(sc); if (node) o2nm_node_put(node); + if (mynode) + o2nm_node_put(mynode); return; } -- cgit v1.2.2 From 81f2094a631df1ba275f4d4bd7ea5bacfd8dbcfc Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 28 Feb 2006 17:31:22 -0800 Subject: [PATCH] ocfs2: use hlists for lockres hash Switch from list_head to hlist_head. Make the size of the hash dependent upon the allocated area, rather than a constant. Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmcommon.h | 8 +++----- fs/ocfs2/dlm/dlmdebug.c | 12 +++++------- fs/ocfs2/dlm/dlmdomain.c | 39 +++++++++++++++++++-------------------- fs/ocfs2/dlm/dlmmaster.c | 4 ++-- fs/ocfs2/dlm/dlmrecovery.c | 23 ++++++++++++----------- 5 files changed, 41 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 23ceaa7127..9c77258374 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -37,9 +37,7 @@ #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes #define DLM_THREAD_MS 200 // flush at least every 200 ms -#define DLM_HASH_BITS 7 -#define DLM_HASH_SIZE (1 << DLM_HASH_BITS) -#define DLM_HASH_MASK (DLM_HASH_SIZE - 1) +#define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head)) enum dlm_ast_type { DLM_AST = 0, @@ -87,7 +85,7 @@ enum dlm_ctxt_state { struct dlm_ctxt { struct list_head list; - struct list_head *resources; + struct hlist_head *lockres_hash; struct list_head dirty_list; struct list_head purge_list; struct list_head pending_asts; @@ -217,7 +215,7 @@ struct dlm_lock_resource { /* WARNING: Please see the comment in dlm_init_lockres before * adding fields here. */ - struct list_head list; + struct hlist_node hash_node; struct kref refs; /* please keep these next 3 in this order diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index f339fe2797..54f61b76ab 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -117,8 +117,8 @@ EXPORT_SYMBOL_GPL(dlm_print_one_lock); void dlm_dump_lock_resources(struct dlm_ctxt *dlm) { struct dlm_lock_resource *res; - struct list_head *iter; - struct list_head *bucket; + struct hlist_node *iter; + struct hlist_head *bucket; int i; mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n", @@ -129,12 +129,10 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm) } spin_lock(&dlm->spinlock); - for (i=0; iresources[i]); - list_for_each(iter, bucket) { - res = list_entry(iter, struct dlm_lock_resource, list); + for (i=0; ilockres_hash[i]); + hlist_for_each_entry(res, iter, bucket, hash_node) dlm_print_one_lock_resource(res); - } } spin_unlock(&dlm->spinlock); } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6ee3083738..8f3a9e3106 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -77,26 +77,26 @@ static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) { - list_del_init(&lockres->list); + hlist_del_init(&lockres->hash_node); dlm_lockres_put(lockres); } void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - struct list_head *bucket; + struct hlist_head *bucket; struct qstr *q; assert_spin_locked(&dlm->spinlock); q = &res->lockname; q->hash = full_name_hash(q->name, q->len); - bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]); + bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]); /* get a reference for our hashtable */ dlm_lockres_get(res); - list_add_tail(&res->list, bucket); + hlist_add_head(&res->hash_node, bucket); } struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, @@ -104,9 +104,9 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, unsigned int len) { unsigned int hash; - struct list_head *iter; + struct hlist_node *iter; struct dlm_lock_resource *tmpres=NULL; - struct list_head *bucket; + struct hlist_head *bucket; mlog_entry("%.*s\n", len, name); @@ -114,11 +114,11 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, hash = full_name_hash(name, len); - bucket = &(dlm->resources[hash & DLM_HASH_MASK]); + bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]); /* check for pre-existing lock */ - list_for_each(iter, bucket) { - tmpres = list_entry(iter, struct dlm_lock_resource, list); + hlist_for_each(iter, bucket) { + tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node); if (tmpres->lockname.len == len && memcmp(tmpres->lockname.name, name, len) == 0) { dlm_lockres_get(tmpres); @@ -193,8 +193,8 @@ static int dlm_wait_on_domain_helper(const char *domain) static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) { - if (dlm->resources) - free_page((unsigned long) dlm->resources); + if (dlm->lockres_hash) + free_page((unsigned long) dlm->lockres_hash); if (dlm->name) kfree(dlm->name); @@ -303,10 +303,10 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) mlog(0, "Migrating locks from domain %s\n", dlm->name); restart: spin_lock(&dlm->spinlock); - for (i=0; iresources[i])) { - res = list_entry(dlm->resources[i].next, - struct dlm_lock_resource, list); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + while (!hlist_empty(&dlm->lockres_hash[i])) { + res = hlist_entry(dlm->lockres_hash[i].first, + struct dlm_lock_resource, hash_node); /* need reference when manually grabbing lockres */ dlm_lockres_get(res); /* this should unhash the lockres @@ -1191,18 +1191,17 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, goto leave; } - dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL); - if (!dlm->resources) { + dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL); + if (!dlm->lockres_hash) { mlog_errno(-ENOMEM); kfree(dlm->name); kfree(dlm); dlm = NULL; goto leave; } - memset(dlm->resources, 0, PAGE_SIZE); - for (i=0; iresources[i]); + for (i=0; ilockres_hash[i]); strcpy(dlm->name, domain); dlm->key = key; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 2e2e95e694..847dd3cc4c 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -564,7 +564,7 @@ static void dlm_lockres_release(struct kref *kref) /* By the time we're ready to blow this guy away, we shouldn't * be on any lists. */ - BUG_ON(!list_empty(&res->list)); + BUG_ON(!hlist_unhashed(&res->hash_node)); BUG_ON(!list_empty(&res->granted)); BUG_ON(!list_empty(&res->converting)); BUG_ON(!list_empty(&res->blocked)); @@ -605,7 +605,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, init_waitqueue_head(&res->wq); spin_lock_init(&res->spinlock); - INIT_LIST_HEAD(&res->list); + INIT_HLIST_NODE(&res->hash_node); INIT_LIST_HEAD(&res->granted); INIT_LIST_HEAD(&res->converting); INIT_LIST_HEAD(&res->blocked); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ed76bda1a5..1e232000f3 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1693,7 +1693,10 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, u8 dead_node, u8 new_master) { int i; - struct list_head *iter, *iter2, *bucket; + struct list_head *iter, *iter2; + struct hlist_node *hash_iter; + struct hlist_head *bucket; + struct dlm_lock_resource *res; mlog_entry_void(); @@ -1717,10 +1720,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, * for now we need to run the whole hash, clear * the RECOVERING state and set the owner * if necessary */ - for (i=0; iresources[i]); - list_for_each(iter, bucket) { - res = list_entry (iter, struct dlm_lock_resource, list); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = &(dlm->lockres_hash[i]); + hlist_for_each_entry(res, hash_iter, bucket, hash_node) { if (res->state & DLM_LOCK_RES_RECOVERING) { if (res->owner == dead_node) { mlog(0, "(this=%u) res %.*s owner=%u " @@ -1852,10 +1854,10 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) { - struct list_head *iter; + struct hlist_node *iter; struct dlm_lock_resource *res; int i; - struct list_head *bucket; + struct hlist_head *bucket; struct dlm_lock *lock; @@ -1876,10 +1878,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) * can be kicked again to see if any ASTs or BASTs * need to be fired as a result. */ - for (i=0; iresources[i]); - list_for_each(iter, bucket) { - res = list_entry (iter, struct dlm_lock_resource, list); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = &(dlm->lockres_hash[i]); + hlist_for_each_entry(res, iter, bucket, hash_node) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, -- cgit v1.2.2 From 6a3124a3946c16159c3faf83e62ffdb5d1134b3a Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Thu, 2 Mar 2006 02:54:30 -0800 Subject: [PATCH] v9fs: fix atomic create open In order to assure atomic create+open v9fs stores the open fid produced by v9fs_vfs_create in the dentry, from where v9fs_file_open retrieves it and associates it with the open file. This patch modifies v9fs to use nameidata.intent.open values to do the atomic create+open. Signed-off-by: Latchesar Ionkov Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/fid.c | 73 ++------- fs/9p/fid.h | 5 +- fs/9p/v9fs_vfs.h | 1 + fs/9p/vfs_file.c | 106 +++++------- fs/9p/vfs_inode.c | 478 +++++++++++++++++++++++++++++++++++------------------- fs/9p/vfs_super.c | 12 +- 6 files changed, 379 insertions(+), 296 deletions(-) (limited to 'fs') diff --git a/fs/9p/fid.c b/fs/9p/fid.c index eda449778f..c27f546dd2 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -40,7 +40,7 @@ * */ -static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) +int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) { struct list_head *fid_list = (struct list_head *)dentry->d_fsdata; dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid, @@ -68,14 +68,11 @@ static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) * */ -struct v9fs_fid *v9fs_fid_create(struct dentry *dentry, - struct v9fs_session_info *v9ses, int fid, int create) +struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid) { struct v9fs_fid *new; - dprintk(DEBUG_9P, "fid create dentry %p, fid %d, create %d\n", - dentry, fid, create); - + dprintk(DEBUG_9P, "fid create fid %d\n", fid); new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); if (new == NULL) { dprintk(DEBUG_ERROR, "Out of Memory\n"); @@ -85,19 +82,13 @@ struct v9fs_fid *v9fs_fid_create(struct dentry *dentry, new->fid = fid; new->v9ses = v9ses; new->fidopen = 0; - new->fidcreate = create; new->fidclunked = 0; new->iounit = 0; new->rdir_pos = 0; new->rdir_fcall = NULL; + INIT_LIST_HEAD(&new->list); - if (v9fs_fid_insert(new, dentry) == 0) return new; - else { - dprintk(DEBUG_ERROR, "Problems inserting to dentry\n"); - kfree(new); - return NULL; - } } /** @@ -119,7 +110,7 @@ void v9fs_fid_destroy(struct v9fs_fid *fid) static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry) { int fidnum, cfidnum, err; - struct v9fs_fid *cfid; + struct v9fs_fid *cfid, *fid; struct dentry *cde; struct v9fs_session_info *v9ses; @@ -158,7 +149,16 @@ static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry) cde = cde->d_parent; } - return v9fs_fid_create(dentry, v9ses, fidnum, 0); + fid = v9fs_fid_create(v9ses, fidnum); + if (fid) { + err = v9fs_fid_insert(fid, dentry); + if (err < 0) { + kfree(fid); + goto clunk_fid; + } + } + + return fid; clunk_fid: v9fs_t_clunk(v9ses, fidnum); @@ -179,29 +179,12 @@ clunk_fid: struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry) { struct list_head *fid_list = (struct list_head *)dentry->d_fsdata; - struct v9fs_fid *current_fid = NULL; - struct v9fs_fid *temp = NULL; struct v9fs_fid *return_fid = NULL; dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry); - if (fid_list) { - list_for_each_entry_safe(current_fid, temp, fid_list, list) { - if (!current_fid->fidcreate) { - return_fid = current_fid; - break; - } - } - - if (!return_fid) - return_fid = current_fid; - } - - /* we are at the root but didn't match */ - if ((!return_fid) && (dentry->d_parent == dentry)) { - /* TODO: clone attach with new uid */ - return_fid = current_fid; - } + if (fid_list) + return_fid = list_entry(fid_list->next, struct v9fs_fid, list); if (!return_fid) { struct dentry *par = current->fs->pwd->d_parent; @@ -228,25 +211,3 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry) return return_fid; } - -struct v9fs_fid *v9fs_fid_get_created(struct dentry *dentry) -{ - struct list_head *fid_list; - struct v9fs_fid *fid, *ftmp, *ret; - - dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry); - fid_list = (struct list_head *)dentry->d_fsdata; - ret = NULL; - if (fid_list) { - list_for_each_entry_safe(fid, ftmp, fid_list, list) { - if (fid->fidcreate && fid->pid == current->pid) { - list_del(&fid->list); - ret = fid; - break; - } - } - } - - dprintk(DEBUG_9P, "return %p\n", ret); - return ret; -} diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 84c673a44c..7ccf0d064e 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -33,7 +33,6 @@ struct v9fs_fid { u32 fid; unsigned char fidopen; /* set when fid is opened */ - unsigned char fidcreate; /* set when fid was just created */ unsigned char fidclunked; /* set when fid has already been clunked */ struct v9fs_qid qid; @@ -56,5 +55,5 @@ struct v9fs_fid { struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry); struct v9fs_fid *v9fs_fid_get_created(struct dentry *); void v9fs_fid_destroy(struct v9fs_fid *fid); -struct v9fs_fid *v9fs_fid_create(struct dentry *, - struct v9fs_session_info *v9ses, int fid, int create); +struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *, int fid); +int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry); diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 69cf2905dc..a759278aca 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -51,3 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat); void v9fs_dentry_release(struct dentry *); +int v9fs_uflags2omode(int uflags); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index c7e14d9172..de3a129698 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -53,94 +53,70 @@ int v9fs_file_open(struct inode *inode, struct file *file) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - struct v9fs_fid *v9fid, *fid; + struct v9fs_fid *vfid; struct v9fs_fcall *fcall = NULL; - int open_mode = 0; - unsigned int iounit = 0; - int newfid = -1; - long result = -1; + int omode; + int fid = V9FS_NOFID; + int err; dprintk(DEBUG_VFS, "inode: %p file: %p \n", inode, file); - v9fid = v9fs_fid_get_created(file->f_dentry); - if (!v9fid) - v9fid = v9fs_fid_lookup(file->f_dentry); - - if (!v9fid) { + vfid = v9fs_fid_lookup(file->f_dentry); + if (!vfid) { dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n"); return -EBADF; } - if (!v9fid->fidcreate) { - fid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); - if (fid == NULL) { - dprintk(DEBUG_ERROR, "Out of Memory\n"); - return -ENOMEM; - } - - fid->fidopen = 0; - fid->fidcreate = 0; - fid->fidclunked = 0; - fid->iounit = 0; - fid->v9ses = v9ses; - - newfid = v9fs_get_idpool(&v9ses->fidpool); - if (newfid < 0) { + fid = v9fs_get_idpool(&v9ses->fidpool); + if (fid < 0) { eprintk(KERN_WARNING, "newfid fails!\n"); return -ENOSPC; } - result = - v9fs_t_walk(v9ses, v9fid->fid, newfid, NULL, NULL); - - if (result < 0) { - v9fs_put_idpool(newfid, &v9ses->fidpool); + err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, NULL); + if (err < 0) { dprintk(DEBUG_ERROR, "rewalk didn't work\n"); - return -EBADF; + goto put_fid; + } + + vfid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); + if (vfid == NULL) { + dprintk(DEBUG_ERROR, "out of memory\n"); + goto clunk_fid; } - fid->fid = newfid; - v9fid = fid; /* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */ /* translate open mode appropriately */ - open_mode = file->f_flags & 0x3; + omode = v9fs_uflags2omode(file->f_flags); + err = v9fs_t_open(v9ses, fid, omode, &fcall); + if (err < 0) { + PRINT_FCALL_ERROR("open failed", fcall); + goto destroy_vfid; + } - if (file->f_flags & O_EXCL) - open_mode |= V9FS_OEXCL; + file->private_data = vfid; + vfid->fid = fid; + vfid->fidopen = 1; + vfid->fidclunked = 0; + vfid->iounit = fcall->params.ropen.iounit; + vfid->rdir_pos = 0; + vfid->rdir_fcall = NULL; + vfid->filp = file; + kfree(fcall); - if (v9ses->extended) { - if (file->f_flags & O_TRUNC) - open_mode |= V9FS_OTRUNC; + return 0; - if (file->f_flags & O_APPEND) - open_mode |= V9FS_OAPPEND; - } +destroy_vfid: + v9fs_fid_destroy(vfid); - result = v9fs_t_open(v9ses, newfid, open_mode, &fcall); - if (result < 0) { - PRINT_FCALL_ERROR("open failed", fcall); - kfree(fcall); - return result; - } +clunk_fid: + v9fs_t_clunk(v9ses, fid); - iounit = fcall->params.ropen.iounit; +put_fid: + v9fs_put_idpool(fid, &v9ses->fidpool); kfree(fcall); - } else { - /* create case */ - newfid = v9fid->fid; - iounit = v9fid->iounit; - v9fid->fidcreate = 0; - } - - file->private_data = v9fid; - - v9fid->rdir_pos = 0; - v9fid->rdir_fcall = NULL; - v9fid->fidopen = 1; - v9fid->filp = file; - v9fid->iounit = iounit; - return 0; + return err; } /** @@ -289,9 +265,7 @@ v9fs_file_write(struct file *filp, const char __user * data, total += result; } while (count); - if(inode->i_mapping->nrpages) invalidate_inode_pages2(inode->i_mapping); - return total; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 63e5b0398e..dce729d428 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -125,6 +125,38 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) return res; } +int v9fs_uflags2omode(int uflags) +{ + int ret; + + ret = 0; + switch (uflags&3) { + default: + case O_RDONLY: + ret = V9FS_OREAD; + break; + + case O_WRONLY: + ret = V9FS_OWRITE; + break; + + case O_RDWR: + ret = V9FS_ORDWR; + break; + } + + if (uflags & O_EXCL) + ret |= V9FS_OEXCL; + + if (uflags & O_TRUNC) + ret |= V9FS_OTRUNC; + + if (uflags & O_APPEND) + ret |= V9FS_OAPPEND; + + return ret; +} + /** * v9fs_blank_wstat - helper function to setup a 9P stat structure * @v9ses: 9P session info (for determining extended mode) @@ -163,7 +195,7 @@ v9fs_blank_wstat(struct v9fs_wstat *wstat) struct inode *v9fs_get_inode(struct super_block *sb, int mode) { - struct inode *inode = NULL; + struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); @@ -222,171 +254,135 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) return inode; } -/** - * v9fs_create - helper function to create files and directories - * @dir: directory inode file is being created in - * @file_dentry: dentry file is being created in - * @perm: permissions file is being created with - * @open_mode: resulting open mode for file - * - */ - static int -v9fs_create(struct inode *dir, - struct dentry *file_dentry, - unsigned int perm, unsigned int open_mode) +v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name, + u32 perm, u8 mode, u32 *fidp, struct v9fs_qid *qid, u32 *iounit) { - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); - struct super_block *sb = dir->i_sb; - struct v9fs_fid *dirfid = - v9fs_fid_lookup(file_dentry->d_parent); - struct v9fs_fid *fid = NULL; - struct inode *file_inode = NULL; - struct v9fs_fcall *fcall = NULL; - struct v9fs_qid qid; - int dirfidnum = -1; - long newfid = -1; - int result = 0; - unsigned int iounit = 0; - int wfidno = -1; + u32 fid; int err; + struct v9fs_fcall *fcall; - perm = unixmode2p9mode(v9ses, perm); - - dprintk(DEBUG_VFS, "dir: %p dentry: %p perm: %o mode: %o\n", dir, - file_dentry, perm, open_mode); - - if (!dirfid) - return -EBADF; - - dirfidnum = dirfid->fid; - if (dirfidnum < 0) { - dprintk(DEBUG_ERROR, "No fid for the directory #%lu\n", - dir->i_ino); - return -EBADF; - } - - if (file_dentry->d_inode) { - dprintk(DEBUG_ERROR, - "Odd. There is an inode for dir %lu, name :%s:\n", - dir->i_ino, file_dentry->d_name.name); - return -EEXIST; - } - - newfid = v9fs_get_idpool(&v9ses->fidpool); - if (newfid < 0) { + fid = v9fs_get_idpool(&v9ses->fidpool); + if (fid < 0) { eprintk(KERN_WARNING, "no free fids available\n"); - return -ENOSPC; + err = -ENOSPC; + goto error; } - result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall); - if (result < 0) { + err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall); + if (err < 0) { PRINT_FCALL_ERROR("clone error", fcall); - v9fs_put_idpool(newfid, &v9ses->fidpool); - newfid = -1; - goto CleanUpFid; + goto error; } - kfree(fcall); - fcall = NULL; - result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name, - perm, open_mode, &fcall); - if (result < 0) { + err = v9fs_t_create(v9ses, fid, name, perm, mode, &fcall); + if (err < 0) { PRINT_FCALL_ERROR("create fails", fcall); - goto CleanUpFid; + goto error; } - iounit = fcall->params.rcreate.iounit; - qid = fcall->params.rcreate.qid; + if (iounit) + *iounit = fcall->params.rcreate.iounit; + + if (qid) + *qid = fcall->params.rcreate.qid; + + if (fidp) + *fidp = fid; + kfree(fcall); - fcall = NULL; + return 0; - if (!(perm&V9FS_DMDIR)) { - fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1); - dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate); - if (!fid) { - result = -ENOMEM; - goto CleanUpFid; - } +error: + if (fid >= 0) + v9fs_put_idpool(fid, &v9ses->fidpool); - fid->qid = qid; - fid->iounit = iounit; - } else { - err = v9fs_t_clunk(v9ses, newfid); - newfid = -1; - if (err < 0) - dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err); - } + kfree(fcall); + return err; +} + +static struct v9fs_fid* +v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry) +{ + int err; + u32 nfid; + struct v9fs_fid *ret; + struct v9fs_fcall *fcall; - /* walk to the newly created file and put the fid in the dentry */ - wfidno = v9fs_get_idpool(&v9ses->fidpool); - if (wfidno < 0) { + nfid = v9fs_get_idpool(&v9ses->fidpool); + if (nfid < 0) { eprintk(KERN_WARNING, "no free fids available\n"); - return -ENOSPC; + err = -ENOSPC; + goto error; } - result = v9fs_t_walk(v9ses, dirfidnum, wfidno, - (char *) file_dentry->d_name.name, &fcall); - if (result < 0) { - PRINT_FCALL_ERROR("clone error", fcall); - v9fs_put_idpool(wfidno, &v9ses->fidpool); - wfidno = -1; - goto CleanUpFid; + err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name, + &fcall); + + if (err < 0) { + PRINT_FCALL_ERROR("walk error", fcall); + v9fs_put_idpool(nfid, &v9ses->fidpool); + goto error; } + kfree(fcall); fcall = NULL; + ret = v9fs_fid_create(v9ses, nfid); + if (!ret) { + err = -ENOMEM; + goto clunk_fid; + } - if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) { - v9fs_put_idpool(wfidno, &v9ses->fidpool); - - goto CleanUpFid; + err = v9fs_fid_insert(ret, dentry); + if (err < 0) { + v9fs_fid_destroy(ret); + goto clunk_fid; } - if ((perm & V9FS_DMSYMLINK) || (perm & V9FS_DMLINK) || - (perm & V9FS_DMNAMEDPIPE) || (perm & V9FS_DMSOCKET) || - (perm & V9FS_DMDEVICE)) - return 0; + return ret; - result = v9fs_t_stat(v9ses, wfidno, &fcall); - if (result < 0) { - PRINT_FCALL_ERROR("stat error", fcall); - goto CleanUpFid; - } +clunk_fid: + v9fs_t_clunk(v9ses, nfid); + +error: + kfree(fcall); + return ERR_PTR(err); +} +struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, u32 fid, + struct super_block *sb) +{ + int err, umode; + struct inode *ret; + struct v9fs_fcall *fcall; - file_inode = v9fs_get_inode(sb, - p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode)); + ret = NULL; + err = v9fs_t_stat(v9ses, fid, &fcall); + if (err) { + PRINT_FCALL_ERROR("stat error", fcall); + goto error; + } - if ((!file_inode) || IS_ERR(file_inode)) { - dprintk(DEBUG_ERROR, "create inode failed\n"); - result = -EBADF; - goto CleanUpFid; + umode = p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode); + ret = v9fs_get_inode(sb, umode); + if (IS_ERR(ret)) { + err = PTR_ERR(ret); + ret = NULL; + goto error; } - v9fs_stat2inode(&fcall->params.rstat.stat, file_inode, sb); + v9fs_stat2inode(&fcall->params.rstat.stat, ret, sb); kfree(fcall); - fcall = NULL; - file_dentry->d_op = &v9fs_dentry_operations; - d_instantiate(file_dentry, file_inode); - - return 0; + return ret; - CleanUpFid: +error: kfree(fcall); - fcall = NULL; + if (ret) + iput(ret); - if (newfid >= 0) { - err = v9fs_t_clunk(v9ses, newfid); - if (err < 0) - dprintk(DEBUG_ERROR, "clunk failed: %d\n", err); - } - if (wfidno >= 0) { - err = v9fs_t_clunk(v9ses, wfidno); - if (err < 0) - dprintk(DEBUG_ERROR, "clunk failed: %d\n", err); - } - return result; + return ERR_PTR(err); } /** @@ -440,20 +436,97 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) return result; } +static int +v9fs_open_created(struct inode *inode, struct file *file) +{ + return 0; +} + /** * v9fs_vfs_create - VFS hook to create files * @inode: directory inode that is being deleted * @dentry: dentry that is being deleted - * @perm: create permissions + * @mode: create permissions * @nd: path information * */ static int -v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm, +v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { - return v9fs_create(inode, dentry, perm, O_RDWR); + int err; + u32 fid, perm, iounit; + int flags; + struct v9fs_session_info *v9ses; + struct v9fs_fid *dfid, *vfid, *ffid; + struct inode *inode; + struct v9fs_qid qid; + struct file *filp; + + inode = NULL; + vfid = NULL; + v9ses = v9fs_inode2v9ses(dir); + dfid = v9fs_fid_lookup(dentry->d_parent); + perm = unixmode2p9mode(v9ses, mode); + + if (nd && nd->flags & LOOKUP_OPEN) + flags = nd->intent.open.flags - 1; + else + flags = O_RDWR; + + err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, + perm, v9fs_uflags2omode(flags), &fid, &qid, &iounit); + + if (err) + goto error; + + vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); + if (IS_ERR(vfid)) { + err = PTR_ERR(vfid); + vfid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto error; + } + + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + + if (nd && nd->flags & LOOKUP_OPEN) { + ffid = v9fs_fid_create(v9ses, fid); + if (!ffid) + return -ENOMEM; + + filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); + if (IS_ERR(filp)) { + v9fs_fid_destroy(ffid); + return PTR_ERR(filp); + } + + ffid->rdir_pos = 0; + ffid->rdir_fcall = NULL; + ffid->fidopen = 1; + ffid->iounit = iounit; + ffid->filp = filp; + filp->private_data = ffid; + } + + return 0; + +error: + if (vfid) + v9fs_fid_destroy(vfid); + + if (inode) + iput(inode); + + return err; } /** @@ -464,9 +537,57 @@ v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm, * */ -static int v9fs_vfs_mkdir(struct inode *inode, struct dentry *dentry, int mode) +static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - return v9fs_create(inode, dentry, mode | S_IFDIR, O_RDONLY); + int err; + u32 fid, perm; + struct v9fs_session_info *v9ses; + struct v9fs_fid *dfid, *vfid; + struct inode *inode; + + inode = NULL; + vfid = NULL; + v9ses = v9fs_inode2v9ses(dir); + dfid = v9fs_fid_lookup(dentry->d_parent); + perm = unixmode2p9mode(v9ses, mode | S_IFDIR); + + err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, + perm, V9FS_OREAD, &fid, NULL, NULL); + + if (err) { + dprintk(DEBUG_ERROR, "create error %d\n", err); + goto error; + } + + err = v9fs_t_clunk(v9ses, fid); + if (err) { + dprintk(DEBUG_ERROR, "clunk error %d\n", err); + goto error; + } + + vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); + if (IS_ERR(vfid)) { + err = PTR_ERR(vfid); + vfid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto error; + } + + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + return 0; + +error: + if (vfid) + v9fs_fid_destroy(vfid); + + return err; } /** @@ -516,9 +637,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOSPC); } - result = - v9fs_t_walk(v9ses, dirfidnum, newfid, (char *)dentry->d_name.name, - NULL); + result = v9fs_t_walk(v9ses, dirfidnum, newfid, + (char *)dentry->d_name.name, NULL); if (result < 0) { v9fs_put_idpool(newfid, &v9ses->fidpool); if (result == -ENOENT) { @@ -551,13 +671,17 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid); - fid = v9fs_fid_create(dentry, v9ses, newfid, 0); + fid = v9fs_fid_create(v9ses, newfid); if (fid == NULL) { dprintk(DEBUG_ERROR, "couldn't insert\n"); result = -ENOMEM; goto FreeFcall; } + result = v9fs_fid_insert(fid, dentry); + if (result < 0) + goto FreeFcall; + fid->qid = fcall->params.rstat.stat.qid; dentry->d_op = &v9fs_dentry_operations; @@ -886,8 +1010,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) } /* copy extension buffer into buffer */ - if (fcall->params.rstat.stat.extension.len+1 < buflen) - buflen = fcall->params.rstat.stat.extension.len + 1; + if (fcall->params.rstat.stat.extension.len < buflen) + buflen = fcall->params.rstat.stat.extension.len; memcpy(buffer, fcall->params.rstat.stat.extension.str, buflen - 1); buffer[buflen-1] = 0; @@ -951,7 +1075,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) if (!link) link = ERR_PTR(-ENOMEM); else { - len = v9fs_readlink(dentry, link, PATH_MAX); + len = v9fs_readlink(dentry, link, strlen(link)); if (len < 0) { __putname(link); @@ -983,53 +1107,75 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, int mode, const char *extension) { - int err, retval; + int err; + u32 fid, perm; struct v9fs_session_info *v9ses; + struct v9fs_fid *dfid, *vfid; + struct inode *inode; struct v9fs_fcall *fcall; - struct v9fs_fid *fid; struct v9fs_wstat wstat; - v9ses = v9fs_inode2v9ses(dir); - retval = -EPERM; fcall = NULL; + inode = NULL; + vfid = NULL; + v9ses = v9fs_inode2v9ses(dir); + dfid = v9fs_fid_lookup(dentry->d_parent); + perm = unixmode2p9mode(v9ses, mode); if (!v9ses->extended) { dprintk(DEBUG_ERROR, "not extended\n"); - goto free_mem; + return -EPERM; } - /* issue a create */ - retval = v9fs_create(dir, dentry, mode, 0); - if (retval != 0) - goto free_mem; + err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, + perm, V9FS_OREAD, &fid, NULL, NULL); - fid = v9fs_fid_get_created(dentry); - if (!fid) { - dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n"); - goto free_mem; + if (err) + goto error; + + err = v9fs_t_clunk(v9ses, fid); + if (err) + goto error; + + vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); + if (IS_ERR(vfid)) { + err = PTR_ERR(vfid); + vfid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto error; } /* issue a Twstat */ v9fs_blank_wstat(&wstat); wstat.muid = v9ses->name; wstat.extension = (char *) extension; - retval = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall); - if (retval < 0) { - PRINT_FCALL_ERROR("wstat error", fcall); - goto free_mem; - } - - err = v9fs_t_clunk(v9ses, fid->fid); + err = v9fs_t_wstat(v9ses, vfid->fid, &wstat, &fcall); if (err < 0) { - dprintk(DEBUG_ERROR, "clunk failed: %d\n", err); - goto free_mem; + PRINT_FCALL_ERROR("wstat error", fcall); + goto error; } - d_drop(dentry); /* FID - will this also clunk? */ + kfree(fcall); + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + return 0; -free_mem: +error: kfree(fcall); - return retval; + if (vfid) + v9fs_fid_destroy(vfid); + + if (inode) + iput(inode); + + return err; + } /** diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 2c4fa75be0..0c85872be5 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -146,7 +146,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type inode->i_gid = gid; root = d_alloc_root(inode); - if (!root) { retval = -ENOMEM; goto put_back_sb; @@ -157,24 +156,27 @@ static struct super_block *v9fs_get_sb(struct file_system_type stat_result = v9fs_t_stat(v9ses, newfid, &fcall); if (stat_result < 0) { dprintk(DEBUG_ERROR, "stat error\n"); + kfree(fcall); v9fs_t_clunk(v9ses, newfid); - v9fs_put_idpool(newfid, &v9ses->fidpool); } else { /* Setup the Root Inode */ - root_fid = v9fs_fid_create(root, v9ses, newfid, 0); + kfree(fcall); + root_fid = v9fs_fid_create(v9ses, newfid); if (root_fid == NULL) { retval = -ENOMEM; goto put_back_sb; } + retval = v9fs_fid_insert(root_fid, root); + if (retval < 0) + goto put_back_sb; + root_fid->qid = fcall->params.rstat.stat.qid; root->d_inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid); v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb); } - kfree(fcall); - if (stat_result < 0) { retval = stat_result; goto put_back_sb; -- cgit v1.2.2 From 74b8054c730785cd9db093e48f53337e521b6270 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Thu, 2 Mar 2006 02:54:32 -0800 Subject: [PATCH] v9fs: fix bug in atomic create open fix Lucho's atomic create+open fix had a bug in the super block initialization causing all mounts to fail. He was freeing an fcall too early. This patch fixes that oversight. Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 0c85872be5..cdf787ee08 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -160,7 +160,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type v9fs_t_clunk(v9ses, newfid); } else { /* Setup the Root Inode */ - kfree(fcall); root_fid = v9fs_fid_create(v9ses, newfid); if (root_fid == NULL) { retval = -ENOMEM; @@ -168,8 +167,10 @@ static struct super_block *v9fs_get_sb(struct file_system_type } retval = v9fs_fid_insert(root_fid, root); - if (retval < 0) + if (retval < 0) { + kfree(fcall); goto put_back_sb; + } root_fid->qid = fcall->params.rstat.stat.qid; root->d_inode->i_ino = @@ -177,6 +178,8 @@ static struct super_block *v9fs_get_sb(struct file_system_type v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb); } + kfree(fcall); + if (stat_result < 0) { retval = stat_result; goto put_back_sb; -- cgit v1.2.2 From 46f6dac259717551916405ee3388de89fb152bca Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Thu, 2 Mar 2006 02:54:33 -0800 Subject: [PATCH] v9fs: simplify fid mapping v9fs has been plagued by an over-complicated approach trying to map Linux dentry semantics to Plan 9 fid semantics. Our previous approach called for aggressive flushing of the dcache resulting in several problems (including wierd cwd behavior when running /bin/pwd). This patch dramatically simplifies our handling of this fid management. Fids will not be clunked as promptly, but the new approach is more functionally correct. We now clunk un-open fids only when their dentry ref_count reaches 0 (and d_delete is called). Another simplification is we no longer seek to match fids to the process-id or uid of the action initiator. The uid-matching will need to be revisited when we fix the security model. Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/fid.c | 94 ++++-------------------------------------------------- fs/9p/fid.h | 1 - fs/9p/v9fs.c | 1 + fs/9p/vfs_dentry.c | 45 +++++--------------------- 4 files changed, 15 insertions(+), 126 deletions(-) (limited to 'fs') diff --git a/fs/9p/fid.c b/fs/9p/fid.c index c27f546dd2..c4d13bf904 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -1,7 +1,7 @@ /* * V9FS FID Management * - * Copyright (C) 2005 by Eric Van Hensbergen + * Copyright (C) 2005, 2006 by Eric Van Hensbergen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -57,7 +57,6 @@ int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) } fid->uid = current->uid; - fid->pid = current->pid; list_add(&fid->list, fid_list); return 0; } @@ -88,7 +87,7 @@ struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid) new->rdir_fcall = NULL; INIT_LIST_HEAD(&new->list); - return new; + return new; } /** @@ -103,76 +102,14 @@ void v9fs_fid_destroy(struct v9fs_fid *fid) kfree(fid); } -/** - * v9fs_fid_walk_up - walks from the process current directory - * up to the specified dentry. - */ -static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry) -{ - int fidnum, cfidnum, err; - struct v9fs_fid *cfid, *fid; - struct dentry *cde; - struct v9fs_session_info *v9ses; - - v9ses = v9fs_inode2v9ses(current->fs->pwd->d_inode); - cfid = v9fs_fid_lookup(current->fs->pwd); - if (cfid == NULL) { - dprintk(DEBUG_ERROR, "process cwd doesn't have a fid\n"); - return ERR_PTR(-ENOENT); - } - - cfidnum = cfid->fid; - cde = current->fs->pwd; - /* TODO: take advantage of multiwalk */ - - fidnum = v9fs_get_idpool(&v9ses->fidpool); - if (fidnum < 0) { - dprintk(DEBUG_ERROR, "could not get a new fid num\n"); - err = -ENOENT; - goto clunk_fid; - } - - while (cde != dentry) { - if (cde == cde->d_parent) { - dprintk(DEBUG_ERROR, "can't find dentry\n"); - err = -ENOENT; - goto clunk_fid; - } - - err = v9fs_t_walk(v9ses, cfidnum, fidnum, "..", NULL); - if (err < 0) { - dprintk(DEBUG_ERROR, "problem walking to parent\n"); - goto clunk_fid; - } - - cfidnum = fidnum; - cde = cde->d_parent; - } - - fid = v9fs_fid_create(v9ses, fidnum); - if (fid) { - err = v9fs_fid_insert(fid, dentry); - if (err < 0) { - kfree(fid); - goto clunk_fid; - } - } - - return fid; - -clunk_fid: - v9fs_t_clunk(v9ses, fidnum); - return ERR_PTR(err); -} - /** * v9fs_fid_lookup - retrieve the right fid from a particular dentry * @dentry: dentry to look for fid in * @type: intent of lookup (operation or traversal) * - * search list of fids associated with a dentry for a fid with a matching - * thread id or uid. If that fails, look up the dentry's parents to see if you - * can find a matching fid. + * find a fid in the dentry + * + * TODO: only match fids that have the same uid as current user * */ @@ -187,26 +124,7 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry) return_fid = list_entry(fid_list->next, struct v9fs_fid, list); if (!return_fid) { - struct dentry *par = current->fs->pwd->d_parent; - int count = 1; - while (par != NULL) { - if (par == dentry) - break; - count++; - if (par == par->d_parent) { - dprintk(DEBUG_ERROR, - "got to root without finding dentry\n"); - break; - } - par = par->d_parent; - } - -/* XXX - there may be some duplication we can get rid of */ - if (par == dentry) { - return_fid = v9fs_fid_walk_up(dentry); - if (IS_ERR(return_fid)) - return_fid = NULL; - } + dprintk(DEBUG_ERROR, "Couldn't find a fid in dentry\n"); } return return_fid; diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 7ccf0d064e..1fc2dd08d7 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -44,7 +44,6 @@ struct v9fs_fid { struct v9fs_fcall *rdir_fcall; /* management stuff */ - pid_t pid; /* thread associated with this fid */ uid_t uid; /* user associated with this fid */ /* private data */ diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index ef33865491..61352491ba 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -397,6 +397,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses, } if (v9ses->afid != ~0) { + dprintk(DEBUG_ERROR, "afid not equal to ~0\n"); if (v9fs_t_clunk(v9ses, v9ses->afid)) dprintk(DEBUG_ERROR, "clunk failed\n"); } diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 2dd806dac9..12c9cc926b 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -43,47 +43,18 @@ #include "fid.h" /** - * v9fs_dentry_validate - VFS dcache hook to validate cache - * @dentry: dentry that is being validated - * @nd: path data + * v9fs_dentry_delete - called when dentry refcount equals 0 + * @dentry: dentry in question * - * dcache really shouldn't be used for 9P2000 as at all due to - * potential attached semantics to directory traversal (walk). - * - * FUTURE: look into how to use dcache to allow multi-stage - * walks in Plan 9 & potential for better dcache operation which - * would remain valid for Plan 9 semantics. Older versions - * had validation via stat for those interested. However, since - * stat has the same approximate overhead as walk there really - * is no difference. The only improvement would be from a - * time-decay cache like NFS has and that undermines the - * synchronous nature of 9P2000. + * By returning 1 here we should remove cacheing of unused + * dentry components. * */ -static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd) +int v9fs_dentry_delete(struct dentry *dentry) { - struct dentry *dc = current->fs->pwd; - - dprintk(DEBUG_VFS, "dentry: %s (%p)\n", dentry->d_iname, dentry); - if (v9fs_fid_lookup(dentry)) { - dprintk(DEBUG_VFS, "VALID\n"); - return 1; - } - - while (dc != NULL) { - if (dc == dentry) { - dprintk(DEBUG_VFS, "VALID\n"); - return 1; - } - if (dc == dc->d_parent) - break; - - dc = dc->d_parent; - } - - dprintk(DEBUG_VFS, "INVALID\n"); - return 0; + dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); + return 1; } /** @@ -118,6 +89,6 @@ void v9fs_dentry_release(struct dentry *dentry) } struct dentry_operations v9fs_dentry_operations = { - .d_revalidate = v9fs_dentry_validate, + .d_delete = v9fs_dentry_delete, .d_release = v9fs_dentry_release, }; -- cgit v1.2.2 From c499ec24c31edf270e777a868ffd0daddcfe7ebd Mon Sep 17 00:00:00 2001 From: "Vladimir V. Saveliev" Date: Thu, 2 Mar 2006 02:54:39 -0800 Subject: [PATCH] reiserfs: do not check if unsigned < 0 This patch fixes bugs in reiserfs where unsigned integers were checked whether they are less then 0. Signed-off-by: Vladimir V. Saveliev Cc: Neil Brown Signed-off-by: Hans Reiser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/file.c | 14 +++++++------- fs/reiserfs/inode.c | 8 ++------ fs/reiserfs/journal.c | 3 +-- 3 files changed, 10 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index f3473176c8..be12879bb1 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1464,13 +1464,11 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t partially overwritten pages, if needed. And lock the pages, so that nobody else can access these until we are done. We get number of actual blocks needed as a result. */ - blocks_to_allocate = - reiserfs_prepare_file_region_for_write(inode, pos, - num_pages, - write_bytes, - prepared_pages); - if (blocks_to_allocate < 0) { - res = blocks_to_allocate; + res = reiserfs_prepare_file_region_for_write(inode, pos, + num_pages, + write_bytes, + prepared_pages); + if (res < 0) { reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - @@ -1478,6 +1476,8 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t break; } + blocks_to_allocate = res; + /* First we correct our estimate of how many blocks we need */ reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index b33d67bba2..d60f6238c6 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -627,11 +627,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block, reiserfs_write_lock(inode->i_sb); version = get_inode_item_key_version(inode); - if (block < 0) { - reiserfs_write_unlock(inode->i_sb); - return -EIO; - } - if (!file_capable(inode, block)) { reiserfs_write_unlock(inode->i_sb); return -EFBIG; @@ -934,12 +929,13 @@ int reiserfs_get_block(struct inode *inode, sector_t block, //pos_in_item * inode->i_sb->s_blocksize, TYPE_INDIRECT, 3); // key type is unimportant + RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key), + "green-805: invalid offset"); blocks_needed = 1 + ((cpu_key_k_offset(&key) - cpu_key_k_offset(&tmp_key)) >> inode->i_sb-> s_blocksize_bits); - RFALSE(blocks_needed < 0, "green-805: invalid offset"); if (blocks_needed == 1) { un = &unf_single; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index b7a179560a..5a9d2722fa 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2319,8 +2319,7 @@ static int journal_read(struct super_block *p_s_sb) return 1; } jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data); - if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 && - le32_to_cpu(jh->j_first_unflushed_offset) < + if (le32_to_cpu(jh->j_first_unflushed_offset) < SB_ONDISK_JOURNAL_SIZE(p_s_sb) && le32_to_cpu(jh->j_last_flush_trans_id) > 0) { oldest_start = -- cgit v1.2.2 From 3af1efe8a301f5b1c813f5f761cb1e10d6175605 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 2 Mar 2006 13:25:26 -0500 Subject: [PATCH] reiserfs: fix unaligned bitmap usage The bitmaps associated with generation numbers for directory entries are declared as an array of ints. On some platforms, this causes alignment exceptions. The following patch uses the standard bitmap declaration macros to declare the bitmaps, fixing the problem. Originally from Takashi Iwai. Signed-off-by: Takashi Iwai Acked-by: Jeff Mahoney Signed-off-by: Linus Torvalds --- fs/reiserfs/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index c8123308e0..284f7852de 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -247,7 +247,7 @@ static int linear_search_in_dir_item(struct cpu_key *key, /* mark, that this generation number is used */ if (de->de_gen_number_bit_string) set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), - (unsigned long *)de->de_gen_number_bit_string); + de->de_gen_number_bit_string); // calculate pointer to name and namelen de->de_entry_num = i; @@ -431,7 +431,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, struct reiserfs_de_head *deh; INITIALIZE_PATH(path); struct reiserfs_dir_entry de; - int bit_string[MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1]; + DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1); int gen_number; char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc if we create file with short name */ @@ -486,7 +486,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, /* find the proper place for the new entry */ memset(bit_string, 0, sizeof(bit_string)); - de.de_gen_number_bit_string = (char *)bit_string; + de.de_gen_number_bit_string = bit_string; retval = reiserfs_find_entry(dir, name, namelen, &path, &de); if (retval != NAME_NOT_FOUND) { if (buffer != small_buf) @@ -508,7 +508,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, } gen_number = - find_first_zero_bit((unsigned long *)bit_string, + find_first_zero_bit(bit_string, MAX_GENERATION_NUMBER + 1); if (gen_number > MAX_GENERATION_NUMBER) { /* there is no free generation number */ -- cgit v1.2.2 From e77e6f3be93763ef88ccbaa9e0ebda5360d92f7c Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 5 Mar 2006 03:39:55 +0000 Subject: [CIFS] Always match oplock break (cache notification) to the right tcp session when multiply mounted. Fixes slow response when cifs client is mounted to shares on multiple servers and oplock break occurs (usually due to attempt to multiply open a file). When treeids on mutiple mounted shares match and we find the wrong match first, we searched for the wrong cached files to send oplock break response for which usually meant that no matching file was found and thus the server would have to timeout the notification. Oplock break timeout is about 20 seconds on some servers so this could cause significantly slower performance on file open calls in a few cases (in particular when multiple shares are mounted from multiple servers, tree ids match, and we have a cached file which is later opened multiple times). This was the most important of the bugs that was found and fixed at Connectathon (interoperability testing event) this week. Acked-by: Shaggy (shaggy@austin.ibm.com) Signed-off-by: Steve French (sfrench@us.ibm.com) --- fs/cifs/cifsproto.h | 2 +- fs/cifs/connect.c | 2 +- fs/cifs/misc.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3c03aadaff..7b25463d3c 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -52,7 +52,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *, int * /* type of buf returned */ , const int long_op); extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid); extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length); -extern int is_valid_oplock_break(struct smb_hdr *smb); +extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); extern int is_size_safe_to_change(struct cifsInodeInfo *); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *); extern unsigned int smbCalcSize(struct smb_hdr *ptr); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ef5ae6f93c..2a0c1f4ca0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -630,7 +630,7 @@ multi_t2_fnd: smallbuf = NULL; } wake_up_process(task_to_wake); - } else if ((is_valid_oplock_break(smb_buffer) == FALSE) + } else if ((is_valid_oplock_break(smb_buffer, server) == FALSE) && (isMultiRsp == FALSE)) { cERROR(1, ("No task to wake, unknown frame rcvd!")); cifs_dump_mem("Received Data is: ",(char *)smb_buffer, diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 812c6bb0fe..432ba15e2c 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -475,7 +475,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length) return 0; } int -is_valid_oplock_break(struct smb_hdr *buf) +is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) { struct smb_com_lock_req * pSMB = (struct smb_com_lock_req *)buf; struct list_head *tmp; @@ -535,7 +535,7 @@ is_valid_oplock_break(struct smb_hdr *buf) read_lock(&GlobalSMBSeslock); list_for_each(tmp, &GlobalTreeConnectionList) { tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList); - if (tcon->tid == buf->Tid) { + if ((tcon->tid == buf->Tid) && (srv == tcon->ses->server)) { cifs_stats_inc(&tcon->num_oplock_brks); list_for_each(tmp1,&tcon->openFileList){ netfile = list_entry(tmp1,struct cifsFileInfo, -- cgit v1.2.2 From ff3aea0e68bfd46120ce2d08bc1f8240fa2bd36a Mon Sep 17 00:00:00 2001 From: Dave Johnson Date: Mon, 6 Mar 2006 15:42:36 -0800 Subject: [PATCH] cramfs mounts provide corrupted content since 2.6.15 Fix handling of cramfs images created by util-linux containing empty regular files. Images created by cramfstools 1.x were ok. Fill out inode contents in cramfs_iget5_set() instead of get_cramfs_inode() to prevent issues if cramfs_iget5_test() is called with I_LOCK|I_NEW still set. Signed-off-by: Dave Johnson Cc: Olaf Hering Cc: Chris Mason Cc: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cramfs/inode.c | 60 +++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 7fe85415ae..8ad52f5bf2 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -36,7 +36,7 @@ static DECLARE_MUTEX(read_mutex); /* These two macros may change in future, to provide better st_ino semantics. */ -#define CRAMINO(x) ((x)->offset?(x)->offset<<2:1) +#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1) #define OFFSET(x) ((x)->i_ino) @@ -66,8 +66,36 @@ static int cramfs_iget5_test(struct inode *inode, void *opaque) static int cramfs_iget5_set(struct inode *inode, void *opaque) { + static struct timespec zerotime; struct cramfs_inode *cramfs_inode = opaque; + inode->i_mode = cramfs_inode->mode; + inode->i_uid = cramfs_inode->uid; + inode->i_size = cramfs_inode->size; + inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_gid = cramfs_inode->gid; + /* Struct copy intentional */ + inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; inode->i_ino = CRAMINO(cramfs_inode); + /* inode->i_nlink is left 1 - arguably wrong for directories, + but it's the best we can do without reading the directory + contents. 1 yields the right result in GNU find, even + without -noleaf option. */ + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + inode->i_data.a_ops = &cramfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &cramfs_dir_inode_operations; + inode->i_fop = &cramfs_directory_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &cramfs_aops; + } else { + inode->i_size = 0; + inode->i_blocks = 0; + init_special_inode(inode, inode->i_mode, + old_decode_dev(cramfs_inode->size)); + } return 0; } @@ -77,37 +105,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode), cramfs_iget5_test, cramfs_iget5_set, cramfs_inode); - static struct timespec zerotime; - if (inode && (inode->i_state & I_NEW)) { - inode->i_mode = cramfs_inode->mode; - inode->i_uid = cramfs_inode->uid; - inode->i_size = cramfs_inode->size; - inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; - inode->i_blksize = PAGE_CACHE_SIZE; - inode->i_gid = cramfs_inode->gid; - /* Struct copy intentional */ - inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; - inode->i_ino = CRAMINO(cramfs_inode); - /* inode->i_nlink is left 1 - arguably wrong for directories, - but it's the best we can do without reading the directory - contents. 1 yields the right result in GNU find, even - without -noleaf option. */ - if (S_ISREG(inode->i_mode)) { - inode->i_fop = &generic_ro_fops; - inode->i_data.a_ops = &cramfs_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &cramfs_dir_inode_operations; - inode->i_fop = &cramfs_directory_operations; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &page_symlink_inode_operations; - inode->i_data.a_ops = &cramfs_aops; - } else { - inode->i_size = 0; - inode->i_blocks = 0; - init_special_inode(inode, inode->i_mode, - old_decode_dev(cramfs_inode->size)); - } unlock_new_inode(inode); } return inode; -- cgit v1.2.2 From ecbd3a632c8198744655b769c5c2b5a1455c1fba Mon Sep 17 00:00:00 2001 From: Peter Staubach Date: Mon, 6 Mar 2006 15:42:56 -0800 Subject: [PATCH] ramfs needs to update directory m/ctime on symlink ramfs neglects to update the directory mtime and ctime fields when creating a new symbolic link. Ramfs was modified in 2.6.15 to update these fields when other types of entries are created. The symlink support is separate from that other support, so that change did not cover quite all of the possibilities. All of the directory content manipulation entry points now seem to be covered with respect to these time field updates. Signed-off-by: Peter Staubach Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index cde5d48994..14bd2246fb 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -137,6 +137,7 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * inode->i_gid = dir->i_gid; d_instantiate(dentry, inode); dget(dentry); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; } else iput(inode); } -- cgit v1.2.2 From 5ddfae16bddb12104fff63c36fb5901f1a3729fc Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 6 Mar 2006 15:42:57 -0800 Subject: [PATCH] smaps: hugepages fix smaps doesn't have a hugepage pagetable walker. Skip walking hugepage vmas. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0eaad41f46..35787f907f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -289,7 +289,7 @@ static int show_smap(struct seq_file *m, void *v) struct mem_size_stats mss; memset(&mss, 0, sizeof mss); - if (vma->vm_mm) + if (vma->vm_mm && !is_vm_hugetlb_page(vma)) smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); return show_map_internal(m, v, &mss); } -- cgit v1.2.2 From ad820c5dd47dff9397ef1e94388bc6577983f68b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 6 Mar 2006 15:42:58 -0800 Subject: [PATCH] smaps: shared fix The point of the smaps "shared" is to count the number of pages that are mapped by more than one process, according to Mauricio Lin. However, smaps uses page_count for this, so it will return a false positive for every page that is mapped by just that one process, which is also in pagecache or swapcache. There are false positive situations for anonymous pages not in swapcache as well: - page reclaim, migration - get_user_pages (eg. direct-io, ptrace) Use page_mapcount instead, to count the number of mappings to the page. Use vm_normal_page so that weird things like /dev/mem aren't counted either. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 35787f907f..91b7c15ab3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -204,7 +204,6 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, { pte_t *pte, ptent; spinlock_t *ptl; - unsigned long pfn; struct page *page; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -214,12 +213,12 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; mss->resident += PAGE_SIZE; - pfn = pte_pfn(ptent); - if (!pfn_valid(pfn)) + + page = vm_normal_page(vma, addr, ptent); + if (!page) continue; - page = pfn_to_page(pfn); - if (page_count(page) >= 2) { + if (page_mapcount(page) >= 2) { if (pte_dirty(ptent)) mss->shared_dirty += PAGE_SIZE; else -- cgit v1.2.2 From d19e9974084b4024abcfcfc9d8676c90d26994bb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 7 Mar 2006 09:16:35 -0800 Subject: Simplify fifo_open() locking logic We don't do interruptible waits for the pipe mutex anywhere else any more either, so don't do it in fifo_open() either. Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- fs/fifo.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fifo.c b/fs/fifo.c index 923371b753..d13fcd3ec8 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -34,10 +34,7 @@ static int fifo_open(struct inode *inode, struct file *filp) { int ret; - ret = -ERESTARTSYS; - if (mutex_lock_interruptible(PIPE_MUTEX(*inode))) - goto err_nolock_nocleanup; - + mutex_lock(PIPE_MUTEX(*inode)); if (!inode->i_pipe) { ret = -ENOMEM; if(!pipe_new(inode)) @@ -140,8 +137,6 @@ err: err_nocleanup: mutex_unlock(PIPE_MUTEX(*inode)); - -err_nolock_nocleanup: return ret; } -- cgit v1.2.2 From a19cbd4bf258840ade3b6ee9e9256006d0644e09 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Mar 2006 14:03:09 -0800 Subject: Mark the pipe file operations static They aren't used (nor even really usable) outside of pipe.c anyway Signed-off-by: Linus Torvalds --- fs/pipe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index d722579df7..8aada8e426 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -605,7 +605,7 @@ struct file_operations rdwr_fifo_fops = { .fasync = pipe_rdwr_fasync, }; -struct file_operations read_pipe_fops = { +static struct file_operations read_pipe_fops = { .llseek = no_llseek, .read = pipe_read, .readv = pipe_readv, @@ -617,7 +617,7 @@ struct file_operations read_pipe_fops = { .fasync = pipe_read_fasync, }; -struct file_operations write_pipe_fops = { +static struct file_operations write_pipe_fops = { .llseek = no_llseek, .read = bad_pipe_r, .write = pipe_write, @@ -629,7 +629,7 @@ struct file_operations write_pipe_fops = { .fasync = pipe_write_fasync, }; -struct file_operations rdwr_pipe_fops = { +static struct file_operations rdwr_pipe_fops = { .llseek = no_llseek, .read = pipe_read, .readv = pipe_readv, -- cgit v1.2.2 From 4d6660eb3665f22d16aff466eb9d45df6102b254 Mon Sep 17 00:00:00 2001 From: Phillip Susi Date: Tue, 7 Mar 2006 21:55:24 -0800 Subject: [PATCH] udf: fix uid/gid options and add uid/gid=ignore and forget options Fix a bug in udf where it would write uid/gid = 0 to the disk for files owned by the id given with the uid=/gid= mount options. It also adds 4 new mount options: uid/gid=forget and uid/gid=ignore. Without any options the id in core and on disk always match. Giving uid/gid=nnn specifies a default ID to be used in core when the on disk ID is -1. uid/gid=ignore forces the in core ID to allways be used no matter what the on disk ID is. uid/gid=forget forces the on disk ID to always be written out as -1. The use of these options allows you to override ownerships on a disk or disable ownwership information from being written, allowing the media to be used portably between different computers and possibly different users without permissions issues that would require root to correct. Signed-off-by: Phillip Susi Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/udf/inode.c | 16 ++++++++++++---- fs/udf/super.c | 18 +++++++++++++++++- fs/udf/udf_sb.h | 4 ++++ 3 files changed, 33 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 395e582ee5..d04cff2273 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1045,10 +1045,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) } inode->i_uid = le32_to_cpu(fe->uid); - if ( inode->i_uid == -1 ) inode->i_uid = UDF_SB(inode->i_sb)->s_uid; + if (inode->i_uid == -1 || UDF_QUERY_FLAG(inode->i_sb, + UDF_FLAG_UID_IGNORE)) + inode->i_uid = UDF_SB(inode->i_sb)->s_uid; inode->i_gid = le32_to_cpu(fe->gid); - if ( inode->i_gid == -1 ) inode->i_gid = UDF_SB(inode->i_sb)->s_gid; + if (inode->i_gid == -1 || UDF_QUERY_FLAG(inode->i_sb, + UDF_FLAG_GID_IGNORE)) + inode->i_gid = UDF_SB(inode->i_sb)->s_gid; inode->i_nlink = le16_to_cpu(fe->fileLinkCount); if (!inode->i_nlink) @@ -1335,10 +1339,14 @@ udf_update_inode(struct inode *inode, int do_sync) return err; } - if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid) + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) + fe->uid = cpu_to_le32(-1); + else if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid) fe->uid = cpu_to_le32(inode->i_uid); - if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid) + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) + fe->gid = cpu_to_le32(-1); + else if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid) fe->gid = cpu_to_le32(inode->i_gid); udfperms = ((inode->i_mode & S_IRWXO) ) | diff --git a/fs/udf/super.c b/fs/udf/super.c index 4a6f49adc6..368d8f81fe 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -269,7 +269,7 @@ enum { Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, Opt_rootdir, Opt_utf8, Opt_iocharset, - Opt_err + Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore }; static match_table_t tokens = { @@ -282,6 +282,10 @@ static match_table_t tokens = { {Opt_adinicb, "adinicb"}, {Opt_shortad, "shortad"}, {Opt_longad, "longad"}, + {Opt_uforget, "uid=forget"}, + {Opt_uignore, "uid=ignore"}, + {Opt_gforget, "gid=forget"}, + {Opt_gignore, "gid=ignore"}, {Opt_gid, "gid=%u"}, {Opt_uid, "uid=%u"}, {Opt_umask, "umask=%o"}, @@ -414,6 +418,18 @@ udf_parse_options(char *options, struct udf_options *uopt) uopt->flags |= (1 << UDF_FLAG_NLS_MAP); break; #endif + case Opt_uignore: + uopt->flags |= (1 << UDF_FLAG_UID_IGNORE); + break; + case Opt_uforget: + uopt->flags |= (1 << UDF_FLAG_UID_FORGET); + break; + case Opt_gignore: + uopt->flags |= (1 << UDF_FLAG_GID_IGNORE); + break; + case Opt_gforget: + uopt->flags |= (1 << UDF_FLAG_GID_FORGET); + break; default: printk(KERN_ERR "udf: bad mount option \"%s\" " "or missing value\n", p); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 663669810b..110f8d6261 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -20,6 +20,10 @@ #define UDF_FLAG_VARCONV 8 #define UDF_FLAG_NLS_MAP 9 #define UDF_FLAG_UTF8 10 +#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ +#define UDF_FLAG_UID_IGNORE 12 /* use sb uid instead of on disk uid */ +#define UDF_FLAG_GID_FORGET 13 +#define UDF_FLAG_GID_IGNORE 14 #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 -- cgit v1.2.2 From 529bf6be5c04f2e869d07bfdb122e9fd98ade714 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Tue, 7 Mar 2006 21:55:35 -0800 Subject: [PATCH] fix file counting I have benchmarked this on an x86_64 NUMA system and see no significant performance difference on kernbench. Tested on both x86_64 and powerpc. The way we do file struct accounting is not very suitable for batched freeing. For scalability reasons, file accounting was constructor/destructor based. This meant that nr_files was decremented only when the object was removed from the slab cache. This is susceptible to slab fragmentation. With RCU based file structure, consequent batched freeing and a test program like Serge's, we just speed this up and end up with a very fragmented slab - llm22:~ # cat /proc/sys/fs/file-nr 587730 0 758844 At the same time, I see only a 2000+ objects in filp cache. The following patch I fixes this problem. This patch changes the file counting by removing the filp_count_lock. Instead we use a separate percpu counter, nr_files, for now and all accesses to it are through get_nr_files() api. In the sysctl handler for nr_files, we populate files_stat.nr_files before returning to user. Counting files as an when they are created and destroyed (as opposed to inside slab) allows us to correctly count open files with RCU. Signed-off-by: Dipankar Sarma Cc: "Paul E. McKenney" Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 +- fs/file_table.c | 87 ++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 56 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index a173bba326..11dc83092d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1736,7 +1736,7 @@ void __init vfs_caches_init(unsigned long mempages) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); diff --git a/fs/file_table.c b/fs/file_table.c index 768b581675..44fabeaa94 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -5,6 +5,7 @@ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ +#include #include #include #include @@ -19,52 +20,67 @@ #include #include #include +#include +#include + +#include /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -EXPORT_SYMBOL(files_stat); /* Needed by unix.o */ - /* public. Not pretty! */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); -static DEFINE_SPINLOCK(filp_count_lock); +static struct percpu_counter nr_files __cacheline_aligned_in_smp; -/* slab constructors and destructors are called from arbitrary - * context and must be fully threaded - use a local spinlock - * to protect files_stat.nr_files - */ -void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags) +static inline void file_free_rcu(struct rcu_head *head) { - if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files++; - spin_unlock_irqrestore(&filp_count_lock, flags); - } + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + kmem_cache_free(filp_cachep, f); } -void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags) +static inline void file_free(struct file *f) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files--; - spin_unlock_irqrestore(&filp_count_lock, flags); + percpu_counter_dec(&nr_files); + call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } -static inline void file_free_rcu(struct rcu_head *head) +/* + * Return the total number of open files in the system + */ +static int get_nr_files(void) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); - kmem_cache_free(filp_cachep, f); + return percpu_counter_read_positive(&nr_files); } -static inline void file_free(struct file *f) +/* + * Return the maximum number of open files in the system + */ +int get_max_files(void) { - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + return files_stat.max_files; } +EXPORT_SYMBOL_GPL(get_max_files); + +/* + * Handle nr_files sysctl + */ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + files_stat.nr_files = get_nr_files(); + return proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +#else +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or @@ -78,14 +94,20 @@ struct file *get_empty_filp(void) /* * Privileged users can go above max_files */ - if (files_stat.nr_files >= files_stat.max_files && - !capable(CAP_SYS_ADMIN)) - goto over; + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + /* + * percpu_counters are inaccurate. Do an expensive check before + * we go and fail. + */ + if (percpu_counter_sum(&nr_files) >= files_stat.max_files) + goto over; + } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; + percpu_counter_inc(&nr_files); memset(f, 0, sizeof(*f)); if (security_file_alloc(f)) goto fail_sec; @@ -101,10 +123,10 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ - if (files_stat.nr_files > old_max) { + if (get_nr_files() > old_max) { printk(KERN_INFO "VFS: file-max limit %d reached\n", - files_stat.max_files); - old_max = files_stat.nr_files; + get_max_files()); + old_max = get_nr_files(); } goto fail; @@ -276,4 +298,5 @@ void __init files_init(unsigned long mempages) if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; files_defer_init(); + percpu_counter_init(&nr_files); } -- cgit v1.2.2 From e96fb230cc97760e448327c0de612cfba94ca7bf Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 7 Mar 2006 21:55:36 -0800 Subject: [PATCH] jffs2: avoid divide-by-zero Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs2/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 3e51dd1da8..cf55b221fc 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -233,7 +233,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) c->nextblock->dirty_size = 0; } #ifdef CONFIG_JFFS2_FS_WRITEBUFFER - if (!jffs2_can_mark_obsolete(c) && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) { + if (!jffs2_can_mark_obsolete(c) && c->wbuf_pagesize && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) { /* If we're going to start writing into a block which already contains data, and the end of the data isn't page-aligned, skip a little and align it. */ -- cgit v1.2.2 From 90f0094dc607abe384a412bfb7199fb667ab0735 Mon Sep 17 00:00:00 2001 From: Horst Hummel Date: Tue, 7 Mar 2006 21:55:39 -0800 Subject: [PATCH] s390: dasd partition detection DASD allows to open a device as soon as gendisk is registered, which means the device is a fake device (capacity=0) and we do know nothing about blocksize and partitions at that point of time. In case the device is opened by someone, the bdev and inode creation is done with the fake device info and the following partition detection code is just using the wrong data. To avoid this modify the DASD state machine to make sure that the open is rejected until the device analysis is either finished or an unformatted device was detected. Signed-off-by: Horst Hummel Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/partitions/ibm.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 78010ad60e..1e4a93835f 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -52,6 +52,7 @@ int ibm_partition(struct parsed_partitions *state, struct block_device *bdev) { int blocksize, offset, size; + loff_t i_size; dasd_information_t *info; struct hd_geometry *geo; char type[5] = {0,}; @@ -63,6 +64,13 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) unsigned char *data; Sector sect; + blocksize = bdev_hardsect_size(bdev); + if (blocksize <= 0) + return 0; + i_size = i_size_read(bdev->bd_inode); + if (i_size == 0) + return 0; + if ((info = kmalloc(sizeof(dasd_information_t), GFP_KERNEL)) == NULL) goto out_noinfo; if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL) @@ -73,9 +81,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 || ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) goto out_noioctl; - - if ((blocksize = bdev_hardsect_size(bdev)) <= 0) - goto out_badsect; /* * Get volume label, extract name and type. @@ -111,7 +116,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) } else { printk("CMS1/%8s:", name); offset = (info->label_block + 1); - size = bdev->bd_inode->i_size >> 9; + size = i_size >> 9; } put_partition(state, 1, offset*(blocksize >> 9), size-offset*(blocksize >> 9)); @@ -168,7 +173,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) else printk("(nonl)/%8s:", name); offset = (info->label_block + 1); - size = (bdev->bd_inode->i_size >> 9); + size = i_size >> 9; put_partition(state, 1, offset*(blocksize >> 9), size-offset*(blocksize >> 9)); } @@ -180,7 +185,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) return 1; out_readerr: -out_badsect: out_noioctl: kfree(label); out_nolab: -- cgit v1.2.2 From 731805b49489055c1548f7ccfbd44c9b84013264 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Tue, 7 Mar 2006 21:55:42 -0800 Subject: [PATCH] v9fs: fix for access to unitialized variables or freed memory Miscellaneous fixes related to accessing uninitialized variables or memory that was already freed. Signed-off-by: Latchesar Ionkov Cc: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/9p.c | 1 - fs/9p/trans_fd.c | 1 + fs/9p/vfs_inode.c | 8 +++----- fs/9p/vfs_super.c | 1 - 4 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/9p/9p.c b/fs/9p/9p.c index 1a6d08761f..f86a28d1d6 100644 --- a/fs/9p/9p.c +++ b/fs/9p/9p.c @@ -111,7 +111,6 @@ static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc, if (!rc) return; - dprintk(DEBUG_9P, "tcall id %d rcall id %d\n", tc->id, rc->id); v9ses = a; if (rc->id == RCLUNK) v9fs_put_idpool(fid, &v9ses->fidpool); diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c index 1a28ef97a3..5b2ce21b10 100644 --- a/fs/9p/trans_fd.c +++ b/fs/9p/trans_fd.c @@ -80,6 +80,7 @@ static int v9fs_fd_send(struct v9fs_transport *trans, void *v, int len) if (!trans || trans->status != Connected || !ts) return -EIO; + oldfs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ ret = vfs_write(ts->out_file, (void __user *)v, len, &ts->out_file->f_pos); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index dce729d428..3ad8455f85 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -265,8 +265,7 @@ v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name, fid = v9fs_get_idpool(&v9ses->fidpool); if (fid < 0) { eprintk(KERN_WARNING, "no free fids available\n"); - err = -ENOSPC; - goto error; + return -ENOSPC; } err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall); @@ -313,8 +312,7 @@ v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry) nfid = v9fs_get_idpool(&v9ses->fidpool); if (nfid < 0) { eprintk(KERN_WARNING, "no free fids available\n"); - err = -ENOSPC; - goto error; + return ERR_PTR(-ENOSPC); } err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name, @@ -612,7 +610,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, int result = 0; dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", - dir, dentry->d_iname, dentry, nameidata); + dir, dentry->d_name.name, dentry, nameidata); sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index cdf787ee08..d05318fa68 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -156,7 +156,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type stat_result = v9fs_t_stat(v9ses, newfid, &fcall); if (stat_result < 0) { dprintk(DEBUG_ERROR, "stat error\n"); - kfree(fcall); v9fs_t_clunk(v9ses, newfid); } else { /* Setup the Root Inode */ -- cgit v1.2.2 From 1efa3c05f8640c37ba89d54dfaa18504d21986ce Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 8 Mar 2006 16:46:08 -0800 Subject: [NET] compat ifconf: fix limits A recent change to compat. dev_ifconf() in fs/compat_ioctl.c causes ifconf data to be truncated 1 entry too early when copying it to userspace. The correct amount of data (length) is returned, but the final entry is empty (zero, not filled in). The for-loop 'i' check should use <= to allow the final struct ifreq32 to be copied. I also used the ifconf-corruption program in kernel bugzilla #4746 to make sure that this change does not re-introduce the corruption. Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- fs/compat_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 537ac70edf..c666769a87 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -446,7 +446,7 @@ static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg) ifr = ifc.ifc_req; ifr32 = compat_ptr(ifc32.ifcbuf); for (i = 0, j = 0; - i + sizeof (struct ifreq32) < ifc32.ifc_len && j < ifc.ifc_len; + i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len; i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) { if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32))) return -EFAULT; -- cgit v1.2.2 From 0ef675d491bd65028fa838015ebc6ce8abefab6f Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Thu, 9 Mar 2006 17:33:38 -0800 Subject: [PATCH] mtd: 64 bit fixes Fix some bugs in mtd/jffs2 on 64bit platform. The MEMGETBADBLOCK/MEMSETBADBLOCK ioctl are not listed in compat_ioctl.h. And some variables in jffs2 are declared as uint32_t but used to hold size_t values. Signed-off-by: Atsushi Nemoto Cc: Thomas Gleixner Acked-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs2/nodelist.c | 3 ++- fs/jffs2/readinode.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index b635e167a3..d4d0c41490 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -406,7 +406,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info int err = 0, pointed = 0; struct jffs2_eraseblock *jeb; unsigned char *buffer; - uint32_t crc, ofs, retlen, len; + uint32_t crc, ofs, len; + size_t retlen; BUG_ON(tn->csize == 0); diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 5f0652df5d..f1695642d0 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -112,7 +112,7 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r * negative error code on failure. */ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, - struct jffs2_raw_dirent *rd, uint32_t read, struct jffs2_full_dirent **fdp, + struct jffs2_raw_dirent *rd, size_t read, struct jffs2_full_dirent **fdp, uint32_t *latest_mctime, uint32_t *mctime_ver) { struct jffs2_full_dirent *fd; -- cgit v1.2.2 From 0adb25d2e71ab047423d6fc63d5d184590d0a66f Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Sat, 11 Mar 2006 03:27:13 -0800 Subject: [PATCH] ext3: ext3_symlink should use GFP_NOFS allocations inside This patch fixes illegal __GFP_FS allocation inside ext3 transaction in ext3_symlink(). Such allocation may re-enter ext3 code from try_to_free_pages. But JBD/ext3 code keeps a pointer to current journal handle in task_struct and, hence, is not reentrable. This bug led to "Assertion failure in journal_dirty_metadata()" messages. http://bugzilla.openvz.org/show_bug.cgi?id=115 Signed-off-by: Andrey Savochkin Signed-off-by: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/namei.c | 3 ++- fs/namei.c | 13 +++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 8bd8ac0777..b8f5cd1e54 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2141,7 +2141,8 @@ retry: * We have a transaction open. All is sweetness. It also sets * i_size in generic_commit_write(). */ - err = page_symlink(inode, symname, l); + err = __page_symlink(inode, symname, l, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { ext3_dec_count(handle, inode); ext3_mark_inode_dirty(handle, inode); diff --git a/fs/namei.c b/fs/namei.c index 557dcf395c..8dc2b038d5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2613,13 +2613,15 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) } } -int page_symlink(struct inode *inode, const char *symname, int len) +int __page_symlink(struct inode *inode, const char *symname, int len, + gfp_t gfp_mask) { struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct page *page; int err = -ENOMEM; char *kaddr; + page = find_or_create_page(mapping, 0, gfp_mask); if (!page) goto fail; err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); @@ -2654,6 +2656,12 @@ fail: return err; } +int page_symlink(struct inode *inode, const char *symname, int len) +{ + return __page_symlink(inode, symname, len, + mapping_gfp_mask(inode->i_mapping)); +} + struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, @@ -2672,6 +2680,7 @@ EXPORT_SYMBOL(lookup_one_len); EXPORT_SYMBOL(page_follow_link_light); EXPORT_SYMBOL(page_put_link); EXPORT_SYMBOL(page_readlink); +EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); EXPORT_SYMBOL(path_lookup); -- cgit v1.2.2 From cd6ef84e6ac9454080707f2f338360f5d7e556fc Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sat, 11 Mar 2006 03:27:14 -0800 Subject: [PATCH] ext3: fix nobh mode for chattr +j inodes One can do "chattr +j" on a file to change its journalling mode. Fix writeback mode with "nobh" handling for it. Even though, we mount ext3 filesystem in writeback mode with "nobh" option, some one can do "chattr +j" on a single file to force it to do journalled mode. In order to do journaling, ext3_block_truncate_page() need to fallback to default case of creating buffers and adding them to transaction etc. Signed-off-by: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/inode.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3fc4238e97..0384e539b8 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1624,15 +1624,14 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, * For "nobh" option, we can only work if we don't need to * read-in the page - otherwise we create buffers to do the IO. */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) { - if (PageUptodate(page)) { - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, length); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - set_page_dirty(page); - goto unlock; - } + if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && + ext3_should_writeback_data(inode) && PageUptodate(page)) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + goto unlock; } if (!page_has_buffers(page)) -- cgit v1.2.2 From 143f412eb4c7cc48b9eb4381f9133b7d36c68075 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Mar 2006 21:20:46 -0800 Subject: [PATCH] NFS: Fix a potential panic in O_DIRECT Based on an original patch by Mike O'Connor and Greg Banks of SGI. Mike states: A normal user can panic an NFS client and cause a local DoS with 'judicious'(?) use of O_DIRECT. Any O_DIRECT write to an NFS file where the user buffer starts with a valid mapped page and contains an unmapped page, will crash in this way. I haven't followed the code, but O_DIRECT reads with similar user buffers will probably also crash albeit in different ways. Details: when nfs_get_user_pages() calls get_user_pages(), it detects and correctly handles get_user_pages() returning an error, which happens if the first page covered by the user buffer's address range is unmapped. However, if the first page is mapped but some subsequent page isn't, get_user_pages() will return a positive number which is less than the number of pages requested (this behaviour is sort of analagous to a short write() call and appears to be intentional). nfs_get_user_pages() doesn't detect this and hands off the array of pages (whose last few elements are random rubbish from the newly allocated array memory) to it's caller, whence they go to nfs_direct_write_seg(), which then totally ignores the nr_pages it's given, and calculates its own idea of how many pages are in the array from the user buffer length. Needless to say, when it comes to transmit those uninitialised page* pointers, we see a crash in the network stack. Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/direct.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 04ab2fc360..4e9b3a1b36 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -57,6 +57,7 @@ #define NFSDBG_FACILITY NFSDBG_VFS #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) +static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty); static kmem_cache_t *nfs_direct_cachep; /* @@ -107,6 +108,15 @@ nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, page_count, (rw == READ), 0, *pages, NULL); up_read(¤t->mm->mmap_sem); + /* + * If we got fewer pages than expected from get_user_pages(), + * the user buffer runs off the end of a mapping; return EFAULT. + */ + if (result >= 0 && result < page_count) { + nfs_free_user_pages(*pages, result, 0); + *pages = NULL; + result = -EFAULT; + } } return result; } -- cgit v1.2.2 From c12e87f4652b1ba3be168b4f63a440399b941928 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Mar 2006 21:20:47 -0800 Subject: [PATCH] NFSv4: fix mount segfault on errors returned that are < -1000 It turns out that nfs4_proc_get_root() may return raw NFSv4 errors instead of mapping them to kernel errors. Problem spotted by Neil Horman Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 984ca3454d..f8c0066e02 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1430,7 +1430,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, if (status == 0) status = nfs4_do_fsinfo(server, fhandle, info); out: - return status; + return nfs4_map_errors(status); } static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) -- cgit v1.2.2 From 30f4e20a0d3492668f5065af582b5af2d1e4256b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Mar 2006 21:20:49 -0800 Subject: [PATCH] NLM: Ensure we do not Oops in the case of an unlock In theory, NLM specs assure us that the server will only reply LCK_GRANTED or LCK_DENIED_GRACE_PERIOD to our NLM_UNLOCK request. In practice, we should not assume this to be the case, and the code will currently Oops if we do. Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/lockd/clntproc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 220058d861..970b6a6aa3 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -662,12 +662,18 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) * reclaimed while we're stuck in the unlock call. */ fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED; + /* + * Note: the server is supposed to either grant us the unlock + * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either + * case, we want to unlock. + */ + do_vfs_lock(fl); + if (req->a_flags & RPC_TASK_ASYNC) { status = nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); /* Hrmf... Do the unlock early since locks_remove_posix() * really expects us to free the lock synchronously */ - do_vfs_lock(fl); if (status < 0) { nlmclnt_release_lockargs(req); kfree(req); @@ -680,7 +686,6 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) if (status < 0) return status; - do_vfs_lock(fl); if (resp->status == NLM_LCK_GRANTED) return 0; -- cgit v1.2.2 From a488edc914aa1d766a4e2c982b5ae03d5657ec1b Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 14 Mar 2006 13:44:00 -0600 Subject: [PATCH] JFS: Take logsync lock before testing mp->lsn This fixes a race where lsn could be cleared before taking the lock Signed-off-by: Dave Kleikamp Signed-off-by: Linus Torvalds --- fs/jfs/jfs_dmap.c | 7 ++----- fs/jfs/jfs_imap.c | 6 ++---- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 2967b73934..79b5404db1 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -532,10 +532,10 @@ dbUpdatePMap(struct inode *ipbmap, lastlblkno = lblkno; + LOGSYNC_LOCK(log, flags); if (mp->lsn != 0) { /* inherit older/smaller lsn */ logdiff(diffp, mp->lsn, log); - LOGSYNC_LOCK(log, flags); if (difft < diffp) { mp->lsn = lsn; @@ -548,20 +548,17 @@ dbUpdatePMap(struct inode *ipbmap, logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } else { mp->log = log; mp->lsn = lsn; /* insert bp after tblock in logsync list */ - LOGSYNC_LOCK(log, flags); - log->count++; list_add(&mp->synclist, &tblk->synclist); mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } + LOGSYNC_UNLOCK(log, flags); } /* write the last buffer. */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 31b4aa13dd..4efa0d0eec 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -2844,11 +2844,11 @@ diUpdatePMap(struct inode *ipimap, */ lsn = tblk->lsn; log = JFS_SBI(tblk->sb)->log; + LOGSYNC_LOCK(log, flags); if (mp->lsn != 0) { /* inherit older/smaller lsn */ logdiff(difft, lsn, log); logdiff(diffp, mp->lsn, log); - LOGSYNC_LOCK(log, flags); if (difft < diffp) { mp->lsn = lsn; /* move mp after tblock in logsync list */ @@ -2860,17 +2860,15 @@ diUpdatePMap(struct inode *ipimap, logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } else { mp->log = log; mp->lsn = lsn; /* insert mp after tblock in logsync list */ - LOGSYNC_LOCK(log, flags); log->count++; list_add(&mp->synclist, &tblk->synclist); mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } + LOGSYNC_UNLOCK(log, flags); write_metapage(mp); return (0); } -- cgit v1.2.2