From a19cbd4bf258840ade3b6ee9e9256006d0644e09 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Mar 2006 14:03:09 -0800 Subject: Mark the pipe file operations static They aren't used (nor even really usable) outside of pipe.c anyway Signed-off-by: Linus Torvalds --- fs/pipe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index d722579df79a..8aada8e426f4 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -605,7 +605,7 @@ struct file_operations rdwr_fifo_fops = { .fasync = pipe_rdwr_fasync, }; -struct file_operations read_pipe_fops = { +static struct file_operations read_pipe_fops = { .llseek = no_llseek, .read = pipe_read, .readv = pipe_readv, @@ -617,7 +617,7 @@ struct file_operations read_pipe_fops = { .fasync = pipe_read_fasync, }; -struct file_operations write_pipe_fops = { +static struct file_operations write_pipe_fops = { .llseek = no_llseek, .read = bad_pipe_r, .write = pipe_write, @@ -629,7 +629,7 @@ struct file_operations write_pipe_fops = { .fasync = pipe_write_fasync, }; -struct file_operations rdwr_pipe_fops = { +static struct file_operations rdwr_pipe_fops = { .llseek = no_llseek, .read = pipe_read, .readv = pipe_readv, -- cgit v1.2.2 From 4d6660eb3665f22d16aff466eb9d45df6102b254 Mon Sep 17 00:00:00 2001 From: Phillip Susi Date: Tue, 7 Mar 2006 21:55:24 -0800 Subject: [PATCH] udf: fix uid/gid options and add uid/gid=ignore and forget options Fix a bug in udf where it would write uid/gid = 0 to the disk for files owned by the id given with the uid=/gid= mount options. It also adds 4 new mount options: uid/gid=forget and uid/gid=ignore. Without any options the id in core and on disk always match. Giving uid/gid=nnn specifies a default ID to be used in core when the on disk ID is -1. uid/gid=ignore forces the in core ID to allways be used no matter what the on disk ID is. uid/gid=forget forces the on disk ID to always be written out as -1. The use of these options allows you to override ownerships on a disk or disable ownwership information from being written, allowing the media to be used portably between different computers and possibly different users without permissions issues that would require root to correct. Signed-off-by: Phillip Susi Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/udf/inode.c | 16 ++++++++++++---- fs/udf/super.c | 18 +++++++++++++++++- fs/udf/udf_sb.h | 4 ++++ 3 files changed, 33 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 395e582ee542..d04cff2273b6 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1045,10 +1045,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) } inode->i_uid = le32_to_cpu(fe->uid); - if ( inode->i_uid == -1 ) inode->i_uid = UDF_SB(inode->i_sb)->s_uid; + if (inode->i_uid == -1 || UDF_QUERY_FLAG(inode->i_sb, + UDF_FLAG_UID_IGNORE)) + inode->i_uid = UDF_SB(inode->i_sb)->s_uid; inode->i_gid = le32_to_cpu(fe->gid); - if ( inode->i_gid == -1 ) inode->i_gid = UDF_SB(inode->i_sb)->s_gid; + if (inode->i_gid == -1 || UDF_QUERY_FLAG(inode->i_sb, + UDF_FLAG_GID_IGNORE)) + inode->i_gid = UDF_SB(inode->i_sb)->s_gid; inode->i_nlink = le16_to_cpu(fe->fileLinkCount); if (!inode->i_nlink) @@ -1335,10 +1339,14 @@ udf_update_inode(struct inode *inode, int do_sync) return err; } - if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid) + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) + fe->uid = cpu_to_le32(-1); + else if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid) fe->uid = cpu_to_le32(inode->i_uid); - if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid) + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) + fe->gid = cpu_to_le32(-1); + else if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid) fe->gid = cpu_to_le32(inode->i_gid); udfperms = ((inode->i_mode & S_IRWXO) ) | diff --git a/fs/udf/super.c b/fs/udf/super.c index 4a6f49adc609..368d8f81fe54 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -269,7 +269,7 @@ enum { Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, Opt_rootdir, Opt_utf8, Opt_iocharset, - Opt_err + Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore }; static match_table_t tokens = { @@ -282,6 +282,10 @@ static match_table_t tokens = { {Opt_adinicb, "adinicb"}, {Opt_shortad, "shortad"}, {Opt_longad, "longad"}, + {Opt_uforget, "uid=forget"}, + {Opt_uignore, "uid=ignore"}, + {Opt_gforget, "gid=forget"}, + {Opt_gignore, "gid=ignore"}, {Opt_gid, "gid=%u"}, {Opt_uid, "uid=%u"}, {Opt_umask, "umask=%o"}, @@ -414,6 +418,18 @@ udf_parse_options(char *options, struct udf_options *uopt) uopt->flags |= (1 << UDF_FLAG_NLS_MAP); break; #endif + case Opt_uignore: + uopt->flags |= (1 << UDF_FLAG_UID_IGNORE); + break; + case Opt_uforget: + uopt->flags |= (1 << UDF_FLAG_UID_FORGET); + break; + case Opt_gignore: + uopt->flags |= (1 << UDF_FLAG_GID_IGNORE); + break; + case Opt_gforget: + uopt->flags |= (1 << UDF_FLAG_GID_FORGET); + break; default: printk(KERN_ERR "udf: bad mount option \"%s\" " "or missing value\n", p); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 663669810be6..110f8d62616f 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -20,6 +20,10 @@ #define UDF_FLAG_VARCONV 8 #define UDF_FLAG_NLS_MAP 9 #define UDF_FLAG_UTF8 10 +#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ +#define UDF_FLAG_UID_IGNORE 12 /* use sb uid instead of on disk uid */ +#define UDF_FLAG_GID_FORGET 13 +#define UDF_FLAG_GID_IGNORE 14 #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 -- cgit v1.2.2 From 529bf6be5c04f2e869d07bfdb122e9fd98ade714 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Tue, 7 Mar 2006 21:55:35 -0800 Subject: [PATCH] fix file counting I have benchmarked this on an x86_64 NUMA system and see no significant performance difference on kernbench. Tested on both x86_64 and powerpc. The way we do file struct accounting is not very suitable for batched freeing. For scalability reasons, file accounting was constructor/destructor based. This meant that nr_files was decremented only when the object was removed from the slab cache. This is susceptible to slab fragmentation. With RCU based file structure, consequent batched freeing and a test program like Serge's, we just speed this up and end up with a very fragmented slab - llm22:~ # cat /proc/sys/fs/file-nr 587730 0 758844 At the same time, I see only a 2000+ objects in filp cache. The following patch I fixes this problem. This patch changes the file counting by removing the filp_count_lock. Instead we use a separate percpu counter, nr_files, for now and all accesses to it are through get_nr_files() api. In the sysctl handler for nr_files, we populate files_stat.nr_files before returning to user. Counting files as an when they are created and destroyed (as opposed to inside slab) allows us to correctly count open files with RCU. Signed-off-by: Dipankar Sarma Cc: "Paul E. McKenney" Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 +- fs/file_table.c | 87 ++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 56 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index a173bba32666..11dc83092d4a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1736,7 +1736,7 @@ void __init vfs_caches_init(unsigned long mempages) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); diff --git a/fs/file_table.c b/fs/file_table.c index 768b58167543..44fabeaa9415 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -5,6 +5,7 @@ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ +#include #include #include #include @@ -19,52 +20,67 @@ #include #include #include +#include +#include + +#include /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -EXPORT_SYMBOL(files_stat); /* Needed by unix.o */ - /* public. Not pretty! */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); -static DEFINE_SPINLOCK(filp_count_lock); +static struct percpu_counter nr_files __cacheline_aligned_in_smp; -/* slab constructors and destructors are called from arbitrary - * context and must be fully threaded - use a local spinlock - * to protect files_stat.nr_files - */ -void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags) +static inline void file_free_rcu(struct rcu_head *head) { - if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files++; - spin_unlock_irqrestore(&filp_count_lock, flags); - } + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + kmem_cache_free(filp_cachep, f); } -void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags) +static inline void file_free(struct file *f) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files--; - spin_unlock_irqrestore(&filp_count_lock, flags); + percpu_counter_dec(&nr_files); + call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } -static inline void file_free_rcu(struct rcu_head *head) +/* + * Return the total number of open files in the system + */ +static int get_nr_files(void) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); - kmem_cache_free(filp_cachep, f); + return percpu_counter_read_positive(&nr_files); } -static inline void file_free(struct file *f) +/* + * Return the maximum number of open files in the system + */ +int get_max_files(void) { - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + return files_stat.max_files; } +EXPORT_SYMBOL_GPL(get_max_files); + +/* + * Handle nr_files sysctl + */ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + files_stat.nr_files = get_nr_files(); + return proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +#else +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or @@ -78,14 +94,20 @@ struct file *get_empty_filp(void) /* * Privileged users can go above max_files */ - if (files_stat.nr_files >= files_stat.max_files && - !capable(CAP_SYS_ADMIN)) - goto over; + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + /* + * percpu_counters are inaccurate. Do an expensive check before + * we go and fail. + */ + if (percpu_counter_sum(&nr_files) >= files_stat.max_files) + goto over; + } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; + percpu_counter_inc(&nr_files); memset(f, 0, sizeof(*f)); if (security_file_alloc(f)) goto fail_sec; @@ -101,10 +123,10 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ - if (files_stat.nr_files > old_max) { + if (get_nr_files() > old_max) { printk(KERN_INFO "VFS: file-max limit %d reached\n", - files_stat.max_files); - old_max = files_stat.nr_files; + get_max_files()); + old_max = get_nr_files(); } goto fail; @@ -276,4 +298,5 @@ void __init files_init(unsigned long mempages) if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; files_defer_init(); + percpu_counter_init(&nr_files); } -- cgit v1.2.2 From e96fb230cc97760e448327c0de612cfba94ca7bf Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 7 Mar 2006 21:55:36 -0800 Subject: [PATCH] jffs2: avoid divide-by-zero Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs2/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 3e51dd1da8aa..cf55b221fc2b 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -233,7 +233,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) c->nextblock->dirty_size = 0; } #ifdef CONFIG_JFFS2_FS_WRITEBUFFER - if (!jffs2_can_mark_obsolete(c) && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) { + if (!jffs2_can_mark_obsolete(c) && c->wbuf_pagesize && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) { /* If we're going to start writing into a block which already contains data, and the end of the data isn't page-aligned, skip a little and align it. */ -- cgit v1.2.2 From 90f0094dc607abe384a412bfb7199fb667ab0735 Mon Sep 17 00:00:00 2001 From: Horst Hummel Date: Tue, 7 Mar 2006 21:55:39 -0800 Subject: [PATCH] s390: dasd partition detection DASD allows to open a device as soon as gendisk is registered, which means the device is a fake device (capacity=0) and we do know nothing about blocksize and partitions at that point of time. In case the device is opened by someone, the bdev and inode creation is done with the fake device info and the following partition detection code is just using the wrong data. To avoid this modify the DASD state machine to make sure that the open is rejected until the device analysis is either finished or an unformatted device was detected. Signed-off-by: Horst Hummel Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/partitions/ibm.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 78010ad60e47..1e4a93835fed 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -52,6 +52,7 @@ int ibm_partition(struct parsed_partitions *state, struct block_device *bdev) { int blocksize, offset, size; + loff_t i_size; dasd_information_t *info; struct hd_geometry *geo; char type[5] = {0,}; @@ -63,6 +64,13 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) unsigned char *data; Sector sect; + blocksize = bdev_hardsect_size(bdev); + if (blocksize <= 0) + return 0; + i_size = i_size_read(bdev->bd_inode); + if (i_size == 0) + return 0; + if ((info = kmalloc(sizeof(dasd_information_t), GFP_KERNEL)) == NULL) goto out_noinfo; if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL) @@ -73,9 +81,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 || ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) goto out_noioctl; - - if ((blocksize = bdev_hardsect_size(bdev)) <= 0) - goto out_badsect; /* * Get volume label, extract name and type. @@ -111,7 +116,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) } else { printk("CMS1/%8s:", name); offset = (info->label_block + 1); - size = bdev->bd_inode->i_size >> 9; + size = i_size >> 9; } put_partition(state, 1, offset*(blocksize >> 9), size-offset*(blocksize >> 9)); @@ -168,7 +173,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) else printk("(nonl)/%8s:", name); offset = (info->label_block + 1); - size = (bdev->bd_inode->i_size >> 9); + size = i_size >> 9; put_partition(state, 1, offset*(blocksize >> 9), size-offset*(blocksize >> 9)); } @@ -180,7 +185,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) return 1; out_readerr: -out_badsect: out_noioctl: kfree(label); out_nolab: -- cgit v1.2.2 From 731805b49489055c1548f7ccfbd44c9b84013264 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Tue, 7 Mar 2006 21:55:42 -0800 Subject: [PATCH] v9fs: fix for access to unitialized variables or freed memory Miscellaneous fixes related to accessing uninitialized variables or memory that was already freed. Signed-off-by: Latchesar Ionkov Cc: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/9p.c | 1 - fs/9p/trans_fd.c | 1 + fs/9p/vfs_inode.c | 8 +++----- fs/9p/vfs_super.c | 1 - 4 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/9p/9p.c b/fs/9p/9p.c index 1a6d08761f39..f86a28d1d6a6 100644 --- a/fs/9p/9p.c +++ b/fs/9p/9p.c @@ -111,7 +111,6 @@ static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc, if (!rc) return; - dprintk(DEBUG_9P, "tcall id %d rcall id %d\n", tc->id, rc->id); v9ses = a; if (rc->id == RCLUNK) v9fs_put_idpool(fid, &v9ses->fidpool); diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c index 1a28ef97a3d1..5b2ce21b10fa 100644 --- a/fs/9p/trans_fd.c +++ b/fs/9p/trans_fd.c @@ -80,6 +80,7 @@ static int v9fs_fd_send(struct v9fs_transport *trans, void *v, int len) if (!trans || trans->status != Connected || !ts) return -EIO; + oldfs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ ret = vfs_write(ts->out_file, (void __user *)v, len, &ts->out_file->f_pos); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index dce729d42869..3ad8455f8577 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -265,8 +265,7 @@ v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name, fid = v9fs_get_idpool(&v9ses->fidpool); if (fid < 0) { eprintk(KERN_WARNING, "no free fids available\n"); - err = -ENOSPC; - goto error; + return -ENOSPC; } err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall); @@ -313,8 +312,7 @@ v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry) nfid = v9fs_get_idpool(&v9ses->fidpool); if (nfid < 0) { eprintk(KERN_WARNING, "no free fids available\n"); - err = -ENOSPC; - goto error; + return ERR_PTR(-ENOSPC); } err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name, @@ -612,7 +610,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, int result = 0; dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", - dir, dentry->d_iname, dentry, nameidata); + dir, dentry->d_name.name, dentry, nameidata); sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index cdf787ee08de..d05318fa684e 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -156,7 +156,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type stat_result = v9fs_t_stat(v9ses, newfid, &fcall); if (stat_result < 0) { dprintk(DEBUG_ERROR, "stat error\n"); - kfree(fcall); v9fs_t_clunk(v9ses, newfid); } else { /* Setup the Root Inode */ -- cgit v1.2.2 From 1efa3c05f8640c37ba89d54dfaa18504d21986ce Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 8 Mar 2006 16:46:08 -0800 Subject: [NET] compat ifconf: fix limits A recent change to compat. dev_ifconf() in fs/compat_ioctl.c causes ifconf data to be truncated 1 entry too early when copying it to userspace. The correct amount of data (length) is returned, but the final entry is empty (zero, not filled in). The for-loop 'i' check should use <= to allow the final struct ifreq32 to be copied. I also used the ifconf-corruption program in kernel bugzilla #4746 to make sure that this change does not re-introduce the corruption. Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- fs/compat_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 537ac70edfe5..c666769a875d 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -446,7 +446,7 @@ static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg) ifr = ifc.ifc_req; ifr32 = compat_ptr(ifc32.ifcbuf); for (i = 0, j = 0; - i + sizeof (struct ifreq32) < ifc32.ifc_len && j < ifc.ifc_len; + i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len; i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) { if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32))) return -EFAULT; -- cgit v1.2.2 From 0ef675d491bd65028fa838015ebc6ce8abefab6f Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Thu, 9 Mar 2006 17:33:38 -0800 Subject: [PATCH] mtd: 64 bit fixes Fix some bugs in mtd/jffs2 on 64bit platform. The MEMGETBADBLOCK/MEMSETBADBLOCK ioctl are not listed in compat_ioctl.h. And some variables in jffs2 are declared as uint32_t but used to hold size_t values. Signed-off-by: Atsushi Nemoto Cc: Thomas Gleixner Acked-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs2/nodelist.c | 3 ++- fs/jffs2/readinode.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index b635e167a3fa..d4d0c41490cd 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -406,7 +406,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info int err = 0, pointed = 0; struct jffs2_eraseblock *jeb; unsigned char *buffer; - uint32_t crc, ofs, retlen, len; + uint32_t crc, ofs, len; + size_t retlen; BUG_ON(tn->csize == 0); diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 5f0652df5d47..f1695642d0f7 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -112,7 +112,7 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r * negative error code on failure. */ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, - struct jffs2_raw_dirent *rd, uint32_t read, struct jffs2_full_dirent **fdp, + struct jffs2_raw_dirent *rd, size_t read, struct jffs2_full_dirent **fdp, uint32_t *latest_mctime, uint32_t *mctime_ver) { struct jffs2_full_dirent *fd; -- cgit v1.2.2 From 0adb25d2e71ab047423d6fc63d5d184590d0a66f Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Sat, 11 Mar 2006 03:27:13 -0800 Subject: [PATCH] ext3: ext3_symlink should use GFP_NOFS allocations inside This patch fixes illegal __GFP_FS allocation inside ext3 transaction in ext3_symlink(). Such allocation may re-enter ext3 code from try_to_free_pages. But JBD/ext3 code keeps a pointer to current journal handle in task_struct and, hence, is not reentrable. This bug led to "Assertion failure in journal_dirty_metadata()" messages. http://bugzilla.openvz.org/show_bug.cgi?id=115 Signed-off-by: Andrey Savochkin Signed-off-by: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/namei.c | 3 ++- fs/namei.c | 13 +++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 8bd8ac077704..b8f5cd1e540d 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2141,7 +2141,8 @@ retry: * We have a transaction open. All is sweetness. It also sets * i_size in generic_commit_write(). */ - err = page_symlink(inode, symname, l); + err = __page_symlink(inode, symname, l, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { ext3_dec_count(handle, inode); ext3_mark_inode_dirty(handle, inode); diff --git a/fs/namei.c b/fs/namei.c index 557dcf395ca1..8dc2b038d5d9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2613,13 +2613,15 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) } } -int page_symlink(struct inode *inode, const char *symname, int len) +int __page_symlink(struct inode *inode, const char *symname, int len, + gfp_t gfp_mask) { struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct page *page; int err = -ENOMEM; char *kaddr; + page = find_or_create_page(mapping, 0, gfp_mask); if (!page) goto fail; err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); @@ -2654,6 +2656,12 @@ fail: return err; } +int page_symlink(struct inode *inode, const char *symname, int len) +{ + return __page_symlink(inode, symname, len, + mapping_gfp_mask(inode->i_mapping)); +} + struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, @@ -2672,6 +2680,7 @@ EXPORT_SYMBOL(lookup_one_len); EXPORT_SYMBOL(page_follow_link_light); EXPORT_SYMBOL(page_put_link); EXPORT_SYMBOL(page_readlink); +EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); EXPORT_SYMBOL(path_lookup); -- cgit v1.2.2 From cd6ef84e6ac9454080707f2f338360f5d7e556fc Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sat, 11 Mar 2006 03:27:14 -0800 Subject: [PATCH] ext3: fix nobh mode for chattr +j inodes One can do "chattr +j" on a file to change its journalling mode. Fix writeback mode with "nobh" handling for it. Even though, we mount ext3 filesystem in writeback mode with "nobh" option, some one can do "chattr +j" on a single file to force it to do journalled mode. In order to do journaling, ext3_block_truncate_page() need to fallback to default case of creating buffers and adding them to transaction etc. Signed-off-by: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/inode.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3fc4238e9703..0384e539b88f 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1624,15 +1624,14 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, * For "nobh" option, we can only work if we don't need to * read-in the page - otherwise we create buffers to do the IO. */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) { - if (PageUptodate(page)) { - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, length); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - set_page_dirty(page); - goto unlock; - } + if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && + ext3_should_writeback_data(inode) && PageUptodate(page)) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + goto unlock; } if (!page_has_buffers(page)) -- cgit v1.2.2 From 143f412eb4c7cc48b9eb4381f9133b7d36c68075 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Mar 2006 21:20:46 -0800 Subject: [PATCH] NFS: Fix a potential panic in O_DIRECT Based on an original patch by Mike O'Connor and Greg Banks of SGI. Mike states: A normal user can panic an NFS client and cause a local DoS with 'judicious'(?) use of O_DIRECT. Any O_DIRECT write to an NFS file where the user buffer starts with a valid mapped page and contains an unmapped page, will crash in this way. I haven't followed the code, but O_DIRECT reads with similar user buffers will probably also crash albeit in different ways. Details: when nfs_get_user_pages() calls get_user_pages(), it detects and correctly handles get_user_pages() returning an error, which happens if the first page covered by the user buffer's address range is unmapped. However, if the first page is mapped but some subsequent page isn't, get_user_pages() will return a positive number which is less than the number of pages requested (this behaviour is sort of analagous to a short write() call and appears to be intentional). nfs_get_user_pages() doesn't detect this and hands off the array of pages (whose last few elements are random rubbish from the newly allocated array memory) to it's caller, whence they go to nfs_direct_write_seg(), which then totally ignores the nr_pages it's given, and calculates its own idea of how many pages are in the array from the user buffer length. Needless to say, when it comes to transmit those uninitialised page* pointers, we see a crash in the network stack. Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/direct.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 04ab2fc360e7..4e9b3a1b36c5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -57,6 +57,7 @@ #define NFSDBG_FACILITY NFSDBG_VFS #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) +static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty); static kmem_cache_t *nfs_direct_cachep; /* @@ -107,6 +108,15 @@ nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, page_count, (rw == READ), 0, *pages, NULL); up_read(¤t->mm->mmap_sem); + /* + * If we got fewer pages than expected from get_user_pages(), + * the user buffer runs off the end of a mapping; return EFAULT. + */ + if (result >= 0 && result < page_count) { + nfs_free_user_pages(*pages, result, 0); + *pages = NULL; + result = -EFAULT; + } } return result; } -- cgit v1.2.2 From c12e87f4652b1ba3be168b4f63a440399b941928 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Mar 2006 21:20:47 -0800 Subject: [PATCH] NFSv4: fix mount segfault on errors returned that are < -1000 It turns out that nfs4_proc_get_root() may return raw NFSv4 errors instead of mapping them to kernel errors. Problem spotted by Neil Horman Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 984ca3454d04..f8c0066e02e1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1430,7 +1430,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, if (status == 0) status = nfs4_do_fsinfo(server, fhandle, info); out: - return status; + return nfs4_map_errors(status); } static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) -- cgit v1.2.2 From 30f4e20a0d3492668f5065af582b5af2d1e4256b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Mar 2006 21:20:49 -0800 Subject: [PATCH] NLM: Ensure we do not Oops in the case of an unlock In theory, NLM specs assure us that the server will only reply LCK_GRANTED or LCK_DENIED_GRACE_PERIOD to our NLM_UNLOCK request. In practice, we should not assume this to be the case, and the code will currently Oops if we do. Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/lockd/clntproc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 220058d8616d..970b6a6aa337 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -662,12 +662,18 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) * reclaimed while we're stuck in the unlock call. */ fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED; + /* + * Note: the server is supposed to either grant us the unlock + * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either + * case, we want to unlock. + */ + do_vfs_lock(fl); + if (req->a_flags & RPC_TASK_ASYNC) { status = nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); /* Hrmf... Do the unlock early since locks_remove_posix() * really expects us to free the lock synchronously */ - do_vfs_lock(fl); if (status < 0) { nlmclnt_release_lockargs(req); kfree(req); @@ -680,7 +686,6 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) if (status < 0) return status; - do_vfs_lock(fl); if (resp->status == NLM_LCK_GRANTED) return 0; -- cgit v1.2.2 From a488edc914aa1d766a4e2c982b5ae03d5657ec1b Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 14 Mar 2006 13:44:00 -0600 Subject: [PATCH] JFS: Take logsync lock before testing mp->lsn This fixes a race where lsn could be cleared before taking the lock Signed-off-by: Dave Kleikamp Signed-off-by: Linus Torvalds --- fs/jfs/jfs_dmap.c | 7 ++----- fs/jfs/jfs_imap.c | 6 ++---- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 2967b7393415..79b5404db100 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -532,10 +532,10 @@ dbUpdatePMap(struct inode *ipbmap, lastlblkno = lblkno; + LOGSYNC_LOCK(log, flags); if (mp->lsn != 0) { /* inherit older/smaller lsn */ logdiff(diffp, mp->lsn, log); - LOGSYNC_LOCK(log, flags); if (difft < diffp) { mp->lsn = lsn; @@ -548,20 +548,17 @@ dbUpdatePMap(struct inode *ipbmap, logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } else { mp->log = log; mp->lsn = lsn; /* insert bp after tblock in logsync list */ - LOGSYNC_LOCK(log, flags); - log->count++; list_add(&mp->synclist, &tblk->synclist); mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } + LOGSYNC_UNLOCK(log, flags); } /* write the last buffer. */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 31b4aa13dd4b..4efa0d0eec39 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -2844,11 +2844,11 @@ diUpdatePMap(struct inode *ipimap, */ lsn = tblk->lsn; log = JFS_SBI(tblk->sb)->log; + LOGSYNC_LOCK(log, flags); if (mp->lsn != 0) { /* inherit older/smaller lsn */ logdiff(difft, lsn, log); logdiff(diffp, mp->lsn, log); - LOGSYNC_LOCK(log, flags); if (difft < diffp) { mp->lsn = lsn; /* move mp after tblock in logsync list */ @@ -2860,17 +2860,15 @@ diUpdatePMap(struct inode *ipimap, logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } else { mp->log = log; mp->lsn = lsn; /* insert mp after tblock in logsync list */ - LOGSYNC_LOCK(log, flags); log->count++; list_add(&mp->synclist, &tblk->synclist); mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } + LOGSYNC_UNLOCK(log, flags); write_metapage(mp); return (0); } -- cgit v1.2.2 From 3fb962bde48c413bfa419ec4413037e87955dcb6 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Wed, 15 Mar 2006 15:14:45 +1100 Subject: Fix a direct I/O locking issue revealed by the new mutex code. Affects only XFS (i.e. DIO_OWN_LOCKING case) - currently it is not possible to get i_mutex locking correct when using DIO_OWN direct I/O locking in a filesystem due to indeterminism in the possible return code/lock/unlock combinations. This can cause a direct read to attempt a double i_mutex unlock inside XFS. We're now ensuring __blockdev_direct_IO always exits with the inode i_mutex (still) held for a direct reader. Tested with the three different locking modes (via direct block device access, ext3 and XFS) - both reading and writing; cannot find any regressions resulting from this change, and it clearly fixes the mutex_unlock warning originally reported here: http://marc.theaimsgroup.com/?l=linux-kernel&m=114189068126253&w=2 Signed-off-by: Nathan Scott Acked-by: Christoph Hellwig --- fs/direct-io.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 848044af7e16..27f3e787faca 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1155,15 +1155,16 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * For writes, i_mutex is not held on entry; it is never taken. * * DIO_LOCKING (simple locking for regular files) - * For writes we are called under i_mutex and return with i_mutex held, even though - * it is internally dropped. + * For writes we are called under i_mutex and return with i_mutex held, even + * though it is internally dropped. * For reads, i_mutex is not held on entry, but it is taken and dropped before * returning. * * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of * uninitialised data, allowing parallel direct readers and writers) * For writes we are called without i_mutex, return without it, never touch it. - * For reads, i_mutex is held on entry and will be released before returning. + * For reads we are called under i_mutex and return with i_mutex held, even + * though it may be internally dropped. * * Additional i_alloc_sem locking requirements described inline below. */ @@ -1182,7 +1183,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int reader_with_isem = (rw == READ && dio_lock_type == DIO_OWN_LOCKING); + int release_i_mutex = 0; + int acquire_i_mutex = 0; if (rw & WRITE) current->flags |= PF_SYNCWRITE; @@ -1225,7 +1227,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * writers need to grab i_alloc_sem only (i_mutex is already held) * For regular files using DIO_OWN_LOCKING, * neither readers nor writers take any locks here - * (i_mutex is already held and release for writers here) */ dio->lock_type = dio_lock_type; if (dio_lock_type != DIO_NO_LOCKING) { @@ -1236,7 +1237,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, mapping = iocb->ki_filp->f_mapping; if (dio_lock_type != DIO_OWN_LOCKING) { mutex_lock(&inode->i_mutex); - reader_with_isem = 1; + release_i_mutex = 1; } retval = filemap_write_and_wait_range(mapping, offset, @@ -1248,7 +1249,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (dio_lock_type == DIO_OWN_LOCKING) { mutex_unlock(&inode->i_mutex); - reader_with_isem = 0; + acquire_i_mutex = 1; } } @@ -1269,11 +1270,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, nr_segs, blkbits, get_blocks, end_io, dio); if (rw == READ && dio_lock_type == DIO_LOCKING) - reader_with_isem = 0; + release_i_mutex = 0; out: - if (reader_with_isem) + if (release_i_mutex) mutex_unlock(&inode->i_mutex); + else if (acquire_i_mutex) + mutex_lock(&inode->i_mutex); if (rw & WRITE) current->flags &= ~PF_SYNCWRITE; return retval; -- cgit v1.2.2 From 4983da07f1e2e8dc81cb9d640fbf35b899cdbdf2 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 14 Mar 2006 19:50:19 -0800 Subject: [PATCH] page migration: fail if page is in a vma flagged VM_LOCKED page migration currently simply retries a couple of times if try_to_unmap() fails without inspecting the return code. However, SWAP_FAIL indicates that the page is in a vma that has the VM_LOCKED flag set (if ignore_refs ==1). We can check for that return code and avoid retrying the migration. migrate_page_remove_references() now needs to return a reason why the failure occured. So switch migrate_page_remove_references to use -Exx style error messages. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 62cfd17dc5fe..a9b399402007 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,6 +3060,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page) { struct address_space *mapping = page->mapping; struct buffer_head *bh, *head; + int rc; if (!mapping) return -EAGAIN; @@ -3069,8 +3070,9 @@ int buffer_migrate_page(struct page *newpage, struct page *page) head = page_buffers(page); - if (migrate_page_remove_references(newpage, page, 3)) - return -EAGAIN; + rc = migrate_page_remove_references(newpage, page, 3); + if (rc) + return rc; bh = head; do { -- cgit v1.2.2 From f13b83580acef03a36c785dccc534ccdd7e43084 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 15 Mar 2006 17:37:32 +0100 Subject: [PATCH] fs/namespace.c:dup_namespace(): fix a use after free The Coverity checker spotted the following bug in dup_namespace(): <-- snip --> if (!new_ns->root) { up_write(&namespace_sem); kfree(new_ns); goto out; } ... out: return new_ns; <-- snip --> Callers expect a non-NULL result to not be freed. Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- fs/namespace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 058a44865beb..39c81a8d6316 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1338,7 +1338,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); if (!new_ns) - goto out; + return NULL; atomic_set(&new_ns->count, 1); INIT_LIST_HEAD(&new_ns->list); @@ -1352,7 +1352,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) if (!new_ns->root) { up_write(&namespace_sem); kfree(new_ns); - goto out; + return NULL; } spin_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); @@ -1393,7 +1393,6 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) if (altrootmnt) mntput(altrootmnt); -out: return new_ns; } -- cgit v1.2.2 From 2d7f2ea9c989853310c7f6e8be52cc090cc8e66b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 15 Mar 2006 21:41:59 +0000 Subject: [PATCH] Fix ext2 readdir f_pos re-validation logic This fixes not one, but _two_, silly (but admittedly hard to hit) bugs in the ext2 filesystem "readdir()" function. It also cleans up the code to avoid the unnecessary goto mess. The bugs were related to re-valiating the f_pos value after somebody had either done an "lseek()" on the directory to an invalid offset, or when the offset had become invalid due to a file being unlinked in the directory. The code would not only set the f_version too eagerly, it would also not update f_pos appropriately for when the offset fixup took place. When that happened, we'd occasionally subsequently fail the readdir() even when we shouldn't (no real harm done, but an ugly printk, and obviously you would end up not necessarily seeing all entries). Thanks to Masoud Sharbiani who noticed the problem and had a test-case for it, and also fixed up a thinko in the first version of this patch. Signed-off-by: Al Viro Acked-by: Masoud Sharbiani Signed-off-by: Linus Torvalds --- fs/ext2/dir.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 7442bdd1267a..b3dbd716cd3a 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -256,11 +256,10 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); unsigned char *types = NULL; - int need_revalidate = (filp->f_version != inode->i_version); - int ret; + int need_revalidate = filp->f_version != inode->i_version; if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) - goto success; + return 0; if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) types = ext2_filetype_table; @@ -275,12 +274,15 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) "bad page in #%lu", inode->i_ino); filp->f_pos += PAGE_CACHE_SIZE - offset; - ret = -EIO; - goto done; + return -EIO; } kaddr = page_address(page); - if (need_revalidate) { - offset = ext2_validate_entry(kaddr, offset, chunk_mask); + if (unlikely(need_revalidate)) { + if (offset) { + offset = ext2_validate_entry(kaddr, offset, chunk_mask); + filp->f_pos = (n<f_version = inode->i_version; need_revalidate = 0; } de = (ext2_dirent *)(kaddr+offset); @@ -289,9 +291,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) if (de->rec_len == 0) { ext2_error(sb, __FUNCTION__, "zero-length directory entry"); - ret = -EIO; ext2_put_page(page); - goto done; + return -EIO; } if (de->inode) { int over; @@ -306,19 +307,14 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) le32_to_cpu(de->inode), d_type); if (over) { ext2_put_page(page); - goto success; + return 0; } } filp->f_pos += le16_to_cpu(de->rec_len); } ext2_put_page(page); } - -success: - ret = 0; -done: - filp->f_version = inode->i_version; - return ret; + return 0; } /* -- cgit v1.2.2 From 8532159f5521ba24e697f0d25970ae89ff62a1f2 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Thu, 16 Mar 2006 23:04:04 -0800 Subject: [PATCH] v9fs: fix overzealous dropping of dentry which breaks dcache There is a d_drop in dir_release which caused problems as it invalidates dcache entries too soon. This was likely a part of the wierd cwd behavior folks were seeing. Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_dir.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index ae6d032b9b59..cd5eeb032d64 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -202,7 +202,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) filp->private_data = NULL; } - d_drop(filp->f_dentry); return 0; } -- cgit v1.2.2 From 85c6932ef0c7a82c309f8728ddf29768001d794e Mon Sep 17 00:00:00 2001 From: Peter Staubach Date: Thu, 16 Mar 2006 23:04:02 -0800 Subject: [PATCH] nfsservctl(): remove user-triggerable printk A user can use nfsservctl() to spam the logs. This can happen because the arguments to the nfsservctl() system call are versioned. This is a good thing. However, when a bad version is detected, the kernel prints a message and then returns an error. Signed-off-by: Peter Staubach Cc: Trond Myklebust Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsctl.c b/fs/nfsctl.c index 0d4cf9486068..1c72c7f85ddc 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -98,10 +98,8 @@ asmlinkage sys_nfsservctl(int cmd, struct nfsctl_arg __user *arg, void __user *r if (copy_from_user(&version, &arg->ca_version, sizeof(int))) return -EFAULT; - if (version != NFSCTL_VERSION) { - printk(KERN_WARNING "nfsd: incompatible version in syscall.\n"); + if (version != NFSCTL_VERSION) return -EINVAL; - } if (cmd < 0 || cmd >= sizeof(map)/sizeof(map[0]) || !map[cmd].name) return -EINVAL; -- cgit v1.2.2