From 48ce8b056c88920c8ac187781048f5dae33c81b9 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Mon, 5 Jun 2006 08:21:03 -0500 Subject: JFS: commit_mutex cleanups I look at code, and see that 1)locks wasn't release in the opposite order in which they were taken 2)in jfs_rename we lock new_ip, and in "error path" we didn't unlock it 3)I see strange expression: "! !" May be this worth to fix? Signed-off-by: Evgeniy Dushistov Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_txnmgr.c | 2 +- fs/jfs/namei.c | 33 ++++++++++++++++----------------- 2 files changed, 17 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index ac3d66948e..49618dd94f 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2944,7 +2944,7 @@ int jfs_sync(void *arg) * Inode is being freed */ list_del_init(&jfs_ip->anon_inode_list); - } else if (! !mutex_trylock(&jfs_ip->commit_mutex)) { + } else if (mutex_trylock(&jfs_ip->commit_mutex)) { /* * inode will be removed from anonymous list * when it is committed diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 09ea03f622..295268ad23 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -165,8 +165,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, out3: txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; @@ -300,8 +300,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) out3: txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; @@ -384,8 +384,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) if (rc == -EIO) txAbort(tid, 1); txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); goto out2; } @@ -422,8 +422,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); /* * Truncating the directory index table is not guaranteed. It @@ -503,8 +503,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) if (rc == -EIO) txAbort(tid, 1); /* Marks FS Dirty */ txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); IWRITE_UNLOCK(ip); goto out1; } @@ -527,8 +527,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) if ((new_size = commitZeroLink(tid, ip)) < 0) { txAbort(tid, 1); /* Marks FS Dirty */ txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); IWRITE_UNLOCK(ip); rc = new_size; goto out1; @@ -556,9 +556,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); - + mutex_unlock(&JFS_IP(dip)->commit_mutex); while (new_size && (rc == 0)) { tid = txBegin(dip->i_sb, 0); @@ -847,8 +846,8 @@ static int jfs_link(struct dentry *old_dentry, out: txEnd(tid); - mutex_unlock(&JFS_IP(dir)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dir)->commit_mutex); jfs_info("jfs_link: rc:%d", rc); return rc; @@ -1037,8 +1036,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, out3: txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; @@ -1160,10 +1159,11 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(new_ip->i_mode)) { new_ip->i_nlink--; if (new_ip->i_nlink) { - mutex_unlock(&JFS_IP(new_dir)->commit_mutex); - mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); if (old_dir != new_dir) mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); if (!S_ISDIR(old_ip->i_mode) && new_ip) IWRITE_UNLOCK(new_ip); jfs_error(new_ip->i_sb, @@ -1281,13 +1281,12 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, out4: txEnd(tid); - - mutex_unlock(&JFS_IP(new_dir)->commit_mutex); - mutex_unlock(&JFS_IP(old_ip)->commit_mutex); - if (old_dir != new_dir) - mutex_unlock(&JFS_IP(old_dir)->commit_mutex); if (new_ip) mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + if (old_dir != new_dir) + mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); while (new_size && (rc == 0)) { tid = txBegin(new_ip->i_sb, 0); -- cgit v1.2.2 From 8ba10ab128e88bfbe58f7164543827ef3c3a2c88 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 8 Jul 2006 02:17:40 +0000 Subject: [CIFS] CIFS_DEBUG2 depends on CIFS Signed-off-by: Steve French --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 00aa3d5c5a..7db05742a9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1791,6 +1791,7 @@ config CIFS_POSIX config CIFS_DEBUG2 bool "Enable additional CIFS debugging routines" + depends on CIFS help Enabling this option adds a few more debugging routines to the cifs code which slightly increases the size of -- cgit v1.2.2 From aadd06e5c56b9ff5117ec77e59eada43dc46e2fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Jul 2006 11:00:01 +0200 Subject: [PATCH] splice: fix problems with sys_tee() Several issues noticed/fixed: - We cannot reliably block in link_pipe() while holding both input and output mutexes. So do preparatory checks before locking down both mutexes and doing the link. - The ipipe->nrbufs vs i check was bad, because we could have dropped the ipipe lock in-between. This causes us to potentially look at unknown buffers if we were racing with someone else reading this pipe. Signed-off-by: Jens Axboe --- fs/splice.c | 238 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 133 insertions(+), 105 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 05fd2787be..684bca3d3a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1306,6 +1306,85 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, return error; } +/* + * Make sure there's data to read. Wait for input if we can, otherwise + * return an appropriate error. + */ +static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs) + return 0; + + ret = 0; + mutex_lock(&pipe->inode->i_mutex); + + while (!pipe->nrbufs) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } + pipe_wait(pipe); + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + +/* + * Make sure there's writeable room. Wait for room if we can, otherwise + * return an appropriate error. + */ +static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs < PIPE_BUFFERS) + return 0; + + ret = 0; + mutex_lock(&pipe->inode->i_mutex); + + while (pipe->nrbufs >= PIPE_BUFFERS) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + break; + } + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + /* * Link contents of ipipe to opipe. */ @@ -1314,9 +1393,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret, do_wakeup, i, ipipe_first; - - ret = do_wakeup = ipipe_first = 0; + int ret = 0, i = 0, nbuf; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1324,126 +1401,62 @@ static int link_pipe(struct pipe_inode_info *ipipe, * could deadlock (one doing tee from A -> B, the other from B -> A). */ if (ipipe->inode < opipe->inode) { - ipipe_first = 1; - mutex_lock(&ipipe->inode->i_mutex); - mutex_lock(&opipe->inode->i_mutex); + mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_CHILD); } else { - mutex_lock(&opipe->inode->i_mutex); - mutex_lock(&ipipe->inode->i_mutex); + mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_CHILD); } - for (i = 0;; i++) { + do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - if (ipipe->nrbufs - i) { - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); - /* - * If we have room, fill this buffer - */ - if (opipe->nrbufs < PIPE_BUFFERS) { - int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); - - /* - * Get a reference to this pipe buffer, - * so we can copy the contents over. - */ - ibuf->ops->get(ipipe, ibuf); - - obuf = opipe->bufs + nbuf; - *obuf = *ibuf; - - /* - * Don't inherit the gift flag, we need to - * prevent multiple steals of this page. - */ - obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - - if (obuf->len > len) - obuf->len = len; - - opipe->nrbufs++; - do_wakeup = 1; - ret += obuf->len; - len -= obuf->len; - - if (!len) - break; - if (opipe->nrbufs < PIPE_BUFFERS) - continue; - } - - /* - * We have input available, but no output room. - * If we already copied data, return that. If we - * need to drop the opipe lock, it must be ordered - * last to avoid deadlocks. - */ - if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { - if (!ret) - ret = -EAGAIN; - break; - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&opipe->wait)) - wake_up_interruptible(&opipe->wait); - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); - do_wakeup = 0; - } + /* + * If we have iterated all input buffers or ran out of + * output room, break. + */ + if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) + break; - opipe->waiting_writers++; - pipe_wait(opipe); - opipe->waiting_writers--; - continue; - } + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); /* - * No input buffers, do the usual checks for available - * writers and blocking and wait if necessary + * Get a reference to this pipe buffer, + * so we can copy the contents over. */ - if (!ipipe->writers) - break; - if (!ipipe->waiting_writers) { - if (ret) - break; - } + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + /* - * pipe_wait() drops the ipipe mutex. To avoid deadlocks - * with another process, we can only safely do that if - * the ipipe lock is ordered last. + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. */ - if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { - if (!ret) - ret = -EAGAIN; - break; - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - if (waitqueue_active(&ipipe->wait)) - wake_up_interruptible_sync(&ipipe->wait); - kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + if (obuf->len > len) + obuf->len = len; - pipe_wait(ipipe); - } + opipe->nrbufs++; + ret += obuf->len; + len -= obuf->len; + i++; + } while (len); mutex_unlock(&ipipe->inode->i_mutex); mutex_unlock(&opipe->inode->i_mutex); - if (do_wakeup) { + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) { smp_mb(); if (waitqueue_active(&opipe->wait)) wake_up_interruptible(&opipe->wait); @@ -1464,14 +1477,29 @@ static long do_tee(struct file *in, struct file *out, size_t len, { struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + int ret = -EINVAL; /* - * Link ipipe to the two output pipes, consuming as we go along. + * Duplicate the contents of ipipe to opipe without actually + * copying the data. */ - if (ipipe && opipe) - return link_pipe(ipipe, opipe, len, flags); + if (ipipe && opipe && ipipe != opipe) { + /* + * Keep going, unless we encounter an error. The ipipe/opipe + * ordering doesn't really matter. + */ + ret = link_ipipe_prep(ipipe, flags); + if (!ret) { + ret = link_opipe_prep(opipe, flags); + if (!ret) { + ret = link_pipe(ipipe, opipe, len, flags); + if (!ret && (flags & SPLICE_F_NONBLOCK)) + ret = -EAGAIN; + } + } + } - return -EINVAL; + return ret; } asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) -- cgit v1.2.2 From 73ce5934e2d855db436566297f12966eb507a435 Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Mon, 10 Jul 2006 04:43:56 -0700 Subject: [PATCH] reiserfs: fix journaling issue regarding fsync() When write() extends a file(i_size is increased) and fsync() is called, change of inode must be written to journaling area through fsync(). But,currently the i_trans_id is not correctly updated when i_size is increased. So fsync() does not kick the journal writer. Reiserfs_file_write() already updates the transaction when blocks are allocated, but the case when i_size increases and new blocks are not added is not correctly treated. Following patch fix this bug. Signed-off-by: Hisashi Hifumi Cc: Jeff Mahoney Cc: Chris Mason Cc: Hans Reiser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 752cea12e3..f318b58510 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -860,8 +860,12 @@ static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_han // this sets the proper flags for O_SYNC to trigger a commit mark_inode_dirty(inode); reiserfs_write_unlock(inode->i_sb); - } else + } else { + reiserfs_write_lock(inode->i_sb); + reiserfs_update_inode_transaction(inode); mark_inode_dirty(inode); + reiserfs_write_unlock(inode->i_sb); + } sd_update = 1; } -- cgit v1.2.2 From 25e206b54b9a20e63b6f5194aeebfa13d37e015c Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Mon, 10 Jul 2006 04:44:00 -0700 Subject: [PATCH] partitions: let partitions inherit policy from disk Change the partition code in fs/partitions/check.c to initialize a newly detected partition's policy field with that of the containing block device (see patch below). My reasoning is that function set_disk_ro() in block/genhd.c modifies the policy field (read-only indicator) of a disk and all contained partitions. When a partition is detected after the call to set_disk_ro(), the policy field of this partition will currently not inherit the disk's policy field. This behavior poses a problem in cases where a block device can be 'logically de- and reactivated' like e.g. the s390 DASD driver because partition detection may run after the policy field has been modified. Signed-off-by: Peter Oberparleiter Acked-by: Al Viro Makes-sense-to: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/partitions/check.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 839634026e..51c6a748df 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -339,6 +339,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) p->start_sect = start; p->nr_sects = len; p->partno = part; + p->policy = disk->policy; if (isdigit(disk->kobj.name[strlen(disk->kobj.name)-1])) snprintf(p->kobj.name,KOBJ_NAME_LEN,"%sp%d",disk->kobj.name,part); -- cgit v1.2.2 From 69c3a5b8fd8cfa67be22f6d7ae5c681c6777d817 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Jul 2006 04:44:23 -0700 Subject: [PATCH] fs/read_write.c: EXPORT_UNUSED_SYMBOL This patch marks an unused export as EXPORT_UNUSED_SYMBOL. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/read_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 5bc0e9234f..d4cb3183c9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -436,7 +436,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) return seg; } -EXPORT_SYMBOL(iov_shorten); +EXPORT_UNUSED_SYMBOL(iov_shorten); /* June 2006 */ /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) -- cgit v1.2.2 From b6174df5eec9cdfd598c03d6d0807e344e109213 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Mon, 10 Jul 2006 04:44:49 -0700 Subject: [PATCH] mmap zero-length hugetlb file with PROT_NONE to protect a hugetlb virtual area Sometimes, applications need below call to be successful although "/mnt/hugepages/file1" doesn't exist. fd = open("/mnt/hugepages/file1", O_CREAT|O_RDWR, 0755); *addr = mmap(NULL, 0x1024*1024*256, PROT_NONE, 0, fd, 0); As for regular pages (or files), above call does work, but as for huge pages, above call would fail because hugetlbfs_file_mmap would fail if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size). This capability on huge page is useful on ia64 when the process wants to protect one area on region 4, so other threads couldn't read/write this area. A famous JVM (Java Virtual Machine) implementation on IA64 needs the capability. Signed-off-by: Zhang Yanmin Cc: David Gibson Cc: Hugh Dickins [ Expand-on-mmap semantics again... this time matching normal fs's. wli ] Acked-by: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6449cb6979..c3920c96da 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -83,8 +83,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) - goto out; if (vma->vm_flags & VM_MAYSHARE && hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), @@ -93,7 +91,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; hugetlb_prefault_arch_hook(vma->vm_mm); - if (inode->i_size < len) + if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: mutex_unlock(&inode->i_mutex); -- cgit v1.2.2 From 1aeb21d626327ee909fef03f72aea6e8a60e6c0c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:50 -0700 Subject: [PATCH] FDPIC: Fix FDPIC compile errors Fix FDPIC compile errors. (akpm: we suspect it fixes a warning) Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index eba4e23b9c..07624b95ae 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -459,6 +459,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, */ hwcap = ELF_HWCAP; k_platform = ELF_PLATFORM; + u_platform = NULL; if (k_platform) { platform_len = strlen(k_platform) + 1; -- cgit v1.2.2 From 21ff821630c0e64f5d2fab96ced72000d77fa90b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:52 -0700 Subject: [PATCH] NOMMU: Fix execution off of ramfs with mmap() Fix execution through the FDPIC binfmt of programs stored on ramfs by preventing the ramfs mmap() returning successfully on a private mapping of a ramfs file. This causes NOMMU mmap to make a copy of the mapped portion of the file and map that instead. This could be improved by granting direct mapping access to read-only private mappings for which the data is stored on a contiguous run of pages. However, this is only likely to be the case if the file was extended with truncate before being written. ramfs is left to map the file directly for shared mappings so that SYSV IPC and POSIX shared memory both still work. Signed-off-by: David Howells Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 99fffc9e1b..677139b48e 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -283,9 +283,9 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, /*****************************************************************************/ /* - * set up a mapping + * set up a mapping for shared memory segments */ int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { - return 0; + return vma->vm_flags & VM_SHARED ? 0 : -ENOSYS; } -- cgit v1.2.2 From 8a2ab7f5df76b920d62b908919d987d3b8a82856 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:53 -0700 Subject: [PATCH] FDPIC: Adjust the ELF-FDPIC driver to conform more to the CodingStyle Adjust the ELF-FDPIC binfmt driver to conform much more to the CodingStyle, silly though it may be. Further changes: (*) Drop the casts to long for addresses in kdebug() statements (they're unsigned long already). (*) Use extra variables to avoid expressions longer than 80 chars by splitting the statement into multiple statements and letting the compiler optimise them back together. (*) Eliminate duplicate call of ksize() when working out how much space was actually allocated for the stack. (*) Discard the commented-out load_shlib prototype and op pointer as this will not be supported in ELF-FDPIC for the foreseeable future. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 305 +++++++++++++++++++++++++++----------------------- 1 file changed, 168 insertions(+), 137 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 07624b95ae..a4ff873898 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1,6 +1,6 @@ /* binfmt_elf_fdpic.c: FDPIC ELF binary format * - * Copyright (C) 2003, 2004 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * Derived from binfmt_elf.c * @@ -50,43 +50,45 @@ typedef char *elf_caddr_t; MODULE_LICENSE("GPL"); -static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs); -//static int load_elf_fdpic_library(struct file *); -static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file); -static int elf_fdpic_map_file(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm, - const char *what); +static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); +static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); +static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, + struct mm_struct *, const char *); -static int create_elf_fdpic_tables(struct linux_binprm *bprm, - struct mm_struct *mm, - struct elf_fdpic_params *exec_params, - struct elf_fdpic_params *interp_params); +static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *, + struct elf_fdpic_params *, + struct elf_fdpic_params *); #ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *_sp); -static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm); +static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *, + unsigned long *); +static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, + struct file *, + struct mm_struct *); #endif -static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm); +static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, + struct file *, struct mm_struct *); static struct linux_binfmt elf_fdpic_format = { .module = THIS_MODULE, .load_binary = load_elf_fdpic_binary, -// .load_shlib = load_elf_fdpic_library, // .core_dump = elf_fdpic_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, }; -static int __init init_elf_fdpic_binfmt(void) { return register_binfmt(&elf_fdpic_format); } -static void __exit exit_elf_fdpic_binfmt(void) { unregister_binfmt(&elf_fdpic_format); } +static int __init init_elf_fdpic_binfmt(void) +{ + return register_binfmt(&elf_fdpic_format); +} -module_init(init_elf_fdpic_binfmt) -module_exit(exit_elf_fdpic_binfmt) +static void __exit exit_elf_fdpic_binfmt(void) +{ + unregister_binfmt(&elf_fdpic_format); +} + +module_init(init_elf_fdpic_binfmt); +module_exit(exit_elf_fdpic_binfmt); static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) { @@ -105,7 +107,8 @@ static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) /* * read the program headers table into memory */ -static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file) +static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, + struct file *file) { struct elf32_phdr *phdr; unsigned long size; @@ -121,7 +124,8 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *f if (!params->phdrs) return -ENOMEM; - retval = kernel_read(file, params->hdr.e_phoff, (char *) params->phdrs, size); + retval = kernel_read(file, params->hdr.e_phoff, + (char *) params->phdrs, size); if (retval < 0) return retval; @@ -141,17 +145,24 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *f } return 0; -} /* end elf_fdpic_fetch_phdrs() */ +} /*****************************************************************************/ /* * load an fdpic binary into various bits of memory */ -static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_elf_fdpic_binary(struct linux_binprm *bprm, + struct pt_regs *regs) { struct elf_fdpic_params exec_params, interp_params; struct elf_phdr *phdr; - unsigned long stack_size; + unsigned long stack_size, entryaddr; +#ifndef CONFIG_MMU + unsigned long fullsize; +#endif +#ifdef ELF_FDPIC_PLAT_INIT + unsigned long dynaddr; +#endif struct file *interpreter = NULL; /* to shut gcc up */ char *interpreter_name = NULL; int executable_stack; @@ -212,7 +223,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs goto error; } - retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE); + retval = kernel_read(interpreter, 0, bprm->buf, + BINPRM_BUF_SIZE); if (retval < 0) goto error; @@ -295,7 +307,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs ¤t->mm->start_stack, ¤t->mm->start_brk); - retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); + retval = setup_arg_pages(bprm, current->mm->start_stack, + executable_stack); if (retval < 0) { send_sig(SIGKILL, current, 0); goto error_kill; @@ -303,7 +316,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs #endif /* load the executable and interpreter into memory */ - retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, "executable"); + retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, + "executable"); if (retval < 0) goto error_kill; @@ -324,7 +338,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs if (!current->mm->start_brk) current->mm->start_brk = current->mm->end_data; - current->mm->brk = current->mm->start_brk = PAGE_ALIGN(current->mm->start_brk); + current->mm->brk = current->mm->start_brk = + PAGE_ALIGN(current->mm->start_brk); #else /* create a stack and brk area big enough for everyone @@ -336,47 +351,45 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs stack_size = PAGE_SIZE * 2; down_write(¤t->mm->mmap_sem); - current->mm->start_brk = do_mmap(NULL, - 0, - stack_size, + current->mm->start_brk = do_mmap(NULL, 0, stack_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0); - if (IS_ERR((void *) current->mm->start_brk)) { + if (IS_ERR_VALUE(current->mm->start_brk)) { up_write(¤t->mm->mmap_sem); retval = current->mm->start_brk; current->mm->start_brk = 0; goto error_kill; } - if (do_mremap(current->mm->start_brk, - stack_size, - ksize((char *) current->mm->start_brk), - 0, 0 - ) == current->mm->start_brk - ) - stack_size = ksize((char *) current->mm->start_brk); + /* expand the stack mapping to use up the entire allocation granule */ + fullsize = ksize((char *) current->mm->start_brk); + if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size, + fullsize, 0, 0))) + stack_size = fullsize; up_write(¤t->mm->mmap_sem); current->mm->brk = current->mm->start_brk; current->mm->context.end_brk = current->mm->start_brk; - current->mm->context.end_brk += (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; + current->mm->context.end_brk += + (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; current->mm->start_stack = current->mm->start_brk + stack_size; #endif compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; - if (create_elf_fdpic_tables(bprm, current->mm, &exec_params, &interp_params) < 0) + if (create_elf_fdpic_tables(bprm, current->mm, + &exec_params, &interp_params) < 0) goto error_kill; - kdebug("- start_code %lx", (long) current->mm->start_code); - kdebug("- end_code %lx", (long) current->mm->end_code); - kdebug("- start_data %lx", (long) current->mm->start_data); - kdebug("- end_data %lx", (long) current->mm->end_data); - kdebug("- start_brk %lx", (long) current->mm->start_brk); - kdebug("- brk %lx", (long) current->mm->brk); - kdebug("- start_stack %lx", (long) current->mm->start_stack); + kdebug("- start_code %lx", current->mm->start_code); + kdebug("- end_code %lx", current->mm->end_code); + kdebug("- start_data %lx", current->mm->start_data); + kdebug("- end_data %lx", current->mm->end_data); + kdebug("- start_brk %lx", current->mm->start_brk); + kdebug("- brk %lx", current->mm->brk); + kdebug("- start_stack %lx", current->mm->start_stack); #ifdef ELF_FDPIC_PLAT_INIT /* @@ -385,21 +398,18 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs * example. This macro performs whatever initialization to * the regs structure is required. */ - ELF_FDPIC_PLAT_INIT(regs, - exec_params.map_addr, - interp_params.map_addr, - interp_params.dynamic_addr ?: exec_params.dynamic_addr - ); + dynaddr = interp_params.dynamic_addr ?: exec_params.dynamic_addr; + ELF_FDPIC_PLAT_INIT(regs, exec_params.map_addr, interp_params.map_addr, + dynaddr); #endif /* everything is now ready... get the userspace context ready to roll */ - start_thread(regs, - interp_params.entry_addr ?: exec_params.entry_addr, - current->mm->start_stack); + entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; + start_thread(regs, entryaddr, current->mm->start_stack); if (unlikely(current->ptrace & PT_PTRACED)) { if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); + ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP); else send_sig(SIGTRAP, current, 0); } @@ -419,11 +429,11 @@ error: return retval; /* unrecoverable error - kill the process */ - error_kill: +error_kill: send_sig(SIGSEGV, current, 0); goto error; -} /* end load_elf_fdpic_binary() */ +} /*****************************************************************************/ /* @@ -471,11 +481,11 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, #if defined(__i386__) && defined(CONFIG_SMP) /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions - * by the processes running on the same package. One thing we can do - * is to shuffle the initial stack for them. + * by the processes running on the same package. One thing we can do is + * to shuffle the initial stack for them. * - * the conditionals here are unneeded, but kept in to make the - * code behaviour the same as pre change unless we have hyperthreaded + * the conditionals here are unneeded, but kept in to make the code + * behaviour the same as pre change unless we have hyperthreaded * processors. This keeps Mr Marcelo Person happier but should be * removed for 2.5 */ @@ -498,11 +508,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, if (interp_params->loadmap) { len = sizeof(struct elf32_fdpic_loadmap); - len += sizeof(struct elf32_fdpic_loadseg) * interp_params->loadmap->nsegs; + len += sizeof(struct elf32_fdpic_loadseg) * + interp_params->loadmap->nsegs; sp = (sp - len) & ~7UL; interp_params->map_addr = sp; - if (copy_to_user((void __user *) sp, interp_params->loadmap, len) != 0) + if (copy_to_user((void __user *) sp, interp_params->loadmap, + len) != 0) return -EFAULT; current->mm->context.interp_fdpic_loadmap = (unsigned long) sp; @@ -526,34 +538,37 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, sp -= sp & 15UL; /* put the ELF interpreter info on the stack */ -#define NEW_AUX_ENT(nr, id, val) \ - do { \ - struct { unsigned long _id, _val; } __user *ent = (void __user *) csp; \ - __put_user((id), &ent[nr]._id); \ - __put_user((val), &ent[nr]._val); \ +#define NEW_AUX_ENT(nr, id, val) \ + do { \ + struct { unsigned long _id, _val; } __user *ent; \ + \ + ent = (void __user *) csp; \ + __put_user((id), &ent[nr]._id); \ + __put_user((val), &ent[nr]._val); \ } while (0) csp -= 2 * sizeof(unsigned long); NEW_AUX_ENT(0, AT_NULL, 0); if (k_platform) { csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(0, AT_PLATFORM, (elf_addr_t)(unsigned long) u_platform); + NEW_AUX_ENT(0, AT_PLATFORM, + (elf_addr_t) (unsigned long) u_platform); } csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); - NEW_AUX_ENT( 0, AT_HWCAP, hwcap); - NEW_AUX_ENT( 1, AT_PAGESZ, PAGE_SIZE); - NEW_AUX_ENT( 2, AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT( 3, AT_PHDR, exec_params->ph_addr); - NEW_AUX_ENT( 4, AT_PHENT, sizeof(struct elf_phdr)); - NEW_AUX_ENT( 5, AT_PHNUM, exec_params->hdr.e_phnum); - NEW_AUX_ENT( 6, AT_BASE, interp_params->elfhdr_addr); - NEW_AUX_ENT( 7, AT_FLAGS, 0); - NEW_AUX_ENT( 8, AT_ENTRY, exec_params->entry_addr); - NEW_AUX_ENT( 9, AT_UID, (elf_addr_t) current->uid); - NEW_AUX_ENT(10, AT_EUID, (elf_addr_t) current->euid); - NEW_AUX_ENT(11, AT_GID, (elf_addr_t) current->gid); - NEW_AUX_ENT(12, AT_EGID, (elf_addr_t) current->egid); + NEW_AUX_ENT( 0, AT_HWCAP, hwcap); + NEW_AUX_ENT( 1, AT_PAGESZ, PAGE_SIZE); + NEW_AUX_ENT( 2, AT_CLKTCK, CLOCKS_PER_SEC); + NEW_AUX_ENT( 3, AT_PHDR, exec_params->ph_addr); + NEW_AUX_ENT( 4, AT_PHENT, sizeof(struct elf_phdr)); + NEW_AUX_ENT( 5, AT_PHNUM, exec_params->hdr.e_phnum); + NEW_AUX_ENT( 6, AT_BASE, interp_params->elfhdr_addr); + NEW_AUX_ENT( 7, AT_FLAGS, 0); + NEW_AUX_ENT( 8, AT_ENTRY, exec_params->entry_addr); + NEW_AUX_ENT( 9, AT_UID, (elf_addr_t) current->uid); + NEW_AUX_ENT(10, AT_EUID, (elf_addr_t) current->euid); + NEW_AUX_ENT(11, AT_GID, (elf_addr_t) current->gid); + NEW_AUX_ENT(12, AT_EGID, (elf_addr_t) current->egid); #ifdef ARCH_DLINFO /* ARCH_DLINFO must come last so platform specific code can enforce @@ -579,7 +594,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, #ifdef CONFIG_MMU current->mm->arg_start = bprm->p; #else - current->mm->arg_start = current->mm->start_stack - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); + current->mm->arg_start = current->mm->start_stack - + (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); #endif p = (char __user *) current->mm->arg_start; @@ -607,7 +623,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, mm->start_stack = (unsigned long) sp; return 0; -} /* end create_elf_fdpic_tables() */ +} /*****************************************************************************/ /* @@ -615,7 +631,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, * the stack */ #ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *_sp) +static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, + unsigned long *_sp) { unsigned long index, stop, sp; char *src; @@ -636,9 +653,9 @@ static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; - out: +out: return ret; -} /* end elf_fdpic_transfer_args_to_stack() */ +} #endif /*****************************************************************************/ @@ -713,17 +730,18 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, seg = loadmap->segs; for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { if (params->hdr.e_entry >= seg->p_vaddr && - params->hdr.e_entry < seg->p_vaddr + seg->p_memsz - ) { + params->hdr.e_entry < seg->p_vaddr + seg->p_memsz) { params->entry_addr = - (params->hdr.e_entry - seg->p_vaddr) + seg->addr; + (params->hdr.e_entry - seg->p_vaddr) + + seg->addr; break; } } } /* determine where the program header table has wound up if mapped */ - stop = params->hdr.e_phoff + params->hdr.e_phnum * sizeof (struct elf_phdr); + stop = params->hdr.e_phoff; + stop += params->hdr.e_phnum * sizeof (struct elf_phdr); phdr = params->phdrs; for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { @@ -737,9 +755,11 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, seg = loadmap->segs; for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { if (phdr->p_vaddr >= seg->p_vaddr && - phdr->p_vaddr + phdr->p_filesz <= seg->p_vaddr + seg->p_memsz - ) { - params->ph_addr = (phdr->p_vaddr - seg->p_vaddr) + seg->addr + + phdr->p_vaddr + phdr->p_filesz <= + seg->p_vaddr + seg->p_memsz) { + params->ph_addr = + (phdr->p_vaddr - seg->p_vaddr) + + seg->addr + params->hdr.e_phoff - phdr->p_offset; break; } @@ -756,18 +776,22 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, seg = loadmap->segs; for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { if (phdr->p_vaddr >= seg->p_vaddr && - phdr->p_vaddr + phdr->p_memsz <= seg->p_vaddr + seg->p_memsz - ) { - params->dynamic_addr = (phdr->p_vaddr - seg->p_vaddr) + seg->addr; - - /* check the dynamic section contains at least one item, and that - * the last item is a NULL entry */ + phdr->p_vaddr + phdr->p_memsz <= + seg->p_vaddr + seg->p_memsz) { + params->dynamic_addr = + (phdr->p_vaddr - seg->p_vaddr) + + seg->addr; + + /* check the dynamic section contains at least + * one item, and that the last item is a NULL + * entry */ if (phdr->p_memsz == 0 || phdr->p_memsz % sizeof(Elf32_Dyn) != 0) goto dynamic_error; tmp = phdr->p_memsz / sizeof(Elf32_Dyn); - if (((Elf32_Dyn *) params->dynamic_addr)[tmp - 1].d_tag != 0) + if (((Elf32_Dyn *) + params->dynamic_addr)[tmp - 1].d_tag != 0) goto dynamic_error; break; } @@ -776,8 +800,8 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, } /* now elide adjacent segments in the load map on MMU linux - * - on uClinux the holes between may actually be filled with system stuff or stuff from - * other processes + * - on uClinux the holes between may actually be filled with system + * stuff or stuff from other processes */ #ifdef CONFIG_MMU nloads = loadmap->nsegs; @@ -788,7 +812,9 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) { load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz); if (load_addr == (seg->addr & PAGE_MASK)) { - mseg->p_memsz += load_addr - (mseg->addr + mseg->p_memsz); + mseg->p_memsz += + load_addr - + (mseg->addr + mseg->p_memsz); mseg->p_memsz += seg->addr & ~PAGE_MASK; mseg->p_memsz += seg->p_memsz; loadmap->nsegs--; @@ -816,20 +842,21 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, return 0; - dynamic_error: +dynamic_error: printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", what, file->f_dentry->d_inode->i_ino); return -ELIBBAD; -} /* end elf_fdpic_map_file() */ +} /*****************************************************************************/ /* * map a file with constant displacement under uClinux */ #ifndef CONFIG_MMU -static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm) +static int elf_fdpic_map_file_constdisp_on_uclinux( + struct elf_fdpic_params *params, + struct file *file, + struct mm_struct *mm) { struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; @@ -840,7 +867,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para load_addr = params->load_addr; seg = params->loadmap->segs; - /* determine the bounds of the contiguous overall allocation we must make */ + /* determine the bounds of the contiguous overall allocation we must + * make */ phdr = params->phdrs; for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { if (params->phdrs[loop].p_type != PT_LOAD) @@ -861,7 +889,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para maddr = do_mmap(NULL, load_addr, top - base, PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); up_write(&mm->mmap_sem); - if (IS_ERR((void *) maddr)) + if (IS_ERR_VALUE(maddr)) return (int) maddr; if (load_addr != 0) @@ -879,7 +907,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para seg->p_vaddr = phdr->p_vaddr; seg->p_memsz = phdr->p_memsz; - ret = file->f_op->read(file, (void *) seg->addr, phdr->p_filesz, &fpos); + ret = file->f_op->read(file, (void *) seg->addr, + phdr->p_filesz, &fpos); if (ret < 0) return ret; @@ -896,8 +925,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para if (phdr->p_flags & PF_X) { mm->start_code = seg->addr; mm->end_code = seg->addr + phdr->p_memsz; - } - else if (!mm->start_data) { + } else if (!mm->start_data) { mm->start_data = seg->addr; #ifndef CONFIG_MMU mm->end_data = seg->addr + phdr->p_memsz; @@ -914,7 +942,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para } return 0; -} /* end elf_fdpic_map_file_constdisp_on_uclinux() */ +} #endif /*****************************************************************************/ @@ -975,14 +1003,14 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, case ELF_FDPIC_FLAG_CONSTDISP: /* constant displacement - * - can be mapped anywhere, but must be mapped as a unit + * - can be mapped anywhere, but must be mapped as a + * unit */ if (!dvset) { maddr = load_addr; delta_vaddr = phdr->p_vaddr; dvset = 1; - } - else { + } else { maddr = load_addr + phdr->p_vaddr - delta_vaddr; flags |= MAP_FIXED; } @@ -1006,13 +1034,14 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, up_write(&mm->mmap_sem); kdebug("mmap[%d] sz=%lx pr=%x fl=%x of=%lx --> %08lx", - loop, phdr->p_memsz + disp, prot, flags, phdr->p_offset - disp, - maddr); + loop, phdr->p_memsz + disp, prot, flags, + phdr->p_offset - disp, maddr); - if (IS_ERR((void *) maddr)) + if (IS_ERR_VALUE(maddr)) return (int) maddr; - if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == ELF_FDPIC_FLAG_CONTIGUOUS) + if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == + ELF_FDPIC_FLAG_CONTIGUOUS) load_addr += PAGE_ALIGN(phdr->p_memsz + disp); seg->addr = maddr + disp; @@ -1023,7 +1052,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_offset == 0) params->elfhdr_addr = seg->addr; - /* clear the bit between beginning of mapping and beginning of PT_LOAD */ + /* clear the bit between beginning of mapping and beginning of + * PT_LOAD */ if (prot & PROT_WRITE && disp > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); clear_user((void __user *) maddr, disp); @@ -1039,19 +1069,20 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); #ifdef CONFIG_MMU - if (excess > excess1) { unsigned long xaddr = maddr + phdr->p_filesz + excess1; unsigned long xmaddr; flags |= MAP_FIXED | MAP_ANONYMOUS; down_write(&mm->mmap_sem); - xmaddr = do_mmap(NULL, xaddr, excess - excess1, prot, flags, 0); + xmaddr = do_mmap(NULL, xaddr, excess - excess1, + prot, flags, 0); up_write(&mm->mmap_sem); kdebug("mmap[%d] " " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", - loop, xaddr, excess - excess1, prot, flags, xmaddr); + loop, xaddr, excess - excess1, prot, flags, + xmaddr); if (xmaddr != xaddr) return -ENOMEM; @@ -1060,7 +1091,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (prot & PROT_WRITE && excess1 > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess1); - clear_user((void __user *) maddr + phdr->p_filesz, excess1); + clear_user((void __user *) maddr + phdr->p_filesz, + excess1); } #else @@ -1075,8 +1107,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_flags & PF_X) { mm->start_code = maddr; mm->end_code = maddr + phdr->p_memsz; - } - else if (!mm->start_data) { + } else if (!mm->start_data) { mm->start_data = maddr; mm->end_data = maddr + phdr->p_memsz; } @@ -1086,4 +1117,4 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, } return 0; -} /* end elf_fdpic_map_file_by_direct_mmap() */ +} -- cgit v1.2.2 From b4cac1a0227a6f84be0381cd350a3c8730a4a671 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:54 -0700 Subject: [PATCH] FDPIC: Move roundup() into linux/kernel.h Move the roundup() macro from binfmt_elf.c into linux/kernel.h as it's generally useful. [akpm@osdl.org: nuke all the other implementations] Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 -- fs/proc/kcore.c | 2 -- fs/xfs/linux-2.6/xfs_linux.h | 1 - 3 files changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f42e64210e..672a3b90bc 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1185,8 +1185,6 @@ static int maydump(struct vm_area_struct *vma) return 1; } -#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) - /* An ELF note in memory */ struct memelfnote { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 036d14d836..8d6d85d740 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -42,8 +42,6 @@ const struct file_operations proc_kcore_operations = { #define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) #endif -#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) - /* An ELF note in memory */ struct memelfnote { diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 8c021dc57d..a13f75c1a9 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -215,7 +215,6 @@ BUFFER_FNS(PrivateStart, unwritten); #define MIN(a,b) (min(a,b)) #define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) -#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* * Various platform dependent calls that don't fit anywhere else -- cgit v1.2.2 From 6d8c4e3b0150ff537902477ed62f8a8e9e70007b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:55 -0700 Subject: [PATCH] FDPIC: Add coredump capability for the ELF-FDPIC binfmt Add coredump capability for the ELF-FDPIC binfmt. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 676 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 674 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index a4ff873898..2f33658292 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -24,7 +24,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -48,6 +50,12 @@ typedef char *elf_caddr_t; #define kdebug(fmt, ...) do {} while(0) #endif +#if 0 +#define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) +#else +#define kdcore(fmt, ...) do {} while(0) +#endif + MODULE_LICENSE("GPL"); static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); @@ -70,10 +78,16 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, struct file *, struct mm_struct *); +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) +static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *); +#endif + static struct linux_binfmt elf_fdpic_format = { .module = THIS_MODULE, .load_binary = load_elf_fdpic_binary, -// .core_dump = elf_fdpic_core_dump, +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) + .core_dump = elf_fdpic_core_dump, +#endif .min_coredump = ELF_EXEC_PAGESIZE, }; @@ -87,7 +101,7 @@ static void __exit exit_elf_fdpic_binfmt(void) unregister_binfmt(&elf_fdpic_format); } -module_init(init_elf_fdpic_binfmt); +core_initcall(init_elf_fdpic_binfmt); module_exit(exit_elf_fdpic_binfmt); static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) @@ -1118,3 +1132,661 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, return 0; } + +/*****************************************************************************/ +/* + * ELF-FDPIC core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge + * + * Modelled on fs/binfmt_elf.c core dumper + */ +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) + +/* + * These are the only things you should do on a core-file: use only these + * functions to write out all the necessary info. + */ +static int dump_write(struct file *file, const void *addr, int nr) +{ + return file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} + +static int dump_seek(struct file *file, loff_t off) +{ + if (file->f_op->llseek) { + if (file->f_op->llseek(file, off, SEEK_SET) != off) + return 0; + } else { + file->f_pos = off; + } + return 1; +} + +/* + * Decide whether a segment is worth dumping; default is yes to be + * sure (missing info is worse than too much; etc). + * Personally I'd include everything, and use the coredump limit... + * + * I think we should skip something. But I am not sure how. H.J. + */ +static int maydump(struct vm_area_struct *vma) +{ + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & (VM_IO | VM_RESERVED)) { + kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); + return 0; + } + + /* If we may not read the contents, don't allow us to dump + * them either. "dump_write()" can't handle it anyway. + */ + if (!(vma->vm_flags & VM_READ)) { + kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); + return 0; + } + + /* Dump shared memory only if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (vma->vm_file->f_dentry->d_inode->i_nlink == 0) { + kdcore("%08lx: %08lx: no (share)", vma->vm_start, vma->vm_flags); + return 1; + } + + kdcore("%08lx: %08lx: no (share)", vma->vm_start, vma->vm_flags); + return 0; + } + +#ifdef CONFIG_MMU + /* If it hasn't been written to, don't write it out */ + if (!vma->anon_vma) { + kdcore("%08lx: %08lx: no (!anon)", vma->vm_start, vma->vm_flags); + return 0; + } +#endif + + kdcore("%08lx: %08lx: yes", vma->vm_start, vma->vm_flags); + return 1; +} + +/* An ELF note in memory */ +struct memelfnote +{ + const char *name; + int type; + unsigned int datasz; + void *data; +}; + +static int notesize(struct memelfnote *en) +{ + int sz; + + sz = sizeof(struct elf_note); + sz += roundup(strlen(en->name) + 1, 4); + sz += roundup(en->datasz, 4); + + return sz; +} + +/* #define DEBUG */ + +#define DUMP_WRITE(addr, nr) \ + do { if (!dump_write(file, (addr), (nr))) return 0; } while(0) +#define DUMP_SEEK(off) \ + do { if (!dump_seek(file, (off))) return 0; } while(0) + +static int writenote(struct memelfnote *men, struct file *file) +{ + struct elf_note en; + + en.n_namesz = strlen(men->name) + 1; + en.n_descsz = men->datasz; + en.n_type = men->type; + + DUMP_WRITE(&en, sizeof(en)); + DUMP_WRITE(men->name, en.n_namesz); + /* XXX - cast from long long to long to avoid need for libgcc.a */ + DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ + DUMP_WRITE(men->data, men->datasz); + DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ + + return 1; +} +#undef DUMP_WRITE +#undef DUMP_SEEK + +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ + goto end_coredump; +#define DUMP_SEEK(off) \ + if (!dump_seek(file, (off))) \ + goto end_coredump; + +static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) +{ + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_FDPIC_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = segs; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + return; +} + +static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) +{ + phdr->p_type = PT_NOTE; + phdr->p_offset = offset; + phdr->p_vaddr = 0; + phdr->p_paddr = 0; + phdr->p_filesz = sz; + phdr->p_memsz = 0; + phdr->p_flags = 0; + phdr->p_align = 0; + return; +} + +static inline void fill_note(struct memelfnote *note, const char *name, int type, + unsigned int sz, void *data) +{ + note->name = name; + note->type = type; + note->datasz = sz; + note->data = data; + return; +} + +/* + * fill up all the fields in prstatus from the given task struct, except + * registers which need to be filled up seperately. + */ +static void fill_prstatus(struct elf_prstatus *prstatus, + struct task_struct *p, long signr) +{ + prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; + prstatus->pr_sigpend = p->pending.signal.sig[0]; + prstatus->pr_sighold = p->blocked.sig[0]; + prstatus->pr_pid = p->pid; + prstatus->pr_ppid = p->parent->pid; + prstatus->pr_pgrp = process_group(p); + prstatus->pr_sid = p->signal->session; + if (thread_group_leader(p)) { + /* + * This is the record for the group leader. Add in the + * cumulative times of previous dead threads. This total + * won't include the time of each live thread whose state + * is included in the core dump. The final total reported + * to our parent process when it calls wait4 will include + * those sums as well as the little bit more time it takes + * this and each other thread to finish dying after the + * core dump synchronization phase. + */ + cputime_to_timeval(cputime_add(p->utime, p->signal->utime), + &prstatus->pr_utime); + cputime_to_timeval(cputime_add(p->stime, p->signal->stime), + &prstatus->pr_stime); + } else { + cputime_to_timeval(p->utime, &prstatus->pr_utime); + cputime_to_timeval(p->stime, &prstatus->pr_stime); + } + cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); + cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); + + prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; + prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; +} + +static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, + struct mm_struct *mm) +{ + unsigned int i, len; + + /* first copy the parameters from user space */ + memset(psinfo, 0, sizeof(struct elf_prpsinfo)); + + len = mm->arg_end - mm->arg_start; + if (len >= ELF_PRARGSZ) + len = ELF_PRARGSZ - 1; + if (copy_from_user(&psinfo->pr_psargs, + (const char __user *) mm->arg_start, len)) + return -EFAULT; + for (i = 0; i < len; i++) + if (psinfo->pr_psargs[i] == 0) + psinfo->pr_psargs[i] = ' '; + psinfo->pr_psargs[len] = 0; + + psinfo->pr_pid = p->pid; + psinfo->pr_ppid = p->parent->pid; + psinfo->pr_pgrp = process_group(p); + psinfo->pr_sid = p->signal->session; + + i = p->state ? ffz(~p->state) + 1 : 0; + psinfo->pr_state = i; + psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; + psinfo->pr_zomb = psinfo->pr_sname == 'Z'; + psinfo->pr_nice = task_nice(p); + psinfo->pr_flag = p->flags; + SET_UID(psinfo->pr_uid, p->uid); + SET_GID(psinfo->pr_gid, p->gid); + strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + + return 0; +} + +/* Here is the structure in which status of each thread is captured. */ +struct elf_thread_status +{ + struct list_head list; + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + elf_fpregset_t fpu; /* NT_PRFPREG */ + struct task_struct *thread; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t xfpu; /* NT_PRXFPREG */ +#endif + struct memelfnote notes[3]; + int num_notes; +}; + +/* + * In order to add the specific thread information for the elf file format, + * we need to keep a linked list of every thread's pr_status and then create + * a single section for them in the final core file. + */ +static int elf_dump_thread_status(long signr, struct elf_thread_status *t) +{ + struct task_struct *p = t->thread; + int sz = 0; + + t->num_notes = 0; + + fill_prstatus(&t->prstatus, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + &t->prstatus); + t->num_notes++; + sz += notesize(&t->notes[0]); + + t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu); + if (t->prstatus.pr_fpvalid) { + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + &t->fpu); + t->num_notes++; + sz += notesize(&t->notes[1]); + } + +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(p, &t->xfpu)) { + fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu), + &t->xfpu); + t->num_notes++; + sz += notesize(&t->notes[2]); + } +#endif + return sz; +} + +/* + * dump the segments for an MMU process + */ +#ifdef CONFIG_MMU +static int elf_fdpic_dump_segments(struct file *file, struct mm_struct *mm, + size_t *size, unsigned long *limit) +{ + struct vm_area_struct *vma; + + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + unsigned long addr; + + if (!maydump(vma)) + continue; + + for (addr = vma->vm_start; + addr < vma->vm_end; + addr += PAGE_SIZE + ) { + struct vm_area_struct *vma; + struct page *page; + + if (get_user_pages(current, current->mm, addr, 1, 0, 1, + &page, &vma) <= 0) { + DUMP_SEEK(file->f_pos + PAGE_SIZE); + } + else if (page == ZERO_PAGE(addr)) { + DUMP_SEEK(file->f_pos + PAGE_SIZE); + page_cache_release(page); + } + else { + void *kaddr; + + flush_cache_page(vma, addr, page_to_pfn(page)); + kaddr = kmap(page); + if ((*size += PAGE_SIZE) > *limit || + !dump_write(file, kaddr, PAGE_SIZE) + ) { + kunmap(page); + page_cache_release(page); + return -EIO; + } + kunmap(page); + page_cache_release(page); + } + } + } + + return 0; + +end_coredump: + return -EFBIG; +} +#endif + +/* + * dump the segments for a NOMMU process + */ +#ifndef CONFIG_MMU +static int elf_fdpic_dump_segments(struct file *file, struct mm_struct *mm, + size_t *size, unsigned long *limit) +{ + struct vm_list_struct *vml; + + for (vml = current->mm->context.vmlist; vml; vml = vml->next) { + struct vm_area_struct *vma = vml->vma; + + if (!maydump(vma)) + continue; + + if ((*size += PAGE_SIZE) > *limit) + return -EFBIG; + + if (!dump_write(file, (void *) vma->vm_start, + vma->vm_end - vma->vm_start)) + return -EIO; + } + + return 0; +} +#endif + +/* + * Actual dumper + * + * This is a two-pass process; first we find the offsets of the bits, + * and then they are actually written out. If we run out of core limit + * we just truncate. + */ +static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, + struct file *file) +{ +#define NUM_NOTES 6 + int has_dumped = 0; + mm_segment_t fs; + int segs; + size_t size = 0; + int i; + struct vm_area_struct *vma; + struct elfhdr *elf = NULL; + loff_t offset = 0, dataoff; + unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; + int numnote; + struct memelfnote *notes = NULL; + struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ + struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */ + struct task_struct *g, *p; + LIST_HEAD(thread_list); + struct list_head *t; + elf_fpregset_t *fpu = NULL; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t *xfpu = NULL; +#endif + int thread_status_size = 0; +#ifndef CONFIG_MMU + struct vm_list_struct *vml; +#endif + elf_addr_t *auxv; + + /* + * We no longer stop all VM operations. + * + * This is because those proceses that could possibly change map_count + * or the mmap / vma pages are now blocked in do_exit on current + * finishing this core dump. + * + * Only ptrace can touch these memory addresses, but it doesn't change + * the map_count or the pages allocated. So no possibility of crashing + * exists while dumping the mm->vm_next areas to the core file. + */ + + /* alloc memory for large data structures: too large to be on stack */ + elf = kmalloc(sizeof(*elf), GFP_KERNEL); + if (!elf) + goto cleanup; + prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL); + if (!prstatus) + goto cleanup; + psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); + if (!psinfo) + goto cleanup; + notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL); + if (!notes) + goto cleanup; + fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); + if (!fpu) + goto cleanup; +#ifdef ELF_CORE_COPY_XFPREGS + xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL); + if (!xfpu) + goto cleanup; +#endif + + if (signr) { + struct elf_thread_status *tmp; + read_lock(&tasklist_lock); + do_each_thread(g,p) + if (current->mm == p->mm && current != p) { + tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); + if (!tmp) { + read_unlock(&tasklist_lock); + goto cleanup; + } + INIT_LIST_HEAD(&tmp->list); + tmp->thread = p; + list_add(&tmp->list, &thread_list); + } + while_each_thread(g,p); + read_unlock(&tasklist_lock); + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp; + int sz; + + tmp = list_entry(t, struct elf_thread_status, list); + sz = elf_dump_thread_status(signr, tmp); + thread_status_size += sz; + } + } + + /* now collect the dump for the current */ + fill_prstatus(prstatus, current, signr); + elf_core_copy_regs(&prstatus->pr_reg, regs); + +#ifdef CONFIG_MMU + segs = current->mm->map_count; +#else + segs = 0; + for (vml = current->mm->context.vmlist; vml; vml = vml->next) + segs++; +#endif +#ifdef ELF_CORE_EXTRA_PHDRS + segs += ELF_CORE_EXTRA_PHDRS; +#endif + + /* Set up header */ + fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ + + has_dumped = 1; + current->flags |= PF_DUMPCORE; + + /* + * Set up the notes in similar form to SVR4 core dumps made + * with info from their /proc. + */ + + fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus); + fill_psinfo(psinfo, current->group_leader, current->mm); + fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + + numnote = 2; + + auxv = (elf_addr_t *) current->mm->saved_auxv; + + i = 0; + do + i += 2; + while (auxv[i - 2] != AT_NULL); + fill_note(¬es[numnote++], "CORE", NT_AUXV, + i * sizeof(elf_addr_t), auxv); + + /* Try to dump the FPU. */ + if ((prstatus->pr_fpvalid = + elf_core_copy_task_fpregs(current, regs, fpu))) + fill_note(notes + numnote++, + "CORE", NT_PRFPREG, sizeof(*fpu), fpu); +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(current, xfpu)) + fill_note(notes + numnote++, + "LINUX", NT_PRXFPREG, sizeof(*xfpu), xfpu); +#endif + + fs = get_fs(); + set_fs(KERNEL_DS); + + DUMP_WRITE(elf, sizeof(*elf)); + offset += sizeof(*elf); /* Elf header */ + offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ + + /* Write notes phdr entry */ + { + struct elf_phdr phdr; + int sz = 0; + + for (i = 0; i < numnote; i++) + sz += notesize(notes + i); + + sz += thread_status_size; + + fill_elf_note_phdr(&phdr, sz, offset); + offset += sz; + DUMP_WRITE(&phdr, sizeof(phdr)); + } + + /* Page-align dumped data */ + dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); + + /* write program headers for segments dump */ + for ( +#ifdef CONFIG_MMU + vma = current->mm->mmap; vma; vma = vma->vm_next +#else + vml = current->mm->context.vmlist; vml; vml = vml->next +#endif + ) { + struct elf_phdr phdr; + size_t sz; + +#ifndef CONFIG_MMU + vma = vml->vma; +#endif + + sz = vma->vm_end - vma->vm_start; + + phdr.p_type = PT_LOAD; + phdr.p_offset = offset; + phdr.p_vaddr = vma->vm_start; + phdr.p_paddr = 0; + phdr.p_filesz = maydump(vma) ? sz : 0; + phdr.p_memsz = sz; + offset += phdr.p_filesz; + phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; + if (vma->vm_flags & VM_WRITE) + phdr.p_flags |= PF_W; + if (vma->vm_flags & VM_EXEC) + phdr.p_flags |= PF_X; + phdr.p_align = ELF_EXEC_PAGESIZE; + + DUMP_WRITE(&phdr, sizeof(phdr)); + } + +#ifdef ELF_CORE_WRITE_EXTRA_PHDRS + ELF_CORE_WRITE_EXTRA_PHDRS; +#endif + + /* write out the notes section */ + for (i = 0; i < numnote; i++) + if (!writenote(notes + i, file)) + goto end_coredump; + + /* write out the thread status notes section */ + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp = + list_entry(t, struct elf_thread_status, list); + + for (i = 0; i < tmp->num_notes; i++) + if (!writenote(&tmp->notes[i], file)) + goto end_coredump; + } + + DUMP_SEEK(dataoff); + + if (elf_fdpic_dump_segments(file, current->mm, &size, &limit) < 0) + goto end_coredump; + +#ifdef ELF_CORE_WRITE_EXTRA_DATA + ELF_CORE_WRITE_EXTRA_DATA; +#endif + + if (file->f_pos != offset) { + /* Sanity check */ + printk(KERN_WARNING + "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", + file->f_pos, offset); + } + +end_coredump: + set_fs(fs); + +cleanup: + while (!list_empty(&thread_list)) { + struct list_head *tmp = thread_list.next; + list_del(tmp); + kfree(list_entry(tmp, struct elf_thread_status, list)); + } + + kfree(elf); + kfree(prstatus); + kfree(psinfo); + kfree(notes); + kfree(fpu); +#ifdef ELF_CORE_COPY_XFPREGS + kfree(xfpu); +#endif + return has_dumped; +#undef NUM_NOTES +} + +#endif /* USE_ELF_CORE_DUMP */ -- cgit v1.2.2 From 92eb7a2f28d551acedeb5752263267a64b1f5ddf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Jul 2006 04:45:31 -0700 Subject: [PATCH] fix weird logic in alloc_fdtable() There's a fairly obvious infinite loop in there. Also, use roundup_pow_of_two() rather than open-coding stuff. Cc: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 55f4e70225..3f35608606 100644 --- a/fs/file.c +++ b/fs/file.c @@ -240,13 +240,9 @@ static struct fdtable *alloc_fdtable(int nr) if (!fdt) goto out; - nfds = 8 * L1_CACHE_BYTES; - /* Expand to the max in easy steps */ - while (nfds <= nr) { - nfds = nfds * 2; - if (nfds > NR_OPEN) - nfds = NR_OPEN; - } + nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nfds)); + if (nfds > NR_OPEN) + nfds = NR_OPEN; new_openset = alloc_fdset(nfds); new_execset = alloc_fdset(nfds); -- cgit v1.2.2 From 36cf96f5e7c098731a1ad9d79694d6f591b18e7f Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 10 Jul 2006 04:45:33 -0700 Subject: [PATCH] Remove leftover ext3 acl declarations These functions no longer exist; remove their declarations. Signed-off-by: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/acl.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 92d50b53a9..0d1e6279cb 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -62,9 +62,6 @@ extern int ext3_permission (struct inode *, int, struct nameidata *); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); -extern int init_ext3_acl(void); -extern void exit_ext3_acl(void); - #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include #define ext3_permission NULL -- cgit v1.2.2 From e2b209509ca33743864846aef2e1b2afc21f7915 Mon Sep 17 00:00:00 2001 From: Shankar Anand Date: Mon, 10 Jul 2006 04:45:44 -0700 Subject: [PATCH] knfsd: nfsd4: add per-operation server stats Add an nfs4 operations count array to nfsd_stats structure. The count is incremented in nfsd4_proc_compound() where all the operations are handled by the nfsv4 server. This count of individual nfsv4 operations is also entered into /proc filesystem. Signed-off-by: Shankar Anand Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4proc.c | 8 ++++++++ fs/nfsd/stats.c | 10 ++++++++++ 2 files changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b0e095ea0c..ee4eff27ae 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -721,6 +721,12 @@ nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) return nfs_ok; } +static inline void nfsd4_increment_op_stats(u32 opnum) +{ + if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) + nfsdstats.nfs4_opcount[opnum]++; +} + /* * COMPOUND call. @@ -930,6 +936,8 @@ encode_op: /* XXX Ugh, we need to get rid of this kind of special case: */ if (op->opnum == OP_READ && op->u.read.rd_filp) fput(op->u.read.rd_filp); + + nfsd4_increment_op_stats(op->opnum); } out: diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 57265d5638..71944cddf6 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -72,6 +72,16 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) /* show my rpc info */ svc_seq_show(seq, &nfsd_svcstats); +#ifdef CONFIG_NFSD_V4 + /* Show count for individual nfsv4 operations */ + /* Writing operation numbers 0 1 2 also for maintaining uniformity */ + seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); + for (i = 0; i <= LAST_NFS4_OP; i++) + seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]); + + seq_putc(seq, '\n'); +#endif + return 0; } -- cgit v1.2.2 From d579091b4385e9386e244622d593fe064aa8e8e7 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Wed, 12 Jul 2006 09:03:05 -0700 Subject: [PATCH] fix fdset leakage When found, it is obvious. nfds calculated when allocating fdsets is rewritten by calculation of size of fdtable, and when we are unlucky, we try to free fdsets of wrong size. Found due to OpenVZ resource management (User Beancounters). Signed-off-by: Alexey Kuznetsov Signed-off-by: Kirill Korotaev Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 3f35608606..c8f1b0af8e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -273,11 +273,13 @@ static struct fdtable *alloc_fdtable(int nr) } while (nfds <= nr); new_fds = alloc_fd_array(nfds); if (!new_fds) - goto out; + goto out2; fdt->fd = new_fds; fdt->max_fds = nfds; fdt->free_files = NULL; return fdt; +out2: + nfds = fdt->max_fdset; out: if (new_openset) free_fdset(new_openset, nfds); -- cgit v1.2.2 From 232ba9dbd68bb084d5d90c511f207d18eae614da Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 12 Jul 2006 09:03:06 -0700 Subject: [PATCH] lockdep: annotate the sysfs i_mutex to be a separate class sysfs has a different i_mutex lock order behavior for i_mutex than the other filesystems; sysfs i_mutex is called in many places with subsystem locks held. At the same time, many of the VFS locking rules do not apply to sysfs at all (cross directory rename for example). To untangle this mess (which gives false positives in lockdep), we're giving sysfs inodes their own class for i_mutex. Signed-off-by: Arjan van de Ven Cc: Ingo Molnar Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sysfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 5e0e31cc46..9889e54e1f 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -109,6 +109,17 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } + +/* + * sysfs has a different i_mutex lock order behavior for i_mutex than other + * filesystems; sysfs i_mutex is called in many places with subsystem locks + * held. At the same time, many of the VFS locking rules do not apply to + * sysfs at all (cross directory rename for example). To untangle this mess + * (which gives false positives in lockdep), we're giving sysfs inodes their + * own class for i_mutex. + */ +static struct lock_class_key sysfs_inode_imutex_key; + struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) { struct inode * inode = new_inode(sysfs_sb); @@ -118,6 +129,7 @@ struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) inode->i_mapping->a_ops = &sysfs_aops; inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; inode->i_op = &sysfs_inode_operations; + lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); if (sd->s_iattr) { /* sysfs_dirent has non-default attributes -- cgit v1.2.2 From 0635170b544b01b46a81b4ac5cff5020ab59d1fc Mon Sep 17 00:00:00 2001 From: "Adam B. Jerome" Date: Wed, 12 Jul 2006 09:03:07 -0700 Subject: [PATCH] /fs/proc/: 'larger than buffer size' memory accessed by clear_user() Address a potential 'larger than buffer size' memory access by clear_user(). Without this patch, this call to clear_user() can attempt to clear too many (tsz) bytes resulting in a wrong (-EFAULT) return code by read_kcore(). Signed-off-by: Adam B. Jerome Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 8d6d85d740..6a984f64ed 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -382,7 +382,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) */ if (n) { if (clear_user(buffer + tsz - n, - tsz - n)) + n)) return -EFAULT; } } else { -- cgit v1.2.2 From a29b0b74e73b66674d20a170e463fe9032f2272a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 12 Jul 2006 09:03:08 -0700 Subject: [PATCH] alloc_fdtable() expansion fix We're supposed to go the next power of two if nfds==nr. Of `nr', not of `nfsd'. Spotted by Rene Scharfe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index c8f1b0af8e..b3c6b82e6a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -240,7 +240,7 @@ static struct fdtable *alloc_fdtable(int nr) if (!fdt) goto out; - nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nfds)); + nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nr + 1)); if (nfds > NR_OPEN) nfds = NR_OPEN; -- cgit v1.2.2 From 18b0bbd8ca6d3cb90425aa0d77b99a762c6d6de3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 14 Jul 2006 16:51:34 -0700 Subject: Fix nasty /proc vulnerability We have a bad interaction with both the kernel and user space being able to change some of the /proc file status. This fixes the most obvious part of it, but I expect we'll also make it harder for users to modify even their "own" files in /proc. Signed-off-by: Linus Torvalds --- fs/proc/base.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 243a94af04..0cb8f20d00 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1338,6 +1338,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) } else { inode->i_uid = 0; inode->i_gid = 0; + inode->i_mode = 0; } security_task_to_inode(task, inode); put_task_struct(task); -- cgit v1.2.2 From 9ee8ab9fbf21e6b87ad227cd46c0a4be41ab749b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 14 Jul 2006 21:48:03 -0700 Subject: Relax /proc fix a bit Clearign all of i_mode was a bit draconian. We only really care about S_ISUID/ISGID, after all. Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0cb8f20d00..474eae3450 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1338,8 +1338,8 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) } else { inode->i_uid = 0; inode->i_gid = 0; - inode->i_mode = 0; } + inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); put_task_struct(task); return 1; @@ -1390,6 +1390,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_uid = 0; inode->i_gid = 0; } + inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); put_task_struct(task); return 1; -- cgit v1.2.2 From de45921535bfc3b1f63b426c2a9739635f864283 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Fri, 14 Jul 2006 00:23:49 -0700 Subject: [PATCH] struct file leakage 2.6.16 leaks like hell. While testing, I found massive leakage (reproduced in openvz) in: *filp *size-4096 And 1 object leaks in *size-32 *size-64 *size-128 It is the fix for the first one. filp leaks in the bowels of namei.c. Seems, size-4096 is file table leaking in expand_fdtables. I have no idea what are the rest and why they show only accompanying another leaks. Some debugging structs? [akpm@osdl.org, Trond: remove the IS_ERR() check] Signed-off-by: Alexey Kuznetsov Cc: Kirill Korotaev Cc: Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index c9750d755a..e01070d7bf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1712,8 +1712,14 @@ do_link: if (error) goto exit_dput; error = __do_follow_link(&path, nd); - if (error) + if (error) { + /* Does someone understand code flow here? Or it is only + * me so stupid? Anathema to whoever designed this non-sense + * with "intent.open". + */ + release_open_intent(nd); return error; + } nd->flags &= ~LOOKUP_PARENT; if (nd->last_type == LAST_BIND) goto ok; -- cgit v1.2.2 From 6fbe82a952790c634ea6035c223a01a81377daf1 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 14 Jul 2006 00:24:22 -0700 Subject: [PATCH] reiserfs: fix handling of device names with /'s in them On systems with block devices containing a slash (virtual dasd, cciss, etc), reiserfs will fail to initialize /proc/fs/reiserfs/ due to it being interpreted as a subdirectory. The generic block device code changes the / to ! for use in the sysfs tree. This patch uses that convention. Tested by making dm devices use dm/ rather than dm- [akpm@osdl.org: name variables consistently] Signed-off-by: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/procfs.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 5d8a8cfebc..c533ec1bca 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -492,9 +492,17 @@ static void add_file(struct super_block *sb, char *name, int reiserfs_proc_info_init(struct super_block *sb) { + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + spin_lock_init(&__PINFO(sb).lock); - REISERFS_SB(sb)->procdir = - proc_mkdir(reiserfs_bdevname(sb), proc_info_root); + REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root); if (REISERFS_SB(sb)->procdir) { REISERFS_SB(sb)->procdir->owner = THIS_MODULE; REISERFS_SB(sb)->procdir->data = sb; @@ -508,13 +516,22 @@ int reiserfs_proc_info_init(struct super_block *sb) return 0; } reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s", - proc_info_root_name, reiserfs_bdevname(sb)); + proc_info_root_name, b); return 1; } int reiserfs_proc_info_done(struct super_block *sb) { struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + if (de) { remove_proc_entry("journal", de); remove_proc_entry("oidmap", de); @@ -528,7 +545,7 @@ int reiserfs_proc_info_done(struct super_block *sb) __PINFO(sb).exiting = 1; spin_unlock(&__PINFO(sb).lock); if (proc_info_root) { - remove_proc_entry(reiserfs_bdevname(sb), proc_info_root); + remove_proc_entry(b, proc_info_root); REISERFS_SB(sb)->procdir = NULL; } return 0; -- cgit v1.2.2 From d247e2c661f28a21e5f9a8d672e1e88a7c1c5d4a Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 14 Jul 2006 00:24:23 -0700 Subject: [PATCH] add function documentation for register_chrdev() Documentation for register_chrdev() was missing completely. [akpm@osdl.org: kerneldocification] Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/char_dev.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs') diff --git a/fs/char_dev.c b/fs/char_dev.c index a4cbc6706e..3483d3cf80 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -182,6 +182,28 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, return 0; } +/** + * register_chrdev() - Register a major number for character devices. + * @major: major device number or 0 for dynamic allocation + * @name: name of this range of devices + * @fops: file operations associated with this devices + * + * If @major == 0 this functions will dynamically allocate a major and return + * its number. + * + * If @major > 0 this function will attempt to reserve a device with the given + * major number and will return zero on success. + * + * Returns a -ve errno on failure. + * + * The name of this device has nothing to do with the name of the device in + * /dev. It only helps to keep track of the different owners of devices. If + * your module name has only one type of devices it's ok to use e.g. the name + * of the module here. + * + * This function registers a range of 256 minor numbers. The first minor number + * is 0. + */ int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) { -- cgit v1.2.2 From 25890454667b3295f67b3372352be90705f8667c Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:43 -0700 Subject: [PATCH] per-task-delay-accounting: /proc export of aggregated block I/O delays Export I/O delays seen by a task through /proc//stats for use in top etc. Note that delays for I/O done for swapping in pages (swapin I/O) is clubbed together with all other I/O here (this is not the case in the netlink interface where the swapin I/O is kept distinct) [akpm@osdl.org: printk warning fix] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 7495d3e207..0b615d62a1 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -411,7 +412,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %llu\n", task->pid, tcomm, state, @@ -455,7 +456,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) task->exit_signal, task_cpu(task), task->rt_priority, - task->policy); + task->policy, + (unsigned long long)delayacct_blkio_ticks(task)); if(mm) mmput(mm); return res; -- cgit v1.2.2 From 92d032855e64834283de5acfb0463232e0ab128e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Jul 2006 12:20:05 -0700 Subject: Mark /proc MS_NOSUID and MS_NOEXEC Not that we really need this any more, but at the same time there's no reason not to do this. Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6dcef089e1..49dfb2ab78 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -192,7 +192,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent) { struct inode * root_inode; - s->s_flags |= MS_NODIRATIME; + s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = PROC_SUPER_MAGIC; -- cgit v1.2.2 From 6d76fa58b050044994fe25f8753b8023f2b36737 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Jul 2006 12:26:45 -0700 Subject: Don't allow chmod() on the /proc// files This just turns off chmod() on the /proc// files, since there is no good reason to allow it, and had we disallowed it originally, the nasty /proc race exploit wouldn't have been possible. The other patches already fixed the problem chmod() could cause, so this is really just some final mop-up.. This particular version is based off a patch by Eugene and Marcel which had much better naming than my original equivalent one. Signed-off-by: Eugene Teo Signed-off-by: Marcel Holtmann Signed-off-by: Linus Torvalds --- fs/proc/base.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 474eae3450..fe8d55fb17 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -551,6 +551,27 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } +static int proc_setattr(struct dentry *dentry, struct iattr *attr) +{ + int error; + struct inode *inode = dentry->d_inode; + + if (attr->ia_valid & ATTR_MODE) + return -EPERM; + + error = inode_change_ok(inode, attr); + if (!error) { + error = security_inode_setattr(dentry, attr); + if (!error) + error = inode_setattr(inode, attr); + } + return error; +} + +static struct inode_operations proc_def_inode_operations = { + .setattr = proc_setattr, +}; + extern struct seq_operations mounts_op; struct proc_mounts { struct seq_file m; @@ -1111,7 +1132,8 @@ out: static struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, - .follow_link = proc_pid_follow_link + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, }; static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) @@ -1285,6 +1307,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st ei = PROC_I(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); + inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. @@ -1529,11 +1552,13 @@ static struct file_operations proc_task_operations = { */ static struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, + .setattr = proc_setattr, }; static struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, + .setattr = proc_setattr, }; #ifdef CONFIG_SECURITY @@ -1847,11 +1872,13 @@ static struct file_operations proc_tid_base_operations = { static struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; static struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; #ifdef CONFIG_SECURITY @@ -1894,11 +1921,13 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir, static struct inode_operations proc_tgid_attr_inode_operations = { .lookup = proc_tgid_attr_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; static struct inode_operations proc_tid_attr_inode_operations = { .lookup = proc_tid_attr_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; #endif -- cgit v1.2.2 From 2a293b7d5aa2f0d1e3d87b642f7ac263c2d664e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Jul 2006 17:04:26 +1000 Subject: [XFS] All xfs_disk_dquot_t values are (as the name says) disk endian. Before putting them into struct statfs they should be endian-swapped. SGI-PV: 954580 SGI-Modid: xfs-linux-melb:xfs-kern:26550a Signed-off-by: Christoph Hellwig Signed-off-by: Nathan Scott --- fs/xfs/quota/xfs_qm_bhv.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index e95e99f716..f137856c32 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -217,17 +217,24 @@ xfs_qm_statvfs( return 0; dp = &dqp->q_core; - limit = dp->d_blk_softlimit ? dp->d_blk_softlimit : dp->d_blk_hardlimit; + limit = dp->d_blk_softlimit ? + be64_to_cpu(dp->d_blk_softlimit) : + be64_to_cpu(dp->d_blk_hardlimit); if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; - statp->f_bfree = (statp->f_blocks > dp->d_bcount) ? - (statp->f_blocks - dp->d_bcount) : 0; + statp->f_bfree = + (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? + (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; } - limit = dp->d_ino_softlimit ? dp->d_ino_softlimit : dp->d_ino_hardlimit; + + limit = dp->d_ino_softlimit ? + be64_to_cpu(dp->d_ino_softlimit) : + be64_to_cpu(dp->d_ino_hardlimit); if (limit && statp->f_files > limit) { statp->f_files = limit; - statp->f_ffree = (statp->f_files > dp->d_icount) ? - (statp->f_ffree - dp->d_icount) : 0; + statp->f_ffree = + (statp->f_files > be64_to_cpu(dp->d_icount)) ? + (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0; } xfs_qm_dqput(dqp); -- cgit v1.2.2 From f5faad799475c4058416264f672bb33bf8b5ef41 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:04:44 +1000 Subject: [XFS] Fix remount vs no/barrier options by ensuring we clear unwanted flags from iclog buffers before submitting them for writing. SGI-PV: 954772 SGI-Modid: xfs-linux-melb:xfs-kern:26605a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_buf.h | 4 ++-- fs/xfs/xfs_log.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index ceda3a2859..7858703ed8 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -246,8 +246,8 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define BUF_BUSY XBF_DONT_BLOCK #define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) -#define XFS_BUF_ZEROFLAGS(bp) \ - ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI)) +#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ + ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) #define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e730328636..21ac1a67e3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1413,7 +1413,7 @@ xlog_sync(xlog_t *log, ops = iclog->ic_header.h_num_logops; INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops); - bp = iclog->ic_bp; + bp = iclog->ic_bp; ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT))); @@ -1430,15 +1430,14 @@ xlog_sync(xlog_t *log, } XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ + XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); /* * Do an ordered write for the log block. - * - * It may not be needed to flush the first split block in the log wrap - * case, but do it anyways to be safe -AK + * Its unnecessary to flush the first split block in the log wrap case. */ - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER)) XFS_BUF_ORDERED(bp); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); @@ -1460,7 +1459,7 @@ xlog_sync(xlog_t *log, return error; } if (split) { - bp = iclog->ic_log->l_xbuf; + bp = iclog->ic_log->l_xbuf; ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); @@ -1468,6 +1467,7 @@ xlog_sync(xlog_t *log, XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ (__psint_t)count), split); XFS_BUF_SET_FSPRIVATE(bp, iclog); + XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) -- cgit v1.2.2 From b2ea401bac39e75ebb64038609ed22efbc799905 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:05:13 +1000 Subject: [XFS] Fix a barrier related forced shutdown on mounts with quota enabled. SGI-PV: 912426 SGI-Modid: xfs-linux-melb:xfs-kern:26622a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_super.c | 7 +++++++ fs/xfs/xfs_vfsops.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9bdef9d519..4754f342a5 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -314,6 +314,13 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp) return; } + if (xfs_readonly_buftarg(mp->m_ddev_targp)) { + xfs_fs_cmn_err(CE_NOTE, mp, + "Disabling barriers, underlying device is readonly"); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + return; + } + error = xfs_barrier_test(mp); if (error) { xfs_fs_cmn_err(CE_NOTE, mp, diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 6c96391f3f..b427d220a1 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -515,7 +515,7 @@ xfs_mount( if (error) goto error2; - if ((mp->m_flags & XFS_MOUNT_BARRIER) && !(vfsp->vfs_flag & VFS_RDONLY)) + if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_mountfs_check_barriers(mp); error = XFS_IOINIT(vfsp, args, flags); -- cgit v1.2.2 From 41ff715abc49324fb2cb20e66bc4e0290cfdbe51 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:05:51 +1000 Subject: [XFS] Ensure bulkstat from an invalid inode number gets caught always with EINVAL. SGI-PV: 953819 SGI-Modid: xfs-linux-melb:xfs-kern:26629a Signed-off-by: Nathan Scott --- fs/xfs/xfs_inode.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 86c1bf0bba..1f8ecff855 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -334,10 +334,9 @@ xfs_itobp( #if !defined(__KERNEL__) ni = 0; #elif defined(DEBUG) - ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : - (BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog); + ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; #else /* usual case */ - ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 1; + ni = 1; #endif for (i = 0; i < ni; i++) { @@ -348,11 +347,15 @@ xfs_itobp( (i << mp->m_sb.sb_inodelog)); di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (imap_flags & XFS_IMAP_BULKSTAT) { + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EINVAL); + } #ifdef DEBUG - if (!(imap_flags & XFS_IMAP_BULKSTAT)) - cmn_err(CE_ALERT, + cmn_err(CE_ALERT, "Device %s - bad inode magic/vsn " "daddr %lld #%d (magic=%x)", XFS_BUFTARG_NAME(mp->m_ddev_targp), -- cgit v1.2.2 From 2ccb48ebb4de139eef4fcefd5f2bb823cb0d81b9 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sun, 30 Jul 2006 03:03:01 -0700 Subject: [PATCH] ext3: avoid triggering ext3_error on bad NFS file handle The inode number out of an NFS file handle gets passed eventually to ext3_get_inode_block() without any checking. If ext3_get_inode_block() allows it to trigger an error, then bad filehandles can have unpleasant effect - ext3_error() will usually cause a forced read-only remount, or a panic if `errors=panic' was used. So remove the call to ext3_error there and put a matching check in ext3/namei.c where inode numbers are read off storage. [akpm@osdl.org: fix off-by-one error] Signed-off-by: Neil Brown Signed-off-by: Jan Kara Cc: Marcel Holtmann Cc: Cc: "Stephen C. Tweedie" Cc: Eric Sandeen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/inode.c | 13 +++++++------ fs/ext3/namei.c | 15 +++++++++++++-- 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index f804d5e9d6..ab034d3053 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2402,14 +2402,15 @@ static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, struct buffer_head *bh; struct ext3_group_desc * gdp; - - if ((ino != EXT3_ROOT_INO && ino != EXT3_JOURNAL_INO && - ino != EXT3_RESIZE_INO && ino < EXT3_FIRST_INO(sb)) || - ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) { - ext3_error(sb, "ext3_get_inode_block", - "bad inode number: %lu", ino); + if (!ext3_valid_inum(sb, ino)) { + /* + * This error is already checked for in namei.c unless we are + * looking at an NFS filehandle, in which case no error + * report is needed + */ return 0; } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); if (block_group >= EXT3_SB(sb)->s_groups_count) { ext3_error(sb,"ext3_get_inode_block","group >= groups count"); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index d9176dba36..2aa7101b27 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1000,7 +1000,12 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str if (bh) { unsigned long ino = le32_to_cpu(de->inode); brelse (bh); - inode = iget(dir->i_sb, ino); + if (!ext3_valid_inum(dir->i_sb, ino)) { + ext3_error(dir->i_sb, "ext3_lookup", + "bad inode number: %lu", ino); + inode = NULL; + } else + inode = iget(dir->i_sb, ino); if (!inode) return ERR_PTR(-EACCES); @@ -1028,7 +1033,13 @@ struct dentry *ext3_get_parent(struct dentry *child) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); - inode = iget(child->d_inode->i_sb, ino); + + if (!ext3_valid_inum(child->d_inode->i_sb, ino)) { + ext3_error(child->d_inode->i_sb, "ext3_get_parent", + "bad inode number: %lu", ino); + inode = NULL; + } else + inode = iget(child->d_inode->i_sb, ino); if (!inode) return ERR_PTR(-EACCES); -- cgit v1.2.2 From d1bbf14f37261c2c0dba71404602e1ddcec069d2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 30 Jul 2006 03:03:16 -0700 Subject: [PATCH] knfsd: Fix stale file handle problem with subtree_checking. A recent commit (7fc90ec93a5eb71f4b08403baf5ba7176b3ec6b1) moved the call to nfsd_setuser out of the 'find a dentry for a filehandle' branch of fh_verify so that it would always be called. This had the unfortunately side-effect of moving *after* the call to decode_fh, so the prober fsuid was not set when nfsd_acceptable was called, the 'permission' check did the wrong thing. This patch moves the nfsd_setuser call back where it was, and add as call in the other branch of the if. Cc: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfsfh.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ecc439d256..501d838845 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -187,6 +187,11 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) goto out; } + /* Set user creds for this exportpoint */ + error = nfserrno(nfsd_setuser(rqstp, exp)); + if (error) + goto out; + /* * Look up the dentry using the NFS file handle. */ @@ -241,16 +246,17 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) dprintk("nfsd: fh_verify - just checking\n"); dentry = fhp->fh_dentry; exp = fhp->fh_export; + /* Set user creds for this exportpoint; necessary even + * in the "just checking" case because this may be a + * filehandle that was created by fh_compose, and that + * is about to be used in another nfsv4 compound + * operation */ + error = nfserrno(nfsd_setuser(rqstp, exp)); + if (error) + goto out; } cache_get(&exp->h); - /* Set user creds for this exportpoint; necessary even in the "just - * checking" case because this may be a filehandle that was created by - * fh_compose, and that is about to be used in another nfsv4 compound - * operation */ - error = nfserrno(nfsd_setuser(rqstp, exp)); - if (error) - goto out; error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type); if (error) -- cgit v1.2.2 From 0e1dfc66b6ec94984a4778132147a8aa36461d58 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 30 Jul 2006 03:03:28 -0700 Subject: [PATCH] invalidate_bdev() speedup We can immediately bail from invalidate_bdev() if the blockdev has no pagecache. This solves the huge IPI storms which hald is causing on the big ia64 machines when it polls CDROM drives. Acked-by: Jes Sorensen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 3660dcb975..71649ef9b6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -470,13 +470,18 @@ out: pass does the actual I/O. */ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) { + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) + return; + invalidate_bh_lrus(); /* * FIXME: what about destroy_dirty_buffers? * We really want to use invalidate_inode_pages2() for * that, but not until that's cleaned up. */ - invalidate_inode_pages(bdev->bd_inode->i_mapping); + invalidate_inode_pages(mapping); } /* -- cgit v1.2.2 From cfa224e928f782e1593b5222688fad84c2cad3e8 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Sun, 30 Jul 2006 03:03:51 -0700 Subject: [PATCH] enable mac partition label per default on pmac Enable mac partition table support per default also for a powermac config. Signed-off-by: Olaf Hering Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/partitions/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig index c9a4780992..e478f19418 100644 --- a/fs/partitions/Kconfig +++ b/fs/partitions/Kconfig @@ -99,7 +99,7 @@ config IBM_PARTITION config MAC_PARTITION bool "Macintosh partition map support" if PARTITION_ADVANCED - default y if MAC + default y if (MAC || PPC_PMAC) help Say Y here if you would like to use hard disks under Linux which were partitioned on a Macintosh. -- cgit v1.2.2 From 5b6509aa8c2f292caea7c0602ec361f920951508 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 30 Jul 2006 03:03:54 -0700 Subject: [PATCH] inotify: fix deadlock found by lockdep This is a real deadlock, a nice complex one: (warning: long explanation follows so that Andrew can have a complete patch description) it's an ABCDA deadlock: A iprune_mutex B inode->inotify_mutex C ih->mutex D dev->ev_mutex The AB relationship comes straight from invalidate_inodes() int invalidate_inodes(struct super_block * sb) { int busy; LIST_HEAD(throw_away); mutex_lock(&iprune_mutex); spin_lock(&inode_lock); inotify_unmount_inodes(&sb->s_inodes); where inotify_umount_inodes() takes the mutex_lock(&inode->inotify_mutex); The BC relationship comes directly from inotify_find_update_watch(): s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode, u32 mask) { ... mutex_lock(&inode->inotify_mutex); mutex_lock(&ih->mutex); The CD relationship comes from inotify_rm_wd: inotify_rm_wd does mutex_lock(&inode->inotify_mutex); mutex_lock(&ih->mutex) and then calls inotify_remove_watch_locked() which calls notify_dev_queue_event() which does mutex_lock(&dev->ev_mutex); (this strictly is a BCD relationship) The DA relationship comes from the most interesting part: [] shrink_icache_memory+0x42/0x270 [] shrink_slab+0x11d/0x1c9 [] try_to_free_pages+0x187/0x244 [] __alloc_pages+0x1cd/0x2e0 [] cache_alloc_refill+0x3f8/0x821 [] kmem_cache_alloc+0x85/0xcb [] kernel_event+0x2e/0x122 [] inotify_dev_queue_event+0xcc/0x140 inotify_dev_queue_event schedules a kernel_event which does a kmem_cache_alloc( , GFP_KERNEL) which may try to shrink slabs, including the inode cache .. which then takes iprune_mutex. And voila, there is an AB, a BC, a CD relationship (even a direct BCD), and also now a DA relationship -> a circular type AB-BA deadlock but involving 4 locks. The solution is simple: kernel_event() is NOT allowed to use GFP_KERNEL, but must use GFP_NOFS to not cause recursion into the VFS. Signed-off-by: Arjan van de Ven Acked-by: Ingo Molnar Acked-by: Robert Love Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inotify_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/inotify_user.c b/fs/inotify_user.c index f2386442ad..017cb0f134 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -187,7 +187,7 @@ static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie, { struct inotify_kernel_event *kevent; - kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL); + kevent = kmem_cache_alloc(event_cachep, GFP_NOFS); if (unlikely(!kevent)) return NULL; -- cgit v1.2.2 From 6ecbc4e1a395062a8e99e4f5fe328f6ba166d9c8 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 30 Jul 2006 03:03:56 -0700 Subject: [PATCH] Remove incorrect unlock_kernel from allocation failure path in coda_open() Commit 398c53a757702e1e3a7a2c24860c7ad26acb53ed (in the historical GIT tree) moved the lock_kernel() in coda_open after the allocation of a coda_file_info struct, but left an unlock_kernel() in the allocation failure error path; remove it. Signed-off-by: Josh Triplett Acked-by: Jan Harkes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coda/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/coda/file.c b/fs/coda/file.c index cc66c681bd..dbfbcfa5b3 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -136,10 +136,8 @@ int coda_open(struct inode *coda_inode, struct file *coda_file) coda_vfs_stat.open++; cfi = kmalloc(sizeof(struct coda_file_info), GFP_KERNEL); - if (!cfi) { - unlock_kernel(); + if (!cfi) return -ENOMEM; - } lock_kernel(); -- cgit v1.2.2 From 0aa9e4f147880b2d7d1eef1f0b45112af0e36f9f Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 30 Jul 2006 03:03:58 -0700 Subject: [PATCH] efs: Remove incorrect unlock_kernel from failure path in efs_symlink_readpage() If efs_symlink_readpage hits the -ENAMETOOLONG error path, it will call unlock_kernel without ever having called lock_kernel(); fix this by creating and jumping to a new label fail_notlocked rather than the fail label used after calling lock_kernel(). Signed-off-by: Josh Triplett Cc: Marcelo Tosatti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/efs/symlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c index e249cf733a..1d30d2ff44 100644 --- a/fs/efs/symlink.c +++ b/fs/efs/symlink.c @@ -22,7 +22,7 @@ static int efs_symlink_readpage(struct file *file, struct page *page) err = -ENAMETOOLONG; if (size > 2 * EFS_BLOCKSIZE) - goto fail; + goto fail_notlocked; lock_kernel(); /* read first 512 bytes of link target */ @@ -47,6 +47,7 @@ static int efs_symlink_readpage(struct file *file, struct page *page) return 0; fail: unlock_kernel(); +fail_notlocked: SetPageError(page); kunmap(page); unlock_page(page); -- cgit v1.2.2 From 344fe78669d2d1cff9e8939598f6d0d865b6a75b Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 30 Jul 2006 03:03:59 -0700 Subject: [PATCH] ufs: remove incorrect unlock_kernel from failure path in ufs_symlink() ufs_symlink, in one of its error paths, calls unlock_kernel without ever having called lock_kernel(); fix this by creating and jumping to a new label out_notlocked rather than the out label used after calling lock_kernel(). Signed-off-by: Josh Triplett Cc: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index abd5f23a42..d344b411e2 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -129,7 +129,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, struct inode * inode; if (l > sb->s_blocksize) - goto out; + goto out_notlocked; lock_kernel(); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); @@ -155,6 +155,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, err = ufs_add_nondir(dentry, inode); out: unlock_kernel(); +out_notlocked: return err; out_fail: -- cgit v1.2.2 From 685d16ddb07b74537fb18972784e6214840fdd20 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sun, 30 Jul 2006 03:04:08 -0700 Subject: [PATCH] fuse: fix zero timeout An attribute and entry timeout of zero should mean, that the entity is invalidated immediately after the operation. Previously invalidation only happened at the next clock tick. Reported and tested by Craig Davies. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dir.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 72a74cde6d..6db66ec386 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -25,8 +25,11 @@ */ static unsigned long time_to_jiffies(unsigned long sec, unsigned long nsec) { - struct timespec ts = {sec, nsec}; - return jiffies + timespec_to_jiffies(&ts); + if (sec || nsec) { + struct timespec ts = {sec, nsec}; + return jiffies + timespec_to_jiffies(&ts); + } else + return jiffies - 1; } /* -- cgit v1.2.2 From 0a0898cf413876d4ed6e371f3e04bf38600a9205 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sun, 30 Jul 2006 03:04:10 -0700 Subject: [PATCH] fuse: use jiffies_64 It is entirely possible (though rare) that jiffies half-wraps around, while a dentry/inode remains in the cache. This could mean that the dentry/inode is not invalidated for another half wraparound-time. To get around this problem, use 64-bit jiffies. The only problem with this is that dentry->d_time is 32 bits on 32-bit archs. So use d_fsdata as the high 32 bits. This is an ugly hack, but far simpler, than having to allocate private data just for this purpose. Since 64-bit jiffies can be assumed never to wrap around, simple comparison can be used, and a zero time value can represent "invalid". Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dir.c | 44 ++++++++++++++++++++++++++++++++++++-------- fs/fuse/fuse_i.h | 2 +- fs/fuse/inode.c | 2 +- 3 files changed, 38 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 6db66ec386..409ce6a7cc 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,6 +14,33 @@ #include #include +#if BITS_PER_LONG >= 64 +static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_time = time; +} + +static inline u64 fuse_dentry_time(struct dentry *entry) +{ + return entry->d_time; +} +#else +/* + * On 32 bit archs store the high 32 bits of time in d_fsdata + */ +static void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_time = time; + entry->d_fsdata = (void *) (unsigned long) (time >> 32); +} + +static u64 fuse_dentry_time(struct dentry *entry) +{ + return (u64) entry->d_time + + ((u64) (unsigned long) entry->d_fsdata << 32); +} +#endif + /* * FUSE caches dentries and attributes with separate timeout. The * time in jiffies until the dentry/attributes are valid is stored in @@ -23,13 +50,13 @@ /* * Calculate the time in jiffies until a dentry/attributes are valid */ -static unsigned long time_to_jiffies(unsigned long sec, unsigned long nsec) +static u64 time_to_jiffies(unsigned long sec, unsigned long nsec) { if (sec || nsec) { struct timespec ts = {sec, nsec}; - return jiffies + timespec_to_jiffies(&ts); + return get_jiffies_64() + timespec_to_jiffies(&ts); } else - return jiffies - 1; + return 0; } /* @@ -38,7 +65,8 @@ static unsigned long time_to_jiffies(unsigned long sec, unsigned long nsec) */ static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o) { - entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec); + fuse_dentry_settime(entry, + time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); if (entry->d_inode) get_fuse_inode(entry->d_inode)->i_time = time_to_jiffies(o->attr_valid, o->attr_valid_nsec); @@ -50,7 +78,7 @@ static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o) */ void fuse_invalidate_attr(struct inode *inode) { - get_fuse_inode(inode)->i_time = jiffies - 1; + get_fuse_inode(inode)->i_time = 0; } /* @@ -63,7 +91,7 @@ void fuse_invalidate_attr(struct inode *inode) */ static void fuse_invalidate_entry_cache(struct dentry *entry) { - entry->d_time = jiffies - 1; + fuse_dentry_settime(entry, 0); } /* @@ -105,7 +133,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) if (inode && is_bad_inode(inode)) return 0; - else if (time_after(jiffies, entry->d_time)) { + else if (fuse_dentry_time(entry) < get_jiffies_64()) { int err; struct fuse_entry_out outarg; struct fuse_conn *fc; @@ -669,7 +697,7 @@ static int fuse_revalidate(struct dentry *entry) if (!fuse_allow_task(fc, current)) return -EACCES; if (get_node_id(inode) != FUSE_ROOT_ID && - time_before_eq(jiffies, fi->i_time)) + fi->i_time >= get_jiffies_64()) return 0; return fuse_do_getattr(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0dbf966218..69c7750d55 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -59,7 +59,7 @@ struct fuse_inode { struct fuse_req *forget_req; /** Time in jiffies until the file attributes are valid */ - unsigned long i_time; + u64 i_time; }; /** FUSE specific file data */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index dcaaabd3b9..7d25092262 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -51,7 +51,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) return NULL; fi = get_fuse_inode(inode); - fi->i_time = jiffies - 1; + fi->i_time = 0; fi->nodeid = 0; fi->nlookup = 0; fi->forget_req = fuse_request_alloc(); -- cgit v1.2.2 From 873302c71c0e60234eb187b15f83c2d79e84c40a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sun, 30 Jul 2006 03:04:10 -0700 Subject: [PATCH] fuse: fix typo Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/control.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a3bce3a772..46fe60b2da 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -105,7 +105,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, /* * Add a connection to the control filesystem (if it exists). Caller - * must host fuse_mutex + * must hold fuse_mutex */ int fuse_ctl_add_conn(struct fuse_conn *fc) { @@ -139,7 +139,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) /* * Remove a connection from the control filesystem (if it exists). - * Caller must host fuse_mutex + * Caller must hold fuse_mutex */ void fuse_ctl_remove_conn(struct fuse_conn *fc) { -- cgit v1.2.2 From bc65ac6a0ffc66c56d1e6893685d7fe87c63cc44 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 30 Jul 2006 03:04:12 -0700 Subject: [PATCH] freevxfs: Add missing lock_kernel() to vxfs_readdir Commit 7b2fd697427e73c81d5fa659efd91bd07d303b0e in the historical GIT tree stopped calling the readdir member of a file_operations struct with the big kernel lock held, and fixed up all the readdir functions to do their own locking. However, that change added calls to unlock_kernel() in vxfs_readdir, but no call to lock_kernel(). Fix this by adding a call to lock_kernel(). Signed-off-by: Josh Triplett Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/freevxfs/vxfs_lookup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 29cce456c7..43886fa00a 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -246,6 +246,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler) u_long page, npages, block, pblocks, nblocks, offset; loff_t pos; + lock_kernel(); + switch ((long)fp->f_pos) { case 0: if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) -- cgit v1.2.2 From 0e31f51d8177320d61ec5786ca4aafa7b7a749b4 Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sun, 30 Jul 2006 03:04:14 -0700 Subject: [PATCH] ext3 -nobh option causes oops For files other than IFREG, nobh option doesn't make sense. Modifications to them are journalled and needs buffer heads to do that. Without this patch, we get kernel oops in page_buffers(). Signed-off-by: Badari Pulavarty Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ab034d3053..c5ee9f0691 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1158,7 +1158,7 @@ retry: ret = PTR_ERR(handle); goto out; } - if (test_opt(inode->i_sb, NOBH)) + if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) ret = nobh_prepare_write(page, from, to, ext3_get_block); else ret = block_prepare_write(page, from, to, ext3_get_block); @@ -1244,7 +1244,7 @@ static int ext3_writeback_commit_write(struct file *file, struct page *page, if (new_i_size > EXT3_I(inode)->i_disksize) EXT3_I(inode)->i_disksize = new_i_size; - if (test_opt(inode->i_sb, NOBH)) + if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) ret = nobh_commit_write(file, page, from, to); else ret = generic_commit_write(file, page, from, to); @@ -1494,7 +1494,7 @@ static int ext3_writeback_writepage(struct page *page, goto out_fail; } - if (test_opt(inode->i_sb, NOBH)) + if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) ret = nobh_writepage(page, ext3_get_block, wbc); else ret = block_write_full_page(page, ext3_get_block, wbc); -- cgit v1.2.2 From 4c90c68aca278f425afc0b48d86298b960fbc0ce Mon Sep 17 00:00:00 2001 From: Russ Ross Date: Sun, 30 Jul 2006 03:04:15 -0700 Subject: [PATCH] 9p: fix marshalling bug in tcreate with empty extension field Signed-off-by: Russ Ross Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/conv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/conv.c b/fs/9p/conv.c index 1e898144eb..56d88c1a09 100644 --- a/fs/9p/conv.c +++ b/fs/9p/conv.c @@ -673,8 +673,10 @@ struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode, struct cbuf *bufp = &buffer; size = 4 + 2 + strlen(name) + 4 + 1; /* fid[4] name[s] perm[4] mode[1] */ - if (extended && extension!=NULL) - size += 2 + strlen(extension); /* extension[s] */ + if (extended) { + size += 2 + /* extension[s] */ + (extension == NULL ? 0 : strlen(extension)); + } fc = v9fs_create_common(bufp, size, TCREATE); if (IS_ERR(fc)) -- cgit v1.2.2 From 834a9b8ca7a01c34570be021f88e18884a29f048 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Sun, 30 Jul 2006 03:04:16 -0700 Subject: [PATCH] 9p: fix fid behavior on failed remove Based on a bug report from Russ Ross According to the spec: "The remove request asks the file server both to remove the file represented by fid and to clunk the fid, even if the remove fails." but the Linux client seems to expect the fid to be valid after a failed remove attempt. Specifically, I'm getting this behavior when attempting to remove a non-empty directory. Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 2f580a197b..eae50c9d6d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -434,11 +434,11 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) result = v9fs_t_remove(v9ses, fid, &fcall); if (result < 0) { PRINT_FCALL_ERROR("remove fails", fcall); - } else { - v9fs_put_idpool(fid, &v9ses->fidpool); - v9fs_fid_destroy(v9fid); } + v9fs_put_idpool(fid, &v9ses->fidpool); + v9fs_fid_destroy(v9fid); + kfree(fcall); return result; } -- cgit v1.2.2