aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Hansen <haveblue@us.ibm.com>2008-02-15 17:37:48 -0500
committerAl Viro <viro@zeniv.linux.org.uk>2008-04-19 00:29:25 -0400
commit4a3fd211ccfc08a88edc824300e25a87785c6a5f (patch)
tree99f1a76a99fa78464b8de731f7fdb5bcc9667a5e
parent42a74f206b914db13ee1f5ae932dcd91a77c8579 (diff)
[PATCH] r/o bind mounts: elevate write count for open()s
This is the first really tricky patch in the series. It elevates the writer count on a mount each time a non-special file is opened for write. We used to do this in may_open(), but Miklos pointed out that __dentry_open() is used as well to create filps. This will cover even those cases, while a call in may_open() would not have. There is also an elevated count around the vfs_create() call in open_namei(). See the comments for more details, but we need this to fix a 'create, remount, fail r/w open()' race. Some filesystems forego the use of normal vfs calls to create struct files. Make sure that these users elevate the mnt writer count because they will get __fput(), and we need to make sure they're balanced. Acked-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--fs/file_table.c14
-rw-r--r--fs/namei.c75
-rw-r--r--fs/open.c36
-rw-r--r--ipc/mqueue.c16
4 files changed, 127 insertions, 14 deletions
diff --git a/fs/file_table.c b/fs/file_table.c
index 3f73eb1f195a..71efc7000226 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -199,6 +199,17 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
199 file->f_mapping = dentry->d_inode->i_mapping; 199 file->f_mapping = dentry->d_inode->i_mapping;
200 file->f_mode = mode; 200 file->f_mode = mode;
201 file->f_op = fop; 201 file->f_op = fop;
202
203 /*
204 * These mounts don't really matter in practice
205 * for r/o bind mounts. They aren't userspace-
206 * visible. We do this for consistency, and so
207 * that we can do debugging checks at __fput()
208 */
209 if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
210 error = mnt_want_write(mnt);
211 WARN_ON(error);
212 }
202 return error; 213 return error;
203} 214}
204EXPORT_SYMBOL(init_file); 215EXPORT_SYMBOL(init_file);
@@ -221,10 +232,13 @@ EXPORT_SYMBOL(fput);
221 */ 232 */
222void drop_file_write_access(struct file *file) 233void drop_file_write_access(struct file *file)
223{ 234{
235 struct vfsmount *mnt = file->f_path.mnt;
224 struct dentry *dentry = file->f_path.dentry; 236 struct dentry *dentry = file->f_path.dentry;
225 struct inode *inode = dentry->d_inode; 237 struct inode *inode = dentry->d_inode;
226 238
227 put_write_access(inode); 239 put_write_access(inode);
240 if (!special_file(inode->i_mode))
241 mnt_drop_write(mnt);
228} 242}
229EXPORT_SYMBOL_GPL(drop_file_write_access); 243EXPORT_SYMBOL_GPL(drop_file_write_access);
230 244
diff --git a/fs/namei.c b/fs/namei.c
index 83c843b3fea3..e179f71bfcb0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1623 return -EACCES; 1623 return -EACCES;
1624 1624
1625 flag &= ~O_TRUNC; 1625 flag &= ~O_TRUNC;
1626 } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE)) 1626 }
1627 return -EROFS;
1628 1627
1629 error = vfs_permission(nd, acc_mode); 1628 error = vfs_permission(nd, acc_mode);
1630 if (error) 1629 if (error)
@@ -1724,18 +1723,32 @@ static inline int open_to_namei_flags(int flag)
1724 return flag; 1723 return flag;
1725} 1724}
1726 1725
1726static int open_will_write_to_fs(int flag, struct inode *inode)
1727{
1728 /*
1729 * We'll never write to the fs underlying
1730 * a device file.
1731 */
1732 if (special_file(inode->i_mode))
1733 return 0;
1734 return (flag & O_TRUNC);
1735}
1736
1727/* 1737/*
1728 * Note that the low bits of "flag" aren't the same as in the open 1738 * Note that the low bits of the passed in "open_flag"
1729 * system call. See open_to_namei_flags(). 1739 * are not the same as in the local variable "flag". See
1740 * open_to_namei_flags() for more details.
1730 */ 1741 */
1731struct file *do_filp_open(int dfd, const char *pathname, 1742struct file *do_filp_open(int dfd, const char *pathname,
1732 int open_flag, int mode) 1743 int open_flag, int mode)
1733{ 1744{
1745 struct file *filp;
1734 struct nameidata nd; 1746 struct nameidata nd;
1735 int acc_mode, error; 1747 int acc_mode, error;
1736 struct path path; 1748 struct path path;
1737 struct dentry *dir; 1749 struct dentry *dir;
1738 int count = 0; 1750 int count = 0;
1751 int will_write;
1739 int flag = open_to_namei_flags(open_flag); 1752 int flag = open_to_namei_flags(open_flag);
1740 1753
1741 acc_mode = ACC_MODE(flag); 1754 acc_mode = ACC_MODE(flag);
@@ -1791,17 +1804,30 @@ do_last:
1791 } 1804 }
1792 1805
1793 if (IS_ERR(nd.intent.open.file)) { 1806 if (IS_ERR(nd.intent.open.file)) {
1794 mutex_unlock(&dir->d_inode->i_mutex);
1795 error = PTR_ERR(nd.intent.open.file); 1807 error = PTR_ERR(nd.intent.open.file);
1796 goto exit_dput; 1808 goto exit_mutex_unlock;
1797 } 1809 }
1798 1810
1799 /* Negative dentry, just create the file */ 1811 /* Negative dentry, just create the file */
1800 if (!path.dentry->d_inode) { 1812 if (!path.dentry->d_inode) {
1801 error = __open_namei_create(&nd, &path, flag, mode); 1813 /*
1814 * This write is needed to ensure that a
1815 * ro->rw transition does not occur between
1816 * the time when the file is created and when
1817 * a permanent write count is taken through
1818 * the 'struct file' in nameidata_to_filp().
1819 */
1820 error = mnt_want_write(nd.path.mnt);
1802 if (error) 1821 if (error)
1822 goto exit_mutex_unlock;
1823 error = __open_namei_create(&nd, &path, flag, mode);
1824 if (error) {
1825 mnt_drop_write(nd.path.mnt);
1803 goto exit; 1826 goto exit;
1804 return nameidata_to_filp(&nd, open_flag); 1827 }
1828 filp = nameidata_to_filp(&nd, open_flag);
1829 mnt_drop_write(nd.path.mnt);
1830 return filp;
1805 } 1831 }
1806 1832
1807 /* 1833 /*
@@ -1831,11 +1857,40 @@ do_last:
1831 if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) 1857 if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
1832 goto exit; 1858 goto exit;
1833ok: 1859ok:
1860 /*
1861 * Consider:
1862 * 1. may_open() truncates a file
1863 * 2. a rw->ro mount transition occurs
1864 * 3. nameidata_to_filp() fails due to
1865 * the ro mount.
1866 * That would be inconsistent, and should
1867 * be avoided. Taking this mnt write here
1868 * ensures that (2) can not occur.
1869 */
1870 will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
1871 if (will_write) {
1872 error = mnt_want_write(nd.path.mnt);
1873 if (error)
1874 goto exit;
1875 }
1834 error = may_open(&nd, acc_mode, flag); 1876 error = may_open(&nd, acc_mode, flag);
1835 if (error) 1877 if (error) {
1878 if (will_write)
1879 mnt_drop_write(nd.path.mnt);
1836 goto exit; 1880 goto exit;
1837 return nameidata_to_filp(&nd, open_flag); 1881 }
1882 filp = nameidata_to_filp(&nd, open_flag);
1883 /*
1884 * It is now safe to drop the mnt write
1885 * because the filp has had a write taken
1886 * on its behalf.
1887 */
1888 if (will_write)
1889 mnt_drop_write(nd.path.mnt);
1890 return filp;
1838 1891
1892exit_mutex_unlock:
1893 mutex_unlock(&dir->d_inode->i_mutex);
1839exit_dput: 1894exit_dput:
1840 path_put_conditional(&path, &nd); 1895 path_put_conditional(&path, &nd);
1841exit: 1896exit:
diff --git a/fs/open.c b/fs/open.c
index 8111947905d8..e12f17010324 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -730,6 +730,35 @@ out:
730 return error; 730 return error;
731} 731}
732 732
733/*
734 * You have to be very careful that these write
735 * counts get cleaned up in error cases and
736 * upon __fput(). This should probably never
737 * be called outside of __dentry_open().
738 */
739static inline int __get_file_write_access(struct inode *inode,
740 struct vfsmount *mnt)
741{
742 int error;
743 error = get_write_access(inode);
744 if (error)
745 return error;
746 /*
747 * Do not take mount writer counts on
748 * special files since no writes to
749 * the mount itself will occur.
750 */
751 if (!special_file(inode->i_mode)) {
752 /*
753 * Balanced in __fput()
754 */
755 error = mnt_want_write(mnt);
756 if (error)
757 put_write_access(inode);
758 }
759 return error;
760}
761
733static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, 762static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
734 int flags, struct file *f, 763 int flags, struct file *f,
735 int (*open)(struct inode *, struct file *)) 764 int (*open)(struct inode *, struct file *))
@@ -742,7 +771,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
742 FMODE_PREAD | FMODE_PWRITE; 771 FMODE_PREAD | FMODE_PWRITE;
743 inode = dentry->d_inode; 772 inode = dentry->d_inode;
744 if (f->f_mode & FMODE_WRITE) { 773 if (f->f_mode & FMODE_WRITE) {
745 error = get_write_access(inode); 774 error = __get_file_write_access(inode, mnt);
746 if (error) 775 if (error)
747 goto cleanup_file; 776 goto cleanup_file;
748 } 777 }
@@ -784,8 +813,11 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
784 813
785cleanup_all: 814cleanup_all:
786 fops_put(f->f_op); 815 fops_put(f->f_op);
787 if (f->f_mode & FMODE_WRITE) 816 if (f->f_mode & FMODE_WRITE) {
788 put_write_access(inode); 817 put_write_access(inode);
818 if (!special_file(inode->i_mode))
819 mnt_drop_write(mnt);
820 }
789 file_kill(f); 821 file_kill(f);
790 f->f_path.dentry = NULL; 822 f->f_path.dentry = NULL;
791 f->f_path.mnt = NULL; 823 f->f_path.mnt = NULL;
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 34262c11f480..94fd3b08fb77 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -598,6 +598,7 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
598 int oflag, mode_t mode, struct mq_attr __user *u_attr) 598 int oflag, mode_t mode, struct mq_attr __user *u_attr)
599{ 599{
600 struct mq_attr attr; 600 struct mq_attr attr;
601 struct file *result;
601 int ret; 602 int ret;
602 603
603 if (u_attr) { 604 if (u_attr) {
@@ -612,13 +613,24 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
612 } 613 }
613 614
614 mode &= ~current->fs->umask; 615 mode &= ~current->fs->umask;
616 ret = mnt_want_write(mqueue_mnt);
617 if (ret)
618 goto out;
615 ret = vfs_create(dir->d_inode, dentry, mode, NULL); 619 ret = vfs_create(dir->d_inode, dentry, mode, NULL);
616 dentry->d_fsdata = NULL; 620 dentry->d_fsdata = NULL;
617 if (ret) 621 if (ret)
618 goto out; 622 goto out_drop_write;
619 623
620 return dentry_open(dentry, mqueue_mnt, oflag); 624 result = dentry_open(dentry, mqueue_mnt, oflag);
625 /*
626 * dentry_open() took a persistent mnt_want_write(),
627 * so we can now drop this one.
628 */
629 mnt_drop_write(mqueue_mnt);
630 return result;
621 631
632out_drop_write:
633 mnt_drop_write(mqueue_mnt);
622out: 634out:
623 dput(dentry); 635 dput(dentry);
624 mntput(mqueue_mnt); 636 mntput(mqueue_mnt);