[PATCH] r/o bind mounts: elevate write count for open()s

This is the first really tricky patch in the series. It elevates the writer count on a mount each time a non-special file is opened for write. We used to do this in may_open(), but Miklos pointed out that __dentry_open() is used as well to create filps. This will cover even those cases, while a call in may_open() would not have. There is also an elevated count around the vfs_create() call in open_namei(). See the comments for more details, but we need this to fix a 'create, remount, fail r/w open()' race. Some filesystems forego the use of normal vfs calls to create struct files. Make sure that these users elevate the mnt writer count because they will get __fput(), and we need to make sure they're balanced. Acked-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
author: Dave Hansen <haveblue@us.ibm.com> 2008-02-15 17:37:48 -0500
committer: Al Viro <viro@zeniv.linux.org.uk> 2008-04-19 00:29:25 -0400
commit: 4a3fd211ccfc08a88edc824300e25a87785c6a5f (patch)
tree: 99f1a76a99fa78464b8de731f7fdb5bcc9667a5e /fs/namei.c
parent: 42a74f206b914db13ee1f5ae932dcd91a77c8579 (diff)
1 files changed, 65 insertions, 10 deletions
diff --git a/fs/namei.c b/fs/namei.c
index 83c843b3fea3..e179f71bfcb0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
                        return -EACCES;
                flag &= ~O_TRUNC;
-        } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
+        }
-                return -EROFS;
        error = vfs_permission(nd, acc_mode);
        if (error)
@@ -1724,18 +1723,32 @@ static inline int open_to_namei_flags(int flag)
        return flag;
 }
+static int open_will_write_to_fs(int flag, struct inode *inode)
+{
+        /*
+         * We'll never write to the fs underlying
+         * a device file.
+         */
+        if (special_file(inode->i_mode))
+                return 0;
+        return (flag & O_TRUNC);
+}
 /*
- * Note that the low bits of "flag" aren't the same as in the open
+ * Note that the low bits of the passed in "open_flag"
- * system call.  See open_to_namei_flags().
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
 */
 struct file *do_filp_open(int dfd, const char *pathname,
                int open_flag, int mode)
 {
+        struct file *filp;
        struct nameidata nd;
        int acc_mode, error;
        struct path path;
        struct dentry *dir;
        int count = 0;
+        int will_write;
        int flag = open_to_namei_flags(open_flag);
        acc_mode = ACC_MODE(flag);
@@ -1791,17 +1804,30 @@ do_last:
        }
        if (IS_ERR(nd.intent.open.file)) {
-                mutex_unlock(&dir->d_inode->i_mutex);
                error = PTR_ERR(nd.intent.open.file);
-                goto exit_dput;
+                goto exit_mutex_unlock;
        }
        /* Negative dentry, just create the file */
        if (!path.dentry->d_inode) {
-                error = __open_namei_create(&nd, &path, flag, mode);
+                /*
+                 * This write is needed to ensure that a
+                 * ro->rw transition does not occur between
+                 * the time when the file is created and when
+                 * a permanent write count is taken through
+                 * the 'struct file' in nameidata_to_filp().
+                 */
+                error = mnt_want_write(nd.path.mnt);
                if (error)
+                        goto exit_mutex_unlock;
+                error = __open_namei_create(&nd, &path, flag, mode);
+                if (error) {
+                        mnt_drop_write(nd.path.mnt);
                        goto exit;
-                return nameidata_to_filp(&nd, open_flag);
+                }
+                filp = nameidata_to_filp(&nd, open_flag);
+                mnt_drop_write(nd.path.mnt);
+                return filp;
        }
        /*
@@ -1831,11 +1857,40 @@ do_last:
        if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
                goto exit;
 ok:
+        /*
+         * Consider:
+         * 1. may_open() truncates a file
+         * 2. a rw->ro mount transition occurs
+         * 3. nameidata_to_filp() fails due to
+         *    the ro mount.
+         * That would be inconsistent, and should
+         * be avoided. Taking this mnt write here
+         * ensures that (2) can not occur.
+         */
+        will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
+        if (will_write) {
+                error = mnt_want_write(nd.path.mnt);
+                if (error)
+                        goto exit;
+        }
        error = may_open(&nd, acc_mode, flag);
-        if (error)
+        if (error) {
+                if (will_write)
+                        mnt_drop_write(nd.path.mnt);
                goto exit;
-        return nameidata_to_filp(&nd, open_flag);
+        }
+        filp = nameidata_to_filp(&nd, open_flag);
+        /*
+         * It is now safe to drop the mnt write
+         * because the filp has had a write taken
+         * on its behalf.
+         */
+        if (will_write)
+                mnt_drop_write(nd.path.mnt);
+        return filp;
+exit_mutex_unlock:
+        mutex_unlock(&dir->d_inode->i_mutex);
 exit_dput:
        path_put_conditional(&path, &nd);
 exit:
author	Dave Hansen <haveblue@us.ibm.com>	2008-02-15 17:37:48 -0500
committer	Al Viro <viro@zeniv.linux.org.uk>	2008-04-19 00:29:25 -0400
commit	4a3fd211ccfc08a88edc824300e25a87785c6a5f (patch)
tree	99f1a76a99fa78464b8de731f7fdb5bcc9667a5e /fs/namei.c
parent	42a74f206b914db13ee1f5ae932dcd91a77c8579 (diff)

diff --git a/fs/namei.c b/fs/namei.c index 83c843b3fea3..e179f71bfcb0 100644 --- a/fs/namei.c +++ b/fs/namei.c
@@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1623	return -EACCES;	1623	return -EACCES;
1624		1624
1625	flag &= ~O_TRUNC;	1625	flag &= ~O_TRUNC;
1626	} else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))	1626	}
1627	return -EROFS;
1628		1627
1629	error = vfs_permission(nd, acc_mode);	1628	error = vfs_permission(nd, acc_mode);
1630	if (error)	1629	if (error)
@@ -1724,18 +1723,32 @@ static inline int open_to_namei_flags(int flag)
1724	return flag;	1723	return flag;
1725	}	1724	}
1726		1725
		1726	static int open_will_write_to_fs(int flag, struct inode *inode)
		1727	{
		1728	/*
		1729	* We'll never write to the fs underlying
		1730	* a device file.
		1731	*/
		1732	if (special_file(inode->i_mode))
		1733	return 0;
		1734	return (flag & O_TRUNC);
		1735	}
		1736
1727	/*	1737	/*
1728	* Note that the low bits of "flag" aren't the same as in the open	1738	* Note that the low bits of the passed in "open_flag"
1729	* system call. See open_to_namei_flags().	1739	* are not the same as in the local variable "flag". See
		1740	* open_to_namei_flags() for more details.
1730	*/	1741	*/
1731	struct file do_filp_open(int dfd, const char pathname,	1742	struct file do_filp_open(int dfd, const char pathname,
1732	int open_flag, int mode)	1743	int open_flag, int mode)
1733	{	1744	{
		1745	struct file *filp;
1734	struct nameidata nd;	1746	struct nameidata nd;
1735	int acc_mode, error;	1747	int acc_mode, error;
1736	struct path path;	1748	struct path path;
1737	struct dentry *dir;	1749	struct dentry *dir;
1738	int count = 0;	1750	int count = 0;
		1751	int will_write;
1739	int flag = open_to_namei_flags(open_flag);	1752	int flag = open_to_namei_flags(open_flag);
1740		1753
1741	acc_mode = ACC_MODE(flag);	1754	acc_mode = ACC_MODE(flag);
@@ -1791,17 +1804,30 @@ do_last:
1791	}	1804	}
1792		1805
1793	if (IS_ERR(nd.intent.open.file)) {	1806	if (IS_ERR(nd.intent.open.file)) {
1794	mutex_unlock(&dir->d_inode->i_mutex);
1795	error = PTR_ERR(nd.intent.open.file);	1807	error = PTR_ERR(nd.intent.open.file);
1796	goto exit_dput;	1808	goto exit_mutex_unlock;
1797	}	1809	}
1798		1810
1799	/* Negative dentry, just create the file */	1811	/* Negative dentry, just create the file */
1800	if (!path.dentry->d_inode) {	1812	if (!path.dentry->d_inode) {
1801	error = __open_namei_create(&nd, &path, flag, mode);	1813	/*
		1814	* This write is needed to ensure that a
		1815	* ro->rw transition does not occur between
		1816	* the time when the file is created and when
		1817	* a permanent write count is taken through
		1818	* the 'struct file' in nameidata_to_filp().
		1819	*/
		1820	error = mnt_want_write(nd.path.mnt);
1802	if (error)	1821	if (error)
		1822	goto exit_mutex_unlock;
		1823	error = __open_namei_create(&nd, &path, flag, mode);
		1824	if (error) {
		1825	mnt_drop_write(nd.path.mnt);
1803	goto exit;	1826	goto exit;
1804	return nameidata_to_filp(&nd, open_flag);	1827	}
		1828	filp = nameidata_to_filp(&nd, open_flag);
		1829	mnt_drop_write(nd.path.mnt);
		1830	return filp;
1805	}	1831	}
1806		1832
1807	/*	1833	/*
@@ -1831,11 +1857,40 @@ do_last:
1831	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))	1857	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
1832	goto exit;	1858	goto exit;
1833	ok:	1859	ok:
		1860	/*
		1861	* Consider:
		1862	* 1. may_open() truncates a file
		1863	* 2. a rw->ro mount transition occurs
		1864	* 3. nameidata_to_filp() fails due to
		1865	* the ro mount.
		1866	* That would be inconsistent, and should
		1867	* be avoided. Taking this mnt write here
		1868	* ensures that (2) can not occur.
		1869	*/
		1870	will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
		1871	if (will_write) {
		1872	error = mnt_want_write(nd.path.mnt);
		1873	if (error)
		1874	goto exit;
		1875	}
1834	error = may_open(&nd, acc_mode, flag);	1876	error = may_open(&nd, acc_mode, flag);
1835	if (error)	1877	if (error) {
		1878	if (will_write)
		1879	mnt_drop_write(nd.path.mnt);
1836	goto exit;	1880	goto exit;
1837	return nameidata_to_filp(&nd, open_flag);	1881	}
		1882	filp = nameidata_to_filp(&nd, open_flag);
		1883	/*
		1884	* It is now safe to drop the mnt write
		1885	* because the filp has had a write taken
		1886	* on its behalf.
		1887	*/
		1888	if (will_write)
		1889	mnt_drop_write(nd.path.mnt);
		1890	return filp;
1838		1891
		1892	exit_mutex_unlock:
		1893	mutex_unlock(&dir->d_inode->i_mutex);
1839	exit_dput:	1894	exit_dput:
1840	path_put_conditional(&path, &nd);	1895	path_put_conditional(&path, &nd);
1841	exit:	1896	exit: