diff options
author | Dave Hansen <haveblue@us.ibm.com> | 2008-02-15 17:37:48 -0500 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2008-04-19 00:29:25 -0400 |
commit | 4a3fd211ccfc08a88edc824300e25a87785c6a5f (patch) | |
tree | 99f1a76a99fa78464b8de731f7fdb5bcc9667a5e | |
parent | 42a74f206b914db13ee1f5ae932dcd91a77c8579 (diff) |
[PATCH] r/o bind mounts: elevate write count for open()s
This is the first really tricky patch in the series. It elevates the writer
count on a mount each time a non-special file is opened for write.
We used to do this in may_open(), but Miklos pointed out that __dentry_open()
is used as well to create filps. This will cover even those cases, while a
call in may_open() would not have.
There is also an elevated count around the vfs_create() call in open_namei().
See the comments for more details, but we need this to fix a 'create, remount,
fail r/w open()' race.
Some filesystems forego the use of normal vfs calls to create
struct files. Make sure that these users elevate the mnt
writer count because they will get __fput(), and we need
to make sure they're balanced.
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r-- | fs/file_table.c | 14 | ||||
-rw-r--r-- | fs/namei.c | 75 | ||||
-rw-r--r-- | fs/open.c | 36 | ||||
-rw-r--r-- | ipc/mqueue.c | 16 |
4 files changed, 127 insertions, 14 deletions
diff --git a/fs/file_table.c b/fs/file_table.c index 3f73eb1f195a..71efc7000226 100644 --- a/fs/file_table.c +++ b/fs/file_table.c | |||
@@ -199,6 +199,17 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, | |||
199 | file->f_mapping = dentry->d_inode->i_mapping; | 199 | file->f_mapping = dentry->d_inode->i_mapping; |
200 | file->f_mode = mode; | 200 | file->f_mode = mode; |
201 | file->f_op = fop; | 201 | file->f_op = fop; |
202 | |||
203 | /* | ||
204 | * These mounts don't really matter in practice | ||
205 | * for r/o bind mounts. They aren't userspace- | ||
206 | * visible. We do this for consistency, and so | ||
207 | * that we can do debugging checks at __fput() | ||
208 | */ | ||
209 | if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { | ||
210 | error = mnt_want_write(mnt); | ||
211 | WARN_ON(error); | ||
212 | } | ||
202 | return error; | 213 | return error; |
203 | } | 214 | } |
204 | EXPORT_SYMBOL(init_file); | 215 | EXPORT_SYMBOL(init_file); |
@@ -221,10 +232,13 @@ EXPORT_SYMBOL(fput); | |||
221 | */ | 232 | */ |
222 | void drop_file_write_access(struct file *file) | 233 | void drop_file_write_access(struct file *file) |
223 | { | 234 | { |
235 | struct vfsmount *mnt = file->f_path.mnt; | ||
224 | struct dentry *dentry = file->f_path.dentry; | 236 | struct dentry *dentry = file->f_path.dentry; |
225 | struct inode *inode = dentry->d_inode; | 237 | struct inode *inode = dentry->d_inode; |
226 | 238 | ||
227 | put_write_access(inode); | 239 | put_write_access(inode); |
240 | if (!special_file(inode->i_mode)) | ||
241 | mnt_drop_write(mnt); | ||
228 | } | 242 | } |
229 | EXPORT_SYMBOL_GPL(drop_file_write_access); | 243 | EXPORT_SYMBOL_GPL(drop_file_write_access); |
230 | 244 | ||
diff --git a/fs/namei.c b/fs/namei.c index 83c843b3fea3..e179f71bfcb0 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) | |||
1623 | return -EACCES; | 1623 | return -EACCES; |
1624 | 1624 | ||
1625 | flag &= ~O_TRUNC; | 1625 | flag &= ~O_TRUNC; |
1626 | } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE)) | 1626 | } |
1627 | return -EROFS; | ||
1628 | 1627 | ||
1629 | error = vfs_permission(nd, acc_mode); | 1628 | error = vfs_permission(nd, acc_mode); |
1630 | if (error) | 1629 | if (error) |
@@ -1724,18 +1723,32 @@ static inline int open_to_namei_flags(int flag) | |||
1724 | return flag; | 1723 | return flag; |
1725 | } | 1724 | } |
1726 | 1725 | ||
1726 | static int open_will_write_to_fs(int flag, struct inode *inode) | ||
1727 | { | ||
1728 | /* | ||
1729 | * We'll never write to the fs underlying | ||
1730 | * a device file. | ||
1731 | */ | ||
1732 | if (special_file(inode->i_mode)) | ||
1733 | return 0; | ||
1734 | return (flag & O_TRUNC); | ||
1735 | } | ||
1736 | |||
1727 | /* | 1737 | /* |
1728 | * Note that the low bits of "flag" aren't the same as in the open | 1738 | * Note that the low bits of the passed in "open_flag" |
1729 | * system call. See open_to_namei_flags(). | 1739 | * are not the same as in the local variable "flag". See |
1740 | * open_to_namei_flags() for more details. | ||
1730 | */ | 1741 | */ |
1731 | struct file *do_filp_open(int dfd, const char *pathname, | 1742 | struct file *do_filp_open(int dfd, const char *pathname, |
1732 | int open_flag, int mode) | 1743 | int open_flag, int mode) |
1733 | { | 1744 | { |
1745 | struct file *filp; | ||
1734 | struct nameidata nd; | 1746 | struct nameidata nd; |
1735 | int acc_mode, error; | 1747 | int acc_mode, error; |
1736 | struct path path; | 1748 | struct path path; |
1737 | struct dentry *dir; | 1749 | struct dentry *dir; |
1738 | int count = 0; | 1750 | int count = 0; |
1751 | int will_write; | ||
1739 | int flag = open_to_namei_flags(open_flag); | 1752 | int flag = open_to_namei_flags(open_flag); |
1740 | 1753 | ||
1741 | acc_mode = ACC_MODE(flag); | 1754 | acc_mode = ACC_MODE(flag); |
@@ -1791,17 +1804,30 @@ do_last: | |||
1791 | } | 1804 | } |
1792 | 1805 | ||
1793 | if (IS_ERR(nd.intent.open.file)) { | 1806 | if (IS_ERR(nd.intent.open.file)) { |
1794 | mutex_unlock(&dir->d_inode->i_mutex); | ||
1795 | error = PTR_ERR(nd.intent.open.file); | 1807 | error = PTR_ERR(nd.intent.open.file); |
1796 | goto exit_dput; | 1808 | goto exit_mutex_unlock; |
1797 | } | 1809 | } |
1798 | 1810 | ||
1799 | /* Negative dentry, just create the file */ | 1811 | /* Negative dentry, just create the file */ |
1800 | if (!path.dentry->d_inode) { | 1812 | if (!path.dentry->d_inode) { |
1801 | error = __open_namei_create(&nd, &path, flag, mode); | 1813 | /* |
1814 | * This write is needed to ensure that a | ||
1815 | * ro->rw transition does not occur between | ||
1816 | * the time when the file is created and when | ||
1817 | * a permanent write count is taken through | ||
1818 | * the 'struct file' in nameidata_to_filp(). | ||
1819 | */ | ||
1820 | error = mnt_want_write(nd.path.mnt); | ||
1802 | if (error) | 1821 | if (error) |
1822 | goto exit_mutex_unlock; | ||
1823 | error = __open_namei_create(&nd, &path, flag, mode); | ||
1824 | if (error) { | ||
1825 | mnt_drop_write(nd.path.mnt); | ||
1803 | goto exit; | 1826 | goto exit; |
1804 | return nameidata_to_filp(&nd, open_flag); | 1827 | } |
1828 | filp = nameidata_to_filp(&nd, open_flag); | ||
1829 | mnt_drop_write(nd.path.mnt); | ||
1830 | return filp; | ||
1805 | } | 1831 | } |
1806 | 1832 | ||
1807 | /* | 1833 | /* |
@@ -1831,11 +1857,40 @@ do_last: | |||
1831 | if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) | 1857 | if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) |
1832 | goto exit; | 1858 | goto exit; |
1833 | ok: | 1859 | ok: |
1860 | /* | ||
1861 | * Consider: | ||
1862 | * 1. may_open() truncates a file | ||
1863 | * 2. a rw->ro mount transition occurs | ||
1864 | * 3. nameidata_to_filp() fails due to | ||
1865 | * the ro mount. | ||
1866 | * That would be inconsistent, and should | ||
1867 | * be avoided. Taking this mnt write here | ||
1868 | * ensures that (2) can not occur. | ||
1869 | */ | ||
1870 | will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode); | ||
1871 | if (will_write) { | ||
1872 | error = mnt_want_write(nd.path.mnt); | ||
1873 | if (error) | ||
1874 | goto exit; | ||
1875 | } | ||
1834 | error = may_open(&nd, acc_mode, flag); | 1876 | error = may_open(&nd, acc_mode, flag); |
1835 | if (error) | 1877 | if (error) { |
1878 | if (will_write) | ||
1879 | mnt_drop_write(nd.path.mnt); | ||
1836 | goto exit; | 1880 | goto exit; |
1837 | return nameidata_to_filp(&nd, open_flag); | 1881 | } |
1882 | filp = nameidata_to_filp(&nd, open_flag); | ||
1883 | /* | ||
1884 | * It is now safe to drop the mnt write | ||
1885 | * because the filp has had a write taken | ||
1886 | * on its behalf. | ||
1887 | */ | ||
1888 | if (will_write) | ||
1889 | mnt_drop_write(nd.path.mnt); | ||
1890 | return filp; | ||
1838 | 1891 | ||
1892 | exit_mutex_unlock: | ||
1893 | mutex_unlock(&dir->d_inode->i_mutex); | ||
1839 | exit_dput: | 1894 | exit_dput: |
1840 | path_put_conditional(&path, &nd); | 1895 | path_put_conditional(&path, &nd); |
1841 | exit: | 1896 | exit: |
@@ -730,6 +730,35 @@ out: | |||
730 | return error; | 730 | return error; |
731 | } | 731 | } |
732 | 732 | ||
733 | /* | ||
734 | * You have to be very careful that these write | ||
735 | * counts get cleaned up in error cases and | ||
736 | * upon __fput(). This should probably never | ||
737 | * be called outside of __dentry_open(). | ||
738 | */ | ||
739 | static inline int __get_file_write_access(struct inode *inode, | ||
740 | struct vfsmount *mnt) | ||
741 | { | ||
742 | int error; | ||
743 | error = get_write_access(inode); | ||
744 | if (error) | ||
745 | return error; | ||
746 | /* | ||
747 | * Do not take mount writer counts on | ||
748 | * special files since no writes to | ||
749 | * the mount itself will occur. | ||
750 | */ | ||
751 | if (!special_file(inode->i_mode)) { | ||
752 | /* | ||
753 | * Balanced in __fput() | ||
754 | */ | ||
755 | error = mnt_want_write(mnt); | ||
756 | if (error) | ||
757 | put_write_access(inode); | ||
758 | } | ||
759 | return error; | ||
760 | } | ||
761 | |||
733 | static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, | 762 | static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, |
734 | int flags, struct file *f, | 763 | int flags, struct file *f, |
735 | int (*open)(struct inode *, struct file *)) | 764 | int (*open)(struct inode *, struct file *)) |
@@ -742,7 +771,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, | |||
742 | FMODE_PREAD | FMODE_PWRITE; | 771 | FMODE_PREAD | FMODE_PWRITE; |
743 | inode = dentry->d_inode; | 772 | inode = dentry->d_inode; |
744 | if (f->f_mode & FMODE_WRITE) { | 773 | if (f->f_mode & FMODE_WRITE) { |
745 | error = get_write_access(inode); | 774 | error = __get_file_write_access(inode, mnt); |
746 | if (error) | 775 | if (error) |
747 | goto cleanup_file; | 776 | goto cleanup_file; |
748 | } | 777 | } |
@@ -784,8 +813,11 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, | |||
784 | 813 | ||
785 | cleanup_all: | 814 | cleanup_all: |
786 | fops_put(f->f_op); | 815 | fops_put(f->f_op); |
787 | if (f->f_mode & FMODE_WRITE) | 816 | if (f->f_mode & FMODE_WRITE) { |
788 | put_write_access(inode); | 817 | put_write_access(inode); |
818 | if (!special_file(inode->i_mode)) | ||
819 | mnt_drop_write(mnt); | ||
820 | } | ||
789 | file_kill(f); | 821 | file_kill(f); |
790 | f->f_path.dentry = NULL; | 822 | f->f_path.dentry = NULL; |
791 | f->f_path.mnt = NULL; | 823 | f->f_path.mnt = NULL; |
diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 34262c11f480..94fd3b08fb77 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c | |||
@@ -598,6 +598,7 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry, | |||
598 | int oflag, mode_t mode, struct mq_attr __user *u_attr) | 598 | int oflag, mode_t mode, struct mq_attr __user *u_attr) |
599 | { | 599 | { |
600 | struct mq_attr attr; | 600 | struct mq_attr attr; |
601 | struct file *result; | ||
601 | int ret; | 602 | int ret; |
602 | 603 | ||
603 | if (u_attr) { | 604 | if (u_attr) { |
@@ -612,13 +613,24 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry, | |||
612 | } | 613 | } |
613 | 614 | ||
614 | mode &= ~current->fs->umask; | 615 | mode &= ~current->fs->umask; |
616 | ret = mnt_want_write(mqueue_mnt); | ||
617 | if (ret) | ||
618 | goto out; | ||
615 | ret = vfs_create(dir->d_inode, dentry, mode, NULL); | 619 | ret = vfs_create(dir->d_inode, dentry, mode, NULL); |
616 | dentry->d_fsdata = NULL; | 620 | dentry->d_fsdata = NULL; |
617 | if (ret) | 621 | if (ret) |
618 | goto out; | 622 | goto out_drop_write; |
619 | 623 | ||
620 | return dentry_open(dentry, mqueue_mnt, oflag); | 624 | result = dentry_open(dentry, mqueue_mnt, oflag); |
625 | /* | ||
626 | * dentry_open() took a persistent mnt_want_write(), | ||
627 | * so we can now drop this one. | ||
628 | */ | ||
629 | mnt_drop_write(mqueue_mnt); | ||
630 | return result; | ||
621 | 631 | ||
632 | out_drop_write: | ||
633 | mnt_drop_write(mqueue_mnt); | ||
622 | out: | 634 | out: |
623 | dput(dentry); | 635 | dput(dentry); |
624 | mntput(mqueue_mnt); | 636 | mntput(mqueue_mnt); |