Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull vfs updates from Al Viro: "The first vfs pile, with deep apologies for being very late in this window. Assorted cleanups and fixes, plus a large preparatory part of iov_iter work. There's a lot more of that, but it'll probably go into the next merge window - it *does* shape up nicely, removes a lot of boilerplate, gets rid of locking inconsistencie between aio_write and splice_write and I hope to get Kent's direct-io rewrite merged into the same queue, but some of the stuff after this point is having (mostly trivial) conflicts with the things already merged into mainline and with some I want more testing. This one passes LTP and xfstests without regressions, in addition to usual beating. BTW, readahead02 in ltp syscalls testsuite has started giving failures since "mm/readahead.c: fix readahead failure for memoryless NUMA nodes and limit readahead pages" - might be a false positive, might be a real regression..." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (63 commits) missing bits of "splice: fix racy pipe->buffers uses" cifs: fix the race in cifs_writev() ceph_sync_{,direct_}write: fix an oops on ceph_osdc_new_request() failure kill generic_file_buffered_write() ocfs2_file_aio_write(): switch to generic_perform_write() ceph_aio_write(): switch to generic_perform_write() xfs_file_buffered_aio_write(): switch to generic_perform_write() export generic_perform_write(), start getting rid of generic_file_buffer_write() generic_file_direct_write(): get rid of ppos argument btrfs_file_aio_write(): get rid of ppos kill the 5th argument of generic_file_buffered_write() kill the 4th argument of __generic_file_aio_write() lustre: don't open-code kernel_recvmsg() ocfs2: don't open-code kernel_recvmsg() drbd: don't open-code kernel_recvmsg() constify blk_rq_map_user_iov() and friends lustre: switch to kernel_sendmsg() ocfs2: don't open-code kernel_sendmsg() take iov_iter stuff to mm/iov_iter.c process_vm_access: tidy up a bit ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-12 17:49:50 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-12 17:49:50 -0400
commit: 5166701b368caea89d57b14bf41cf39e819dad51 (patch)
tree: c73b9d4860809e3afa9359be9d03ba2d8d98a18e /fs
parent: 0a7418f5f569512e98789c439198eed4b507cce3 (diff)
parent: a786c06d9f2719203c00b3d97b21f9a96980d0b5 (diff)
34 files changed, 385 insertions, 699 deletions
diff --git a/fs/bio.c b/fs/bio.c
index b1bc722b89aa..6f0362b77806 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1002,7 +1002,7 @@ struct bio_map_data {
 };
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-                             struct sg_iovec *iov, int iov_count,
+                             const struct sg_iovec *iov, int iov_count,
                             int is_our_pages)
 {
        memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
@@ -1022,7 +1022,7 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs,
                       sizeof(struct sg_iovec) * iov_count, gfp_mask);
 }
-static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
+static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
                          int to_user, int from_user, int do_free_page)
 {
        int ret = 0, i;
@@ -1120,7 +1120,7 @@ EXPORT_SYMBOL(bio_uncopy_user);
 */
 struct bio *bio_copy_user_iov(struct request_queue *q,
                              struct rq_map_data *map_data,
-                              struct sg_iovec *iov, int iov_count,
+                              const struct sg_iovec *iov, int iov_count,
                              int write_to_vm, gfp_t gfp_mask)
 {
        struct bio_map_data *bmd;
@@ -1259,7 +1259,7 @@ EXPORT_SYMBOL(bio_copy_user);
 static struct bio *__bio_map_user_iov(struct request_queue *q,
                                      struct block_device *bdev,
-                                      struct sg_iovec *iov, int iov_count,
+                                      const struct sg_iovec *iov, int iov_count,
                                      int write_to_vm, gfp_t gfp_mask)
 {
        int i, j;
@@ -1407,7 +1407,7 @@ EXPORT_SYMBOL(bio_map_user);
 *      device. Returns an error pointer in case of error.
 */
 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
-                             struct sg_iovec *iov, int iov_count,
+                             const struct sg_iovec *iov, int iov_count,
                             int write_to_vm, gfp_t gfp_mask)
 {
        struct bio *bio;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ba0d2b05bb78..552a8d13bc32 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1518,7 +1518,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
        BUG_ON(iocb->ki_pos != pos);
        blk_start_plug(&plug);
-        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+        ret = __generic_file_aio_write(iocb, iov, nr_segs);
        if (ret > 0) {
                ssize_t err;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c5998477fe60..eb742c07e7a4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                struct page *page = prepared_pages[pg];
                /*
                 * Copy data from userspace to the current page
-                 *
-                 * Disable pagefault to avoid recursive lock since
-                 * the pages are already locked
                 */
-                pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
-                pagefault_enable();
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
@@ -1665,7 +1660,7 @@ again:
 static ssize_t __btrfs_direct_write(struct kiocb *iocb,
                                    const struct iovec *iov,
                                    unsigned long nr_segs, loff_t pos,
-                                    loff_t *ppos, size_t count, size_t ocount)
+                                    size_t count, size_t ocount)
 {
        struct file *file = iocb->ki_filp;
        struct iov_iter i;
@@ -1674,7 +1669,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
        loff_t endbyte;
        int err;
-        written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
+        written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
                                            count, ocount);
        if (written < 0 || written == count)
@@ -1693,7 +1688,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
        if (err)
                goto out;
        written += written_buffered;
-        *ppos = pos + written_buffered;
+        iocb->ki_pos = pos + written_buffered;
        invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
                                 endbyte >> PAGE_CACHE_SHIFT);
 out:
@@ -1725,7 +1720,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        loff_t *ppos = &iocb->ki_pos;
        u64 start_pos;
        u64 end_pos;
        ssize_t num_written = 0;
@@ -1796,7 +1790,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        if (unlikely(file->f_flags & O_DIRECT)) {
                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
-                                                   pos, ppos, count, ocount);
+                                                   pos, count, ocount);
        } else {
                struct iov_iter i;
@@ -1804,7 +1798,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                num_written = __btrfs_buffered_write(file, &i, pos);
                if (num_written > 0)
-                        *ppos = pos + num_written;
+                        iocb->ki_pos = pos + num_written;
        }
        mutex_unlock(&inode->i_mutex);
diff --git a/fs/buffer.c b/fs/buffer.c
index 8c53a2b15ecb..9ddb9fc7d923 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2114,8 +2114,8 @@ EXPORT_SYMBOL(generic_write_end);
 * Returns true if all buffers which correspond to a file portion
 * we want to read are uptodate.
 */
-int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
+int block_is_partially_uptodate(struct page *page, unsigned long from,
-                                        unsigned long from)
+                                        unsigned long count)
 {
        unsigned block_start, block_end, blocksize;
        unsigned to;
@@ -2127,7 +2127,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
        head = page_buffers(page);
        blocksize = head->b_size;
-        to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+        to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
        to = from + to;
        if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
                return 0;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 622f4696e484..5b99bafc31d1 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
        /* check parameters */
        ret = -EOPNOTSUPP;
        if (!root->d_inode ||
-            !root->d_inode->i_op ||
            !root->d_inode->i_op->lookup ||
            !root->d_inode->i_op->mkdir ||
            !root->d_inode->i_op->setxattr ||
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 6494d9f673aa..c0a681705104 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
        }
        ret = -EPERM;
-        if (!subdir->d_inode->i_op ||
+        if (!subdir->d_inode->i_op->setxattr ||
-            !subdir->d_inode->i_op->setxattr ||
            !subdir->d_inode->i_op->getxattr ||
            !subdir->d_inode->i_op->lookup ||
            !subdir->d_inode->i_op->mkdir ||
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66075a4ad979..39da1c2efa50 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -601,7 +601,7 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
                                            false);
                if (IS_ERR(req)) {
                        ret = PTR_ERR(req);
-                        goto out;
+                        break;
                }
                num_pages = calc_pages_for(page_align, len);
@@ -719,7 +719,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
                                            false);
                if (IS_ERR(req)) {
                        ret = PTR_ERR(req);
-                        goto out;
+                        break;
                }
                /*
@@ -972,6 +972,7 @@ retry_snap:
                }
        } else {
                loff_t old_size = inode->i_size;
+                struct iov_iter from;
                /*
                 * No need to acquire the i_truncate_mutex. Because
                 * the MDS revokes Fwb caps before sending truncate
@@ -979,9 +980,10 @@ retry_snap:
                 * are pending vmtruncate. So write and vmtruncate
                 * can not run at the same time
                 */
-                written = generic_file_buffered_write(iocb, iov, nr_segs,
+                iov_iter_init(&from, iov, nr_segs, count, 0);
-                                                      pos, &iocb->ki_pos,
+                written = generic_perform_write(file, &from, pos);
-                                                      count, 0);
+                if (likely(written >= 0))
+                        iocb->ki_pos = pos + written;
                if (inode->i_size > old_size)
                        ceph_fscache_update_objectsize(inode);
                mutex_unlock(&inode->i_mutex);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2c70cbe35d39..df9c9141c099 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -850,7 +850,6 @@ const struct inode_operations cifs_file_inode_ops = {
 /*      revalidate:cifs_revalidate, */
        .setattr = cifs_setattr,
        .getattr = cifs_getattr, /* do we need this anymore? */
-        .rename = cifs_rename,
        .permission = cifs_permission,
 #ifdef CONFIG_CIFS_XATTR
        .setxattr = cifs_setxattr,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 216d7e99f921..8807442c94dd 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2579,19 +2579,32 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
        struct cifsInodeInfo *cinode = CIFS_I(inode);
        struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
        ssize_t rc = -EACCES;
-        loff_t lock_pos = pos;
+        loff_t lock_pos = iocb->ki_pos;
-        if (file->f_flags & O_APPEND)
-                lock_pos = i_size_read(inode);
        /*
         * We need to hold the sem to be sure nobody modifies lock list
         * with a brlock that prevents writing.
         */
        down_read(&cinode->lock_sem);
+        mutex_lock(&inode->i_mutex);
+        if (file->f_flags & O_APPEND)
+                lock_pos = i_size_read(inode);
        if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs),
                                     server->vals->exclusive_lock_type, NULL,
-                                     CIFS_WRITE_OP))
+                                     CIFS_WRITE_OP)) {
-                rc = generic_file_aio_write(iocb, iov, nr_segs, pos);
+                rc = __generic_file_aio_write(iocb, iov, nr_segs);
+                mutex_unlock(&inode->i_mutex);
+                if (rc > 0) {
+                        ssize_t err;
+                        err = generic_write_sync(file, iocb->ki_pos - rc, rc);
+                        if (rc < 0)
+                                rc = err;
+                }
+        } else {
+                mutex_unlock(&inode->i_mutex);
+        }
        up_read(&cinode->lock_sem);
        return rc;
 }
@@ -2727,56 +2740,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata)
 /**
 * cifs_readdata_to_iov - copy data from pages in response to an iovec
 * @rdata:      the readdata response with list of pages holding data
- * @iov:        vector in which we should copy the data
+ * @iter:       destination for our data
- * @nr_segs:    number of segments in vector
- * @offset:     offset into file of the first iovec
- * @copied:     used to return the amount of data copied to the iov
 *
 * This function copies data from a list of pages in a readdata response into
 * an array of iovecs. It will first calculate where the data should go
 * based on the info in the readdata and then copy the data into that spot.
 */
-static ssize_t
+static int
-cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov,
+cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
-                        unsigned long nr_segs, loff_t offset, ssize_t *copied)
 {
-        int rc = 0;
+        size_t remaining = rdata->bytes;
-        struct iov_iter ii;
-        size_t pos = rdata->offset - offset;
-        ssize_t remaining = rdata->bytes;
-        unsigned char *pdata;
        unsigned int i;
-        /* set up iov_iter and advance to the correct offset */
-        iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0);
-        iov_iter_advance(&ii, pos);
-        *copied = 0;
        for (i = 0; i < rdata->nr_pages; i++) {
-                ssize_t copy;
                struct page *page = rdata->pages[i];
+                size_t copy = min(remaining, PAGE_SIZE);
-                /* copy a whole page or whatever's left */
+                size_t written = copy_page_to_iter(page, 0, copy, iter);
-                copy = min_t(ssize_t, remaining, PAGE_SIZE);
+                remaining -= written;
+                if (written < copy && iov_iter_count(iter) > 0)
-                /* ...but limit it to whatever space is left in the iov */
+                        break;
-                copy = min_t(ssize_t, copy, iov_iter_count(&ii));
-                /* go while there's data to be copied and no errors */
-                if (copy && !rc) {
-                        pdata = kmap(page);
-                        rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset,
-                                                (int)copy);
-                        kunmap(page);
-                        if (!rc) {
-                                *copied += copy;
-                                remaining -= copy;
-                                iov_iter_advance(&ii, copy);
-                        }
-                }
        }
+        return remaining ? -EFAULT : 0;
-        return rc;
 }
 static void
@@ -2837,20 +2821,21 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
        return total_read > 0 ? total_read : result;
 }
-static ssize_t
+ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
-cifs_iovec_read(struct file *file, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t pos)
-                 unsigned long nr_segs, loff_t *poffset)
 {
+        struct file *file = iocb->ki_filp;
        ssize_t rc;
        size_t len, cur_len;
        ssize_t total_read = 0;
-        loff_t offset = *poffset;
+        loff_t offset = pos;
        unsigned int npages;
        struct cifs_sb_info *cifs_sb;
        struct cifs_tcon *tcon;
        struct cifsFileInfo *open_file;
        struct cifs_readdata *rdata, *tmp;
        struct list_head rdata_list;
+        struct iov_iter to;
        pid_t pid;
        if (!nr_segs)
@@ -2860,6 +2845,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        if (!len)
                return 0;
+        iov_iter_init(&to, iov, nr_segs, len, 0);
        INIT_LIST_HEAD(&rdata_list);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        open_file = file->private_data;
@@ -2917,55 +2904,44 @@ error:
        if (!list_empty(&rdata_list))
                rc = 0;
+        len = iov_iter_count(&to);
        /* the loop below should proceed in the order of increasing offsets */
-restart_loop:
        list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+        again:
                if (!rc) {
-                        ssize_t copied;
                        /* FIXME: freezable sleep too? */
                        rc = wait_for_completion_killable(&rdata->done);
                        if (rc)
                                rc = -EINTR;
-                        else if (rdata->result)
+                        else if (rdata->result) {
                                rc = rdata->result;
-                        else {
+                                /* resend call if it's a retryable error */
-                                rc = cifs_readdata_to_iov(rdata, iov,
+                                if (rc == -EAGAIN) {
-                                                        nr_segs, *poffset,
+                                        rc = cifs_retry_async_readv(rdata);
-                                                        &copied);
+                                        goto again;
-                                total_read += copied;
+                                }
+                        } else {
+                                rc = cifs_readdata_to_iov(rdata, &to);
                        }
-                        /* resend call if it's a retryable error */
-                        if (rc == -EAGAIN) {
-                                rc = cifs_retry_async_readv(rdata);
-                                goto restart_loop;
-                        }
                }
                list_del_init(&rdata->list);
                kref_put(&rdata->refcount, cifs_uncached_readdata_release);
        }
+        total_read = len - iov_iter_count(&to);
        cifs_stats_bytes_read(tcon, total_read);
-        *poffset += total_read;
        /* mask nodata case */
        if (rc == -ENODATA)
                rc = 0;
-        return total_read ? total_read : rc;
+        if (total_read) {
-}
+                iocb->ki_pos = pos + total_read;
+                return total_read;
-ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+        }
-                               unsigned long nr_segs, loff_t pos)
+        return rc;
-{
-        ssize_t read;
-        read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
-        if (read > 0)
-                iocb->ki_pos = pos;
-        return read;
 }
 ssize_t
diff --git a/fs/exec.c b/fs/exec.c
index 9e81c630dfa7..476f3ebf437e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -813,7 +813,7 @@ EXPORT_SYMBOL(kernel_read);
 ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 {
-        ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos);
+        ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
        if (res > 0)
                flush_icache_range(addr, addr + len);
        return res;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4e508fc83dcf..ca7502d89fde 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -146,7 +146,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
                        overwrite = 1;
        }
-        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+        ret = __generic_file_aio_write(iocb, iov, nr_segs);
        mutex_unlock(&inode->i_mutex);
        if (ret > 0) {
diff --git a/fs/file.c b/fs/file.c
index b61293badfb1..8f294cfac697 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -25,7 +25,10 @@
 int sysctl_nr_open __read_mostly = 1024*1024;
 int sysctl_nr_open_min = BITS_PER_LONG;
-int sysctl_nr_open_max = 1024 * 1024; /* raised later */
+/* our max() is unusable in constant expressions ;-/ */
+#define __const_max(x, y) ((x) < (y) ? (x) : (y))
+int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
+                         -BITS_PER_LONG;
 static void *alloc_fdmem(size_t size)
 {
@@ -429,12 +432,6 @@ void exit_files(struct task_struct *tsk)
        }
 }
-void __init files_defer_init(void)
-{
-        sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
-                             -BITS_PER_LONG;
-}
 struct files_struct init_files = {
        .count          = ATOMIC_INIT(1),
        .fdt            = &init_files.fdtab,
diff --git a/fs/file_table.c b/fs/file_table.c
index 01071c4d752e..a374f5033e97 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -52,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head)
 static inline void file_free(struct file *f)
 {
        percpu_counter_dec(&nr_files);
-        file_check_state(f);
        call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
@@ -178,47 +177,12 @@ struct file *alloc_file(struct path *path, fmode_t mode,
        file->f_mapping = path->dentry->d_inode->i_mapping;
        file->f_mode = mode;
        file->f_op = fop;
-        /*
-         * These mounts don't really matter in practice
-         * for r/o bind mounts.  They aren't userspace-
-         * visible.  We do this for consistency, and so
-         * that we can do debugging checks at __fput()
-         */
-        if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
-                file_take_write(file);
-                WARN_ON(mnt_clone_write(path->mnt));
-        }
        if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(path->dentry->d_inode);
        return file;
 }
 EXPORT_SYMBOL(alloc_file);
-/**
- * drop_file_write_access - give up ability to write to a file
- * @file: the file to which we will stop writing
- *
- * This is a central place which will give up the ability
- * to write to @file, along with access to write through
- * its vfsmount.
- */
-static void drop_file_write_access(struct file *file)
-{
-        struct vfsmount *mnt = file->f_path.mnt;
-        struct dentry *dentry = file->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
-        put_write_access(inode);
-        if (special_file(inode->i_mode))
-                return;
-        if (file_check_writeable(file) != 0)
-                return;
-        __mnt_drop_write(mnt);
-        file_release_write(file);
-}
 /* the real guts of fput() - releasing the last reference to file
 */
 static void __fput(struct file *file)
@@ -253,8 +217,10 @@ static void __fput(struct file *file)
        put_pid(file->f_owner.pid);
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_dec(inode);
-        if (file->f_mode & FMODE_WRITE)
+        if (file->f_mode & FMODE_WRITER) {
-                drop_file_write_access(file);
+                put_write_access(inode);
+                __mnt_drop_write(mnt);
+        }
        file->f_path.dentry = NULL;
        file->f_path.mnt = NULL;
        file->f_inode = NULL;
@@ -359,6 +325,5 @@ void __init files_init(unsigned long mempages)
        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-        files_defer_init();
        percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0a648bb455ae..aac71ce373e4 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -667,15 +667,15 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
                struct pipe_buffer *buf = cs->currbuf;
                if (!cs->write) {
-                        buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+                        kunmap_atomic(cs->mapaddr);
                } else {
-                        kunmap(buf->page);
+                        kunmap_atomic(cs->mapaddr);
                        buf->len = PAGE_SIZE - cs->len;
                }
                cs->currbuf = NULL;
                cs->mapaddr = NULL;
        } else if (cs->mapaddr) {
-                kunmap(cs->pg);
+                kunmap_atomic(cs->mapaddr);
                if (cs->write) {
                        flush_dcache_page(cs->pg);
                        set_page_dirty_lock(cs->pg);
@@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                        BUG_ON(!cs->nr_segs);
                        cs->currbuf = buf;
-                        cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
+                        cs->mapaddr = kmap_atomic(buf->page);
                        cs->len = buf->len;
                        cs->buf = cs->mapaddr + buf->offset;
                        cs->pipebufs++;
@@ -726,7 +726,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                        buf->len = 0;
                        cs->currbuf = buf;
-                        cs->mapaddr = kmap(page);
+                        cs->mapaddr = kmap_atomic(page);
                        cs->buf = cs->mapaddr;
                        cs->len = PAGE_SIZE;
                        cs->pipebufs++;
@@ -745,7 +745,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                        return err;
                BUG_ON(err != 1);
                offset = cs->addr % PAGE_SIZE;
-                cs->mapaddr = kmap(cs->pg);
+                cs->mapaddr = kmap_atomic(cs->pg);
                cs->buf = cs->mapaddr + offset;
                cs->len = min(PAGE_SIZE - offset, cs->seglen);
                cs->seglen -= cs->len;
@@ -874,7 +874,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 out_fallback_unlock:
        unlock_page(newpage);
 out_fallback:
-        cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+        cs->mapaddr = kmap_atomic(buf->page);
        cs->buf = cs->mapaddr + buf->offset;
        err = lock_request(cs->fc, cs->req);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 48992cac714b..13f8bdec5110 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1086,9 +1086,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
                if (mapping_writably_mapped(mapping))
                        flush_dcache_page(page);
-                pagefault_disable();
                tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
-                pagefault_enable();
                flush_dcache_page(page);
                mark_page_accessed(page);
@@ -1237,8 +1235,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                goto out;
        if (file->f_flags & O_DIRECT) {
-                written = generic_file_direct_write(iocb, iov, &nr_segs,
+                written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 
-                                                    pos, &iocb->ki_pos,
                                                    count, ocount);
                if (written < 0 || written == count)
                        goto out;
diff --git a/fs/mount.h b/fs/mount.h
index b29e42f05f34..d55297f2fa05 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,7 +10,7 @@ struct mnt_namespace {
        struct user_namespace   *user_ns;
        u64                     seq;    /* Sequence number to prevent loops */
        wait_queue_head_t poll;
-        int event;
+        u64 event;
 };
 struct mnt_pcp {
@@ -104,6 +104,9 @@ struct proc_mounts {
        struct mnt_namespace *ns;
        struct path root;
        int (*show)(struct seq_file *, struct vfsmount *);
+        void *cached_mount;
+        u64 cached_event;
+        loff_t cached_index;
 };
 #define proc_mounts(p) (container_of((p), struct proc_mounts, m))
diff --git a/fs/namei.c b/fs/namei.c
index 88339f59efb5..c6157c894fce 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -358,6 +358,7 @@ int generic_permission(struct inode *inode, int mask)
        return -EACCES;
 }
+EXPORT_SYMBOL(generic_permission);
 /*
 * We _really_ want to just do "generic_permission()" without
@@ -455,6 +456,7 @@ int inode_permission(struct inode *inode, int mask)
                return retval;
        return __inode_permission(inode, mask);
 }
+EXPORT_SYMBOL(inode_permission);
 /**
 * path_get - get a reference to a path
@@ -924,6 +926,7 @@ int follow_up(struct path *path)
        path->mnt = &parent->mnt;
        return 1;
 }
+EXPORT_SYMBOL(follow_up);
 /*
 * Perform an automount
@@ -1085,6 +1088,7 @@ int follow_down_one(struct path *path)
        }
        return 0;
 }
+EXPORT_SYMBOL(follow_down_one);
 static inline bool managed_dentry_might_block(struct dentry *dentry)
 {
@@ -1223,6 +1227,7 @@ int follow_down(struct path *path)
        }
        return 0;
 }
+EXPORT_SYMBOL(follow_down);
 /*
 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
@@ -2025,6 +2030,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path)
                *path = nd.path;
        return res;
 }
+EXPORT_SYMBOL(kern_path);
 /**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
@@ -2049,6 +2055,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                *path = nd.path;
        return err;
 }
+EXPORT_SYMBOL(vfs_path_lookup);
 /*
 * Restricted form of lookup. Doesn't follow links, single-component only,
@@ -2111,6 +2118,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        return __lookup_hash(&this, base, 0);
 }
+EXPORT_SYMBOL(lookup_one_len);
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
@@ -2135,6 +2143,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
 {
        return user_path_at_empty(dfd, name, flags, path, NULL);
 }
+EXPORT_SYMBOL(user_path_at);
 /*
 * NB: most callers don't do anything directly with the reference to the
@@ -2477,6 +2486,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
        mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
        return NULL;
 }
+EXPORT_SYMBOL(lock_rename);
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
@@ -2486,6 +2496,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
                mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
        }
 }
+EXPORT_SYMBOL(unlock_rename);
 int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                bool want_excl)
@@ -2506,6 +2517,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                fsnotify_create(dir, dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_create);
 static int may_open(struct path *path, int acc_mode, int flag)
 {
@@ -3375,6 +3387,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
                fsnotify_create(dir, dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_mknod);
 static int may_mknod(umode_t mode)
 {
@@ -3464,6 +3477,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
                fsnotify_mkdir(dir, dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_mkdir);
 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
 {
@@ -3518,6 +3532,7 @@ void dentry_unhash(struct dentry *dentry)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
 }
+EXPORT_SYMBOL(dentry_unhash);
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
@@ -3555,6 +3570,7 @@ out:
                d_delete(dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_rmdir);
 static long do_rmdir(int dfd, const char __user *pathname)
 {
@@ -3672,6 +3688,7 @@ out:
        return error;
 }
+EXPORT_SYMBOL(vfs_unlink);
 /*
 * Make sure that the actual truncation of the file will occur outside its
@@ -3785,6 +3802,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
                fsnotify_create(dir, dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_symlink);
 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
@@ -3893,6 +3911,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                fsnotify_link(dir, inode, new_dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_link);
 /*
 * Hardlinks are often used in delicate situations.  We avoid
@@ -4152,6 +4171,7 @@ out:
        return error;
 }
+EXPORT_SYMBOL(vfs_rename);
 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, unsigned int, flags)
@@ -4304,11 +4324,9 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
        return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
 }
-int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
+int readlink_copy(char __user *buffer, int buflen, const char *link)
 {
-        int len;
+        int len = PTR_ERR(link);
-        len = PTR_ERR(link);
        if (IS_ERR(link))
                goto out;
@@ -4320,6 +4338,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c
 out:
        return len;
 }
+EXPORT_SYMBOL(readlink_copy);
 /*
 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
@@ -4337,11 +4356,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
        if (IS_ERR(cookie))
                return PTR_ERR(cookie);
-        res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+        res = readlink_copy(buffer, buflen, nd_get_link(&nd));
        if (dentry->d_inode->i_op->put_link)
                dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
        return res;
 }
+EXPORT_SYMBOL(generic_readlink);
 /* get the link contents into pagecache */
 static char *page_getlink(struct dentry * dentry, struct page **ppage)
@@ -4361,14 +4381,14 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
        struct page *page = NULL;
-        char *s = page_getlink(dentry, &page);
+        int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
-        int res = vfs_readlink(dentry,buffer,buflen,s);
        if (page) {
                kunmap(page);
                page_cache_release(page);
        }
        return res;
 }
+EXPORT_SYMBOL(page_readlink);
 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
 {
@@ -4376,6 +4396,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
        nd_set_link(nd, page_getlink(dentry, &page));
        return page;
 }
+EXPORT_SYMBOL(page_follow_link_light);
 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 {
@@ -4386,6 +4407,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
                page_cache_release(page);
        }
 }
+EXPORT_SYMBOL(page_put_link);
 /*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4423,45 +4445,18 @@ retry:
 fail:
        return err;
 }
+EXPORT_SYMBOL(__page_symlink);
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
        return __page_symlink(inode, symname, len,
                        !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
 }
+EXPORT_SYMBOL(page_symlink);
 const struct inode_operations page_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
 };
-EXPORT_SYMBOL(user_path_at);
-EXPORT_SYMBOL(follow_down_one);
-EXPORT_SYMBOL(follow_down);
-EXPORT_SYMBOL(follow_up);
-EXPORT_SYMBOL(get_write_access); /* nfsd */
-EXPORT_SYMBOL(lock_rename);
-EXPORT_SYMBOL(lookup_one_len);
-EXPORT_SYMBOL(page_follow_link_light);
-EXPORT_SYMBOL(page_put_link);
-EXPORT_SYMBOL(page_readlink);
-EXPORT_SYMBOL(__page_symlink);
-EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(kern_path);
-EXPORT_SYMBOL(vfs_path_lookup);
-EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(unlock_rename);
-EXPORT_SYMBOL(vfs_create);
-EXPORT_SYMBOL(vfs_link);
-EXPORT_SYMBOL(vfs_mkdir);
-EXPORT_SYMBOL(vfs_mknod);
-EXPORT_SYMBOL(generic_permission);
-EXPORT_SYMBOL(vfs_readlink);
-EXPORT_SYMBOL(vfs_rename);
-EXPORT_SYMBOL(vfs_rmdir);
-EXPORT_SYMBOL(vfs_symlink);
-EXPORT_SYMBOL(vfs_unlink);
-EXPORT_SYMBOL(dentry_unhash);
-EXPORT_SYMBOL(generic_readlink);
diff --git a/fs/namespace.c b/fs/namespace.c
index 2ffc5a2905d4..182bc41cd887 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -52,7 +52,7 @@ static int __init set_mphash_entries(char *str)
 }
 __setup("mphash_entries=", set_mphash_entries);
-static int event;
+static u64 event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 static DEFINE_SPINLOCK(mnt_id_lock);
@@ -414,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
 */
 int __mnt_want_write_file(struct file *file)
 {
-        struct inode *inode = file_inode(file);
+        if (!(file->f_mode & FMODE_WRITER))
-        if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
                return __mnt_want_write(file->f_path.mnt);
        else
                return mnt_clone_write(file->f_path.mnt);
@@ -570,13 +568,17 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 static void free_vfsmnt(struct mount *mnt)
 {
        kfree(mnt->mnt_devname);
-        mnt_free_id(mnt);
 #ifdef CONFIG_SMP
        free_percpu(mnt->mnt_pcp);
 #endif
        kmem_cache_free(mnt_cache, mnt);
 }
+static void delayed_free_vfsmnt(struct rcu_head *head)
+{
+        free_vfsmnt(container_of(head, struct mount, mnt_rcu));
+}
 /* call under rcu_read_lock */
 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
 {
@@ -848,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        root = mount_fs(type, flags, name, data);
        if (IS_ERR(root)) {
+                mnt_free_id(mnt);
                free_vfsmnt(mnt);
                return ERR_CAST(root);
        }
@@ -885,7 +888,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                        goto out_free;
        }
-        mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
+        mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
        /* Don't allow unprivileged users to change mount flags */
        if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
                mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
@@ -928,20 +931,11 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
        return mnt;
 out_free:
+        mnt_free_id(mnt);
        free_vfsmnt(mnt);
        return ERR_PTR(err);
 }
-static void delayed_free(struct rcu_head *head)
-{
-        struct mount *mnt = container_of(head, struct mount, mnt_rcu);
-        kfree(mnt->mnt_devname);
-#ifdef CONFIG_SMP
-        free_percpu(mnt->mnt_pcp);
-#endif
-        kmem_cache_free(mnt_cache, mnt);
-}
 static void mntput_no_expire(struct mount *mnt)
 {
 put_again:
@@ -991,7 +985,7 @@ put_again:
        dput(mnt->mnt.mnt_root);
        deactivate_super(mnt->mnt.mnt_sb);
        mnt_free_id(mnt);
-        call_rcu(&mnt->mnt_rcu, delayed_free);
+        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
 }
 void mntput(struct vfsmount *mnt)
@@ -1100,14 +1094,29 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        struct proc_mounts *p = proc_mounts(m);
        down_read(&namespace_sem);
-        return seq_list_start(&p->ns->list, *pos);
+        if (p->cached_event == p->ns->event) {
+                void *v = p->cached_mount;
+                if (*pos == p->cached_index)
+                        return v;
+                if (*pos == p->cached_index + 1) {
+                        v = seq_list_next(v, &p->ns->list, &p->cached_index);
+                        return p->cached_mount = v;
+                }
+        }
+        p->cached_event = p->ns->event;
+        p->cached_mount = seq_list_start(&p->ns->list, *pos);
+        p->cached_index = *pos;
+        return p->cached_mount;
 }
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct proc_mounts *p = proc_mounts(m);
-        return seq_list_next(v, &p->ns->list, pos);
+        p->cached_mount = seq_list_next(v, &p->ns->list, pos);
+        p->cached_index = *pos;
+        return p->cached_mount;
 }
 static void m_stop(struct seq_file *m, void *v)
@@ -1661,9 +1670,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
                if (err)
                        goto out;
                err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
+                lock_mount_hash();
                if (err)
                        goto out_cleanup_ids;
-                lock_mount_hash();
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        } else {
@@ -1690,6 +1699,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
        return 0;
 out_cleanup_ids:
+        while (!hlist_empty(&tree_list)) {
+                child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+                umount_tree(child, 0);
+        }
+        unlock_mount_hash();
        cleanup_group_ids(source_mnt, NULL);
 out:
        return err;
@@ -2044,7 +2058,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
        struct mount *parent;
        int err;
-        mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
+        mnt_flags &= ~MNT_INTERNAL_FLAGS;
        mp = lock_mount(path);
        if (IS_ERR(mp))
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 81b4f643ecef..e31e589369a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -470,9 +470,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 {
        struct ncp_mount_data_kernel data;
        struct ncp_server *server;
-        struct file *ncp_filp;
        struct inode *root_inode;
-        struct inode *sock_inode;
        struct socket *sock;
        int error;
        int default_bufsize;
@@ -541,18 +539,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) ||
            !gid_valid(data.gid))
                goto out;
-        error = -EBADF;
+        sock = sockfd_lookup(data.ncp_fd, &error);
-        ncp_filp = fget(data.ncp_fd);
-        if (!ncp_filp)
-                goto out;
-        error = -ENOTSOCK;
-        sock_inode = file_inode(ncp_filp);
-        if (!S_ISSOCK(sock_inode->i_mode))
-                goto out_fput;
-        sock = SOCKET_I(sock_inode);
        if (!sock)
-                goto out_fput;
+                goto out;
-                
        if (sock->type == SOCK_STREAM)
                default_bufsize = 0xF000;
        else
@@ -574,27 +564,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (error)
                goto out_fput;
-        server->ncp_filp = ncp_filp;
        server->ncp_sock = sock;
        
        if (data.info_fd != -1) {
-                struct socket *info_sock;
+                struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
-                error = -EBADF;
-                server->info_filp = fget(data.info_fd);
-                if (!server->info_filp)
-                        goto out_bdi;
-                error = -ENOTSOCK;
-                sock_inode = file_inode(server->info_filp);
-                if (!S_ISSOCK(sock_inode->i_mode))
-                        goto out_fput2;
-                info_sock = SOCKET_I(sock_inode);
                if (!info_sock)
-                        goto out_fput2;
+                        goto out_bdi;
+                server->info_sock = info_sock;
                error = -EBADFD;
                if (info_sock->type != SOCK_STREAM)
                        goto out_fput2;
-                server->info_sock = info_sock;
        }
 /*      server->lock = 0;       */
@@ -766,17 +745,12 @@ out_nls:
        mutex_destroy(&server->root_setup_lock);
        mutex_destroy(&server->mutex);
 out_fput2:
-        if (server->info_filp)
+        if (server->info_sock)
-                fput(server->info_filp);
+                sockfd_put(server->info_sock);
 out_bdi:
        bdi_destroy(&server->bdi);
 out_fput:
-        /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
+        sockfd_put(sock);
-         * 
-         * The previously used put_filp(ncp_filp); was bogus, since
-         * it doesn't perform proper unlocking.
-         */
-        fput(ncp_filp);
 out:
        put_pid(data.wdog_pid);
        sb->s_fs_info = NULL;
@@ -809,9 +783,9 @@ static void ncp_put_super(struct super_block *sb)
        mutex_destroy(&server->root_setup_lock);
        mutex_destroy(&server->mutex);
-        if (server->info_filp)
+        if (server->info_sock)
-                fput(server->info_filp);
+                sockfd_put(server->info_sock);
-        fput(server->ncp_filp);
+        sockfd_put(server->ncp_sock);
        kill_pid(server->m.wdog_pid, SIGTERM, 1);
        put_pid(server->m.wdog_pid);
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index b81e97adc5a9..7fa17e459366 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -45,9 +45,7 @@ struct ncp_server {
        __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
-        struct file *ncp_filp;  /* File pointer to ncp socket */
        struct socket *ncp_sock;/* ncp socket */
-        struct file *info_filp;
        struct socket *info_sock;
        u8 sequence;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 9d8153ebacfb..f47af5e6e230 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
        iput(bvi);
 skip_large_index_stuff:
        /* Setup the operations for this index inode. */
-        vi->i_op = NULL;
-        vi->i_fop = NULL;
        vi->i_mapping->a_ops = &ntfs_mst_aops;
        vi->i_blocks = ni->allocated_size >> 9;
        /*
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index eb649d23a4de..dfda2ffdb16c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -916,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
 static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
 {
-        int ret;
+        struct kvec vec = { .iov_len = len, .iov_base = data, };
-        mm_segment_t oldfs;
+        struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
-        struct kvec vec = {
+        return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
-                .iov_len = len,
-                .iov_base = data,
-        };
-        struct msghdr msg = {
-                .msg_iovlen = 1,
-                .msg_iov = (struct iovec *)&vec,
-                .msg_flags = MSG_DONTWAIT,
-        };
-        oldfs = get_fs();
-        set_fs(get_ds());
-        ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
-        set_fs(oldfs);
-        return ret;
 }
 static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
                              size_t veclen, size_t total)
 {
        int ret;
-        mm_segment_t oldfs;
+        struct msghdr msg;
-        struct msghdr msg = {
-                .msg_iov = (struct iovec *)vec,
-                .msg_iovlen = veclen,
-        };
        if (sock == NULL) {
                ret = -EINVAL;
                goto out;
        }
-        oldfs = get_fs();
+        ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
-        set_fs(get_ds());
+        if (likely(ret == total))
-        ret = sock_sendmsg(sock, &msg, total);
+                return 0;
-        set_fs(oldfs);
+        mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
-        if (ret != total) {
+        if (ret >= 0)
-                mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
+                ret = -EPIPE; /* should be smarter, I bet */
-                     total);
-                if (ret >= 0)
-                        ret = -EPIPE; /* should be smarter, I bet */
-                goto out;
-        }
-        ret = 0;
 out:
-        if (ret < 0)
+        mlog(0, "returning error: %d\n", ret);
-                mlog(0, "returning error: %d\n", ret);
        return ret;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ff33c5ef87f2..8970dcf74de5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2367,15 +2367,18 @@ relock:
        if (direct_io) {
                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
-                                                    ppos, count, ocount);
+                                                    count, ocount);
                if (written < 0) {
                        ret = written;
                        goto out_dio;
                }
        } else {
+                struct iov_iter from;
+                iov_iter_init(&from, iov, nr_segs, count, 0);
                current->backing_dev_info = file->f_mapping->backing_dev_info;
-                written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
+                written = generic_perform_write(file, &from, *ppos);
-                                                      ppos, count, 0);
+                if (likely(written >= 0))
+                        iocb->ki_pos = *ppos + written;
                current->backing_dev_info = NULL;
        }
diff --git a/fs/open.c b/fs/open.c
index 631aea815def..3d30eb1fc95e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -655,35 +655,6 @@ out:
        return error;
 }
-/*
- * You have to be very careful that these write
- * counts get cleaned up in error cases and
- * upon __fput().  This should probably never
- * be called outside of __dentry_open().
- */
-static inline int __get_file_write_access(struct inode *inode,
-                                          struct vfsmount *mnt)
-{
-        int error;
-        error = get_write_access(inode);
-        if (error)
-                return error;
-        /*
-         * Do not take mount writer counts on
-         * special files since no writes to
-         * the mount itself will occur.
-         */
-        if (!special_file(inode->i_mode)) {
-                /*
-                 * Balanced in __fput()
-                 */
-                error = __mnt_want_write(mnt);
-                if (error)
-                        put_write_access(inode);
-        }
-        return error;
-}
 int open_check_o_direct(struct file *f)
 {
        /* NB: we're sure to have correct a_ops only after f_op->open */
@@ -708,26 +679,28 @@ static int do_dentry_open(struct file *f,
        f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
                                FMODE_PREAD | FMODE_PWRITE;
-        if (unlikely(f->f_flags & O_PATH))
-                f->f_mode = FMODE_PATH;
        path_get(&f->f_path);
        inode = f->f_inode = f->f_path.dentry->d_inode;
-        if (f->f_mode & FMODE_WRITE) {
-                error = __get_file_write_access(inode, f->f_path.mnt);
-                if (error)
-                        goto cleanup_file;
-                if (!special_file(inode->i_mode))
-                        file_take_write(f);
-        }
        f->f_mapping = inode->i_mapping;
-        if (unlikely(f->f_mode & FMODE_PATH)) {
+        if (unlikely(f->f_flags & O_PATH)) {
+                f->f_mode = FMODE_PATH;
                f->f_op = &empty_fops;
                return 0;
        }
+        if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
+                error = get_write_access(inode);
+                if (unlikely(error))
+                        goto cleanup_file;
+                error = __mnt_want_write(f->f_path.mnt);
+                if (unlikely(error)) {
+                        put_write_access(inode);
+                        goto cleanup_file;
+                }
+                f->f_mode |= FMODE_WRITER;
+        }
        /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
        if (S_ISREG(inode->i_mode))
                f->f_mode |= FMODE_ATOMIC_POS;
@@ -764,18 +737,9 @@ static int do_dentry_open(struct file *f,
 cleanup_all:
        fops_put(f->f_op);
-        if (f->f_mode & FMODE_WRITE) {
+        if (f->f_mode & FMODE_WRITER) {
                put_write_access(inode);
-                if (!special_file(inode->i_mode)) {
+                __mnt_drop_write(f->f_path.mnt);
-                        /*
-                         * We don't consider this a real
-                         * mnt_want/drop_write() pair
-                         * because it all happenend right
-                         * here, so just reset the state.
-                         */
-                        file_reset_write(f);
-                        __mnt_drop_write(f->f_path.mnt);
-                }
        }
 cleanup_file:
        path_put(&f->f_path);
diff --git a/fs/pipe.c b/fs/pipe.c
index 78fd0d0788db..034bffac3f97 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -142,55 +142,6 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
        return 0;
 }
-static int
-pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
-                      int atomic)
-{
-        unsigned long copy;
-        while (len > 0) {
-                while (!iov->iov_len)
-                        iov++;
-                copy = min_t(unsigned long, len, iov->iov_len);
-                if (atomic) {
-                        if (__copy_to_user_inatomic(iov->iov_base, from, copy))
-                                return -EFAULT;
-                } else {
-                        if (copy_to_user(iov->iov_base, from, copy))
-                                return -EFAULT;
-                }
-                from += copy;
-                len -= copy;
-                iov->iov_base += copy;
-                iov->iov_len -= copy;
-        }
-        return 0;
-}
-/*
- * Attempt to pre-fault in the user memory, so we can use atomic copies.
- * Returns the number of bytes not faulted in.
- */
-static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
-{
-        while (!iov->iov_len)
-                iov++;
-        while (len > 0) {
-                unsigned long this_len;
-                this_len = min_t(unsigned long, len, iov->iov_len);
-                if (fault_in_pages_writeable(iov->iov_base, this_len))
-                        break;
-                len -= this_len;
-                iov++;
-        }
-        return len;
-}
 /*
 * Pre-fault in the user memory, so we can use atomic copies.
 */
@@ -226,52 +177,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 }
 /**
- * generic_pipe_buf_map - virtually map a pipe buffer
- * @pipe:       the pipe that the buffer belongs to
- * @buf:        the buffer that should be mapped
- * @atomic:     whether to use an atomic map
- *
- * Description:
- *      This function returns a kernel virtual address mapping for the
- *      pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
- *      and the caller has to be careful not to fault before calling
- *      the unmap function.
- *
- *      Note that this function calls kmap_atomic() if @atomic != 0.
- */
-void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
-                           struct pipe_buffer *buf, int atomic)
-{
-        if (atomic) {
-                buf->flags |= PIPE_BUF_FLAG_ATOMIC;
-                return kmap_atomic(buf->page);
-        }
-        return kmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_map);
-/**
- * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
- * @pipe:       the pipe that the buffer belongs to
- * @buf:        the buffer that should be unmapped
- * @map_data:   the data that the mapping function returned
- *
- * Description:
- *      This function undoes the mapping that ->map() provided.
- */
-void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
-                            struct pipe_buffer *buf, void *map_data)
-{
-        if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
-                buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
-                kunmap_atomic(map_data);
-        } else
-                kunmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_unmap);
-/**
 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
 * @pipe:       the pipe that the buffer belongs to
 * @buf:        the buffer to attempt to steal
@@ -351,8 +256,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release);
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .can_merge = 1,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = generic_pipe_buf_confirm,
        .release = anon_pipe_buf_release,
        .steal = generic_pipe_buf_steal,
@@ -361,8 +264,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
 static const struct pipe_buf_operations packet_pipe_buf_ops = {
        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = generic_pipe_buf_confirm,
        .release = anon_pipe_buf_release,
        .steal = generic_pipe_buf_steal,
@@ -379,12 +280,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
        ssize_t ret;
        struct iovec *iov = (struct iovec *)_iov;
        size_t total_len;
+        struct iov_iter iter;
        total_len = iov_length(iov, nr_segs);
        /* Null read succeeds. */
        if (unlikely(total_len == 0))
                return 0;
+        iov_iter_init(&iter, iov, nr_segs, total_len, 0);
        do_wakeup = 0;
        ret = 0;
        __pipe_lock(pipe);
@@ -394,9 +298,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
                        int curbuf = pipe->curbuf;
                        struct pipe_buffer *buf = pipe->bufs + curbuf;
                        const struct pipe_buf_operations *ops = buf->ops;
-                        void *addr;
                        size_t chars = buf->len;
-                        int error, atomic;
+                        size_t written;
+                        int error;
                        if (chars > total_len)
                                chars = total_len;
@@ -408,21 +312,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
                                break;
                        }
-                        atomic = !iov_fault_in_pages_write(iov, chars);
+                        written = copy_page_to_iter(buf->page, buf->offset, chars, &iter);
-redo:
+                        if (unlikely(written < chars)) {
-                        addr = ops->map(pipe, buf, atomic);
-                        error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
-                        ops->unmap(pipe, buf, addr);
-                        if (unlikely(error)) {
-                                /*
-                                 * Just retry with the slow path if we failed.
-                                 */
-                                if (atomic) {
-                                        atomic = 0;
-                                        goto redo;
-                                }
                                if (!ret)
-                                        ret = error;
+                                        ret = -EFAULT;
                                break;
                        }
                        ret += chars;
@@ -538,10 +431,16 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
                        iov_fault_in_pages_read(iov, chars);
 redo1:
-                        addr = ops->map(pipe, buf, atomic);
+                        if (atomic)
+                                addr = kmap_atomic(buf->page);
+                        else
+                                addr = kmap(buf->page);
                        error = pipe_iov_copy_from_user(offset + addr, iov,
                                                        chars, atomic);
-                        ops->unmap(pipe, buf, addr);
+                        if (atomic)
+                                kunmap_atomic(addr);
+                        else
+                                kunmap(buf->page);
                        ret = error;
                        do_wakeup = 1;
                        if (error) {
diff --git a/fs/pnode.c b/fs/pnode.c
index 88396df725b4..302bf22c4a30 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m,
        }
 }
-/*
+static struct mount *next_group(struct mount *m, struct mount *origin)
- * return the source mount to be used for cloning
- *
- * @dest        the current destination mount
- * @last_dest   the last seen destination mount
- * @last_src    the last seen source mount
- * @type        return CL_SLAVE if the new mount has to be
- *              cloned as a slave.
- */
-static struct mount *get_source(struct mount *dest,
-                                struct mount *last_dest,
-                                struct mount *last_src,
-                                int *type)
 {
-        struct mount *p_last_src = NULL;
+        while (1) {
-        struct mount *p_last_dest = NULL;
+                while (1) {
+                        struct mount *next;
-        while (last_dest != dest->mnt_master) {
+                        if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
-                p_last_dest = last_dest;
+                                return first_slave(m);
-                p_last_src = last_src;
+                        next = next_peer(m);
-                last_dest = last_dest->mnt_master;
+                        if (m->mnt_group_id == origin->mnt_group_id) {
-                last_src = last_src->mnt_master;
+                                if (next == origin)
+                                        return NULL;
+                        } else if (m->mnt_slave.next != &next->mnt_slave)
+                                break;
+                        m = next;
+                }
+                /* m is the last peer */
+                while (1) {
+                        struct mount *master = m->mnt_master;
+                        if (m->mnt_slave.next != &master->mnt_slave_list)
+                                return next_slave(m);
+                        m = next_peer(master);
+                        if (master->mnt_group_id == origin->mnt_group_id)
+                                break;
+                        if (master->mnt_slave.next == &m->mnt_slave)
+                                break;
+                        m = master;
+                }
+                if (m == origin)
+                        return NULL;
        }
+}
-        if (p_last_dest) {
+/* all accesses are serialized by namespace_sem */
-                do {
+static struct user_namespace *user_ns;
-                        p_last_dest = next_peer(p_last_dest);
+static struct mount *last_dest, *last_source, *dest_master;
-                } while (IS_MNT_NEW(p_last_dest));
+static struct mountpoint *mp;
-                /* is that a peer of the earlier? */
+static struct hlist_head *list;
-                if (dest == p_last_dest) {
-                        *type = CL_MAKE_SHARED;
+static int propagate_one(struct mount *m)
-                        return p_last_src;
+{
+        struct mount *child;
+        int type;
+        /* skip ones added by this propagate_mnt() */
+        if (IS_MNT_NEW(m))
+                return 0;
+        /* skip if mountpoint isn't covered by it */
+        if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
+                return 0;
+        if (m->mnt_group_id == last_dest->mnt_group_id) {
+                type = CL_MAKE_SHARED;
+        } else {
+                struct mount *n, *p;
+                for (n = m; ; n = p) {
+                        p = n->mnt_master;
+                        if (p == dest_master || IS_MNT_MARKED(p)) {
+                                while (last_dest->mnt_master != p) {
+                                        last_source = last_source->mnt_master;
+                                        last_dest = last_source->mnt_parent;
+                                }
+                                if (n->mnt_group_id != last_dest->mnt_group_id) {
+                                        last_source = last_source->mnt_master;
+                                        last_dest = last_source->mnt_parent;
+                                }
+                                break;
+                        }
                }
+                type = CL_SLAVE;
+                /* beginning of peer group among the slaves? */
+                if (IS_MNT_SHARED(m))
+                        type |= CL_MAKE_SHARED;
        }
-        /* slave of the earlier, then */
+                
-        *type = CL_SLAVE;
+        /* Notice when we are propagating across user namespaces */
-        /* beginning of peer group among the slaves? */
+        if (m->mnt_ns->user_ns != user_ns)
-        if (IS_MNT_SHARED(dest))
+                type |= CL_UNPRIVILEGED;
-                *type |= CL_MAKE_SHARED;
+        child = copy_tree(last_source, last_source->mnt.mnt_root, type);
-        return last_src;
+        if (IS_ERR(child))
+                return PTR_ERR(child);
+        mnt_set_mountpoint(m, mp, child);
+        last_dest = m;
+        last_source = child;
+        if (m->mnt_master != dest_master) {
+                read_seqlock_excl(&mount_lock);
+                SET_MNT_MARK(m->mnt_master);
+                read_sequnlock_excl(&mount_lock);
+        }
+        hlist_add_head(&child->mnt_hash, list);
+        return 0;
 }
 /*
@@ -222,56 +270,48 @@ static struct mount *get_source(struct mount *dest,
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
                    struct mount *source_mnt, struct hlist_head *tree_list)
 {
-        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+        struct mount *m, *n;
-        struct mount *m, *child;
        int ret = 0;
-        struct mount *prev_dest_mnt = dest_mnt;
-        struct mount *prev_src_mnt  = source_mnt;
+        /*
-        HLIST_HEAD(tmp_list);
+         * we don't want to bother passing tons of arguments to
+         * propagate_one(); everything is serialized by namespace_sem,
-        for (m = propagation_next(dest_mnt, dest_mnt); m;
+         * so globals will do just fine.
-                        m = propagation_next(m, dest_mnt)) {
+         */
-                int type;
+        user_ns = current->nsproxy->mnt_ns->user_ns;
-                struct mount *source;
+        last_dest = dest_mnt;
+        last_source = source_mnt;
-                if (IS_MNT_NEW(m))
+        mp = dest_mp;
-                        continue;
+        list = tree_list;
+        dest_master = dest_mnt->mnt_master;
-                source =  get_source(m, prev_dest_mnt, prev_src_mnt, &type);
+        /* all peers of dest_mnt, except dest_mnt itself */
-                /* Notice when we are propagating across user namespaces */
+        for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
-                if (m->mnt_ns->user_ns != user_ns)
+                ret = propagate_one(n);
-                        type |= CL_UNPRIVILEGED;
+                if (ret)
-                child = copy_tree(source, source->mnt.mnt_root, type);
-                if (IS_ERR(child)) {
-                        ret = PTR_ERR(child);
-                        tmp_list = *tree_list;
-                        tmp_list.first->pprev = &tmp_list.first;
-                        INIT_HLIST_HEAD(tree_list);
                        goto out;
-                }
+        }
-                if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) {
+        /* all slave groups */
-                        mnt_set_mountpoint(m, dest_mp, child);
+        for (m = next_group(dest_mnt, dest_mnt); m;
-                        hlist_add_head(&child->mnt_hash, tree_list);
+                        m = next_group(m, dest_mnt)) {
-                } else {
+                /* everything in that slave group */
-                        /*
+                n = m;
-                         * This can happen if the parent mount was bind mounted
+                do {
-                         * on some subdirectory of a shared/slave mount.
+                        ret = propagate_one(n);
-                         */
+                        if (ret)
-                        hlist_add_head(&child->mnt_hash, &tmp_list);
+                                goto out;
-                }
+                        n = next_peer(n);
-                prev_dest_mnt = m;
+                } while (n != m);
-                prev_src_mnt  = child;
        }
 out:
-        lock_mount_hash();
+        read_seqlock_excl(&mount_lock);
-        while (!hlist_empty(&tmp_list)) {
+        hlist_for_each_entry(n, tree_list, mnt_hash) {
-                child = hlist_entry(tmp_list.first, struct mount, mnt_hash);
+                m = n->mnt_parent;
-                umount_tree(child, 0);
+                if (m->mnt_master != dest_mnt->mnt_master)
+                        CLEAR_MNT_MARK(m->mnt_master);
        }
-        unlock_mount_hash();
+        read_sequnlock_excl(&mount_lock);
        return ret;
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index fc28a27fa892..4a246358b031 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -16,6 +16,9 @@
 #define IS_MNT_NEW(m)  (!(m)->mnt_ns)
 #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
 #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
 #define CL_EXPIRE               0x01
 #define CL_SLAVE                0x02
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 9ae46b87470d..89026095f2b5 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
        struct task_struct *task;
        void *ns;
        char name[50];
-        int len = -EACCES;
+        int res = -EACCES;
        task = get_proc_task(inode);
        if (!task)
@@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out_put_task;
-        len = -ENOENT;
+        res = -ENOENT;
        ns = ns_ops->get(task);
        if (!ns)
                goto out_put_task;
        snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
-        len = strlen(name);
+        res = readlink_copy(buffer, buflen, name);
-        if (len > buflen)
-                len = buflen;
-        if (copy_to_user(buffer, name, len))
-                len = -EFAULT;
        ns_ops->put(ns);
 out_put_task:
        put_task_struct(task);
 out:
-        return len;
+        return res;
 }
 static const struct inode_operations proc_ns_link_inode_operations = {
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ffeb202ec942..4348bb8907c2 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
        if (!tgid)
                return -ENOENT;
        sprintf(tmp, "%d", tgid);
-        return vfs_readlink(dentry,buffer,buflen,tmp);
+        return readlink_copy(buffer, buflen, tmp);
 }
 static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 7be26f03a3f5..1a81373947f3 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -267,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
        p->root = root;
        p->m.poll_event = ns->event;
        p->show = show;
+        p->cached_event = ~0ULL;
        return 0;
diff --git a/fs/splice.c b/fs/splice.c
index 12028fa41def..9bc07d2b53cf 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -136,8 +136,6 @@ error:
 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = page_cache_pipe_buf_confirm,
        .release = page_cache_pipe_buf_release,
        .steal = page_cache_pipe_buf_steal,
@@ -156,8 +154,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = generic_pipe_buf_confirm,
        .release = page_cache_pipe_buf_release,
        .steal = user_page_pipe_buf_steal,
@@ -547,8 +543,6 @@ EXPORT_SYMBOL(generic_file_splice_read);
 static const struct pipe_buf_operations default_pipe_buf_ops = {
        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = generic_pipe_buf_confirm,
        .release = generic_pipe_buf_release,
        .steal = generic_pipe_buf_steal,
@@ -564,8 +558,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
 /* Pipe buffer operations for a socket and similar. */
 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = generic_pipe_buf_confirm,
        .release = generic_pipe_buf_release,
        .steal = generic_pipe_buf_nosteal,
@@ -767,13 +759,13 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
                goto out;
        if (buf->page != page) {
-                char *src = buf->ops->map(pipe, buf, 1);
+                char *src = kmap_atomic(buf->page);
                char *dst = kmap_atomic(page);
                memcpy(dst + offset, src + buf->offset, this_len);
                flush_dcache_page(page);
                kunmap_atomic(dst);
-                buf->ops->unmap(pipe, buf, src);
+                kunmap_atomic(src);
        }
        ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
                                page, fsdata);
@@ -1067,9 +1059,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        void *data;
        loff_t tmp = sd->pos;
-        data = buf->ops->map(pipe, buf, 0);
+        data = kmap(buf->page);
        ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
-        buf->ops->unmap(pipe, buf, data);
+        kunmap(buf->page);
        return ret;
 }
@@ -1528,116 +1520,48 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
                        struct splice_desc *sd)
 {
-        char *src;
+        int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
-        int ret;
+        return n == sd->len ? n : -EFAULT;
-        /*
-         * See if we can use the atomic maps, by prefaulting in the
-         * pages and doing an atomic copy
-         */
-        if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
-                src = buf->ops->map(pipe, buf, 1);
-                ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
-                                                        sd->len);
-                buf->ops->unmap(pipe, buf, src);
-                if (!ret) {
-                        ret = sd->len;
-                        goto out;
-                }
-        }
-        /*
-         * No dice, use slow non-atomic map and copy
-         */
-        src = buf->ops->map(pipe, buf, 0);
-        ret = sd->len;
-        if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
-                ret = -EFAULT;
-        buf->ops->unmap(pipe, buf, src);
-out:
-        if (ret > 0)
-                sd->u.userptr += ret;
-        return ret;
 }
 /*
 * For lack of a better implementation, implement vmsplice() to userspace
 * as a simple copy of the pipes pages to the user iov.
 */
-static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
+static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
                             unsigned long nr_segs, unsigned int flags)
 {
        struct pipe_inode_info *pipe;
        struct splice_desc sd;
-        ssize_t size;
-        int error;
        long ret;
+        struct iovec iovstack[UIO_FASTIOV];
+        struct iovec *iov = iovstack;
+        struct iov_iter iter;
+        ssize_t count = 0;
        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
-        pipe_lock(pipe);
+        ret = rw_copy_check_uvector(READ, uiov, nr_segs,
+                                    ARRAY_SIZE(iovstack), iovstack, &iov);
-        error = ret = 0;
+        if (ret <= 0)
-        while (nr_segs) {
+                return ret;
-                void __user *base;
-                size_t len;
-                /*
-                 * Get user address base and length for this iovec.
-                 */
-                error = get_user(base, &iov->iov_base);
-                if (unlikely(error))
-                        break;
-                error = get_user(len, &iov->iov_len);
-                if (unlikely(error))
-                        break;
-                /*
-                 * Sanity check this iovec. 0 read succeeds.
-                 */
-                if (unlikely(!len))
-                        break;
-                if (unlikely(!base)) {
-                        error = -EFAULT;
-                        break;
-                }
-                if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
-                        error = -EFAULT;
-                        break;
-                }
-                sd.len = 0;
-                sd.total_len = len;
-                sd.flags = flags;
-                sd.u.userptr = base;
-                sd.pos = 0;
-                size = __splice_from_pipe(pipe, &sd, pipe_to_user);
-                if (size < 0) {
-                        if (!ret)
-                                ret = size;
-                        break;
-                }
-                ret += size;
-                if (size < len)
+        iov_iter_init(&iter, iov, nr_segs, count, 0);
-                        break;
-                nr_segs--;
+        sd.len = 0;
-                iov++;
+        sd.total_len = count;
-        }
+        sd.flags = flags;
+        sd.u.data = &iter;
+        sd.pos = 0;
+        pipe_lock(pipe);
+        ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
        pipe_unlock(pipe);
-        if (!ret)
+        if (iov != iovstack)
-                ret = error;
+                kfree(iov);
        return ret;
 }
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1037637957c7..d2c170f8b035 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -171,7 +171,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        } else
                up_write(&iinfo->i_data_sem);
-        retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+        retval = __generic_file_aio_write(iocb, iov, nr_segs);
        mutex_unlock(&inode->i_mutex);
        if (retval > 0) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 003c0051b62f..79e96ce98733 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -699,7 +699,7 @@ xfs_file_dio_aio_write(
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
        ret = generic_file_direct_write(iocb, iovp,
-                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
+                        &nr_segs, pos, count, ocount);
 out:
        xfs_rw_iunlock(ip, iolock);
@@ -715,7 +715,7 @@ xfs_file_buffered_aio_write(
        const struct iovec      *iovp,
        unsigned long           nr_segs,
        loff_t                  pos,
-        size_t                  ocount)
+        size_t                  count)
 {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
@@ -724,7 +724,7 @@ xfs_file_buffered_aio_write(
        ssize_t                 ret;
        int                     enospc = 0;
        int                     iolock = XFS_IOLOCK_EXCL;
-        size_t                  count = ocount;
+        struct iov_iter         from;
        xfs_rw_ilock(ip, iolock);
@@ -732,14 +732,15 @@ xfs_file_buffered_aio_write(
        if (ret)
                goto out;
+        iov_iter_init(&from, iovp, nr_segs, count, 0);
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
 write_retry:
        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
-        ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+        ret = generic_perform_write(file, &from, pos);
-                        pos, &iocb->ki_pos, count, 0);
+        if (likely(ret >= 0))
+                iocb->ki_pos = pos + ret;
        /*
         * If we just got an ENOSPC, try to write back all dirty inodes to
         * convert delalloc space to free up some of the excess reserved
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bcfe61202115..0b18776b075e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -271,32 +271,6 @@ xfs_open_by_handle(
        return error;
 }
-/*
- * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
- * unused first argument.
- */
-STATIC int
-do_readlink(
-        char __user             *buffer,
-        int                     buflen,
-        const char              *link)
-{
-        int len;
-        len = PTR_ERR(link);
-        if (IS_ERR(link))
-                goto out;
-        len = strlen(link);
-        if (len > (unsigned) buflen)
-                len = buflen;
-        if (copy_to_user(buffer, link, len))
-                len = -EFAULT;
- out:
-        return len;
-}
 int
 xfs_readlink_by_handle(
        struct file             *parfilp,
@@ -334,7 +308,7 @@ xfs_readlink_by_handle(
        error = -xfs_readlink(XFS_I(dentry->d_inode), link);
        if (error)
                goto out_kfree;
-        error = do_readlink(hreq->ohandle, olen, link);
+        error = readlink_copy(hreq->ohandle, olen, link);
        if (error)
                goto out_kfree;
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-12 17:49:50 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-12 17:49:50 -0400
commit	5166701b368caea89d57b14bf41cf39e819dad51 (patch)
tree	c73b9d4860809e3afa9359be9d03ba2d8d98a18e /fs
parent	0a7418f5f569512e98789c439198eed4b507cce3 (diff)
parent	a786c06d9f2719203c00b3d97b21f9a96980d0b5 (diff)