66 files changed, 1446 insertions, 933 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index eb14e055ea83..ff1a5bac4200 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,7 +33,7 @@
 #include <linux/pagemap.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
diff --git a/fs/Makefile b/fs/Makefile
index a88ac4838c9e..cb92fd4c3172 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -118,6 +118,7 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)             += hppfs/
 obj-$(CONFIG_CACHEFILES)        += cachefiles/
 obj-$(CONFIG_DEBUG_FS)          += debugfs/
+obj-$(CONFIG_TRACING)           += tracefs/
 obj-$(CONFIG_OCFS2_FS)          += ocfs2/
 obj-$(CONFIG_BTRFS_FS)          += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a91795e01a7f..3aa7eb66547e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -12,7 +12,7 @@
 *  affs regular file handling primitives
 */
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "affs.h"
 static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c13cb08964ed..0714abcd7f32 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,7 +14,6 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
-#include <linux/aio.h>
 #include "internal.h"
 static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/aio.c b/fs/aio.c
index a793f7023755..1ab60010cf6c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -151,6 +151,38 @@ struct kioctx {
        unsigned                id;
 };
+/*
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
+ */
+#define KIOCB_CANCELLED         ((void *) (~0ULL))
+struct aio_kiocb {
+        struct kiocb            common;
+        struct kioctx           *ki_ctx;
+        kiocb_cancel_fn         *ki_cancel;
+        struct iocb __user      *ki_user_iocb;  /* user's aiocb */
+        __u64                   ki_user_data;   /* user's data for completion */
+        struct list_head        ki_list;        /* the aio core uses this
+                                                 * for cancellation */
+        /*
+         * If the aio_resfd field of the userspace iocb is not zero,
+         * this is the underlying eventfd context to deliver events to.
+         */
+        struct eventfd_ctx      *ki_eventfd;
+};
 /*------ sysctl variables----*/
 static DEFINE_SPINLOCK(aio_nr_lock);
 unsigned long aio_nr;           /* current system wide number of aio requests */
@@ -220,7 +252,7 @@ static int __init aio_setup(void)
        if (IS_ERR(aio_mnt))
                panic("Failed to create aio fs mount.");
-        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+        kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
@@ -484,8 +516,9 @@ static int aio_setup_ring(struct kioctx *ctx)
 #define AIO_EVENTS_FIRST_PAGE   ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET       (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 {
+        struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
        struct kioctx *ctx = req->ki_ctx;
        unsigned long flags;
@@ -500,7 +533,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
-static int kiocb_cancel(struct kiocb *kiocb)
+static int kiocb_cancel(struct aio_kiocb *kiocb)
 {
        kiocb_cancel_fn *old, *cancel;
@@ -518,7 +551,7 @@ static int kiocb_cancel(struct kiocb *kiocb)
                cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
        } while (cancel != old);
-        return cancel(kiocb);
+        return cancel(&kiocb->common);
 }
 static void free_ioctx(struct work_struct *work)
@@ -554,13 +587,13 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
 static void free_ioctx_users(struct percpu_ref *ref)
 {
        struct kioctx *ctx = container_of(ref, struct kioctx, users);
-        struct kiocb *req;
+        struct aio_kiocb *req;
        spin_lock_irq(&ctx->ctx_lock);
        while (!list_empty(&ctx->active_reqs)) {
                req = list_first_entry(&ctx->active_reqs,
-                                       struct kiocb, ki_list);
+                                       struct aio_kiocb, ki_list);
                list_del_init(&req->ki_list);
                kiocb_cancel(req);
@@ -786,22 +819,6 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
        return 0;
 }
-/* wait_on_sync_kiocb:
- *      Waits on the given sync kiocb to complete.
- */
-ssize_t wait_on_sync_kiocb(struct kiocb *req)
-{
-        while (!req->ki_ctx) {
-                set_current_state(TASK_UNINTERRUPTIBLE);
-                if (req->ki_ctx)
-                        break;
-                io_schedule();
-        }
-        __set_current_state(TASK_RUNNING);
-        return req->ki_user_data;
-}
-EXPORT_SYMBOL(wait_on_sync_kiocb);
 /*
 * exit_aio: called when the last user of mm goes away.  At this point, there is
 * no way for any new requests to be submited or any of the io_* syscalls to be
@@ -956,9 +973,9 @@ static void user_refill_reqs_available(struct kioctx *ctx)
 *      Allocate a slot for an aio request.
 * Returns NULL if no requests are free.
 */
-static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 {
-        struct kiocb *req;
+        struct aio_kiocb *req;
        if (!get_reqs_available(ctx)) {
                user_refill_reqs_available(ctx);
@@ -979,10 +996,10 @@ out_put:
        return NULL;
 }
-static void kiocb_free(struct kiocb *req)
+static void kiocb_free(struct aio_kiocb *req)
 {
-        if (req->ki_filp)
+        if (req->common.ki_filp)
-                fput(req->ki_filp);
+                fput(req->common.ki_filp);
        if (req->ki_eventfd != NULL)
                eventfd_ctx_put(req->ki_eventfd);
        kmem_cache_free(kiocb_cachep, req);
@@ -1018,8 +1035,9 @@ out:
 /* aio_complete
 *      Called when the io request on the given iocb is complete.
 */
-void aio_complete(struct kiocb *iocb, long res, long res2)
+static void aio_complete(struct kiocb *kiocb, long res, long res2)
 {
+        struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
        struct kioctx   *ctx = iocb->ki_ctx;
        struct aio_ring *ring;
        struct io_event *ev_page, *event;
@@ -1033,13 +1051,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
         *    ref, no other paths have a way to get another ref
         *  - the sync task helpfully left a reference to itself in the iocb
         */
-        if (is_sync_kiocb(iocb)) {
+        BUG_ON(is_sync_kiocb(kiocb));
-                iocb->ki_user_data = res;
-                smp_wmb();
-                iocb->ki_ctx = ERR_PTR(-EXDEV);
-                wake_up_process(iocb->ki_obj.tsk);
-                return;
-        }
        if (iocb->ki_list.next) {
                unsigned long flags;
@@ -1065,7 +1077,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
        event = ev_page + pos % AIO_EVENTS_PER_PAGE;
-        event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+        event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
        event->data = iocb->ki_user_data;
        event->res = res;
        event->res2 = res2;
@@ -1074,7 +1086,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
        pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
-                 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+                 ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
                 res, res2);
        /* after flagging the request as done, we
@@ -1121,7 +1133,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        percpu_ref_put(&ctx->reqs);
 }
-EXPORT_SYMBOL(aio_complete);
 /* aio_read_events_ring
 *      Pull an event off of the ioctx's event ring.  Returns the number of
@@ -1349,46 +1360,19 @@ typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
                            unsigned long, loff_t);
 typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
-static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
+static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len,
-                                     int rw, char __user *buf,
+                                 struct iovec **iovec,
-                                     unsigned long *nr_segs,
+                                 bool compat,
-                                     struct iovec **iovec,
+                                 struct iov_iter *iter)
-                                     bool compat)
 {
-        ssize_t ret;
-        *nr_segs = kiocb->ki_nbytes;
 #ifdef CONFIG_COMPAT
        if (compat)
-                ret = compat_rw_copy_check_uvector(rw,
+                return compat_import_iovec(rw,
                                (struct compat_iovec __user *)buf,
-                                *nr_segs, UIO_FASTIOV, *iovec, iovec);
+                                len, UIO_FASTIOV, iovec, iter);
-        else
 #endif
-                ret = rw_copy_check_uvector(rw,
+        return import_iovec(rw, (struct iovec __user *)buf,
-                                (struct iovec __user *)buf,
+                                len, UIO_FASTIOV, iovec, iter);
-                                *nr_segs, UIO_FASTIOV, *iovec, iovec);
-        if (ret < 0)
-                return ret;
-        /* ki_nbytes now reflect bytes instead of segs */
-        kiocb->ki_nbytes = ret;
-        return 0;
-}
-static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
-                                       int rw, char __user *buf,
-                                       unsigned long *nr_segs,
-                                       struct iovec *iovec)
-{
-        if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
-                return -EFAULT;
-        iovec->iov_base = buf;
-        iovec->iov_len = kiocb->ki_nbytes;
-        *nr_segs = 1;
-        return 0;
 }
 /*
@@ -1396,11 +1380,10 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
 *      Performs the initial checks and io submission.
 */
 static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
-                            char __user *buf, bool compat)
+                            char __user *buf, size_t len, bool compat)
 {
        struct file *file = req->ki_filp;
        ssize_t ret;
-        unsigned long nr_segs;
        int rw;
        fmode_t mode;
        aio_rw_op *rw_op;
@@ -1431,21 +1414,22 @@ rw_common:
                if (!rw_op && !iter_op)
                        return -EINVAL;
-                ret = (opcode == IOCB_CMD_PREADV ||
+                if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV)
-                       opcode == IOCB_CMD_PWRITEV)
+                        ret = aio_setup_vectored_rw(rw, buf, len,
-                        ? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
+                                                &iovec, compat, &iter);
-                                                &iovec, compat)
+                else {
-                        : aio_setup_single_vector(req, rw, buf, &nr_segs,
+                        ret = import_single_range(rw, buf, len, iovec, &iter);
-                                                  iovec);
+                        iovec = NULL;
+                }
                if (!ret)
-                        ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+                        ret = rw_verify_area(rw, file, &req->ki_pos,
+                                             iov_iter_count(&iter));
                if (ret < 0) {
-                        if (iovec != inline_vecs)
+                        kfree(iovec);
-                                kfree(iovec);
                        return ret;
                }
-                req->ki_nbytes = ret;
+                len = ret;
                /* XXX: move/kill - rw_verify_area()? */
                /* This matches the pread()/pwrite() logic */
@@ -1458,14 +1442,14 @@ rw_common:
                        file_start_write(file);
                if (iter_op) {
-                        iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
                        ret = iter_op(req, &iter);
                } else {
-                        ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+                        ret = rw_op(req, iter.iov, iter.nr_segs, req->ki_pos);
                }
                if (rw == WRITE)
                        file_end_write(file);
+                kfree(iovec);
                break;
        case IOCB_CMD_FDSYNC:
@@ -1487,9 +1471,6 @@ rw_common:
                return -EINVAL;
        }
-        if (iovec != inline_vecs)
-                kfree(iovec);
        if (ret != -EIOCBQUEUED) {
                /*
                 * There's no easy way to restart the syscall since other AIO's
@@ -1508,7 +1489,7 @@ rw_common:
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                         struct iocb *iocb, bool compat)
 {
-        struct kiocb *req;
+        struct aio_kiocb *req;
        ssize_t ret;
        /* enforce forwards compatibility on users */
@@ -1531,11 +1512,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
        if (unlikely(!req))
                return -EAGAIN;
-        req->ki_filp = fget(iocb->aio_fildes);
+        req->common.ki_filp = fget(iocb->aio_fildes);
-        if (unlikely(!req->ki_filp)) {
+        if (unlikely(!req->common.ki_filp)) {
                ret = -EBADF;
                goto out_put_req;
        }
+        req->common.ki_pos = iocb->aio_offset;
+        req->common.ki_complete = aio_complete;
+        req->common.ki_flags = 0;
        if (iocb->aio_flags & IOCB_FLAG_RESFD) {
                /*
@@ -1550,6 +1534,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                        req->ki_eventfd = NULL;
                        goto out_put_req;
                }
+                req->common.ki_flags |= IOCB_EVENTFD;
        }
        ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@ -1558,13 +1544,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                goto out_put_req;
        }
-        req->ki_obj.user = user_iocb;
+        req->ki_user_iocb = user_iocb;
        req->ki_user_data = iocb->aio_data;
-        req->ki_pos = iocb->aio_offset;
-        req->ki_nbytes = iocb->aio_nbytes;
-        ret = aio_run_iocb(req, iocb->aio_lio_opcode,
+        ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode,
                           (char __user *)(unsigned long)iocb->aio_buf,
+                           iocb->aio_nbytes,
                           compat);
        if (ret)
                goto out_put_req;
@@ -1651,10 +1636,10 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 /* lookup_kiocb
 *      Finds a given iocb for cancellation.
 */
-static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
+static struct aio_kiocb *
-                                  u32 key)
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
 {
-        struct list_head *pos;
+        struct aio_kiocb *kiocb;
        assert_spin_locked(&ctx->ctx_lock);
@@ -1662,9 +1647,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
                return NULL;
        /* TODO: use a hash or array, this sucks. */
-        list_for_each(pos, &ctx->active_reqs) {
+        list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
-                struct kiocb *kiocb = list_kiocb(pos);
+                if (kiocb->ki_user_iocb == iocb)
-                if (kiocb->ki_obj.user == iocb)
                        return kiocb;
        }
        return NULL;
@@ -1684,7 +1668,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
                struct io_event __user *, result)
 {
        struct kioctx *ctx;
-        struct kiocb *kiocb;
+        struct aio_kiocb *kiocb;
        u32 key;
        int ret;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 90bc079d9982..fdcb4d69f430 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
+#include <linux/uio.h>
 #include <asm/uaccess.h>
 #include "bfs.h"
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 975266be67d3..2e522aed6584 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,7 +27,6 @@
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/cleancache.h>
-#include <linux/aio.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 30982bbd31c3..aee18f84e315 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,7 +24,6 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
-#include <linux/aio.h>
 #include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
@@ -32,6 +31,7 @@
 #include <linux/compat.h>
 #include <linux/slab.h>
 #include <linux/btrfs.h>
+#include <linux/uio.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d2e732d7af52..686331f22b15 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,7 +32,6 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
-#include <linux/aio.h>
 #include <linux/bit_spinlock.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
@@ -43,6 +42,7 @@
 #include <linux/btrfs.h>
 #include <linux/blkdev.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d533075a823d..139f2fea91a0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,7 +7,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
 #include <linux/falloc.h>
 #include "super.h"
@@ -808,7 +807,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
        struct file *filp = iocb->ki_filp;
        struct ceph_file_info *fi = filp->private_data;
-        size_t len = iocb->ki_nbytes;
+        size_t len = iov_iter_count(to);
        struct inode *inode = file_inode(filp);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct page *pinned_page = NULL;
diff --git a/fs/dcache.c b/fs/dcache.c
index c71e3732e53b..d99736a63e3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2690,7 +2690,7 @@ static int __d_unalias(struct inode *inode,
                struct dentry *dentry, struct dentry *alias)
 {
        struct mutex *m1 = NULL, *m2 = NULL;
-        int ret = -EBUSY;
+        int ret = -ESTALE;
        /* If alias and dentry share a parent, then no extra locks required */
        if (alias->d_parent == dentry->d_parent)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b2e297..6fb00e3f1059 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,7 +37,6 @@
 #include <linux/uio.h>
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
-#include <linux/aio.h>
 /*
 * How many user pages to map in one call to get_user_pages().  This determines
@@ -265,7 +264,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
                                ret = err;
                }
-                aio_complete(dio->iocb, ret, 0);
+                dio->iocb->ki_complete(dio->iocb, ret, 0);
        }
        kmem_cache_free(dio_cache, dio);
@@ -1056,7 +1055,7 @@ static inline int drop_refcount(struct dio *dio)
         * operation.  AIO can if it was a broken operation described above or
         * in fact if all the bios race to complete before we get here.  In
         * that case dio_complete() translates the EIOCBQUEUED into the proper
-         * return code that the caller will hand to aio_complete().
+         * return code that the caller will hand to ->complete().
         *
         * This is managed by the bio_lock instead of being an atomic_t so that
         * completion paths can drop their ref and use the remaining count to
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index fd39bad6f1bd..79675089443d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
-#include <linux/aio.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -52,12 +51,6 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        rc = generic_file_read_iter(iocb, to);
-        /*
-         * Even though this is a async interface, we need to wait
-         * for IO to finish to update atime
-         */
-        if (-EIOCBQUEUED == rc)
-                rc = wait_on_sync_kiocb(iocb);
        if (rc >= 0) {
                path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
                touch_atime(path);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6434bc000125..df9d6afbc5d5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,7 +31,7 @@
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xattr.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2c6ccc49ba27..db07ffbe7c85 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,7 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "ext3.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 33a09da16c9c..598abbbe6786 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,9 +23,9 @@
 #include <linux/jbd2.h>
 #include <linux/mount.h>
 #include <linux/path.h>
-#include <linux/aio.h>
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
+#include <linux/uio.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 45fe924f82bc..740c7871c117 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,9 +20,9 @@
 *      (sct@redhat.com), 1993, 1998
 */
-#include <linux/aio.h>
 #include "ext4_jbd2.h"
 #include "truncate.h"
+#include <linux/uio.h>
 #include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cb9a212b86f..a3f451370bef 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,7 +37,6 @@
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
-#include <linux/aio.h>
 #include <linux/bitops.h>
 #include "ext4_jbd2.h"
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b24a2541a9ba..464984261e69 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,7 +18,6 @@
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
-#include <linux/aio.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 985ed023a750..497f8515d205 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,12 +12,12 @@
 #include <linux/f2fs_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
-#include <linux/aio.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/prefetch.h>
+#include <linux/uio.h>
 #include "f2fs.h"
 #include "node.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 497c7c5263c7..8521207de229 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -19,7 +19,6 @@
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
 #include <linux/mount.h>
-#include <linux/aio.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/uio.h>
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 28d0c7abba1c..b3fa05032234 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,7 +38,6 @@
 #include <linux/device.h>
 #include <linux/file.h>
 #include <linux/fs.h>
-#include <linux/aio.h>
 #include <linux/kdev_t.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
@@ -48,6 +47,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/module.h>
+#include <linux/uio.h>
 #include "fuse_i.h"
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 39706c57ad3c..95a2797eef66 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,7 +19,6 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/swap.h>
 #include <linux/splice.h>
-#include <linux/aio.h>
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 MODULE_ALIAS("devname:fuse");
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c01ec3bdcfd8..ff102cbf16ea 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,8 +15,8 @@
 #include <linux/module.h>
 #include <linux/compat.h>
 #include <linux/swap.h>
-#include <linux/aio.h>
 #include <linux/falloc.h>
+#include <linux/uio.h>
 static const struct file_operations fuse_direct_io_file_operations;
@@ -528,6 +528,17 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
        }
 }
+static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
+{
+        if (io->err)
+                return io->err;
+        if (io->bytes >= 0 && io->write)
+                return -EIO;
+        return io->bytes < 0 ? io->size : io->bytes;
+}
 /**
 * In case of short read, the caller sets 'pos' to the position of
 * actual end of fuse request in IO request. Otherwise, if bytes_requested
@@ -546,6 +557,7 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 */
 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 {
+        bool is_sync = is_sync_kiocb(io->iocb);
        int left;
        spin_lock(&io->lock);
@@ -555,30 +567,24 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
                io->bytes = pos;
        left = --io->reqs;
+        if (!left && is_sync)
+                complete(io->done);
        spin_unlock(&io->lock);
-        if (!left) {
+        if (!left && !is_sync) {
-                long res;
+                ssize_t res = fuse_get_res_by_io(io);
-                if (io->err)
+                if (res >= 0) {
-                        res = io->err;
+                        struct inode *inode = file_inode(io->iocb->ki_filp);
-                else if (io->bytes >= 0 && io->write)
+                        struct fuse_conn *fc = get_fuse_conn(inode);
-                        res = -EIO;
+                        struct fuse_inode *fi = get_fuse_inode(inode);
-                else {
-                        res = io->bytes < 0 ? io->size : io->bytes;
-                        if (!is_sync_kiocb(io->iocb)) {
+                        spin_lock(&fc->lock);
-                                struct inode *inode = file_inode(io->iocb->ki_filp);
+                        fi->attr_version = ++fc->attr_version;
-                                struct fuse_conn *fc = get_fuse_conn(inode);
+                        spin_unlock(&fc->lock);
-                                struct fuse_inode *fi = get_fuse_inode(inode);
-                                spin_lock(&fc->lock);
-                                fi->attr_version = ++fc->attr_version;
-                                spin_unlock(&fc->lock);
-                        }
                }
-                aio_complete(io->iocb, res, 0);
+                io->iocb->ki_complete(io->iocb, res, 0);
                kfree(io);
        }
 }
@@ -2801,6 +2807,7 @@ static ssize_t
 fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
                        loff_t offset)
 {
+        DECLARE_COMPLETION_ONSTACK(wait);
        ssize_t ret = 0;
        struct file *file = iocb->ki_filp;
        struct fuse_file *ff = file->private_data;
@@ -2852,6 +2859,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
        if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
                io->async = false;
+        if (io->async && is_sync_kiocb(iocb))
+                io->done = &wait;
        if (rw == WRITE)
                ret = __fuse_direct_write(io, iter, &pos);
        else
@@ -2864,11 +2874,12 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
                if (!is_sync_kiocb(iocb))
                        return -EIOCBQUEUED;
-                ret = wait_on_sync_kiocb(iocb);
+                wait_for_completion(&wait);
-        } else {
+                ret = fuse_get_res_by_io(io);
-                kfree(io);
        }
+        kfree(io);
        if (rw == WRITE) {
                if (ret > 0)
                        fuse_write_update_size(inode, pos);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1cdfb07c1376..7354dc142a50 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -263,6 +263,7 @@ struct fuse_io_priv {
        int err;
        struct kiocb *iocb;
        struct file *file;
+        struct completion *done;
 };
 /**
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 7b3143064af1..1be3b061c05c 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -110,11 +110,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
        error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
        if (error)
                goto out;
+        set_cached_acl(inode, type, acl);
-        if (acl)
-                set_cached_acl(inode, type, acl);
-        else
-                forget_cached_acl(inode, type);
 out:
        kfree(data);
        return error;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4ad4f94edebe..a6e6990aea39 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,7 +20,7 @@
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/backing-dev.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include <trace/events/writeback.h>
 #include "gfs2.h"
@@ -671,12 +671,12 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        if (alloc_required) {
                struct gfs2_alloc_parms ap = { .aflags = 0, };
-                error = gfs2_quota_lock_check(ip);
+                requested = data_blocks + ind_blocks;
+                ap.target = requested;
+                error = gfs2_quota_lock_check(ip, &ap);
                if (error)
                        goto out_unlock;
-                requested = data_blocks + ind_blocks;
-                ap.target = requested;
                error = gfs2_inplace_reserve(ip, &ap);
                if (error)
                        goto out_qunlock;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index f0b945ab853e..61296ecbd0e2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1224,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size)
        if (gfs2_is_stuffed(ip) &&
            (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
-                error = gfs2_quota_lock_check(ip);
+                error = gfs2_quota_lock_check(ip, &ap);
                if (error)
                        return error;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 3e32bb8e2d7e..8ec43ab5babf 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,7 +25,6 @@
 #include <asm/uaccess.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
-#include <linux/aio.h>
 #include <linux/delay.h>
 #include "gfs2.h"
@@ -429,11 +428,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (ret)
                goto out_unlock;
-        ret = gfs2_quota_lock_check(ip);
-        if (ret)
-                goto out_unlock;
        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
        ap.target = data_blocks + ind_blocks;
+        ret = gfs2_quota_lock_check(ip, &ap);
+        if (ret)
+                goto out_unlock;
        ret = gfs2_inplace_reserve(ip, &ap);
        if (ret)
                goto out_quota_unlock;
@@ -765,22 +764,30 @@ out:
        brelse(dibh);
        return error;
 }
+/**
-static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+ * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of
-                            unsigned int *data_blocks, unsigned int *ind_blocks)
+ *                     blocks, determine how many bytes can be written.
+ * @ip:          The inode in question.
+ * @len:         Max cap of bytes. What we return in *len must be <= this.
+ * @data_blocks: Compute and return the number of data blocks needed
+ * @ind_blocks:  Compute and return the number of indirect blocks needed
+ * @max_blocks:  The total blocks available to work with.
+ *
+ * Returns: void, but @len, @data_blocks and @ind_blocks are filled in.
+ */
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len,
+                            unsigned int *data_blocks, unsigned int *ind_blocks,
+                            unsigned int max_blocks)
 {
+        loff_t max = *len;
        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        unsigned int max_blocks = ip->i_rgd->rd_free_clone;
        unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
        for (tmp = max_data; tmp > sdp->sd_diptrs;) {
                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
                max_data -= tmp;
        }
-        /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
-           so it might end up with fewer data blocks */
-        if (max_data <= *data_blocks)
-                return;
        *data_blocks = max_data;
        *ind_blocks = max_blocks - max_data;
        *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
@@ -797,7 +804,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_alloc_parms ap = { .aflags = 0, };
        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-        loff_t bytes, max_bytes;
+        loff_t bytes, max_bytes, max_blks = UINT_MAX;
        int error;
        const loff_t pos = offset;
        const loff_t count = len;
@@ -819,6 +826,9 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
        gfs2_size_hint(file, offset, len);
+        gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks);
+        ap.min_target = data_blocks + ind_blocks;
        while (len > 0) {
                if (len < bytes)
                        bytes = len;
@@ -827,27 +837,41 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
                        offset += bytes;
                        continue;
                }
-                error = gfs2_quota_lock_check(ip);
+                /* We need to determine how many bytes we can actually
+                 * fallocate without exceeding quota or going over the
+                 * end of the fs. We start off optimistically by assuming
+                 * we can write max_bytes */
+                max_bytes = (len > max_chunk_size) ? max_chunk_size : len;
+                /* Since max_bytes is most likely a theoretical max, we
+                 * calculate a more realistic 'bytes' to serve as a good
+                 * starting point for the number of bytes we may be able
+                 * to write */
+                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+                ap.target = data_blocks + ind_blocks;
+                error = gfs2_quota_lock_check(ip, &ap);
                if (error)
                        return error;
-retry:
+                /* ap.allowed tells us how many blocks quota will allow
-                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+                 * us to write. Check if this reduces max_blks */
+                if (ap.allowed && ap.allowed < max_blks)
+                        max_blks = ap.allowed;
-                ap.target = data_blocks + ind_blocks;
                error = gfs2_inplace_reserve(ip, &ap);
-                if (error) {
+                if (error)
-                        if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
-                                bytes >>= 1;
-                                bytes &= bsize_mask;
-                                if (bytes == 0)
-                                        bytes = sdp->sd_sb.sb_bsize;
-                                goto retry;
-                        }
                        goto out_qunlock;
-                }
-                max_bytes = bytes;
+                /* check if the selected rgrp limits our max_blks further */
-                calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
+                if (ap.allowed && ap.allowed < max_blks)
-                                &max_bytes, &data_blocks, &ind_blocks);
+                        max_blks = ap.allowed;
+                /* Almost done. Calculate bytes that can be written using
+                 * max_blks. We also recompute max_bytes, data_blocks and
+                 * ind_blocks */
+                calc_max_reserv(ip, &max_bytes, &data_blocks,
+                                &ind_blocks, max_blks);
                rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
                          RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks);
@@ -931,6 +955,22 @@ out_uninit:
        return ret;
 }
+static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
+                                      struct file *out, loff_t *ppos,
+                                      size_t len, unsigned int flags)
+{
+        int error;
+        struct gfs2_inode *ip = GFS2_I(out->f_mapping->host);
+        error = gfs2_rs_alloc(ip);
+        if (error)
+                return (ssize_t)error;
+        gfs2_size_hint(out, *ppos, len);
+        return iter_file_splice_write(pipe, out, ppos, len, flags);
+}
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 /**
@@ -1077,7 +1117,7 @@ const struct file_operations gfs2_file_fops = {
        .lock           = gfs2_lock,
        .flock          = gfs2_flock,
        .splice_read    = generic_file_splice_read,
-        .splice_write   = iter_file_splice_write,
+        .splice_write   = gfs2_file_splice_write,
        .setlease       = simple_nosetlease,
        .fallocate      = gfs2_fallocate,
 };
@@ -1107,7 +1147,7 @@ const struct file_operations gfs2_file_fops_nolock = {
        .release        = gfs2_release,
        .fsync          = gfs2_fsync,
        .splice_read    = generic_file_splice_read,
-        .splice_write   = iter_file_splice_write,
+        .splice_write   = gfs2_file_splice_write,
        .setlease       = generic_setlease,
        .fallocate      = gfs2_fallocate,
 };
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f42dffba056a..0fa8062f85a7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2047,34 +2047,41 @@ static const struct file_operations gfs2_sbstats_fops = {
 int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
 {
-        sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
+        struct dentry *dent;
-        if (!sdp->debugfs_dir)
-                return -ENOMEM;
+        dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
-        sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",
+        if (IS_ERR_OR_NULL(dent))
-                                                         S_IFREG | S_IRUGO,
+                goto fail;
-                                                         sdp->debugfs_dir, sdp,
+        sdp->debugfs_dir = dent;
-                                                         &gfs2_glocks_fops);
-        if (!sdp->debugfs_dentry_glocks)
+        dent = debugfs_create_file("glocks",
+                                   S_IFREG | S_IRUGO,
+                                   sdp->debugfs_dir, sdp,
+                                   &gfs2_glocks_fops);
+        if (IS_ERR_OR_NULL(dent))
                goto fail;
+        sdp->debugfs_dentry_glocks = dent;
-        sdp->debugfs_dentry_glstats = debugfs_create_file("glstats",
+        dent = debugfs_create_file("glstats",
-                                                        S_IFREG | S_IRUGO,
+                                   S_IFREG | S_IRUGO,
-                                                        sdp->debugfs_dir, sdp,
+                                   sdp->debugfs_dir, sdp,
-                                                        &gfs2_glstats_fops);
+                                   &gfs2_glstats_fops);
-        if (!sdp->debugfs_dentry_glstats)
+        if (IS_ERR_OR_NULL(dent))
                goto fail;
+        sdp->debugfs_dentry_glstats = dent;
-        sdp->debugfs_dentry_sbstats = debugfs_create_file("sbstats",
+        dent = debugfs_create_file("sbstats",
-                                                        S_IFREG | S_IRUGO,
+                                   S_IFREG | S_IRUGO,
-                                                        sdp->debugfs_dir, sdp,
+                                   sdp->debugfs_dir, sdp,
-                                                        &gfs2_sbstats_fops);
+                                   &gfs2_sbstats_fops);
-        if (!sdp->debugfs_dentry_sbstats)
+        if (IS_ERR_OR_NULL(dent))
                goto fail;
+        sdp->debugfs_dentry_sbstats = dent;
        return 0;
 fail:
        gfs2_delete_debugfs_file(sdp);
-        return -ENOMEM;
+        return dent ? PTR_ERR(dent) : -ENOMEM;
 }
 void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
@@ -2100,6 +2107,8 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
 int gfs2_register_debugfs(void)
 {
        gfs2_root = debugfs_create_dir("gfs2", NULL);
+        if (IS_ERR(gfs2_root))
+                return PTR_ERR(gfs2_root);
        return gfs2_root ? 0 : -ENOMEM;
 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 7a2dbbc0d634..58b75abf6ab2 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -301,8 +301,10 @@ struct gfs2_blkreserv {
 * to the allocation code.
 */
 struct gfs2_alloc_parms {
-        u32 target;
+        u64 target;
+        u32 min_target;
        u32 aflags;
+        u64 allowed;
 };
 enum {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 73c72253faac..08bc84d7e768 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -382,7 +382,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
        struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, };
        int error;
-        error = gfs2_quota_lock_check(ip);
+        error = gfs2_quota_lock_check(ip, &ap);
        if (error)
                goto out;
@@ -525,7 +525,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
        int error;
        if (da->nr_blocks) {
-                error = gfs2_quota_lock_check(dip);
+                error = gfs2_quota_lock_check(dip, &ap);
                if (error)
                        goto fail_quota_locks;
@@ -953,7 +953,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (da.nr_blocks) {
                struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
-                error = gfs2_quota_lock_check(dip);
+                error = gfs2_quota_lock_check(dip, &ap);
                if (error)
                        goto out_gunlock;
@@ -1470,7 +1470,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        if (da.nr_blocks) {
                struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
-                error = gfs2_quota_lock_check(ndip);
+                error = gfs2_quota_lock_check(ndip, &ap);
                if (error)
                        goto out_gunlock;
@@ -1669,6 +1669,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        kuid_t ouid, nuid;
        kgid_t ogid, ngid;
        int error;
+        struct gfs2_alloc_parms ap;
        ouid = inode->i_uid;
        ogid = inode->i_gid;
@@ -1696,9 +1697,11 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out;
+        ap.target = gfs2_get_inode_blocks(&ip->i_inode);
        if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
            !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
-                error = gfs2_quota_check(ip, nuid, ngid);
+                error = gfs2_quota_check(ip, nuid, ngid, &ap);
                if (error)
                        goto out_gunlock_q;
        }
@@ -1713,9 +1716,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
            !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
-                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
+                gfs2_quota_change(ip, -ap.target, ouid, ogid);
-                gfs2_quota_change(ip, -blocks, ouid, ogid);
+                gfs2_quota_change(ip, ap.target, nuid, ngid);
-                gfs2_quota_change(ip, blocks, nuid, ngid);
        }
 out_end_trans:
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3aa17d4d1cfc..5c27e48aa76f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -923,6 +923,9 @@ restart:
        if (error)
                return error;
+        if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+                force_refresh = FORCE;
        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
        if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
@@ -974,11 +977,8 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
             sizeof(struct gfs2_quota_data *), sort_qd, NULL);
        for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
-                int force = NO_FORCE;
                qd = ip->i_res->rs_qa_qd[x];
-                if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+                error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]);
-                        force = FORCE;
-                error = do_glock(qd, force, &ip->i_res->rs_qa_qd_ghs[x]);
                if (error)
                        break;
        }
@@ -1094,14 +1094,33 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
        return 0;
 }
-int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
+/**
+ * gfs2_quota_check - check if allocating new blocks will exceed quota
+ * @ip:  The inode for which this check is being performed
+ * @uid: The uid to check against
+ * @gid: The gid to check against
+ * @ap:  The allocation parameters. ap->target contains the requested
+ *       blocks. ap->min_target, if set, contains the minimum blks
+ *       requested.
+ *
+ * Returns: 0 on success.
+ *                  min_req = ap->min_target ? ap->min_target : ap->target;
+ *                  quota must allow atleast min_req blks for success and
+ *                  ap->allowed is set to the number of blocks allowed
+ *
+ *          -EDQUOT otherwise, quota violation. ap->allowed is set to number
+ *                  of blocks available.
+ */
+int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+                     struct gfs2_alloc_parms *ap)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_quota_data *qd;
-        s64 value;
+        s64 value, warn, limit;
        unsigned int x;
        int error = 0;
+        ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
        if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
                return 0;
@@ -1115,30 +1134,37 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
                      qid_eq(qd->qd_id, make_kqid_gid(gid))))
                        continue;
+                warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn);
+                limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit);
                value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
                spin_lock(&qd_lock);
                value += qd->qd_change;
                spin_unlock(&qd_lock);
-                if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
+                if (limit > 0 && (limit - value) < ap->allowed)
-                        print_message(qd, "exceeded");
+                        ap->allowed = limit - value;
-                        quota_send_warning(qd->qd_id,
+                /* If we can't meet the target */
-                                           sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
+                if (limit && limit < (value + (s64)ap->target)) {
+                        /* If no min_target specified or we don't meet
-                        error = -EDQUOT;
+                         * min_target, return -EDQUOT */
-                        break;
+                        if (!ap->min_target || ap->min_target > ap->allowed) {
-                } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
+                                print_message(qd, "exceeded");
-                           (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value &&
+                                quota_send_warning(qd->qd_id,
+                                                   sdp->sd_vfs->s_dev,
+                                                   QUOTA_NL_BHARDWARN);
+                                error = -EDQUOT;
+                                break;
+                        }
+                } else if (warn && warn < value &&
                           time_after_eq(jiffies, qd->qd_last_warn +
-                                         gfs2_tune_get(sdp,
+                                         gfs2_tune_get(sdp, gt_quota_warn_period)
-                                                gt_quota_warn_period) * HZ)) {
+                                         * HZ)) {
                        quota_send_warning(qd->qd_id,
                                           sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
                        error = print_message(qd, "warning");
                        qd->qd_last_warn = jiffies;
                }
        }
        return error;
 }
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 55d506eb3c4a..ad04b3acae2b 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -24,7 +24,8 @@ extern void gfs2_quota_unhold(struct gfs2_inode *ip);
 extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
 extern void gfs2_quota_unlock(struct gfs2_inode *ip);
-extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+                            struct gfs2_alloc_parms *ap);
 extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
                              kuid_t uid, kgid_t gid);
@@ -37,7 +38,8 @@ extern int gfs2_quotad(void *data);
 extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
-static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
+static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
+                                        struct gfs2_alloc_parms *ap)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        int ret;
@@ -48,7 +50,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
                return ret;
        if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
                return 0;
-        ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+        ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap);
        if (ret)
                gfs2_quota_unlock(ip);
        return ret;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9150207f365c..6af2396a317c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1946,10 +1946,18 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
 * @ip: the inode to reserve space for
 * @ap: the allocation parameters
 *
- * Returns: errno
+ * We try our best to find an rgrp that has at least ap->target blocks
+ * available. After a couple of passes (loops == 2), the prospects of finding
+ * such an rgrp diminish. At this stage, we return the first rgrp that has
+ * atleast ap->min_target blocks available. Either way, we set ap->allowed to
+ * the number of blocks available in the chosen rgrp.
+ *
+ * Returns: 0 on success,
+ *          -ENOMEM if a suitable rgrp can't be found
+ *          errno otherwise
 */
-int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap)
+int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *begin = NULL;
@@ -2012,7 +2020,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
                /* Skip unuseable resource groups */
                if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
                                                 GFS2_RDF_ERROR)) ||
-                    (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
+                    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
                        goto skip_rgrp;
                if (sdp->sd_args.ar_rgrplvb)
@@ -2027,11 +2035,13 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
                        goto check_rgrp;
                /* If rgrp has enough free space, use it */
-                if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) {
+                if (rs->rs_rbm.rgd->rd_free_clone >= ap->target ||
+                    (loops == 2 && ap->min_target &&
+                     rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) {
                        ip->i_rgd = rs->rs_rbm.rgd;
+                        ap->allowed = ip->i_rgd->rd_free_clone;
                        return 0;
                }
 check_rgrp:
                /* Check for unlinked inodes which can be reclaimed */
                if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b104f4af3afd..68972ecfbb01 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -41,7 +41,8 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 #define GFS2_AF_ORLOV 1
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap);
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip,
+                                struct gfs2_alloc_parms *ap);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 0b81f783f787..fd260ce8869a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -732,7 +732,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (error)
                return error;
-        error = gfs2_quota_lock_check(ip);
+        error = gfs2_quota_lock_check(ip, &ap);
        if (error)
                return error;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index d0929bc81782..98d4ea45bb70 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,7 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "hfs_fs.h"
 #include "btree.h"
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 0cf786f2d046..f541196d4ee9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,7 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index d72817ac51f6..762c7a3cf43d 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -195,7 +195,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
        /* unchecked xdatum is chained with c->xattr_unchecked */
        list_del_init(&xd->xindex);
-        dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n",
+        dbg_xattr("success on verifying xdatum (xid=%u, version=%u)\n",
                  xd->xid, xd->version);
        return 0;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index bd3df1ca3c9b..3197aed10614 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,8 +22,8 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/uio.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 5d30c56ae075..4cd9798f4948 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -102,7 +102,7 @@ void jfs_error(struct super_block *sb, const char *fmt, ...)
        vaf.fmt = fmt;
        vaf.va = &args;
-        pr_err("ERROR: (device %s): %pf: %pV\n",
+        pr_err("ERROR: (device %s): %ps: %pV\n",
               sb->s_id, __builtin_return_address(0), &vaf);
        va_end(args);
diff --git a/fs/namei.c b/fs/namei.c
index c83145af4bfc..76fb76a0818b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -119,15 +119,14 @@
 * PATH_MAX includes the nul terminator --RR.
 */
-#define EMBEDDED_NAME_MAX       (PATH_MAX - sizeof(struct filename))
+#define EMBEDDED_NAME_MAX       (PATH_MAX - offsetof(struct filename, iname))
 struct filename *
 getname_flags(const char __user *filename, int flags, int *empty)
 {
-        struct filename *result, *err;
+        struct filename *result;
-        int len;
-        long max;
        char *kname;
+        int len;
        result = audit_reusename(filename);
        if (result)
@@ -136,22 +135,18 @@ getname_flags(const char __user *filename, int flags, int *empty)
        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);
-        result->refcnt = 1;
        /*
         * First, try to embed the struct filename inside the names_cache
         * allocation
         */
-        kname = (char *)result + sizeof(*result);
+        kname = (char *)result->iname;
        result->name = kname;
-        result->separate = false;
-        max = EMBEDDED_NAME_MAX;
-recopy:
+        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
-        len = strncpy_from_user(kname, filename, max);
        if (unlikely(len < 0)) {
-                err = ERR_PTR(len);
+                __putname(result);
-                goto error;
+                return ERR_PTR(len);
        }
        /*
@@ -160,43 +155,49 @@ recopy:
         * names_cache allocation for the pathname, and re-do the copy from
         * userland.
         */
-        if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
+        if (unlikely(len == EMBEDDED_NAME_MAX)) {
+                const size_t size = offsetof(struct filename, iname[1]);
                kname = (char *)result;
-                result = kzalloc(sizeof(*result), GFP_KERNEL);
+                /*
-                if (!result) {
+                 * size is chosen that way we to guarantee that
-                        err = ERR_PTR(-ENOMEM);
+                 * result->iname[0] is within the same object and that
-                        result = (struct filename *)kname;
+                 * kname can't be equal to result->iname, no matter what.
-                        goto error;
+                 */
+                result = kzalloc(size, GFP_KERNEL);
+                if (unlikely(!result)) {
+                        __putname(kname);
+                        return ERR_PTR(-ENOMEM);
                }
                result->name = kname;
-                result->separate = true;
+                len = strncpy_from_user(kname, filename, PATH_MAX);
-                result->refcnt = 1;
+                if (unlikely(len < 0)) {
-                max = PATH_MAX;
+                        __putname(kname);
-                goto recopy;
+                        kfree(result);
+                        return ERR_PTR(len);
+                }
+                if (unlikely(len == PATH_MAX)) {
+                        __putname(kname);
+                        kfree(result);
+                        return ERR_PTR(-ENAMETOOLONG);
+                }
        }
+        result->refcnt = 1;
        /* The empty path is special. */
        if (unlikely(!len)) {
                if (empty)
                        *empty = 1;
-                err = ERR_PTR(-ENOENT);
+                if (!(flags & LOOKUP_EMPTY)) {
-                if (!(flags & LOOKUP_EMPTY))
+                        putname(result);
-                        goto error;
+                        return ERR_PTR(-ENOENT);
+                }
        }
-        err = ERR_PTR(-ENAMETOOLONG);
-        if (unlikely(len >= PATH_MAX))
-                goto error;
        result->uptr = filename;
        result->aname = NULL;
        audit_getname(result);
        return result;
-error:
-        putname(result);
-        return err;
 }
 struct filename *
@@ -216,8 +217,7 @@ getname_kernel(const char * filename)
                return ERR_PTR(-ENOMEM);
        if (len <= EMBEDDED_NAME_MAX) {
-                result->name = (char *)(result) + sizeof(*result);
+                result->name = (char *)result->iname;
-                result->separate = false;
        } else if (len <= PATH_MAX) {
                struct filename *tmp;
@@ -227,7 +227,6 @@ getname_kernel(const char * filename)
                        return ERR_PTR(-ENOMEM);
                }
                tmp->name = (char *)result;
-                tmp->separate = true;
                result = tmp;
        } else {
                __putname(result);
@@ -249,7 +248,7 @@ void putname(struct filename *name)
        if (--name->refcnt > 0)
                return;
-        if (name->separate) {
+        if (name->name != name->iname) {
                __putname(name->name);
                kfree(name);
        } else
@@ -1851,10 +1850,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        return err;
 }
-static int path_init(int dfd, const char *name, unsigned int flags,
+static int path_init(int dfd, const struct filename *name, unsigned int flags,
                     struct nameidata *nd)
 {
        int retval = 0;
+        const char *s = name->name;
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
        nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
@@ -1863,7 +1863,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        if (flags & LOOKUP_ROOT) {
                struct dentry *root = nd->root.dentry;
                struct inode *inode = root->d_inode;
-                if (*name) {
+                if (*s) {
                        if (!d_can_lookup(root))
                                return -ENOTDIR;
                        retval = inode_permission(inode, MAY_EXEC);
@@ -1885,7 +1885,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        nd->root.mnt = NULL;
        nd->m_seq = read_seqbegin(&mount_lock);
-        if (*name=='/') {
+        if (*s == '/') {
                if (flags & LOOKUP_RCU) {
                        rcu_read_lock();
                        nd->seq = set_root_rcu(nd);
@@ -1919,7 +1919,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                dentry = f.file->f_path.dentry;
-                if (*name) {
+                if (*s) {
                        if (!d_can_lookup(dentry)) {
                                fdput(f);
                                return -ENOTDIR;
@@ -1949,7 +1949,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        return -ECHILD;
 done:
        current->total_link_count = 0;
-        return link_path_walk(name, nd);
+        return link_path_walk(s, nd);
 }
 static void path_cleanup(struct nameidata *nd)
@@ -1972,7 +1972,7 @@ static inline int lookup_last(struct nameidata *nd, struct path *path)
 }
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int path_lookupat(int dfd, const char *name,
+static int path_lookupat(int dfd, const struct filename *name,
                                unsigned int flags, struct nameidata *nd)
 {
        struct path path;
@@ -2027,31 +2027,17 @@ static int path_lookupat(int dfd, const char *name,
 static int filename_lookup(int dfd, struct filename *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
+        int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
        if (unlikely(retval == -ECHILD))
-                retval = path_lookupat(dfd, name->name, flags, nd);
+                retval = path_lookupat(dfd, name, flags, nd);
        if (unlikely(retval == -ESTALE))
-                retval = path_lookupat(dfd, name->name,
+                retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
-                                                flags | LOOKUP_REVAL, nd);
        if (likely(!retval))
                audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
        return retval;
 }
-static int do_path_lookup(int dfd, const char *name,
-                                unsigned int flags, struct nameidata *nd)
-{
-        struct filename *filename = getname_kernel(name);
-        int retval = PTR_ERR(filename);
-        if (!IS_ERR(filename)) {
-                retval = filename_lookup(dfd, filename, flags, nd);
-                putname(filename);
-        }
-        return retval;
-}
 /* does lookup, returns the object with parent locked */
 struct dentry *kern_path_locked(const char *name, struct path *path)
 {
@@ -2089,9 +2075,15 @@ out:
 int kern_path(const char *name, unsigned int flags, struct path *path)
 {
        struct nameidata nd;
-        int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
+        struct filename *filename = getname_kernel(name);
-        if (!res)
+        int res = PTR_ERR(filename);
-                *path = nd.path;
+        if (!IS_ERR(filename)) {
+                res = filename_lookup(AT_FDCWD, filename, flags, &nd);
+                putname(filename);
+                if (!res)
+                        *path = nd.path;
+        }
        return res;
 }
 EXPORT_SYMBOL(kern_path);
@@ -2108,15 +2100,22 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct path *path)
 {
-        struct nameidata nd;
+        struct filename *filename = getname_kernel(name);
-        int err;
+        int err = PTR_ERR(filename);
-        nd.root.dentry = dentry;
-        nd.root.mnt = mnt;
        BUG_ON(flags & LOOKUP_PARENT);
-        /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
-        err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
+        /* the first argument of filename_lookup() is ignored with LOOKUP_ROOT */
-        if (!err)
+        if (!IS_ERR(filename)) {
-                *path = nd.path;
+                struct nameidata nd;
+                nd.root.dentry = dentry;
+                nd.root.mnt = mnt;
+                err = filename_lookup(AT_FDCWD, filename,
+                                      flags | LOOKUP_ROOT, &nd);
+                if (!err)
+                        *path = nd.path;
+                putname(filename);
+        }
        return err;
 }
 EXPORT_SYMBOL(vfs_path_lookup);
@@ -2138,9 +2137,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.  Also note that by using this function the
+ * not be called by generic code.
- * nameidata argument is passed to the filesystem methods and a filesystem
- * using this helper needs to be prepared for that.
 */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2341,7 +2338,8 @@ out:
 * Returns 0 and "path" will be valid on success; Returns error otherwise.
 */
 static int
-path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
+path_mountpoint(int dfd, const struct filename *name, struct path *path,
+                unsigned int flags)
 {
        struct nameidata nd;
        int err;
@@ -2370,20 +2368,20 @@ out:
 }
 static int
-filename_mountpoint(int dfd, struct filename *s, struct path *path,
+filename_mountpoint(int dfd, struct filename *name, struct path *path,
                        unsigned int flags)
 {
        int error;
-        if (IS_ERR(s))
+        if (IS_ERR(name))
-                return PTR_ERR(s);
+                return PTR_ERR(name);
-        error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
+        error = path_mountpoint(dfd, name, path, flags | LOOKUP_RCU);
        if (unlikely(error == -ECHILD))
-                error = path_mountpoint(dfd, s->name, path, flags);
+                error = path_mountpoint(dfd, name, path, flags);
        if (unlikely(error == -ESTALE))
-                error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
+                error = path_mountpoint(dfd, name, path, flags | LOOKUP_REVAL);
        if (likely(!error))
-                audit_inode(s, path->dentry, 0);
+                audit_inode(name, path->dentry, 0);
-        putname(s);
+        putname(name);
        return error;
 }
@@ -3156,7 +3154,7 @@ static int do_tmpfile(int dfd, struct filename *pathname,
        static const struct qstr name = QSTR_INIT("/", 1);
        struct dentry *dentry, *child;
        struct inode *dir;
-        int error = path_lookupat(dfd, pathname->name,
+        int error = path_lookupat(dfd, pathname,
                                  flags | LOOKUP_DIRECTORY, nd);
        if (unlikely(error))
                return error;
@@ -3229,7 +3227,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
                goto out;
        }
-        error = path_init(dfd, pathname->name, flags, nd);
+        error = path_init(dfd, pathname, flags, nd);
        if (unlikely(error))
                goto out;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e907c8cf732e..c3929fb2ab26 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -265,7 +265,7 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t
        return -EINVAL;
 #else
-        VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
+        VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
        if (rw == READ)
                return nfs_file_direct_read(iocb, iter, pos);
@@ -393,7 +393,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
                long res = (long) dreq->error;
                if (!res)
                        res = (long) dreq->count;
-                aio_complete(dreq->iocb, res, 0);
+                dreq->iocb->ki_complete(dreq->iocb, res, 0);
        }
        complete_all(&dreq->completion);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e679d24c39d3..37b15582e0de 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -26,7 +26,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
-#include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/swap.h>
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8b5969538f39..ab4987bc637f 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -26,7 +26,7 @@
 #include <linux/mpage.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "segment.h"
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 36ae529511c4..2ff263e6d363 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -8,7 +8,7 @@ ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
 ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-ccflags-y := -DNTFS_VERSION=\"2.1.31\"
+ccflags-y := -DNTFS_VERSION=\"2.1.32\"
 ccflags-$(CONFIG_NTFS_DEBUG)    += -DDEBUG
 ccflags-$(CONFIG_NTFS_RW)       += -DNTFS_RW
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1da9b2d184dc..c1da78dad1af 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
 * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
 *
 * This program/include file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published
@@ -28,7 +28,6 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -329,62 +328,168 @@ err_out:
        return err;
 }
-/**
+static ssize_t ntfs_prepare_file_for_write(struct file *file, loff_t *ppos,
- * ntfs_fault_in_pages_readable -
+                size_t *count)
- *
- * Fault a number of userspace pages into pagetables.
- *
- * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
- * with more than two userspace pages as well as handling the single page case
- * elegantly.
- *
- * If you find this difficult to understand, then think of the while loop being
- * the following code, except that we do without the integer variable ret:
- *
- *      do {
- *              ret = __get_user(c, uaddr);
- *              uaddr += PAGE_SIZE;
- *      } while (!ret && uaddr < end);
- *
- * Note, the final __get_user() may well run out-of-bounds of the user buffer,
- * but _not_ out-of-bounds of the page the user buffer belongs to, and since
- * this is only a read and not a write, and since it is still in the same page,
- * it should not matter and this makes the code much simpler.
- */
-static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
-                int bytes)
 {
-        const char __user *end;
+        loff_t pos;
-        volatile char c;
+        s64 end, ll;
+        ssize_t err;
-        /* Set @end to the first byte outside the last page we care about. */
+        unsigned long flags;
-        end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
+        struct inode *vi = file_inode(file);
+        ntfs_inode *base_ni, *ni = NTFS_I(vi);
-        while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
+        ntfs_volume *vol = ni->vol;
-                ;
-}
-/**
- * ntfs_fault_in_pages_readable_iovec -
- *
- * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
- */
-static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
-                size_t iov_ofs, int bytes)
-{
-        do {
-                const char __user *buf;
-                unsigned len;
-                buf = iov->iov_base + iov_ofs;
+        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
-                len = iov->iov_len - iov_ofs;
+                        "0x%llx, count 0x%lx.", vi->i_ino,
-                if (len > bytes)
+                        (unsigned)le32_to_cpu(ni->type),
-                        len = bytes;
+                        (unsigned long long)*ppos, (unsigned long)*count);
-                ntfs_fault_in_pages_readable(buf, len);
+        /* We can write back this queue in page reclaim. */
-                bytes -= len;
+        current->backing_dev_info = inode_to_bdi(vi);
-                iov++;
+        err = generic_write_checks(file, ppos, count, S_ISBLK(vi->i_mode));
-                iov_ofs = 0;
+        if (unlikely(err))
-        } while (bytes);
+                goto out;
+        /*
+         * All checks have passed.  Before we start doing any writing we want
+         * to abort any totally illegal writes.
+         */
+        BUG_ON(NInoMstProtected(ni));
+        BUG_ON(ni->type != AT_DATA);
+        /* If file is encrypted, deny access, just like NT4. */
+        if (NInoEncrypted(ni)) {
+                /* Only $DATA attributes can be encrypted. */
+                /*
+                 * Reminder for later: Encrypted files are _always_
+                 * non-resident so that the content can always be encrypted.
+                 */
+                ntfs_debug("Denying write access to encrypted file.");
+                err = -EACCES;
+                goto out;
+        }
+        if (NInoCompressed(ni)) {
+                /* Only unnamed $DATA attribute can be compressed. */
+                BUG_ON(ni->name_len);
+                /*
+                 * Reminder for later: If resident, the data is not actually
+                 * compressed.  Only on the switch to non-resident does
+                 * compression kick in.  This is in contrast to encrypted files
+                 * (see above).
+                 */
+                ntfs_error(vi->i_sb, "Writing to compressed files is not "
+                                "implemented yet.  Sorry.");
+                err = -EOPNOTSUPP;
+                goto out;
+        }
+        if (*count == 0)
+                goto out;
+        base_ni = ni;
+        if (NInoAttr(ni))
+                base_ni = ni->ext.base_ntfs_ino;
+        err = file_remove_suid(file);
+        if (unlikely(err))
+                goto out;
+        /*
+         * Our ->update_time method always succeeds thus file_update_time()
+         * cannot fail either so there is no need to check the return code.
+         */
+        file_update_time(file);
+        pos = *ppos;
+        /* The first byte after the last cluster being written to. */
+        end = (pos + *count + vol->cluster_size_mask) &
+                        ~(u64)vol->cluster_size_mask;
+        /*
+         * If the write goes beyond the allocated size, extend the allocation
+         * to cover the whole of the write, rounded up to the nearest cluster.
+         */
+        read_lock_irqsave(&ni->size_lock, flags);
+        ll = ni->allocated_size;
+        read_unlock_irqrestore(&ni->size_lock, flags);
+        if (end > ll) {
+                /*
+                 * Extend the allocation without changing the data size.
+                 *
+                 * Note we ensure the allocation is big enough to at least
+                 * write some data but we do not require the allocation to be
+                 * complete, i.e. it may be partial.
+                 */
+                ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+                if (likely(ll >= 0)) {
+                        BUG_ON(pos >= ll);
+                        /* If the extension was partial truncate the write. */
+                        if (end > ll) {
+                                ntfs_debug("Truncating write to inode 0x%lx, "
+                                                "attribute type 0x%x, because "
+                                                "the allocation was only "
+                                                "partially extended.",
+                                                vi->i_ino, (unsigned)
+                                                le32_to_cpu(ni->type));
+                                *count = ll - pos;
+                        }
+                } else {
+                        err = ll;
+                        read_lock_irqsave(&ni->size_lock, flags);
+                        ll = ni->allocated_size;
+                        read_unlock_irqrestore(&ni->size_lock, flags);
+                        /* Perform a partial write if possible or fail. */
+                        if (pos < ll) {
+                                ntfs_debug("Truncating write to inode 0x%lx "
+                                                "attribute type 0x%x, because "
+                                                "extending the allocation "
+                                                "failed (error %d).",
+                                                vi->i_ino, (unsigned)
+                                                le32_to_cpu(ni->type),
+                                                (int)-err);
+                                *count = ll - pos;
+                        } else {
+                                if (err != -ENOSPC)
+                                        ntfs_error(vi->i_sb, "Cannot perform "
+                                                        "write to inode "
+                                                        "0x%lx, attribute "
+                                                        "type 0x%x, because "
+                                                        "extending the "
+                                                        "allocation failed "
+                                                        "(error %ld).",
+                                                        vi->i_ino, (unsigned)
+                                                        le32_to_cpu(ni->type),
+                                                        (long)-err);
+                                else
+                                        ntfs_debug("Cannot perform write to "
+                                                        "inode 0x%lx, "
+                                                        "attribute type 0x%x, "
+                                                        "because there is not "
+                                                        "space left.",
+                                                        vi->i_ino, (unsigned)
+                                                        le32_to_cpu(ni->type));
+                                goto out;
+                        }
+                }
+        }
+        /*
+         * If the write starts beyond the initialized size, extend it up to the
+         * beginning of the write and initialize all non-sparse space between
+         * the old initialized size and the new one.  This automatically also
+         * increments the vfs inode->i_size to keep it above or equal to the
+         * initialized_size.
+         */
+        read_lock_irqsave(&ni->size_lock, flags);
+        ll = ni->initialized_size;
+        read_unlock_irqrestore(&ni->size_lock, flags);
+        if (pos > ll) {
+                /*
+                 * Wait for ongoing direct i/o to complete before proceeding.
+                 * New direct i/o cannot start as we hold i_mutex.
+                 */
+                inode_dio_wait(vi);
+                err = ntfs_attr_extend_initialized(ni, pos);
+                if (unlikely(err < 0))
+                        ntfs_error(vi->i_sb, "Cannot perform write to inode "
+                                        "0x%lx, attribute type 0x%x, because "
+                                        "extending the initialized size "
+                                        "failed (error %d).", vi->i_ino,
+                                        (unsigned)le32_to_cpu(ni->type),
+                                        (int)-err);
+        }
+out:
+        return err;
 }
 /**
@@ -421,8 +526,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                        goto err_out;
                                }
                        }
-                        err = add_to_page_cache_lru(*cached_page, mapping, index,
+                        err = add_to_page_cache_lru(*cached_page, mapping,
-                                        GFP_KERNEL);
+                                        index, GFP_KERNEL);
                        if (unlikely(err)) {
                                if (err == -EEXIST)
                                        continue;
@@ -1268,180 +1373,6 @@ rl_not_mapped_enoent:
        return err;
 }
-/*
- * Copy as much as we can into the pages and return the number of bytes which
- * were successfully copied.  If a fault is encountered then clear the pages
- * out to (ofs + bytes) and return the number of bytes which were copied.
- */
-static inline size_t ntfs_copy_from_user(struct page **pages,
-                unsigned nr_pages, unsigned ofs, const char __user *buf,
-                size_t bytes)
-{
-        struct page **last_page = pages + nr_pages;
-        char *addr;
-        size_t total = 0;
-        unsigned len;
-        int left;
-        do {
-                len = PAGE_CACHE_SIZE - ofs;
-                if (len > bytes)
-                        len = bytes;
-                addr = kmap_atomic(*pages);
-                left = __copy_from_user_inatomic(addr + ofs, buf, len);
-                kunmap_atomic(addr);
-                if (unlikely(left)) {
-                        /* Do it the slow way. */
-                        addr = kmap(*pages);
-                        left = __copy_from_user(addr + ofs, buf, len);
-                        kunmap(*pages);
-                        if (unlikely(left))
-                                goto err_out;
-                }
-                total += len;
-                bytes -= len;
-                if (!bytes)
-                        break;
-                buf += len;
-                ofs = 0;
-        } while (++pages < last_page);
-out:
-        return total;
-err_out:
-        total += len - left;
-        /* Zero the rest of the target like __copy_from_user(). */
-        while (++pages < last_page) {
-                bytes -= len;
-                if (!bytes)
-                        break;
-                len = PAGE_CACHE_SIZE;
-                if (len > bytes)
-                        len = bytes;
-                zero_user(*pages, 0, len);
-        }
-        goto out;
-}
-static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
-                const struct iovec *iov, size_t iov_ofs, size_t bytes)
-{
-        size_t total = 0;
-        while (1) {
-                const char __user *buf = iov->iov_base + iov_ofs;
-                unsigned len;
-                size_t left;
-                len = iov->iov_len - iov_ofs;
-                if (len > bytes)
-                        len = bytes;
-                left = __copy_from_user_inatomic(vaddr, buf, len);
-                total += len;
-                bytes -= len;
-                vaddr += len;
-                if (unlikely(left)) {
-                        total -= left;
-                        break;
-                }
-                if (!bytes)
-                        break;
-                iov++;
-                iov_ofs = 0;
-        }
-        return total;
-}
-static inline void ntfs_set_next_iovec(const struct iovec **iovp,
-                size_t *iov_ofsp, size_t bytes)
-{
-        const struct iovec *iov = *iovp;
-        size_t iov_ofs = *iov_ofsp;
-        while (bytes) {
-                unsigned len;
-                len = iov->iov_len - iov_ofs;
-                if (len > bytes)
-                        len = bytes;
-                bytes -= len;
-                iov_ofs += len;
-                if (iov->iov_len == iov_ofs) {
-                        iov++;
-                        iov_ofs = 0;
-                }
-        }
-        *iovp = iov;
-        *iov_ofsp = iov_ofs;
-}
-/*
- * This has the same side-effects and return value as ntfs_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
- * single-segment behaviour.
- *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
- * atomic and when not atomic.  This is ok because it calls
- * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
- * fact, the only difference between __copy_from_user_inatomic() and
- * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error.  And on many architectures
- * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
- * makes no difference at all on those architectures.
- */
-static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
-                unsigned nr_pages, unsigned ofs, const struct iovec **iov,
-                size_t *iov_ofs, size_t bytes)
-{
-        struct page **last_page = pages + nr_pages;
-        char *addr;
-        size_t copied, len, total = 0;
-        do {
-                len = PAGE_CACHE_SIZE - ofs;
-                if (len > bytes)
-                        len = bytes;
-                addr = kmap_atomic(*pages);
-                copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
-                                *iov, *iov_ofs, len);
-                kunmap_atomic(addr);
-                if (unlikely(copied != len)) {
-                        /* Do it the slow way. */
-                        addr = kmap(*pages);
-                        copied = __ntfs_copy_from_user_iovec_inatomic(addr +
-                                        ofs, *iov, *iov_ofs, len);
-                        if (unlikely(copied != len))
-                                goto err_out;
-                        kunmap(*pages);
-                }
-                total += len;
-                ntfs_set_next_iovec(iov, iov_ofs, len);
-                bytes -= len;
-                if (!bytes)
-                        break;
-                ofs = 0;
-        } while (++pages < last_page);
-out:
-        return total;
-err_out:
-        BUG_ON(copied > len);
-        /* Zero the rest of the target like __copy_from_user(). */
-        memset(addr + ofs + copied, 0, len - copied);
-        kunmap(*pages);
-        total += copied;
-        ntfs_set_next_iovec(iov, iov_ofs, copied);
-        while (++pages < last_page) {
-                bytes -= len;
-                if (!bytes)
-                        break;
-                len = PAGE_CACHE_SIZE;
-                if (len > bytes)
-                        len = bytes;
-                zero_user(*pages, 0, len);
-        }
-        goto out;
-}
 static inline void ntfs_flush_dcache_pages(struct page **pages,
                unsigned nr_pages)
 {
@@ -1762,86 +1693,83 @@ err_out:
        return err;
 }
-static void ntfs_write_failed(struct address_space *mapping, loff_t to)
+/*
+ * Copy as much as we can into the pages and return the number of bytes which
+ * were successfully copied.  If a fault is encountered then clear the pages
+ * out to (ofs + bytes) and return the number of bytes which were copied.
+ */
+static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
+                unsigned ofs, struct iov_iter *i, size_t bytes)
 {
-        struct inode *inode = mapping->host;
+        struct page **last_page = pages + nr_pages;
+        size_t total = 0;
+        struct iov_iter data = *i;
+        unsigned len, copied;
-        if (to > inode->i_size) {
+        do {
-                truncate_pagecache(inode, inode->i_size);
+                len = PAGE_CACHE_SIZE - ofs;
-                ntfs_truncate_vfs(inode);
+                if (len > bytes)
-        }
+                        len = bytes;
+                copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
+                                len);
+                total += copied;
+                bytes -= copied;
+                if (!bytes)
+                        break;
+                iov_iter_advance(&data, copied);
+                if (copied < len)
+                        goto err;
+                ofs = 0;
+        } while (++pages < last_page);
+out:
+        return total;
+err:
+        /* Zero the rest of the target like __copy_from_user(). */
+        len = PAGE_CACHE_SIZE - copied;
+        do {
+                if (len > bytes)
+                        len = bytes;
+                zero_user(*pages, copied, len);
+                bytes -= len;
+                copied = 0;
+                len = PAGE_CACHE_SIZE;
+        } while (++pages < last_page);
+        goto out;
 }
 /**
- * ntfs_file_buffered_write -
+ * ntfs_perform_write - perform buffered write to a file
- *
+ * @file:       file to write to
- * Locking: The vfs is holding ->i_mutex on the inode.
+ * @i:          iov_iter with data to write
+ * @pos:        byte offset in file at which to begin writing to
 */
-static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
+static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
-                const struct iovec *iov, unsigned long nr_segs,
+                loff_t pos)
-                loff_t pos, loff_t *ppos, size_t count)
 {
-        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *vi = mapping->host;
        ntfs_inode *ni = NTFS_I(vi);
        ntfs_volume *vol = ni->vol;
        struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
        struct page *cached_page = NULL;
-        char __user *buf = NULL;
-        s64 end, ll;
        VCN last_vcn;
        LCN lcn;
-        unsigned long flags;
+        size_t bytes;
-        size_t bytes, iov_ofs = 0;      /* Offset in the current iovec. */
+        ssize_t status, written = 0;
-        ssize_t status, written;
        unsigned nr_pages;
-        int err;
-        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
-                        "pos 0x%llx, count 0x%lx.",
+                        "0x%llx, count 0x%lx.", vi->i_ino,
-                        vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+                        (unsigned)le32_to_cpu(ni->type),
-                        (unsigned long long)pos, (unsigned long)count);
+                        (unsigned long long)pos,
-        if (unlikely(!count))
+                        (unsigned long)iov_iter_count(i));
-                return 0;
-        BUG_ON(NInoMstProtected(ni));
-        /*
-         * If the attribute is not an index root and it is encrypted or
-         * compressed, we cannot write to it yet.  Note we need to check for
-         * AT_INDEX_ALLOCATION since this is the type of both directory and
-         * index inodes.
-         */
-        if (ni->type != AT_INDEX_ALLOCATION) {
-                /* If file is encrypted, deny access, just like NT4. */
-                if (NInoEncrypted(ni)) {
-                        /*
-                         * Reminder for later: Encrypted files are _always_
-                         * non-resident so that the content can always be
-                         * encrypted.
-                         */
-                        ntfs_debug("Denying write access to encrypted file.");
-                        return -EACCES;
-                }
-                if (NInoCompressed(ni)) {
-                        /* Only unnamed $DATA attribute can be compressed. */
-                        BUG_ON(ni->type != AT_DATA);
-                        BUG_ON(ni->name_len);
-                        /*
-                         * Reminder for later: If resident, the data is not
-                         * actually compressed.  Only on the switch to non-
-                         * resident does compression kick in.  This is in
-                         * contrast to encrypted files (see above).
-                         */
-                        ntfs_error(vi->i_sb, "Writing to compressed files is "
-                                        "not implemented yet.  Sorry.");
-                        return -EOPNOTSUPP;
-                }
-        }
        /*
         * If a previous ntfs_truncate() failed, repeat it and abort if it
         * fails again.
         */
        if (unlikely(NInoTruncateFailed(ni))) {
+                int err;
                inode_dio_wait(vi);
                err = ntfs_truncate(vi);
                if (err || NInoTruncateFailed(ni)) {
@@ -1855,81 +1783,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        return err;
                }
        }
-        /* The first byte after the write. */
-        end = pos + count;
-        /*
-         * If the write goes beyond the allocated size, extend the allocation
-         * to cover the whole of the write, rounded up to the nearest cluster.
-         */
-        read_lock_irqsave(&ni->size_lock, flags);
-        ll = ni->allocated_size;
-        read_unlock_irqrestore(&ni->size_lock, flags);
-        if (end > ll) {
-                /* Extend the allocation without changing the data size. */
-                ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
-                if (likely(ll >= 0)) {
-                        BUG_ON(pos >= ll);
-                        /* If the extension was partial truncate the write. */
-                        if (end > ll) {
-                                ntfs_debug("Truncating write to inode 0x%lx, "
-                                                "attribute type 0x%x, because "
-                                                "the allocation was only "
-                                                "partially extended.",
-                                                vi->i_ino, (unsigned)
-                                                le32_to_cpu(ni->type));
-                                end = ll;
-                                count = ll - pos;
-                        }
-                } else {
-                        err = ll;
-                        read_lock_irqsave(&ni->size_lock, flags);
-                        ll = ni->allocated_size;
-                        read_unlock_irqrestore(&ni->size_lock, flags);
-                        /* Perform a partial write if possible or fail. */
-                        if (pos < ll) {
-                                ntfs_debug("Truncating write to inode 0x%lx, "
-                                                "attribute type 0x%x, because "
-                                                "extending the allocation "
-                                                "failed (error code %i).",
-                                                vi->i_ino, (unsigned)
-                                                le32_to_cpu(ni->type), err);
-                                end = ll;
-                                count = ll - pos;
-                        } else {
-                                ntfs_error(vol->sb, "Cannot perform write to "
-                                                "inode 0x%lx, attribute type "
-                                                "0x%x, because extending the "
-                                                "allocation failed (error "
-                                                "code %i).", vi->i_ino,
-                                                (unsigned)
-                                                le32_to_cpu(ni->type), err);
-                                return err;
-                        }
-                }
-        }
-        written = 0;
-        /*
-         * If the write starts beyond the initialized size, extend it up to the
-         * beginning of the write and initialize all non-sparse space between
-         * the old initialized size and the new one.  This automatically also
-         * increments the vfs inode->i_size to keep it above or equal to the
-         * initialized_size.
-         */
-        read_lock_irqsave(&ni->size_lock, flags);
-        ll = ni->initialized_size;
-        read_unlock_irqrestore(&ni->size_lock, flags);
-        if (pos > ll) {
-                err = ntfs_attr_extend_initialized(ni, pos);
-                if (err < 0) {
-                        ntfs_error(vol->sb, "Cannot perform write to inode "
-                                        "0x%lx, attribute type 0x%x, because "
-                                        "extending the initialized size "
-                                        "failed (error code %i).", vi->i_ino,
-                                        (unsigned)le32_to_cpu(ni->type), err);
-                        status = err;
-                        goto err_out;
-                }
-        }
        /*
         * Determine the number of pages per cluster for non-resident
         * attributes.
@@ -1937,10 +1790,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
        nr_pages = 1;
        if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
                nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
-        /* Finally, perform the actual write. */
        last_vcn = -1;
-        if (likely(nr_segs == 1))
-                buf = iov->iov_base;
        do {
                VCN vcn;
                pgoff_t idx, start_idx;
@@ -1965,10 +1815,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                                                vol->cluster_size_bits, false);
                                up_read(&ni->runlist.lock);
                                if (unlikely(lcn < LCN_HOLE)) {
-                                        status = -EIO;
                                        if (lcn == LCN_ENOMEM)
                                                status = -ENOMEM;
-                                        else
+                                        else {
+                                                status = -EIO;
                                                ntfs_error(vol->sb, "Cannot "
                                                        "perform write to "
                                                        "inode 0x%lx, "
@@ -1977,6 +1827,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                                                        "is corrupt.",
                                                        vi->i_ino, (unsigned)
                                                        le32_to_cpu(ni->type));
+                                        }
                                        break;
                                }
                                if (lcn == LCN_HOLE) {
@@ -1989,8 +1840,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                                }
                        }
                }
-                if (bytes > count)
+                if (bytes > iov_iter_count(i))
-                        bytes = count;
+                        bytes = iov_iter_count(i);
+again:
                /*
                 * Bring in the user page(s) that we will copy from _first_.
                 * Otherwise there is a nasty deadlock on copying from the same
@@ -1999,10 +1851,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                 * pages being swapped out between us bringing them into memory
                 * and doing the actual copying.
                 */
-                if (likely(nr_segs == 1))
+                if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) {
-                        ntfs_fault_in_pages_readable(buf, bytes);
+                        status = -EFAULT;
-                else
+                        break;
-                        ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
+                }
                /* Get and lock @do_pages starting at index @start_idx. */
                status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
                                pages, &cached_page);
@@ -2018,56 +1870,57 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        status = ntfs_prepare_pages_for_non_resident_write(
                                        pages, do_pages, pos, bytes);
                        if (unlikely(status)) {
-                                loff_t i_size;
                                do {
                                        unlock_page(pages[--do_pages]);
                                        page_cache_release(pages[do_pages]);
                                } while (do_pages);
-                                /*
-                                 * The write preparation may have instantiated
-                                 * allocated space outside i_size.  Trim this
-                                 * off again.  We can ignore any errors in this
-                                 * case as we will just be waisting a bit of
-                                 * allocated space, which is not a disaster.
-                                 */
-                                i_size = i_size_read(vi);
-                                if (pos + bytes > i_size) {
-                                        ntfs_write_failed(mapping, pos + bytes);
-                                }
                                break;
                        }
                }
                u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
-                if (likely(nr_segs == 1)) {
+                copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
-                        copied = ntfs_copy_from_user(pages + u, do_pages - u,
+                                        i, bytes);
-                                        ofs, buf, bytes);
-                        buf += copied;
-                } else
-                        copied = ntfs_copy_from_user_iovec(pages + u,
-                                        do_pages - u, ofs, &iov, &iov_ofs,
-                                        bytes);
                ntfs_flush_dcache_pages(pages + u, do_pages - u);
-                status = ntfs_commit_pages_after_write(pages, do_pages, pos,
+                status = 0;
-                                bytes);
+                if (likely(copied == bytes)) {
-                if (likely(!status)) {
+                        status = ntfs_commit_pages_after_write(pages, do_pages,
-                        written += copied;
+                                        pos, bytes);
-                        count -= copied;
+                        if (!status)
-                        pos += copied;
+                                status = bytes;
-                        if (unlikely(copied != bytes))
-                                status = -EFAULT;
                }
                do {
                        unlock_page(pages[--do_pages]);
                        page_cache_release(pages[do_pages]);
                } while (do_pages);
-                if (unlikely(status))
+                if (unlikely(status < 0))
                        break;
-                balance_dirty_pages_ratelimited(mapping);
+                copied = status;
                cond_resched();
-        } while (count);
+                if (unlikely(!copied)) {
-err_out:
+                        size_t sc;
-        *ppos = pos;
+                        /*
+                         * We failed to copy anything.  Fall back to single
+                         * segment length write.
+                         *
+                         * This is needed to avoid possible livelock in the
+                         * case that all segments in the iov cannot be copied
+                         * at once without a pagefault.
+                         */
+                        sc = iov_iter_single_seg_count(i);
+                        if (bytes > sc)
+                                bytes = sc;
+                        goto again;
+                }
+                iov_iter_advance(i, copied);
+                pos += copied;
+                written += copied;
+                balance_dirty_pages_ratelimited(mapping);
+                if (fatal_signal_pending(current)) {
+                        status = -EINTR;
+                        break;
+                }
+        } while (iov_iter_count(i));
        if (cached_page)
                page_cache_release(cached_page);
        ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
@@ -2077,59 +1930,56 @@ err_out:
 }
 /**
- * ntfs_file_aio_write_nolock -
+ * ntfs_file_write_iter_nolock - write data to a file
+ * @iocb:       IO state structure (file, offset, etc.)
+ * @from:       iov_iter with data to write
+ *
+ * Basically the same as __generic_file_write_iter() except that it ends
+ * up calling ntfs_perform_write() instead of generic_perform_write() and that
+ * O_DIRECT is not implemented.
 */
-static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
+static ssize_t ntfs_file_write_iter_nolock(struct kiocb *iocb,
-                const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
+                struct iov_iter *from)
 {
        struct file *file = iocb->ki_filp;
-        struct address_space *mapping = file->f_mapping;
+        loff_t pos = iocb->ki_pos;
-        struct inode *inode = mapping->host;
+        ssize_t written = 0;
-        loff_t pos;
+        ssize_t err;
-        size_t count;           /* after file limit checks */
+        size_t count = iov_iter_count(from);
-        ssize_t written, err;
-        count = iov_length(iov, nr_segs);
+        err = ntfs_prepare_file_for_write(file, &pos, &count);
-        pos = *ppos;
+        if (count && !err) {
-        /* We can write back this queue in page reclaim. */
+                iov_iter_truncate(from, count);
-        current->backing_dev_info = inode_to_bdi(inode);
+                written = ntfs_perform_write(file, from, pos);
-        written = 0;
+                if (likely(written >= 0))
-        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+                        iocb->ki_pos = pos + written;
-        if (err)
+        }
-                goto out;
-        if (!count)
-                goto out;
-        err = file_remove_suid(file);
-        if (err)
-                goto out;
-        err = file_update_time(file);
-        if (err)
-                goto out;
-        written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
-                        count);
-out:
        current->backing_dev_info = NULL;
        return written ? written : err;
 }
 /**
- * ntfs_file_aio_write -
+ * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
+ * @iocb:       IO state structure
+ * @from:       iov_iter with data to write
+ *
+ * Basically the same as generic_file_write_iter() except that it ends up
+ * calling ntfs_file_write_iter_nolock() instead of
+ * __generic_file_write_iter().
 */
-static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-                unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
-        struct address_space *mapping = file->f_mapping;
+        struct inode *vi = file_inode(file);
-        struct inode *inode = mapping->host;
        ssize_t ret;
-        BUG_ON(iocb->ki_pos != pos);
+        mutex_lock(&vi->i_mutex);
+        ret = ntfs_file_write_iter_nolock(iocb, from);
-        mutex_lock(&inode->i_mutex);
+        mutex_unlock(&vi->i_mutex);
-        ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
-        mutex_unlock(&inode->i_mutex);
        if (ret > 0) {
-                int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+                ssize_t err;
+                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
                if (err < 0)
                        ret = err;
        }
@@ -2197,37 +2047,17 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
 #endif /* NTFS_RW */
 const struct file_operations ntfs_file_ops = {
-        .llseek         = generic_file_llseek,   /* Seek inside file. */
+        .llseek         = generic_file_llseek,
-        .read           = new_sync_read,         /* Read from file. */
+        .read           = new_sync_read,
-        .read_iter      = generic_file_read_iter, /* Async read from file. */
+        .read_iter      = generic_file_read_iter,
 #ifdef NTFS_RW
-        .write          = do_sync_write,         /* Write to file. */
+        .write          = new_sync_write,
-        .aio_write      = ntfs_file_aio_write,   /* Async write to file. */
+        .write_iter     = ntfs_file_write_iter,
-        /*.release      = ,*/                    /* Last file is closed.  See
+        .fsync          = ntfs_file_fsync,
-                                                    fs/ext2/file.c::
-                                                    ext2_release_file() for
-                                                    how to use this to discard
-                                                    preallocated space for
-                                                    write opened files. */
-        .fsync          = ntfs_file_fsync,       /* Sync a file to disk. */
-        /*.aio_fsync    = ,*/                    /* Sync all outstanding async
-                                                    i/o operations on a
-                                                    kiocb. */
 #endif /* NTFS_RW */
-        /*.ioctl        = ,*/                    /* Perform function on the
+        .mmap           = generic_file_mmap,
-                                                    mounted filesystem. */
+        .open           = ntfs_file_open,
-        .mmap           = generic_file_mmap,     /* Mmap file. */
+        .splice_read    = generic_file_splice_read,
-        .open           = ntfs_file_open,        /* Open file. */
-        .splice_read    = generic_file_splice_read /* Zero-copy data send with
-                                                    the data source being on
-                                                    the ntfs partition.  We do
-                                                    not need to care about the
-                                                    data destination. */
-        /*.sendpage     = ,*/                    /* Zero-copy data send with
-                                                    the data destination being
-                                                    on the ntfs partition.  We
-                                                    do not need to care about
-                                                    the data source. */
 };
 const struct inode_operations ntfs_file_inode_ops = {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 898b9949d363..1d0c21df0d80 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,7 +28,6 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/log2.h>
-#include <linux/aio.h>
 #include "aops.h"
 #include "attrib.h"
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1b0463a92b17..8d2bc840c288 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,6 +29,7 @@
 #include <linux/mpage.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
+#include <linux/uio.h>
 #include <cluster/masklog.h>
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 6cae155d54df..dd59599b022d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,7 +22,7 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
-#include <linux/aio.h>
+#include <linux/fs.h>
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         struct page *page,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ba1790e52ff2..91f03ce98108 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2280,7 +2280,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                file->f_path.dentry->d_name.name,
                (unsigned int)from->nr_segs);   /* GRRRRR */
-        if (iocb->ki_nbytes == 0)
+        if (count == 0)
                return 0;
        appending = file->f_flags & O_APPEND ? 1 : 0;
@@ -2330,8 +2330,7 @@ relock:
        }
        can_do_direct = direct_io;
-        ret = ocfs2_prepare_inode_for_write(file, ppos,
+        ret = ocfs2_prepare_inode_for_write(file, ppos, count, appending,
-                                            iocb->ki_nbytes, appending,
                                            &can_do_direct, &has_refcount);
        if (ret < 0) {
                mlog_errno(ret);
@@ -2339,8 +2338,7 @@ relock:
        }
        if (direct_io && !is_sync_kiocb(iocb))
-                unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
+                unaligned_dio = ocfs2_is_io_unaligned(inode, count, *ppos);
-                                                      *ppos);
        /*
         * We can't complete the direct I/O as requested, fall back to
diff --git a/fs/open.c b/fs/open.c
index 33f9cbf2610b..6a83c47d5904 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -570,6 +570,7 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
        uid = make_kuid(current_user_ns(), user);
        gid = make_kgid(current_user_ns(), group);
+retry_deleg:
        newattrs.ia_valid =  ATTR_CTIME;
        if (user != (uid_t) -1) {
                if (!uid_valid(uid))
@@ -586,7 +587,6 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
        if (!S_ISDIR(inode->i_mode))
                newattrs.ia_valid |=
                        ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
-retry_deleg:
        mutex_lock(&inode->i_mutex);
        error = security_path_chown(path, uid, gid);
        if (!error)
@@ -988,9 +988,6 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
                return ERR_PTR(err);
        if (flags & O_CREAT)
                return ERR_PTR(-EINVAL);
-        if (!filename && (flags & O_DIRECTORY))
-                if (!dentry->d_inode->i_op->lookup)
-                        return ERR_PTR(-ENOTDIR);
        return do_file_open_root(dentry, mnt, filename, &op);
 }
 EXPORT_SYMBOL(file_open_root);
diff --git a/fs/pipe.c b/fs/pipe.c
index 21981e58e2a6..2d084f2d0b83 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,7 +21,6 @@
 #include <linux/audit.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
-#include <linux/aio.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 39d1373128e9..44a549beeafa 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -539,6 +539,9 @@ static int ramoops_probe(struct platform_device *pdev)
        mem_address = pdata->mem_address;
        record_size = pdata->record_size;
        dump_oops = pdata->dump_oops;
+        ramoops_console_size = pdata->console_size;
+        ramoops_pmsg_size = pdata->pmsg_size;
+        ramoops_ftrace_size = pdata->ftrace_size;
        pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n",
                cxt->size, (unsigned long long)cxt->phys_addr,
diff --git a/fs/read_write.c b/fs/read_write.c
index 8e1b68786d66..69128b378646 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/uio.h>
-#include <linux/aio.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/export.h>
@@ -343,13 +342,10 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = iov_iter_count(iter);
        iter->type |= READ;
        ret = file->f_op->read_iter(&kiocb, iter);
-        if (ret == -EIOCBQUEUED)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        if (ret > 0)
                *ppos = kiocb.ki_pos;
        return ret;
@@ -366,13 +362,10 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = iov_iter_count(iter);
        iter->type |= WRITE;
        ret = file->f_op->write_iter(&kiocb, iter);
-        if (ret == -EIOCBQUEUED)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        if (ret > 0)
                *ppos = kiocb.ki_pos;
        return ret;
@@ -426,11 +419,9 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
        ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
-        if (-EIOCBQUEUED == ret)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
@@ -446,12 +437,10 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
        iov_iter_init(&iter, READ, &iov, 1, len);
        ret = filp->f_op->read_iter(&kiocb, &iter);
-        if (-EIOCBQUEUED == ret)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
@@ -510,11 +499,9 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
        ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
-        if (-EIOCBQUEUED == ret)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
@@ -530,12 +517,10 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
        iov_iter_init(&iter, WRITE, &iov, 1, len);
        ret = filp->f_op->write_iter(&kiocb, &iter);
-        if (-EIOCBQUEUED == ret)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
@@ -710,60 +695,47 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 }
 EXPORT_SYMBOL(iov_shorten);
-static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
+static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
-                unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
+                loff_t *ppos, iter_fn_t fn)
 {
        struct kiocb kiocb;
-        struct iov_iter iter;
        ssize_t ret;
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
-        iov_iter_init(&iter, rw, iov, nr_segs, len);
+        ret = fn(&kiocb, iter);
-        ret = fn(&kiocb, &iter);
+        BUG_ON(ret == -EIOCBQUEUED);
-        if (ret == -EIOCBQUEUED)
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
-static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
+static ssize_t do_sync_readv_writev(struct file *filp, struct iov_iter *iter,
-                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
+                loff_t *ppos, iov_fn_t fn)
 {
        struct kiocb kiocb;
        ssize_t ret;
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
-        ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
+        ret = fn(&kiocb, iter->iov, iter->nr_segs, kiocb.ki_pos);
-        if (ret == -EIOCBQUEUED)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
 /* Do it by hand, with file-ops */
-static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
+static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
-                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
+                loff_t *ppos, io_fn_t fn)
 {
-        struct iovec *vector = iov;
        ssize_t ret = 0;
-        while (nr_segs > 0) {
+        while (iov_iter_count(iter)) {
-                void __user *base;
+                struct iovec iovec = iov_iter_iovec(iter);
-                size_t len;
                ssize_t nr;
-                base = vector->iov_base;
+                nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
-                len = vector->iov_len;
-                vector++;
-                nr_segs--;
-                nr = fn(filp, base, len, ppos);
                if (nr < 0) {
                        if (!ret)
@@ -771,8 +743,9 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
                        break;
                }
                ret += nr;
-                if (nr != len)
+                if (nr != iovec.iov_len)
                        break;
+                iov_iter_advance(iter, nr);
        }
        return ret;
@@ -863,17 +836,20 @@ static ssize_t do_readv_writev(int type, struct file *file,
        size_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
+        struct iov_iter iter;
        ssize_t ret;
        io_fn_t fn;
        iov_fn_t fnv;
        iter_fn_t iter_fn;
-        ret = rw_copy_check_uvector(type, uvector, nr_segs,
+        ret = import_iovec(type, uvector, nr_segs,
-                                    ARRAY_SIZE(iovstack), iovstack, &iov);
+                           ARRAY_SIZE(iovstack), &iov, &iter);
-        if (ret <= 0)
+        if (ret < 0)
-                goto out;
+                return ret;
-        tot_len = ret;
+        tot_len = iov_iter_count(&iter);
+        if (!tot_len)
+                goto out;
        ret = rw_verify_area(type, file, pos, tot_len);
        if (ret < 0)
                goto out;
@@ -891,20 +867,17 @@ static ssize_t do_readv_writev(int type, struct file *file,
        }
        if (iter_fn)
-                ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
+                ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
-                                                pos, iter_fn);
        else if (fnv)
-                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+                ret = do_sync_readv_writev(file, &iter, pos, fnv);
-                                                pos, fnv);
        else
-                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+                ret = do_loop_readv_writev(file, &iter, pos, fn);
        if (type != READ)
                file_end_write(file);
 out:
-        if (iov != iovstack)
+        kfree(iov);
-                kfree(iov);
        if ((ret + (type == READ)) > 0) {
                if (type == READ)
                        fsnotify_access(file);
@@ -1043,17 +1016,20 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
        compat_ssize_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
+        struct iov_iter iter;
        ssize_t ret;
        io_fn_t fn;
        iov_fn_t fnv;
        iter_fn_t iter_fn;
-        ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
+        ret = compat_import_iovec(type, uvector, nr_segs,
-                                               UIO_FASTIOV, iovstack, &iov);
+                                  UIO_FASTIOV, &iov, &iter);
-        if (ret <= 0)
+        if (ret < 0)
-                goto out;
+                return ret;
-        tot_len = ret;
+        tot_len = iov_iter_count(&iter);
+        if (!tot_len)
+                goto out;
        ret = rw_verify_area(type, file, pos, tot_len);
        if (ret < 0)
                goto out;
@@ -1071,20 +1047,17 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
        }
        if (iter_fn)
-                ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
+                ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
-                                                pos, iter_fn);
        else if (fnv)
-                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+                ret = do_sync_readv_writev(file, &iter, pos, fnv);
-                                                pos, fnv);
        else
-                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+                ret = do_loop_readv_writev(file, &iter, pos, fn);
        if (type != READ)
                file_end_write(file);
 out:
-        if (iov != iovstack)
+        kfree(iov);
-                kfree(iov);
        if ((ret + (type == READ)) > 0) {
                if (type == READ)
                        fsnotify_access(file);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index e72401e1f995..9312b7842e03 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,7 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 #include <linux/swap.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
diff --git a/fs/splice.c b/fs/splice.c
index 7968da96bebb..41cbb16299e0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -32,7 +32,6 @@
 #include <linux/gfp.h>
 #include <linux/socket.h>
 #include <linux/compat.h>
-#include <linux/aio.h>
 #include "internal.h"
 /*
@@ -1534,34 +1533,29 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
-        ssize_t count;
        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
-        ret = rw_copy_check_uvector(READ, uiov, nr_segs,
+        ret = import_iovec(READ, uiov, nr_segs,
-                                    ARRAY_SIZE(iovstack), iovstack, &iov);
+                           ARRAY_SIZE(iovstack), &iov, &iter);
-        if (ret <= 0)
+        if (ret < 0)
-                goto out;
+                return ret;
-        count = ret;
-        iov_iter_init(&iter, READ, iov, nr_segs, count);
+        sd.total_len = iov_iter_count(&iter);
        sd.len = 0;
-        sd.total_len = count;
        sd.flags = flags;
        sd.u.data = &iter;
        sd.pos = 0;
-        pipe_lock(pipe);
+        if (sd.total_len) {
-        ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
+                pipe_lock(pipe);
-        pipe_unlock(pipe);
+                ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
+                pipe_unlock(pipe);
-out:
+        }
-        if (iov != iovstack)
-                kfree(iov);
+        kfree(iov);
        return ret;
 }
diff --git a/fs/stat.c b/fs/stat.c
index ae0c3cef9927..19636af5e75c 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -66,7 +66,7 @@ int vfs_getattr(struct path *path, struct kstat *stat)
 {
        int retval;
-        retval = security_inode_getattr(path->mnt, path->dentry);
+        retval = security_inode_getattr(path);
        if (retval)
                return retval;
        return vfs_getattr_nosec(path, stat);
diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
new file mode 100644
index 000000000000..82fa35b656c4
--- /dev/null
+++ b/fs/tracefs/Makefile
@@ -0,0 +1,4 @@
+tracefs-objs    := inode.o
+obj-$(CONFIG_TRACING)   += tracefs.o
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
new file mode 100644
index 000000000000..d92bdf3b079a
--- /dev/null
+++ b/fs/tracefs/inode.c
@@ -0,0 +1,650 @@
+/*
+ *  inode.c - part of tracefs, a pseudo file system for activating tracing
+ *
+ * Based on debugfs by: Greg Kroah-Hartman <greg@kroah.com>
+ *
+ *  Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License version
+ *      2 as published by the Free Software Foundation.
+ *
+ * tracefs is the file system that is used by the tracing infrastructure.
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/kobject.h>
+#include <linux/namei.h>
+#include <linux/tracefs.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#define TRACEFS_DEFAULT_MODE    0700
+static struct vfsmount *tracefs_mount;
+static int tracefs_mount_count;
+static bool tracefs_registered;
+static ssize_t default_read_file(struct file *file, char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+        return 0;
+}
+static ssize_t default_write_file(struct file *file, const char __user *buf,
+                                   size_t count, loff_t *ppos)
+{
+        return count;
+}
+static const struct file_operations tracefs_file_operations = {
+        .read =         default_read_file,
+        .write =        default_write_file,
+        .open =         simple_open,
+        .llseek =       noop_llseek,
+};
+static struct tracefs_dir_ops {
+        int (*mkdir)(const char *name);
+        int (*rmdir)(const char *name);
+} tracefs_ops;
+static char *get_dname(struct dentry *dentry)
+{
+        const char *dname;
+        char *name;
+        int len = dentry->d_name.len;
+        dname = dentry->d_name.name;
+        name = kmalloc(len + 1, GFP_KERNEL);
+        if (!name)
+                return NULL;
+        memcpy(name, dname, len);
+        name[len] = 0;
+        return name;
+}
+static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+        char *name;
+        int ret;
+        name = get_dname(dentry);
+        if (!name)
+                return -ENOMEM;
+        /*
+         * The mkdir call can call the generic functions that create
+         * the files within the tracefs system. It is up to the individual
+         * mkdir routine to handle races.
+         */
+        mutex_unlock(&inode->i_mutex);
+        ret = tracefs_ops.mkdir(name);
+        mutex_lock(&inode->i_mutex);
+        kfree(name);
+        return ret;
+}
+static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
+{
+        char *name;
+        int ret;
+        name = get_dname(dentry);
+        if (!name)
+                return -ENOMEM;
+        /*
+         * The rmdir call can call the generic functions that create
+         * the files within the tracefs system. It is up to the individual
+         * rmdir routine to handle races.
+         * This time we need to unlock not only the parent (inode) but
+         * also the directory that is being deleted.
+         */
+        mutex_unlock(&inode->i_mutex);
+        mutex_unlock(&dentry->d_inode->i_mutex);
+        ret = tracefs_ops.rmdir(name);
+        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+        mutex_lock(&dentry->d_inode->i_mutex);
+        kfree(name);
+        return ret;
+}
+static const struct inode_operations tracefs_dir_inode_operations = {
+        .lookup         = simple_lookup,
+        .mkdir          = tracefs_syscall_mkdir,
+        .rmdir          = tracefs_syscall_rmdir,
+};
+static struct inode *tracefs_get_inode(struct super_block *sb)
+{
+        struct inode *inode = new_inode(sb);
+        if (inode) {
+                inode->i_ino = get_next_ino();
+                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        }
+        return inode;
+}
+struct tracefs_mount_opts {
+        kuid_t uid;
+        kgid_t gid;
+        umode_t mode;
+};
+enum {
+        Opt_uid,
+        Opt_gid,
+        Opt_mode,
+        Opt_err
+};
+static const match_table_t tokens = {
+        {Opt_uid, "uid=%u"},
+        {Opt_gid, "gid=%u"},
+        {Opt_mode, "mode=%o"},
+        {Opt_err, NULL}
+};
+struct tracefs_fs_info {
+        struct tracefs_mount_opts mount_opts;
+};
+static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
+{
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        int token;
+        kuid_t uid;
+        kgid_t gid;
+        char *p;
+        opts->mode = TRACEFS_DEFAULT_MODE;
+        while ((p = strsep(&data, ",")) != NULL) {
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_uid:
+                        if (match_int(&args[0], &option))
+                                return -EINVAL;
+                        uid = make_kuid(current_user_ns(), option);
+                        if (!uid_valid(uid))
+                                return -EINVAL;
+                        opts->uid = uid;
+                        break;
+                case Opt_gid:
+                        if (match_int(&args[0], &option))
+                                return -EINVAL;
+                        gid = make_kgid(current_user_ns(), option);
+                        if (!gid_valid(gid))
+                                return -EINVAL;
+                        opts->gid = gid;
+                        break;
+                case Opt_mode:
+                        if (match_octal(&args[0], &option))
+                                return -EINVAL;
+                        opts->mode = option & S_IALLUGO;
+                        break;
+                /*
+                 * We might like to report bad mount options here;
+                 * but traditionally tracefs has ignored all mount options
+                 */
+                }
+        }
+        return 0;
+}
+static int tracefs_apply_options(struct super_block *sb)
+{
+        struct tracefs_fs_info *fsi = sb->s_fs_info;
+        struct inode *inode = sb->s_root->d_inode;
+        struct tracefs_mount_opts *opts = &fsi->mount_opts;
+        inode->i_mode &= ~S_IALLUGO;
+        inode->i_mode |= opts->mode;
+        inode->i_uid = opts->uid;
+        inode->i_gid = opts->gid;
+        return 0;
+}
+static int tracefs_remount(struct super_block *sb, int *flags, char *data)
+{
+        int err;
+        struct tracefs_fs_info *fsi = sb->s_fs_info;
+        sync_filesystem(sb);
+        err = tracefs_parse_options(data, &fsi->mount_opts);
+        if (err)
+                goto fail;
+        tracefs_apply_options(sb);
+fail:
+        return err;
+}
+static int tracefs_show_options(struct seq_file *m, struct dentry *root)
+{
+        struct tracefs_fs_info *fsi = root->d_sb->s_fs_info;
+        struct tracefs_mount_opts *opts = &fsi->mount_opts;
+        if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+                seq_printf(m, ",uid=%u",
+                           from_kuid_munged(&init_user_ns, opts->uid));
+        if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+                seq_printf(m, ",gid=%u",
+                           from_kgid_munged(&init_user_ns, opts->gid));
+        if (opts->mode != TRACEFS_DEFAULT_MODE)
+                seq_printf(m, ",mode=%o", opts->mode);
+        return 0;
+}
+static const struct super_operations tracefs_super_operations = {
+        .statfs         = simple_statfs,
+        .remount_fs     = tracefs_remount,
+        .show_options   = tracefs_show_options,
+};
+static int trace_fill_super(struct super_block *sb, void *data, int silent)
+{
+        static struct tree_descr trace_files[] = {{""}};
+        struct tracefs_fs_info *fsi;
+        int err;
+        save_mount_options(sb, data);
+        fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL);
+        sb->s_fs_info = fsi;
+        if (!fsi) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        err = tracefs_parse_options(data, &fsi->mount_opts);
+        if (err)
+                goto fail;
+        err  =  simple_fill_super(sb, TRACEFS_MAGIC, trace_files);
+        if (err)
+                goto fail;
+        sb->s_op = &tracefs_super_operations;
+        tracefs_apply_options(sb);
+        return 0;
+fail:
+        kfree(fsi);
+        sb->s_fs_info = NULL;
+        return err;
+}
+static struct dentry *trace_mount(struct file_system_type *fs_type,
+                        int flags, const char *dev_name,
+                        void *data)
+{
+        return mount_single(fs_type, flags, data, trace_fill_super);
+}
+static struct file_system_type trace_fs_type = {
+        .owner =        THIS_MODULE,
+        .name =         "tracefs",
+        .mount =        trace_mount,
+        .kill_sb =      kill_litter_super,
+};
+MODULE_ALIAS_FS("tracefs");
+static struct dentry *start_creating(const char *name, struct dentry *parent)
+{
+        struct dentry *dentry;
+        int error;
+        pr_debug("tracefs: creating file '%s'\n",name);
+        error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
+                              &tracefs_mount_count);
+        if (error)
+                return ERR_PTR(error);
+        /* If the parent is not specified, we create it in the root.
+         * We need the root dentry to do this, which is in the super
+         * block. A pointer to that is in the struct vfsmount that we
+         * have around.
+         */
+        if (!parent)
+                parent = tracefs_mount->mnt_root;
+        mutex_lock(&parent->d_inode->i_mutex);
+        dentry = lookup_one_len(name, parent, strlen(name));
+        if (!IS_ERR(dentry) && dentry->d_inode) {
+                dput(dentry);
+                dentry = ERR_PTR(-EEXIST);
+        }
+        if (IS_ERR(dentry))
+                mutex_unlock(&parent->d_inode->i_mutex);
+        return dentry;
+}
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+        mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+        dput(dentry);
+        simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+        return NULL;
+}
+static struct dentry *end_creating(struct dentry *dentry)
+{
+        mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+        return dentry;
+}
+/**
+ * tracefs_create_file - create a file in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the tracefs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ *
+ * This is the basic "create a file" function for tracefs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the tracefs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If tracefs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_file(const char *name, umode_t mode,
+                                   struct dentry *parent, void *data,
+                                   const struct file_operations *fops)
+{
+        struct dentry *dentry;
+        struct inode *inode;
+        if (!(mode & S_IFMT))
+                mode |= S_IFREG;
+        BUG_ON(!S_ISREG(mode));
+        dentry = start_creating(name, parent);
+        if (IS_ERR(dentry))
+                return NULL;
+        inode = tracefs_get_inode(dentry->d_sb);
+        if (unlikely(!inode))
+                return failed_creating(dentry);
+        inode->i_mode = mode;
+        inode->i_fop = fops ? fops : &tracefs_file_operations;
+        inode->i_private = data;
+        d_instantiate(dentry, inode);
+        fsnotify_create(dentry->d_parent->d_inode, dentry);
+        return end_creating(dentry);
+}
+static struct dentry *__create_dir(const char *name, struct dentry *parent,
+                                   const struct inode_operations *ops)
+{
+        struct dentry *dentry = start_creating(name, parent);
+        struct inode *inode;
+        if (IS_ERR(dentry))
+                return NULL;
+        inode = tracefs_get_inode(dentry->d_sb);
+        if (unlikely(!inode))
+                return failed_creating(dentry);
+        inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+        inode->i_op = ops;
+        inode->i_fop = &simple_dir_operations;
+        /* directory inodes start off with i_nlink == 2 (for "." entry) */
+        inc_nlink(inode);
+        d_instantiate(dentry, inode);
+        inc_nlink(dentry->d_parent->d_inode);
+        fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+        return end_creating(dentry);
+}
+/**
+ * tracefs_create_dir - create a directory in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the directory to
+ *        create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          directory will be created in the root of the tracefs filesystem.
+ *
+ * This function creates a directory in tracefs with the given name.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed. If an error occurs, %NULL will be returned.
+ *
+ * If tracing is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
+{
+        return __create_dir(name, parent, &simple_dir_inode_operations);
+}
+/**
+ * tracefs_create_instance_dir - create the tracing instances directory
+ * @name: The name of the instances directory to create
+ * @parent: The parent directory that the instances directory will exist
+ * @mkdir: The function to call when a mkdir is performed.
+ * @rmdir: The function to call when a rmdir is performed.
+ *
+ * Only one instances directory is allowed.
+ *
+ * The instances directory is special as it allows for mkdir and rmdir to
+ * to be done by userspace. When a mkdir or rmdir is performed, the inode
+ * locks are released and the methhods passed in (@mkdir and @rmdir) are
+ * called without locks and with the name of the directory being created
+ * within the instances directory.
+ *
+ * Returns the dentry of the instances directory.
+ */
+struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
+                                          int (*mkdir)(const char *name),
+                                          int (*rmdir)(const char *name))
+{
+        struct dentry *dentry;
+        /* Only allow one instance of the instances directory. */
+        if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
+                return NULL;
+        dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
+        if (!dentry)
+                return NULL;
+        tracefs_ops.mkdir = mkdir;
+        tracefs_ops.rmdir = rmdir;
+        return dentry;
+}
+static inline int tracefs_positive(struct dentry *dentry)
+{
+        return dentry->d_inode && !d_unhashed(dentry);
+}
+static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
+{
+        int ret = 0;
+        if (tracefs_positive(dentry)) {
+                if (dentry->d_inode) {
+                        dget(dentry);
+                        switch (dentry->d_inode->i_mode & S_IFMT) {
+                        case S_IFDIR:
+                                ret = simple_rmdir(parent->d_inode, dentry);
+                                break;
+                        default:
+                                simple_unlink(parent->d_inode, dentry);
+                                break;
+                        }
+                        if (!ret)
+                                d_delete(dentry);
+                        dput(dentry);
+                }
+        }
+        return ret;
+}
+/**
+ * tracefs_remove - removes a file or directory from the tracefs filesystem
+ * @dentry: a pointer to a the dentry of the file or directory to be
+ *          removed.
+ *
+ * This function removes a file or directory in tracefs that was previously
+ * created with a call to another tracefs function (like
+ * tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove(struct dentry *dentry)
+{
+        struct dentry *parent;
+        int ret;
+        if (IS_ERR_OR_NULL(dentry))
+                return;
+        parent = dentry->d_parent;
+        if (!parent || !parent->d_inode)
+                return;
+        mutex_lock(&parent->d_inode->i_mutex);
+        ret = __tracefs_remove(dentry, parent);
+        mutex_unlock(&parent->d_inode->i_mutex);
+        if (!ret)
+                simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+}
+/**
+ * tracefs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in tracefs that
+ * was previously created with a call to another tracefs function
+ * (like tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove_recursive(struct dentry *dentry)
+{
+        struct dentry *child, *parent;
+        if (IS_ERR_OR_NULL(dentry))
+                return;
+        parent = dentry->d_parent;
+        if (!parent || !parent->d_inode)
+                return;
+        parent = dentry;
+ down:
+        mutex_lock(&parent->d_inode->i_mutex);
+ loop:
+        /*
+         * The parent->d_subdirs is protected by the d_lock. Outside that
+         * lock, the child can be unlinked and set to be freed which can
+         * use the d_u.d_child as the rcu head and corrupt this list.
+         */
+        spin_lock(&parent->d_lock);
+        list_for_each_entry(child, &parent->d_subdirs, d_child) {
+                if (!tracefs_positive(child))
+                        continue;
+                /* perhaps simple_empty(child) makes more sense */
+                if (!list_empty(&child->d_subdirs)) {
+                        spin_unlock(&parent->d_lock);
+                        mutex_unlock(&parent->d_inode->i_mutex);
+                        parent = child;
+                        goto down;
+                }
+                spin_unlock(&parent->d_lock);
+                if (!__tracefs_remove(child, parent))
+                        simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+                /*
+                 * The parent->d_lock protects agaist child from unlinking
+                 * from d_subdirs. When releasing the parent->d_lock we can
+                 * no longer trust that the next pointer is valid.
+                 * Restart the loop. We'll skip this one with the
+                 * tracefs_positive() check.
+                 */
+                goto loop;
+        }
+        spin_unlock(&parent->d_lock);
+        mutex_unlock(&parent->d_inode->i_mutex);
+        child = parent;
+        parent = parent->d_parent;
+        mutex_lock(&parent->d_inode->i_mutex);
+        if (child != dentry)
+                /* go up */
+                goto loop;
+        if (!__tracefs_remove(child, parent))
+                simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+        mutex_unlock(&parent->d_inode->i_mutex);
+}
+/**
+ * tracefs_initialized - Tells whether tracefs has been registered
+ */
+bool tracefs_initialized(void)
+{
+        return tracefs_registered;
+}
+static struct kobject *trace_kobj;
+static int __init tracefs_init(void)
+{
+        int retval;
+        trace_kobj = kobject_create_and_add("tracing", kernel_kobj);
+        if (!trace_kobj)
+                return -EINVAL;
+        retval = register_filesystem(&trace_fs_type);
+        if (!retval)
+                tracefs_registered = true;
+        return retval;
+}
+core_initcall(tracefs_init);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e627c0acf626..c3d15fe83403 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,7 +50,6 @@
 */
 #include "ubifs.h"
-#include <linux/aio.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/slab.h>
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 08f3555fbeac..7f885cc8b0b7 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,7 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        int err, pos;
-        size_t count = iocb->ki_nbytes;
+        size_t count = iov_iter_count(from);
        struct udf_inode_info *iinfo = UDF_I(inode);
        mutex_lock(&inode->i_mutex);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a445d599098d..9c1fbd23913d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,7 +38,7 @@
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
 #include <linux/mpage.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a9b7a1b8704..4f8cdc59bc38 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,7 +31,6 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
-#include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a2e1cb8a568b..f44212fae653 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,7 +38,6 @@
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
-#include <linux/aio.h>
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>