117 files changed, 2336 insertions, 1194 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index eb14e055ea83..ff1a5bac4200 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,7 +33,7 @@
 #include <linux/pagemap.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 270c48148f79..2d0cbbd14cfc 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,9 +27,6 @@ config COMPAT_BINFMT_ELF
        bool
        depends on COMPAT && BINFMT_ELF
-config ARCH_BINFMT_ELF_RANDOMIZE_PIE
-        bool
 config ARCH_BINFMT_ELF_STATE
        bool
diff --git a/fs/Makefile b/fs/Makefile
index a88ac4838c9e..cb92fd4c3172 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -118,6 +118,7 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)             += hppfs/
 obj-$(CONFIG_CACHEFILES)        += cachefiles/
 obj-$(CONFIG_DEBUG_FS)          += debugfs/
+obj-$(CONFIG_TRACING)           += tracefs/
 obj-$(CONFIG_OCFS2_FS)          += ocfs2/
 obj-$(CONFIG_BTRFS_FS)          += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
diff --git a/fs/affs/file.c b/fs/affs/file.c
index d2468bf95669..3aa7eb66547e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -12,7 +12,7 @@
 *  affs regular file handling primitives
 */
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "affs.h"
 static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
@@ -699,8 +699,10 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
        boff = tmp % bsize;
        if (boff) {
                bh = affs_bread_ino(inode, bidx, 0);
-                if (IS_ERR(bh))
+                if (IS_ERR(bh)) {
-                        return PTR_ERR(bh);
+                        written = PTR_ERR(bh);
+                        goto err_first_bh;
+                }
                tmp = min(bsize - boff, to - from);
                BUG_ON(boff + tmp > bsize || tmp > bsize);
                memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
@@ -712,14 +714,16 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
                bidx++;
        } else if (bidx) {
                bh = affs_bread_ino(inode, bidx - 1, 0);
-                if (IS_ERR(bh))
+                if (IS_ERR(bh)) {
-                        return PTR_ERR(bh);
+                        written = PTR_ERR(bh);
+                        goto err_first_bh;
+                }
        }
        while (from + bsize <= to) {
                prev_bh = bh;
                bh = affs_getemptyblk_ino(inode, bidx);
                if (IS_ERR(bh))
-                        goto out;
+                        goto err_bh;
                memcpy(AFFS_DATA(bh), data + from, bsize);
                if (buffer_new(bh)) {
                        AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
@@ -751,7 +755,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
                prev_bh = bh;
                bh = affs_bread_ino(inode, bidx, 1);
                if (IS_ERR(bh))
-                        goto out;
+                        goto err_bh;
                tmp = min(bsize, to - from);
                BUG_ON(tmp > bsize);
                memcpy(AFFS_DATA(bh), data + from, tmp);
@@ -790,12 +794,13 @@ done:
        if (tmp > inode->i_size)
                inode->i_size = AFFS_I(inode)->mmu_private = tmp;
+err_first_bh:
        unlock_page(page);
        page_cache_release(page);
        return written;
-out:
+err_bh:
        bh = prev_bh;
        if (!written)
                written = PTR_ERR(bh);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c13cb08964ed..0714abcd7f32 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,7 +14,6 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
-#include <linux/aio.h>
 #include "internal.h"
 static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/aio.c b/fs/aio.c
index f8e52a1854c1..1ab60010cf6c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -151,6 +151,38 @@ struct kioctx {
        unsigned                id;
 };
+/*
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
+ */
+#define KIOCB_CANCELLED         ((void *) (~0ULL))
+struct aio_kiocb {
+        struct kiocb            common;
+        struct kioctx           *ki_ctx;
+        kiocb_cancel_fn         *ki_cancel;
+        struct iocb __user      *ki_user_iocb;  /* user's aiocb */
+        __u64                   ki_user_data;   /* user's data for completion */
+        struct list_head        ki_list;        /* the aio core uses this
+                                                 * for cancellation */
+        /*
+         * If the aio_resfd field of the userspace iocb is not zero,
+         * this is the underlying eventfd context to deliver events to.
+         */
+        struct eventfd_ctx      *ki_eventfd;
+};
 /*------ sysctl variables----*/
 static DEFINE_SPINLOCK(aio_nr_lock);
 unsigned long aio_nr;           /* current system wide number of aio requests */
@@ -220,7 +252,7 @@ static int __init aio_setup(void)
        if (IS_ERR(aio_mnt))
                panic("Failed to create aio fs mount.");
-        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+        kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
@@ -278,11 +310,11 @@ static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
-static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct kioctx_table *table;
-        int i;
+        int i, res = -EINVAL;
        spin_lock(&mm->ioctx_lock);
        rcu_read_lock();
@@ -292,13 +324,17 @@ static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
                ctx = table->table[i];
                if (ctx && ctx->aio_ring_file == file) {
-                        ctx->user_id = ctx->mmap_base = vma->vm_start;
+                        if (!atomic_read(&ctx->dead)) {
+                                ctx->user_id = ctx->mmap_base = vma->vm_start;
+                                res = 0;
+                        }
                        break;
                }
        }
        rcu_read_unlock();
        spin_unlock(&mm->ioctx_lock);
+        return res;
 }
 static const struct file_operations aio_ring_fops = {
@@ -480,8 +516,9 @@ static int aio_setup_ring(struct kioctx *ctx)
 #define AIO_EVENTS_FIRST_PAGE   ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET       (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 {
+        struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
        struct kioctx *ctx = req->ki_ctx;
        unsigned long flags;
@@ -496,7 +533,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
-static int kiocb_cancel(struct kiocb *kiocb)
+static int kiocb_cancel(struct aio_kiocb *kiocb)
 {
        kiocb_cancel_fn *old, *cancel;
@@ -514,7 +551,7 @@ static int kiocb_cancel(struct kiocb *kiocb)
                cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
        } while (cancel != old);
-        return cancel(kiocb);
+        return cancel(&kiocb->common);
 }
 static void free_ioctx(struct work_struct *work)
@@ -550,13 +587,13 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
 static void free_ioctx_users(struct percpu_ref *ref)
 {
        struct kioctx *ctx = container_of(ref, struct kioctx, users);
-        struct kiocb *req;
+        struct aio_kiocb *req;
        spin_lock_irq(&ctx->ctx_lock);
        while (!list_empty(&ctx->active_reqs)) {
                req = list_first_entry(&ctx->active_reqs,
-                                       struct kiocb, ki_list);
+                                       struct aio_kiocb, ki_list);
                list_del_init(&req->ki_list);
                kiocb_cancel(req);
@@ -727,6 +764,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 err_cleanup:
        aio_nr_sub(ctx->max_reqs);
 err_ctx:
+        atomic_set(&ctx->dead, 1);
+        if (ctx->mmap_size)
+                vm_munmap(ctx->mmap_base, ctx->mmap_size);
        aio_free_ring(ctx);
 err:
        mutex_unlock(&ctx->ring_lock);
@@ -748,11 +788,12 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 {
        struct kioctx_table *table;
-        if (atomic_xchg(&ctx->dead, 1))
+        spin_lock(&mm->ioctx_lock);
+        if (atomic_xchg(&ctx->dead, 1)) {
+                spin_unlock(&mm->ioctx_lock);
                return -EINVAL;
+        }
-        spin_lock(&mm->ioctx_lock);
        table = rcu_dereference_raw(mm->ioctx_table);
        WARN_ON(ctx != table->table[ctx->id]);
        table->table[ctx->id] = NULL;
@@ -778,22 +819,6 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
        return 0;
 }
-/* wait_on_sync_kiocb:
- *      Waits on the given sync kiocb to complete.
- */
-ssize_t wait_on_sync_kiocb(struct kiocb *req)
-{
-        while (!req->ki_ctx) {
-                set_current_state(TASK_UNINTERRUPTIBLE);
-                if (req->ki_ctx)
-                        break;
-                io_schedule();
-        }
-        __set_current_state(TASK_RUNNING);
-        return req->ki_user_data;
-}
-EXPORT_SYMBOL(wait_on_sync_kiocb);
 /*
 * exit_aio: called when the last user of mm goes away.  At this point, there is
 * no way for any new requests to be submited or any of the io_* syscalls to be
@@ -948,9 +973,9 @@ static void user_refill_reqs_available(struct kioctx *ctx)
 *      Allocate a slot for an aio request.
 * Returns NULL if no requests are free.
 */
-static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 {
-        struct kiocb *req;
+        struct aio_kiocb *req;
        if (!get_reqs_available(ctx)) {
                user_refill_reqs_available(ctx);
@@ -971,10 +996,10 @@ out_put:
        return NULL;
 }
-static void kiocb_free(struct kiocb *req)
+static void kiocb_free(struct aio_kiocb *req)
 {
-        if (req->ki_filp)
+        if (req->common.ki_filp)
-                fput(req->ki_filp);
+                fput(req->common.ki_filp);
        if (req->ki_eventfd != NULL)
                eventfd_ctx_put(req->ki_eventfd);
        kmem_cache_free(kiocb_cachep, req);
@@ -1010,8 +1035,9 @@ out:
 /* aio_complete
 *      Called when the io request on the given iocb is complete.
 */
-void aio_complete(struct kiocb *iocb, long res, long res2)
+static void aio_complete(struct kiocb *kiocb, long res, long res2)
 {
+        struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
        struct kioctx   *ctx = iocb->ki_ctx;
        struct aio_ring *ring;
        struct io_event *ev_page, *event;
@@ -1025,13 +1051,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
         *    ref, no other paths have a way to get another ref
         *  - the sync task helpfully left a reference to itself in the iocb
         */
-        if (is_sync_kiocb(iocb)) {
+        BUG_ON(is_sync_kiocb(kiocb));
-                iocb->ki_user_data = res;
-                smp_wmb();
-                iocb->ki_ctx = ERR_PTR(-EXDEV);
-                wake_up_process(iocb->ki_obj.tsk);
-                return;
-        }
        if (iocb->ki_list.next) {
                unsigned long flags;
@@ -1057,7 +1077,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
        event = ev_page + pos % AIO_EVENTS_PER_PAGE;
-        event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+        event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
        event->data = iocb->ki_user_data;
        event->res = res;
        event->res2 = res2;
@@ -1066,7 +1086,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
        pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
-                 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+                 ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
                 res, res2);
        /* after flagging the request as done, we
@@ -1113,7 +1133,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        percpu_ref_put(&ctx->reqs);
 }
-EXPORT_SYMBOL(aio_complete);
 /* aio_read_events_ring
 *      Pull an event off of the ioctx's event ring.  Returns the number of
@@ -1341,46 +1360,19 @@ typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
                            unsigned long, loff_t);
 typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
-static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
+static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len,
-                                     int rw, char __user *buf,
+                                 struct iovec **iovec,
-                                     unsigned long *nr_segs,
+                                 bool compat,
-                                     struct iovec **iovec,
+                                 struct iov_iter *iter)
-                                     bool compat)
 {
-        ssize_t ret;
-        *nr_segs = kiocb->ki_nbytes;
 #ifdef CONFIG_COMPAT
        if (compat)
-                ret = compat_rw_copy_check_uvector(rw,
+                return compat_import_iovec(rw,
                                (struct compat_iovec __user *)buf,
-                                *nr_segs, UIO_FASTIOV, *iovec, iovec);
+                                len, UIO_FASTIOV, iovec, iter);
-        else
 #endif
-                ret = rw_copy_check_uvector(rw,
+        return import_iovec(rw, (struct iovec __user *)buf,
-                                (struct iovec __user *)buf,
+                                len, UIO_FASTIOV, iovec, iter);
-                                *nr_segs, UIO_FASTIOV, *iovec, iovec);
-        if (ret < 0)
-                return ret;
-        /* ki_nbytes now reflect bytes instead of segs */
-        kiocb->ki_nbytes = ret;
-        return 0;
-}
-static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
-                                       int rw, char __user *buf,
-                                       unsigned long *nr_segs,
-                                       struct iovec *iovec)
-{
-        if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
-                return -EFAULT;
-        iovec->iov_base = buf;
-        iovec->iov_len = kiocb->ki_nbytes;
-        *nr_segs = 1;
-        return 0;
 }
 /*
@@ -1388,11 +1380,10 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
 *      Performs the initial checks and io submission.
 */
 static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
-                            char __user *buf, bool compat)
+                            char __user *buf, size_t len, bool compat)
 {
        struct file *file = req->ki_filp;
        ssize_t ret;
-        unsigned long nr_segs;
        int rw;
        fmode_t mode;
        aio_rw_op *rw_op;
@@ -1423,21 +1414,22 @@ rw_common:
                if (!rw_op && !iter_op)
                        return -EINVAL;
-                ret = (opcode == IOCB_CMD_PREADV ||
+                if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV)
-                       opcode == IOCB_CMD_PWRITEV)
+                        ret = aio_setup_vectored_rw(rw, buf, len,
-                        ? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
+                                                &iovec, compat, &iter);
-                                                &iovec, compat)
+                else {
-                        : aio_setup_single_vector(req, rw, buf, &nr_segs,
+                        ret = import_single_range(rw, buf, len, iovec, &iter);
-                                                  iovec);
+                        iovec = NULL;
+                }
                if (!ret)
-                        ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+                        ret = rw_verify_area(rw, file, &req->ki_pos,
+                                             iov_iter_count(&iter));
                if (ret < 0) {
-                        if (iovec != inline_vecs)
+                        kfree(iovec);
-                                kfree(iovec);
                        return ret;
                }
-                req->ki_nbytes = ret;
+                len = ret;
                /* XXX: move/kill - rw_verify_area()? */
                /* This matches the pread()/pwrite() logic */
@@ -1450,14 +1442,14 @@ rw_common:
                        file_start_write(file);
                if (iter_op) {
-                        iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
                        ret = iter_op(req, &iter);
                } else {
-                        ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+                        ret = rw_op(req, iter.iov, iter.nr_segs, req->ki_pos);
                }
                if (rw == WRITE)
                        file_end_write(file);
+                kfree(iovec);
                break;
        case IOCB_CMD_FDSYNC:
@@ -1479,9 +1471,6 @@ rw_common:
                return -EINVAL;
        }
-        if (iovec != inline_vecs)
-                kfree(iovec);
        if (ret != -EIOCBQUEUED) {
                /*
                 * There's no easy way to restart the syscall since other AIO's
@@ -1500,7 +1489,7 @@ rw_common:
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                         struct iocb *iocb, bool compat)
 {
-        struct kiocb *req;
+        struct aio_kiocb *req;
        ssize_t ret;
        /* enforce forwards compatibility on users */
@@ -1523,11 +1512,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
        if (unlikely(!req))
                return -EAGAIN;
-        req->ki_filp = fget(iocb->aio_fildes);
+        req->common.ki_filp = fget(iocb->aio_fildes);
-        if (unlikely(!req->ki_filp)) {
+        if (unlikely(!req->common.ki_filp)) {
                ret = -EBADF;
                goto out_put_req;
        }
+        req->common.ki_pos = iocb->aio_offset;
+        req->common.ki_complete = aio_complete;
+        req->common.ki_flags = 0;
        if (iocb->aio_flags & IOCB_FLAG_RESFD) {
                /*
@@ -1542,6 +1534,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                        req->ki_eventfd = NULL;
                        goto out_put_req;
                }
+                req->common.ki_flags |= IOCB_EVENTFD;
        }
        ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@ -1550,13 +1544,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                goto out_put_req;
        }
-        req->ki_obj.user = user_iocb;
+        req->ki_user_iocb = user_iocb;
        req->ki_user_data = iocb->aio_data;
-        req->ki_pos = iocb->aio_offset;
-        req->ki_nbytes = iocb->aio_nbytes;
-        ret = aio_run_iocb(req, iocb->aio_lio_opcode,
+        ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode,
                           (char __user *)(unsigned long)iocb->aio_buf,
+                           iocb->aio_nbytes,
                           compat);
        if (ret)
                goto out_put_req;
@@ -1643,10 +1636,10 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 /* lookup_kiocb
 *      Finds a given iocb for cancellation.
 */
-static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
+static struct aio_kiocb *
-                                  u32 key)
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
 {
-        struct list_head *pos;
+        struct aio_kiocb *kiocb;
        assert_spin_locked(&ctx->ctx_lock);
@@ -1654,9 +1647,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
                return NULL;
        /* TODO: use a hash or array, this sucks. */
-        list_for_each(pos, &ctx->active_reqs) {
+        list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
-                struct kiocb *kiocb = list_kiocb(pos);
+                if (kiocb->ki_user_iocb == iocb)
-                if (kiocb->ki_obj.user == iocb)
                        return kiocb;
        }
        return NULL;
@@ -1676,7 +1668,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
                struct io_event __user *, result)
 {
        struct kioctx *ctx;
-        struct kiocb *kiocb;
+        struct aio_kiocb *kiocb;
        u32 key;
        int ret;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 90bc079d9982..fdcb4d69f430 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
+#include <linux/uio.h>
 #include <asm/uaccess.h>
 #include "bfs.h"
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 995986b8e36b..241ef68d2893 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
 #include <linux/security.h>
 #include <linux/random.h>
 #include <linux/elf.h>
+#include <linux/elf-randomize.h>
 #include <linux/utsname.h>
 #include <linux/coredump.h>
 #include <linux/sched.h>
@@ -862,6 +863,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
            i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
                int elf_prot = 0, elf_flags;
                unsigned long k, vaddr;
+                unsigned long total_size = 0;
                if (elf_ppnt->p_type != PT_LOAD)
                        continue;
@@ -909,25 +911,20 @@ static int load_elf_binary(struct linux_binprm *bprm)
                         * default mmap base, as well as whatever program they
                         * might try to exec.  This is because the brk will
                         * follow the loader, and is not movable.  */
-#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
+                        load_bias = ELF_ET_DYN_BASE - vaddr;
-                        /* Memory randomization might have been switched off
-                         * in runtime via sysctl or explicit setting of
-                         * personality flags.
-                         * If that is the case, retain the original non-zero
-                         * load_bias value in order to establish proper
-                         * non-randomized mappings.
-                         */
                        if (current->flags & PF_RANDOMIZE)
-                                load_bias = 0;
+                                load_bias += arch_mmap_rnd();
-                        else
+                        load_bias = ELF_PAGESTART(load_bias);
-                                load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+                        total_size = total_mapping_size(elf_phdata,
-#else
+                                                        loc->elf_ex.e_phnum);
-                        load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+                        if (!total_size) {
-#endif
+                                error = -EINVAL;
+                                goto out_free_dentry;
+                        }
                }
                error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-                                elf_prot, elf_flags, 0);
+                                elf_prot, elf_flags, total_size);
                if (BAD_ADDR(error)) {
                        retval = IS_ERR((void *)error) ?
                                PTR_ERR((void*)error) : -EINVAL;
@@ -1053,15 +1050,13 @@ static int load_elf_binary(struct linux_binprm *bprm)
        current->mm->end_data = end_data;
        current->mm->start_stack = bprm->p;
-#ifdef arch_randomize_brk
        if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
                current->mm->brk = current->mm->start_brk =
                        arch_randomize_brk(current->mm);
-#ifdef CONFIG_COMPAT_BRK
+#ifdef compat_brk_randomized
                current->brk_randomized = 1;
 #endif
        }
-#endif
        if (current->personality & MMAP_PAGE_ZERO) {
                /* Why this, you ask???  Well SVr4 maps page 0 as read-only,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 975266be67d3..2e522aed6584 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,7 +27,6 @@
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/cleancache.h>
-#include <linux/aio.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84c3b00f3de8..f9c89cae39ee 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3387,6 +3387,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root);
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
@@ -3909,6 +3911,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
                                    loff_t actual_len, u64 *alloc_hint);
 int btrfs_inode_check_errors(struct inode *inode);
 extern const struct dentry_operations btrfs_dentry_operations;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
+#endif
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f79f38542a73..639f2663ed3f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3921,7 +3921,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
        }
        if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
                        + sizeof(struct btrfs_chunk)) {
-                printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+                printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
                                btrfs_super_sys_array_size(sb),
                                sizeof(struct btrfs_disk_key)
                                + sizeof(struct btrfs_chunk));
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6f080451fcb1..8b353ad02f03 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3325,6 +3325,32 @@ out:
        return ret;
 }
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root)
+{
+        struct btrfs_block_group_cache *cache, *tmp;
+        struct btrfs_transaction *cur_trans = trans->transaction;
+        struct btrfs_path *path;
+        if (list_empty(&cur_trans->dirty_bgs) ||
+            !btrfs_test_opt(root, SPACE_CACHE))
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /* Could add new block groups, use _safe just in case */
+        list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+                                 dirty_list) {
+                if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                        cache_save_setup(cache, trans, path);
+        }
+        btrfs_free_path(path);
+        return 0;
+}
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
@@ -5110,7 +5136,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        num_bytes = ALIGN(num_bytes, root->sectorsize);
        spin_lock(&BTRFS_I(inode)->lock);
-        BTRFS_I(inode)->outstanding_extents++;
+        nr_extents = (unsigned)div64_u64(num_bytes +
+                                         BTRFS_MAX_EXTENT_SIZE - 1,
+                                         BTRFS_MAX_EXTENT_SIZE);
+        BTRFS_I(inode)->outstanding_extents += nr_extents;
+        nr_extents = 0;
        if (BTRFS_I(inode)->outstanding_extents >
            BTRFS_I(inode)->reserved_extents)
@@ -5255,6 +5285,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
+        if (btrfs_test_is_dummy_root(root))
+                return;
        trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                      btrfs_ino(inode), to_free, 0);
        if (root->fs_info->quota_enabled) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c7233ff1d533..d688cfe5d496 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4968,6 +4968,12 @@ static int release_extent_buffer(struct extent_buffer *eb)
                /* Should be safe to release our pages at this point */
                btrfs_release_extent_buffer_page(eb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+                if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
+                        __free_extent_buffer(eb);
+                        return 1;
+                }
+#endif
                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
                return 1;
        }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 30982bbd31c3..aee18f84e315 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,7 +24,6 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
-#include <linux/aio.h>
 #include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
@@ -32,6 +31,7 @@
 #include <linux/compat.h>
 #include <linux/slab.h>
 #include <linux/btrfs.h>
+#include <linux/uio.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index da828cf5e8f8..686331f22b15 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,7 +32,6 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
-#include <linux/aio.h>
 #include <linux/bit_spinlock.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
@@ -43,6 +42,7 @@
 #include <linux/btrfs.h>
 #include <linux/blkdev.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -108,6 +108,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 static int btrfs_dirty_inode(struct inode *inode);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode)
+{
+        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+}
+#endif
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
                                     const struct qstr *qstr)
@@ -1542,30 +1549,17 @@ static void btrfs_split_extent_hook(struct inode *inode,
                u64 new_size;
                /*
-                 * We need the largest size of the remaining extent to see if we
+                 * See the explanation in btrfs_merge_extent_hook, the same
-                 * need to add a new outstanding extent.  Think of the following
+                 * applies here, just in reverse.
-                 * case
-                 *
-                 * [MEAX_EXTENT_SIZEx2 - 4k][4k]
-                 *
-                 * The new_size would just be 4k and we'd think we had enough
-                 * outstanding extents for this if we only took one side of the
-                 * split, same goes for the other direction.  We need to see if
-                 * the larger size still is the same amount of extents as the
-                 * original size, because if it is we need to add a new
-                 * outstanding extent.  But if we split up and the larger size
-                 * is less than the original then we are good to go since we've
-                 * already accounted for the extra extent in our original
-                 * accounting.
                 */
                new_size = orig->end - split + 1;
-                if ((split - orig->start) > new_size)
+                num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
-                        new_size = split - orig->start;
-                num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
                                        BTRFS_MAX_EXTENT_SIZE);
-                if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                new_size = split - orig->start;
-                              BTRFS_MAX_EXTENT_SIZE) < num_extents)
+                num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                        BTRFS_MAX_EXTENT_SIZE);
+                if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+                              BTRFS_MAX_EXTENT_SIZE) >= num_extents)
                        return;
        }
@@ -1591,8 +1585,10 @@ static void btrfs_merge_extent_hook(struct inode *inode,
        if (!(other->state & EXTENT_DELALLOC))
                return;
-        old_size = other->end - other->start + 1;
+        if (new->start > other->start)
-        new_size = old_size + (new->end - new->start + 1);
+                new_size = new->end - other->start + 1;
+        else
+                new_size = other->end - new->start + 1;
        /* we're not bigger than the max, unreserve the space and go */
        if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
@@ -1603,13 +1599,32 @@ static void btrfs_merge_extent_hook(struct inode *inode,
        }
        /*
-         * If we grew by another max_extent, just return, we want to keep that
+         * We have to add up either side to figure out how many extents were
-         * reserved amount.
+         * accounted for before we merged into one big extent.  If the number of
+         * extents we accounted for is <= the amount we need for the new range
+         * then we can return, otherwise drop.  Think of it like this
+         *
+         * [ 4k][MAX_SIZE]
+         *
+         * So we've grown the extent by a MAX_SIZE extent, this would mean we
+         * need 2 outstanding extents, on one side we have 1 and the other side
+         * we have 1 so they are == and we can return.  But in this case
+         *
+         * [MAX_SIZE+4k][MAX_SIZE+4k]
+         *
+         * Each range on their own accounts for 2 extents, but merged together
+         * they are only 3 extents worth of accounting, so we need to drop in
+         * this case.
         */
+        old_size = other->end - other->start + 1;
        num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
                                BTRFS_MAX_EXTENT_SIZE);
+        old_size = new->end - new->start + 1;
+        num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                 BTRFS_MAX_EXTENT_SIZE);
        if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
-                      BTRFS_MAX_EXTENT_SIZE) > num_extents)
+                      BTRFS_MAX_EXTENT_SIZE) >= num_extents)
                return;
        spin_lock(&BTRFS_I(inode)->lock);
@@ -1686,6 +1701,10 @@ static void btrfs_set_bit_hook(struct inode *inode,
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
+                /* For sanity tests */
+                if (btrfs_test_is_dummy_root(root))
+                        return;
                __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
                                     root->fs_info->delalloc_batch);
                spin_lock(&BTRFS_I(inode)->lock);
@@ -1741,6 +1760,10 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                    root != root->fs_info->tree_root)
                        btrfs_delalloc_release_metadata(inode, len);
+                /* For sanity tests. */
+                if (btrfs_test_is_dummy_root(root))
+                        return;
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                    && do_list && !(state->state & EXTENT_NORESERVE))
                        btrfs_free_reserved_data_space(inode, len);
@@ -7213,7 +7236,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        u64 start = iblock << inode->i_blkbits;
        u64 lockstart, lockend;
        u64 len = bh_result->b_size;
-        u64 orig_len = len;
+        u64 *outstanding_extents = NULL;
        int unlock_bits = EXTENT_LOCKED;
        int ret = 0;
@@ -7225,6 +7248,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        lockstart = start;
        lockend = start + len - 1;
+        if (current->journal_info) {
+                /*
+                 * Need to pull our outstanding extents and set journal_info to NULL so
+                 * that anything that needs to check if there's a transction doesn't get
+                 * confused.
+                 */
+                outstanding_extents = current->journal_info;
+                current->journal_info = NULL;
+        }
        /*
         * If this errors out it's because we couldn't invalidate pagecache for
         * this range and we need to fallback to buffered.
@@ -7348,11 +7381,20 @@ unlock:
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
-                if (len < orig_len) {
+                /*
+                 * If we have an outstanding_extents count still set then we're
+                 * within our reservation, otherwise we need to adjust our inode
+                 * counter appropriately.
+                 */
+                if (*outstanding_extents) {
+                        (*outstanding_extents)--;
+                } else {
                        spin_lock(&BTRFS_I(inode)->lock);
                        BTRFS_I(inode)->outstanding_extents++;
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
+                current->journal_info = outstanding_extents;
                btrfs_free_reserved_data_space(inode, len);
        }
@@ -7376,6 +7418,8 @@ unlock:
 unlock_err:
        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+        if (outstanding_extents)
+                current->journal_info = outstanding_extents;
        return ret;
 }
@@ -8075,6 +8119,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        u64 outstanding_extents = 0;
        size_t count = 0;
        int flags = 0;
        bool wakeup = true;
@@ -8112,6 +8157,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                ret = btrfs_delalloc_reserve_space(inode, count);
                if (ret)
                        goto out;
+                outstanding_extents = div64_u64(count +
+                                                BTRFS_MAX_EXTENT_SIZE - 1,
+                                                BTRFS_MAX_EXTENT_SIZE);
+                /*
+                 * We need to know how many extents we reserved so that we can
+                 * do the accounting properly if we go over the number we
+                 * originally calculated.  Abuse current->journal_info for this.
+                 */
+                current->journal_info = &outstanding_extents;
        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
                                     &BTRFS_I(inode)->runtime_flags)) {
                inode_dio_done(inode);
@@ -8124,6 +8179,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        iter, offset, btrfs_get_blocks_direct, NULL,
                        btrfs_submit_direct, flags);
        if (rw & WRITE) {
+                current->journal_info = NULL;
                if (ret < 0 && ret != -EIOCBQUEUED)
                        btrfs_delalloc_release_space(inode, count);
                else if (ret >= 0 && (size_t)ret < count)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 97159a8e91d4..058c79eecbfb 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1259,7 +1259,7 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1,
        if (oper1->seq < oper2->seq)
                return -1;
        if (oper1->seq > oper2->seq)
-                return -1;
+                return 1;
        if (oper1->ref_root < oper2->ref_root)
                return -1;
        if (oper1->ref_root > oper2->ref_root)
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index a116b55ce788..054fc0d97131 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -911,6 +911,197 @@ out:
        return ret;
 }
+static int test_extent_accounting(void)
+{
+        struct inode *inode = NULL;
+        struct btrfs_root *root = NULL;
+        int ret = -ENOMEM;
+        inode = btrfs_new_test_inode();
+        if (!inode) {
+                test_msg("Couldn't allocate inode\n");
+                return ret;
+        }
+        root = btrfs_alloc_dummy_root();
+        if (IS_ERR(root)) {
+                test_msg("Couldn't allocate root\n");
+                goto out;
+        }
+        root->fs_info = btrfs_alloc_dummy_fs_info();
+        if (!root->fs_info) {
+                test_msg("Couldn't allocate dummy fs info\n");
+                goto out;
+        }
+        BTRFS_I(inode)->root = root;
+        btrfs_test_inode_set_ops(inode);
+        /* [BTRFS_MAX_EXTENT_SIZE] */
+        BTRFS_I(inode)->outstanding_extents++;
+        ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
+                                        NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 1) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 1, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* [BTRFS_MAX_EXTENT_SIZE][4k] */
+        BTRFS_I(inode)->outstanding_extents++;
+        ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+                                        BTRFS_MAX_EXTENT_SIZE + 4095, NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 2) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 2, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */
+        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+                               BTRFS_MAX_EXTENT_SIZE >> 1,
+                               (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+                               EXTENT_DELALLOC | EXTENT_DIRTY |
+                               EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
+                               NULL, GFP_NOFS);
+        if (ret) {
+                test_msg("clear_extent_bit returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 2) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 2, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* [BTRFS_MAX_EXTENT_SIZE][4K] */
+        BTRFS_I(inode)->outstanding_extents++;
+        ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+                                        (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+                                        NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 2) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 2, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /*
+         * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K]
+         *
+         * I'm artificially adding 2 to outstanding_extents because in the
+         * buffered IO case we'd add things up as we go, but I don't feel like
+         * doing that here, this isn't the interesting case we want to test.
+         */
+        BTRFS_I(inode)->outstanding_extents += 2;
+        ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192,
+                                        (BTRFS_MAX_EXTENT_SIZE << 1) + 12287,
+                                        NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 4) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 4, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */
+        BTRFS_I(inode)->outstanding_extents++;
+        ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+                                        BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 3) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 3, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
+        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+                               BTRFS_MAX_EXTENT_SIZE+4096,
+                               BTRFS_MAX_EXTENT_SIZE+8191,
+                               EXTENT_DIRTY | EXTENT_DELALLOC |
+                               EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+                               NULL, GFP_NOFS);
+        if (ret) {
+                test_msg("clear_extent_bit returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 4) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 4, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /*
+         * Refill the hole again just for good measure, because I thought it
+         * might fail and I'd rather satisfy my paranoia at this point.
+         */
+        BTRFS_I(inode)->outstanding_extents++;
+        ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+                                        BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 3) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 3, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* Empty */
+        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+                               EXTENT_DIRTY | EXTENT_DELALLOC |
+                               EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+                               NULL, GFP_NOFS);
+        if (ret) {
+                test_msg("clear_extent_bit returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 0, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        ret = 0;
+out:
+        if (ret)
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+                                 NULL, GFP_NOFS);
+        iput(inode);
+        btrfs_free_dummy_root(root);
+        return ret;
+}
 int btrfs_test_inodes(void)
 {
        int ret;
@@ -924,5 +1115,9 @@ int btrfs_test_inodes(void)
        if (ret)
                return ret;
        test_msg("Running hole first btrfs_get_extent test\n");
-        return test_hole_first();
+        ret = test_hole_first();
+        if (ret)
+                return ret;
+        test_msg("Running outstanding_extents tests\n");
+        return test_extent_accounting();
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 88e51aded6bd..8be4278e25e8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1023,17 +1023,13 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        u64 old_root_bytenr;
        u64 old_root_used;
        struct btrfs_root *tree_root = root->fs_info->tree_root;
-        bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
        old_root_used = btrfs_root_used(&root->root_item);
-        btrfs_write_dirty_block_groups(trans, root);
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start &&
-                    old_root_used == btrfs_root_used(&root->root_item) &&
+                    old_root_used == btrfs_root_used(&root->root_item))
-                    (!extent_root ||
-                     list_empty(&trans->transaction->dirty_bgs)))
                        break;
                btrfs_set_root_node(&root->root_item, root->node);
@@ -1044,14 +1040,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                        return ret;
                old_root_used = btrfs_root_used(&root->root_item);
-                if (extent_root) {
-                        ret = btrfs_write_dirty_block_groups(trans, root);
-                        if (ret)
-                                return ret;
-                }
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-                if (ret)
-                        return ret;
        }
        return 0;
@@ -1068,6 +1056,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
        struct list_head *next;
        struct extent_buffer *eb;
        int ret;
@@ -1095,11 +1084,15 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
+        ret = btrfs_setup_space_cache(trans, root);
+        if (ret)
+                return ret;
        /* run_qgroups might have added some more refs */
        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
        if (ret)
                return ret;
+again:
        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                next = fs_info->dirty_cowonly_roots.next;
                list_del_init(next);
@@ -1112,8 +1105,23 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                ret = update_cowonly_root(trans, root);
                if (ret)
                        return ret;
+                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                if (ret)
+                        return ret;
        }
+        while (!list_empty(dirty_bgs)) {
+                ret = btrfs_write_dirty_block_groups(trans, root);
+                if (ret)
+                        return ret;
+                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                if (ret)
+                        return ret;
+        }
+        if (!list_empty(&fs_info->dirty_cowonly_roots))
+                goto again;
        list_add_tail(&fs_info->extent_root->dirty_list,
                      &trans->transaction->switch_commits);
        btrfs_after_dev_replace_commit(fs_info);
@@ -1811,6 +1819,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                wait_for_commit(root, cur_trans);
+                if (unlikely(cur_trans->aborted))
+                        ret = cur_trans->aborted;
                btrfs_put_transaction(cur_trans);
                return ret;
diff --git a/fs/buffer.c b/fs/buffer.c
index 20805db2c987..c7a5602d01ee 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3243,8 +3243,8 @@ int try_to_free_buffers(struct page *page)
         * to synchronise against __set_page_dirty_buffers and prevent the
         * dirty bit from being lost.
         */
-        if (ret)
+        if (ret && TestClearPageDirty(page))
-                cancel_dirty_page(page, PAGE_CACHE_SIZE);
+                account_page_cleaned(page, mapping);
        spin_unlock(&mapping->private_lock);
 out:
        if (buffers_to_free) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d533075a823d..139f2fea91a0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,7 +7,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
 #include <linux/falloc.h>
 #include "super.h"
@@ -808,7 +807,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
        struct file *filp = iocb->ki_filp;
        struct ceph_file_info *fi = filp->private_data;
-        size_t len = iocb->ki_nbytes;
+        size_t len = iov_iter_count(to);
        struct inode *inode = file_inode(filp);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct page *pinned_page = NULL;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 4ac7445e6ec7..aa0dc2573374 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,6 +1,9 @@
 /*
 *   fs/cifs/cifsencrypt.c
 *
+ *   Encryption and hashing operations relating to NTLM, NTLMv2.  See MS-NLMP
+ *   for more detailed information
+ *
 *   Copyright (C) International Business Machines  Corp., 2005,2013
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
@@ -515,7 +518,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
                                 __func__);
                        return rc;
                }
-        } else if (ses->serverName) {
+        } else {
+                /* We use ses->serverName if no domain name available */
                len = strlen(ses->serverName);
                server = kmalloc(2 + (len * 2), GFP_KERNEL);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d3aa999ab785..f3bfe08e177b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -773,8 +773,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
        length = atomic_dec_return(&tcpSesAllocCount);
        if (length > 0)
-                mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
+                mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
-                                GFP_KERNEL);
 }
 static int
@@ -848,8 +847,7 @@ cifs_demultiplex_thread(void *p)
        length = atomic_inc_return(&tcpSesAllocCount);
        if (length > 1)
-                mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
+                mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
-                                GFP_KERNEL);
        set_freezable();
        while (server->tcpStatus != CifsExiting) {
@@ -1599,6 +1597,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                                pr_warn("CIFS: username too long\n");
                                goto cifs_parse_mount_err;
                        }
+                        kfree(vol->username);
                        vol->username = kstrdup(string, GFP_KERNEL);
                        if (!vol->username)
                                goto cifs_parse_mount_err;
@@ -1700,6 +1700,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                                goto cifs_parse_mount_err;
                        }
+                        kfree(vol->domainname);
                        vol->domainname = kstrdup(string, GFP_KERNEL);
                        if (!vol->domainname) {
                                pr_warn("CIFS: no memory for domainname\n");
@@ -1731,6 +1732,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        }
                         if (strncasecmp(string, "default", 7) != 0) {
+                                kfree(vol->iocharset);
                                vol->iocharset = kstrdup(string,
                                                         GFP_KERNEL);
                                if (!vol->iocharset) {
@@ -2913,8 +2915,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
                 * calling name ends in null (byte 16) from old smb
                 * convention.
                 */
-                if (server->workstation_RFC1001_name &&
+                if (server->workstation_RFC1001_name[0] != 0)
-                    server->workstation_RFC1001_name[0] != 0)
                        rfc1002mangle(ses_init_buf->trailer.
                                      session_req.calling_name,
                                      server->workstation_RFC1001_name,
@@ -3692,6 +3693,12 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
 #endif /* CIFS_WEAK_PW_HASH */
                rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
                                        bcc_ptr, nls_codepage);
+                if (rc) {
+                        cifs_dbg(FYI, "%s Can't generate NTLM rsp. Error: %d\n",
+                                 __func__, rc);
+                        cifs_buf_release(smb_buffer);
+                        return rc;
+                }
                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a94b3e673182..ca30c391a894 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1823,6 +1823,7 @@ refind_writable:
                        cifsFileInfo_put(inv_file);
                        spin_lock(&cifs_file_list_lock);
                        ++refind;
+                        inv_file = NULL;
                        goto refind_writable;
                }
        }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2d4f37235ed0..3e126d7bb2ea 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -771,6 +771,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
                                cifs_buf_release(srchinf->ntwrk_buf_start);
                        }
                        kfree(srchinf);
+                        if (rc)
+                                goto cgii_exit;
        } else
                goto cgii_exit;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 689f035915cf..22dfdf17d065 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -322,7 +322,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
        /* return pointer to beginning of data area, ie offset from SMB start */
        if ((*off != 0) && (*len != 0))
-                return hdr->ProtocolId + *off;
+                return (char *)(&hdr->ProtocolId[0]) + *off;
        else
                return NULL;
 }
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 96b5d40a2ece..eab05e1aa587 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -684,7 +684,8 @@ smb2_clone_range(const unsigned int xid,
                        /* No need to change MaxChunks since already set to 1 */
                        chunk_sizes_updated = true;
-                }
+                } else
+                        goto cchunk_out;
        }
 cchunk_out:
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 3417340bf89e..65cd7a84c8bc 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1218,7 +1218,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
        struct smb2_ioctl_req *req;
        struct smb2_ioctl_rsp *rsp;
        struct TCP_Server_Info *server;
-        struct cifs_ses *ses = tcon->ses;
+        struct cifs_ses *ses;
        struct kvec iov[2];
        int resp_buftype;
        int num_iovecs;
@@ -1233,6 +1233,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
        if (plen)
                *plen = 0;
+        if (tcon)
+                ses = tcon->ses;
+        else
+                return -EIO;
        if (ses && (ses->server))
                server = ses->server;
        else
@@ -1296,14 +1301,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
        rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
        if ((rc != 0) && (rc != -EINVAL)) {
-                if (tcon)
+                cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
-                        cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
                goto ioctl_exit;
        } else if (rc == -EINVAL) {
                if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) &&
                    (opcode != FSCTL_SRV_COPYCHUNK)) {
-                        if (tcon)
+                        cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
-                                cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
                        goto ioctl_exit;
                }
        }
@@ -1629,7 +1632,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
        rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
-        if ((rc != 0) && tcon)
+        if (rc != 0)
                cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
        free_rsp_buf(resp_buftype, iov[0].iov_base);
@@ -2114,7 +2117,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
        struct kvec iov[2];
        int rc = 0;
        int len;
-        int resp_buftype;
+        int resp_buftype = CIFS_NO_BUFFER;
        unsigned char *bufptr;
        struct TCP_Server_Info *server;
        struct cifs_ses *ses = tcon->ses;
diff --git a/fs/dcache.c b/fs/dcache.c
index c71e3732e53b..d99736a63e3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2690,7 +2690,7 @@ static int __d_unalias(struct inode *inode,
                struct dentry *dentry, struct dentry *alias)
 {
        struct mutex *m1 = NULL, *m2 = NULL;
-        int ret = -EBUSY;
+        int ret = -ESTALE;
        /* If alias and dentry share a parent, then no extra locks required */
        if (alias->d_parent == dentry->d_parent)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 96400ab42d13..61e72d44cf94 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -254,6 +254,9 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
        pr_debug("debugfs: creating file '%s'\n",name);
+        if (IS_ERR(parent))
+                return parent;
        error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
                              &debugfs_mount_count);
        if (error)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b2e297..6fb00e3f1059 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,7 +37,6 @@
 #include <linux/uio.h>
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
-#include <linux/aio.h>
 /*
 * How many user pages to map in one call to get_user_pages().  This determines
@@ -265,7 +264,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
                                ret = err;
                }
-                aio_complete(dio->iocb, ret, 0);
+                dio->iocb->ki_complete(dio->iocb, ret, 0);
        }
        kmem_cache_free(dio_cache, dio);
@@ -1056,7 +1055,7 @@ static inline int drop_refcount(struct dio *dio)
         * operation.  AIO can if it was a broken operation described above or
         * in fact if all the bios race to complete before we get here.  In
         * that case dio_complete() translates the EIOCBQUEUED into the proper
-         * return code that the caller will hand to aio_complete().
+         * return code that the caller will hand to ->complete().
         *
         * This is managed by the bio_lock instead of being an atomic_t so that
         * completion paths can drop their ref and use the remaining count to
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index fd39bad6f1bd..79675089443d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
-#include <linux/aio.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -52,12 +51,6 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        rc = generic_file_read_iter(iocb, to);
-        /*
-         * Even though this is a async interface, we need to wait
-         * for IO to finish to update atime
-         */
-        if (-EIOCBQUEUED == rc)
-                rc = wait_on_sync_kiocb(iocb);
        if (rc >= 0) {
                path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
                touch_atime(path);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6434bc000125..df9d6afbc5d5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,7 +31,7 @@
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xattr.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2c6ccc49ba27..db07ffbe7c85 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,7 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "ext3.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 33a09da16c9c..598abbbe6786 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,9 +23,9 @@
 #include <linux/jbd2.h>
 #include <linux/mount.h>
 #include <linux/path.h>
-#include <linux/aio.h>
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
+#include <linux/uio.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 45fe924f82bc..740c7871c117 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,9 +20,9 @@
 *      (sct@redhat.com), 1993, 1998
 */
-#include <linux/aio.h>
 #include "ext4_jbd2.h"
 #include "truncate.h"
+#include <linux/uio.h>
 #include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cb9a212b86f..a3f451370bef 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,7 +37,6 @@
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
-#include <linux/aio.h>
 #include <linux/bitops.h>
 #include "ext4_jbd2.h"
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b24a2541a9ba..464984261e69 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,7 +18,6 @@
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
-#include <linux/aio.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 985ed023a750..497f8515d205 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,12 +12,12 @@
 #include <linux/f2fs_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
-#include <linux/aio.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/prefetch.h>
+#include <linux/uio.h>
 #include "f2fs.h"
 #include "node.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 497c7c5263c7..8521207de229 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -19,7 +19,6 @@
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
 #include <linux/mount.h>
-#include <linux/aio.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/uio.h>
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e907052eeadb..32a8bbd7a9ad 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,6 +53,18 @@ struct wb_writeback_work {
        struct completion *done;        /* set if the caller waits */
 };
+/*
+ * If an inode is constantly having its pages dirtied, but then the
+ * updates stop dirtytime_expire_interval seconds in the past, it's
+ * possible for the worst case time between when an inode has its
+ * timestamps updated and when they finally get written out to be two
+ * dirtytime_expire_intervals.  We set the default to 12 hours (in
+ * seconds), which means most of the time inodes will have their
+ * timestamps written to disk after 12 hours, but in the worst case a
+ * few inodes might not their timestamps updated for 24 hours.
+ */
+unsigned int dirtytime_expire_interval = 12 * 60 * 60;
 /**
 * writeback_in_progress - determine whether there is writeback in progress
 * @bdi: the device's backing_dev_info structure.
@@ -275,8 +287,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
        if ((flags & EXPIRE_DIRTY_ATIME) == 0)
                older_than_this = work->older_than_this;
-        else if ((work->reason == WB_REASON_SYNC) == 0) {
+        else if (!work->for_sync) {
-                expire_time = jiffies - (HZ * 86400);
+                expire_time = jiffies - (dirtytime_expire_interval * HZ);
                older_than_this = &expire_time;
        }
        while (!list_empty(delaying_queue)) {
@@ -458,6 +470,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                 */
                redirty_tail(inode, wb);
        } else if (inode->i_state & I_DIRTY_TIME) {
+                inode->dirtied_when = jiffies;
                list_move(&inode->i_wb_list, &wb->b_dirty_time);
        } else {
                /* The inode is clean. Remove from writeback lists. */
@@ -505,12 +518,17 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        spin_lock(&inode->i_lock);
        dirty = inode->i_state & I_DIRTY;
-        if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
+        if (inode->i_state & I_DIRTY_TIME) {
-             (inode->i_state & I_DIRTY_TIME)) ||
+                if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
-            (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
+                    unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
-                dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+                    unlikely(time_after(jiffies,
-                trace_writeback_lazytime(inode);
+                                        (inode->dirtied_time_when +
-        }
+                                         dirtytime_expire_interval * HZ)))) {
+                        dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+                        trace_writeback_lazytime(inode);
+                }
+        } else
+                inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
        inode->i_state &= ~dirty;
        /*
@@ -1131,6 +1149,56 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
        rcu_read_unlock();
 }
+/*
+ * Wake up bdi's periodically to make sure dirtytime inodes gets
+ * written back periodically.  We deliberately do *not* check the
+ * b_dirtytime list in wb_has_dirty_io(), since this would cause the
+ * kernel to be constantly waking up once there are any dirtytime
+ * inodes on the system.  So instead we define a separate delayed work
+ * function which gets called much more rarely.  (By default, only
+ * once every 12 hours.)
+ *
+ * If there is any other write activity going on in the file system,
+ * this function won't be necessary.  But if the only thing that has
+ * happened on the file system is a dirtytime inode caused by an atime
+ * update, we need this infrastructure below to make sure that inode
+ * eventually gets pushed out to disk.
+ */
+static void wakeup_dirtytime_writeback(struct work_struct *w);
+static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
+static void wakeup_dirtytime_writeback(struct work_struct *w)
+{
+        struct backing_dev_info *bdi;
+        rcu_read_lock();
+        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+                if (list_empty(&bdi->wb.b_dirty_time))
+                        continue;
+                bdi_wakeup_thread(bdi);
+        }
+        rcu_read_unlock();
+        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+}
+static int __init start_dirtytime_writeback(void)
+{
+        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+        return 0;
+}
+__initcall(start_dirtytime_writeback);
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+                               void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int ret;
+        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        if (ret == 0 && write)
+                mod_delayed_work(system_wq, &dirtytime_work, 0);
+        return ret;
+}
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
        if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1269,8 +1337,13 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        }
                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_wb_list, dirtytime ?
+                        if (dirtytime)
-                                  &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
+                                inode->dirtied_time_when = jiffies;
+                        if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
+                                list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+                        else
+                                list_move(&inode->i_wb_list,
+                                          &bdi->wb.b_dirty_time);
                        spin_unlock(&bdi->wb.list_lock);
                        trace_writeback_dirty_inode_enqueue(inode);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 28d0c7abba1c..b3fa05032234 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,7 +38,6 @@
 #include <linux/device.h>
 #include <linux/file.h>
 #include <linux/fs.h>
-#include <linux/aio.h>
 #include <linux/kdev_t.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
@@ -48,6 +47,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/module.h>
+#include <linux/uio.h>
 #include "fuse_i.h"
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ed19a7d622fa..95a2797eef66 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,7 +19,6 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/swap.h>
 #include <linux/splice.h>
-#include <linux/aio.h>
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 MODULE_ALIAS("devname:fuse");
@@ -890,8 +889,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
        newpage = buf->page;
-        if (WARN_ON(!PageUptodate(newpage)))
+        if (!PageUptodate(newpage))
-                return -EIO;
+                SetPageUptodate(newpage);
        ClearPageMappedToDisk(newpage);
@@ -1353,6 +1352,17 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
        return err;
 }
+static int fuse_dev_open(struct inode *inode, struct file *file)
+{
+        /*
+         * The fuse device's file's private_data is used to hold
+         * the fuse_conn(ection) when it is mounted, and is used to
+         * keep track of whether the file has been mounted already.
+         */
+        file->private_data = NULL;
+        return 0;
+}
 static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
                              unsigned long nr_segs, loff_t pos)
 {
@@ -1797,6 +1807,9 @@ copy_finish:
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
                       unsigned int size, struct fuse_copy_state *cs)
 {
+        /* Don't try to move pages (yet) */
+        cs->move_pages = 0;
        switch (code) {
        case FUSE_NOTIFY_POLL:
                return fuse_notify_poll(fc, size, cs);
@@ -2217,6 +2230,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
 const struct file_operations fuse_dev_operations = {
        .owner          = THIS_MODULE,
+        .open           = fuse_dev_open,
        .llseek         = no_llseek,
        .read           = do_sync_read,
        .aio_read       = fuse_dev_read,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c01ec3bdcfd8..ff102cbf16ea 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,8 +15,8 @@
 #include <linux/module.h>
 #include <linux/compat.h>
 #include <linux/swap.h>
-#include <linux/aio.h>
 #include <linux/falloc.h>
+#include <linux/uio.h>
 static const struct file_operations fuse_direct_io_file_operations;
@@ -528,6 +528,17 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
        }
 }
+static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
+{
+        if (io->err)
+                return io->err;
+        if (io->bytes >= 0 && io->write)
+                return -EIO;
+        return io->bytes < 0 ? io->size : io->bytes;
+}
 /**
 * In case of short read, the caller sets 'pos' to the position of
 * actual end of fuse request in IO request. Otherwise, if bytes_requested
@@ -546,6 +557,7 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 */
 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 {
+        bool is_sync = is_sync_kiocb(io->iocb);
        int left;
        spin_lock(&io->lock);
@@ -555,30 +567,24 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
                io->bytes = pos;
        left = --io->reqs;
+        if (!left && is_sync)
+                complete(io->done);
        spin_unlock(&io->lock);
-        if (!left) {
+        if (!left && !is_sync) {
-                long res;
+                ssize_t res = fuse_get_res_by_io(io);
-                if (io->err)
+                if (res >= 0) {
-                        res = io->err;
+                        struct inode *inode = file_inode(io->iocb->ki_filp);
-                else if (io->bytes >= 0 && io->write)
+                        struct fuse_conn *fc = get_fuse_conn(inode);
-                        res = -EIO;
+                        struct fuse_inode *fi = get_fuse_inode(inode);
-                else {
-                        res = io->bytes < 0 ? io->size : io->bytes;
-                        if (!is_sync_kiocb(io->iocb)) {
+                        spin_lock(&fc->lock);
-                                struct inode *inode = file_inode(io->iocb->ki_filp);
+                        fi->attr_version = ++fc->attr_version;
-                                struct fuse_conn *fc = get_fuse_conn(inode);
+                        spin_unlock(&fc->lock);
-                                struct fuse_inode *fi = get_fuse_inode(inode);
-                                spin_lock(&fc->lock);
-                                fi->attr_version = ++fc->attr_version;
-                                spin_unlock(&fc->lock);
-                        }
                }
-                aio_complete(io->iocb, res, 0);
+                io->iocb->ki_complete(io->iocb, res, 0);
                kfree(io);
        }
 }
@@ -2801,6 +2807,7 @@ static ssize_t
 fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
                        loff_t offset)
 {
+        DECLARE_COMPLETION_ONSTACK(wait);
        ssize_t ret = 0;
        struct file *file = iocb->ki_filp;
        struct fuse_file *ff = file->private_data;
@@ -2852,6 +2859,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
        if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
                io->async = false;
+        if (io->async && is_sync_kiocb(iocb))
+                io->done = &wait;
        if (rw == WRITE)
                ret = __fuse_direct_write(io, iter, &pos);
        else
@@ -2864,11 +2874,12 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
                if (!is_sync_kiocb(iocb))
                        return -EIOCBQUEUED;
-                ret = wait_on_sync_kiocb(iocb);
+                wait_for_completion(&wait);
-        } else {
+                ret = fuse_get_res_by_io(io);
-                kfree(io);
        }
+        kfree(io);
        if (rw == WRITE) {
                if (ret > 0)
                        fuse_write_update_size(inode, pos);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1cdfb07c1376..7354dc142a50 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -263,6 +263,7 @@ struct fuse_io_priv {
        int err;
        struct kiocb *iocb;
        struct file *file;
+        struct completion *done;
 };
 /**
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 7b3143064af1..1be3b061c05c 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -110,11 +110,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
        error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
        if (error)
                goto out;
+        set_cached_acl(inode, type, acl);
-        if (acl)
-                set_cached_acl(inode, type, acl);
-        else
-                forget_cached_acl(inode, type);
 out:
        kfree(data);
        return error;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4ad4f94edebe..a6e6990aea39 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,7 +20,7 @@
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/backing-dev.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include <trace/events/writeback.h>
 #include "gfs2.h"
@@ -671,12 +671,12 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        if (alloc_required) {
                struct gfs2_alloc_parms ap = { .aflags = 0, };
-                error = gfs2_quota_lock_check(ip);
+                requested = data_blocks + ind_blocks;
+                ap.target = requested;
+                error = gfs2_quota_lock_check(ip, &ap);
                if (error)
                        goto out_unlock;
-                requested = data_blocks + ind_blocks;
-                ap.target = requested;
                error = gfs2_inplace_reserve(ip, &ap);
                if (error)
                        goto out_qunlock;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index f0b945ab853e..61296ecbd0e2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1224,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size)
        if (gfs2_is_stuffed(ip) &&
            (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
-                error = gfs2_quota_lock_check(ip);
+                error = gfs2_quota_lock_check(ip, &ap);
                if (error)
                        return error;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 3e32bb8e2d7e..8ec43ab5babf 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,7 +25,6 @@
 #include <asm/uaccess.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
-#include <linux/aio.h>
 #include <linux/delay.h>
 #include "gfs2.h"
@@ -429,11 +428,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (ret)
                goto out_unlock;
-        ret = gfs2_quota_lock_check(ip);
-        if (ret)
-                goto out_unlock;
        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
        ap.target = data_blocks + ind_blocks;
+        ret = gfs2_quota_lock_check(ip, &ap);
+        if (ret)
+                goto out_unlock;
        ret = gfs2_inplace_reserve(ip, &ap);
        if (ret)
                goto out_quota_unlock;
@@ -765,22 +764,30 @@ out:
        brelse(dibh);
        return error;
 }
+/**
-static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+ * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of
-                            unsigned int *data_blocks, unsigned int *ind_blocks)
+ *                     blocks, determine how many bytes can be written.
+ * @ip:          The inode in question.
+ * @len:         Max cap of bytes. What we return in *len must be <= this.
+ * @data_blocks: Compute and return the number of data blocks needed
+ * @ind_blocks:  Compute and return the number of indirect blocks needed
+ * @max_blocks:  The total blocks available to work with.
+ *
+ * Returns: void, but @len, @data_blocks and @ind_blocks are filled in.
+ */
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len,
+                            unsigned int *data_blocks, unsigned int *ind_blocks,
+                            unsigned int max_blocks)
 {
+        loff_t max = *len;
        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        unsigned int max_blocks = ip->i_rgd->rd_free_clone;
        unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
        for (tmp = max_data; tmp > sdp->sd_diptrs;) {
                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
                max_data -= tmp;
        }
-        /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
-           so it might end up with fewer data blocks */
-        if (max_data <= *data_blocks)
-                return;
        *data_blocks = max_data;
        *ind_blocks = max_blocks - max_data;
        *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
@@ -797,7 +804,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_alloc_parms ap = { .aflags = 0, };
        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-        loff_t bytes, max_bytes;
+        loff_t bytes, max_bytes, max_blks = UINT_MAX;
        int error;
        const loff_t pos = offset;
        const loff_t count = len;
@@ -819,6 +826,9 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
        gfs2_size_hint(file, offset, len);
+        gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks);
+        ap.min_target = data_blocks + ind_blocks;
        while (len > 0) {
                if (len < bytes)
                        bytes = len;
@@ -827,27 +837,41 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
                        offset += bytes;
                        continue;
                }
-                error = gfs2_quota_lock_check(ip);
+                /* We need to determine how many bytes we can actually
+                 * fallocate without exceeding quota or going over the
+                 * end of the fs. We start off optimistically by assuming
+                 * we can write max_bytes */
+                max_bytes = (len > max_chunk_size) ? max_chunk_size : len;
+                /* Since max_bytes is most likely a theoretical max, we
+                 * calculate a more realistic 'bytes' to serve as a good
+                 * starting point for the number of bytes we may be able
+                 * to write */
+                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+                ap.target = data_blocks + ind_blocks;
+                error = gfs2_quota_lock_check(ip, &ap);
                if (error)
                        return error;
-retry:
+                /* ap.allowed tells us how many blocks quota will allow
-                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+                 * us to write. Check if this reduces max_blks */
+                if (ap.allowed && ap.allowed < max_blks)
+                        max_blks = ap.allowed;
-                ap.target = data_blocks + ind_blocks;
                error = gfs2_inplace_reserve(ip, &ap);
-                if (error) {
+                if (error)
-                        if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
-                                bytes >>= 1;
-                                bytes &= bsize_mask;
-                                if (bytes == 0)
-                                        bytes = sdp->sd_sb.sb_bsize;
-                                goto retry;
-                        }
                        goto out_qunlock;
-                }
-                max_bytes = bytes;
+                /* check if the selected rgrp limits our max_blks further */
-                calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
+                if (ap.allowed && ap.allowed < max_blks)
-                                &max_bytes, &data_blocks, &ind_blocks);
+                        max_blks = ap.allowed;
+                /* Almost done. Calculate bytes that can be written using
+                 * max_blks. We also recompute max_bytes, data_blocks and
+                 * ind_blocks */
+                calc_max_reserv(ip, &max_bytes, &data_blocks,
+                                &ind_blocks, max_blks);
                rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
                          RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks);
@@ -931,6 +955,22 @@ out_uninit:
        return ret;
 }
+static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
+                                      struct file *out, loff_t *ppos,
+                                      size_t len, unsigned int flags)
+{
+        int error;
+        struct gfs2_inode *ip = GFS2_I(out->f_mapping->host);
+        error = gfs2_rs_alloc(ip);
+        if (error)
+                return (ssize_t)error;
+        gfs2_size_hint(out, *ppos, len);
+        return iter_file_splice_write(pipe, out, ppos, len, flags);
+}
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 /**
@@ -1077,7 +1117,7 @@ const struct file_operations gfs2_file_fops = {
        .lock           = gfs2_lock,
        .flock          = gfs2_flock,
        .splice_read    = generic_file_splice_read,
-        .splice_write   = iter_file_splice_write,
+        .splice_write   = gfs2_file_splice_write,
        .setlease       = simple_nosetlease,
        .fallocate      = gfs2_fallocate,
 };
@@ -1107,7 +1147,7 @@ const struct file_operations gfs2_file_fops_nolock = {
        .release        = gfs2_release,
        .fsync          = gfs2_fsync,
        .splice_read    = generic_file_splice_read,
-        .splice_write   = iter_file_splice_write,
+        .splice_write   = gfs2_file_splice_write,
        .setlease       = generic_setlease,
        .fallocate      = gfs2_fallocate,
 };
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f42dffba056a..0fa8062f85a7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2047,34 +2047,41 @@ static const struct file_operations gfs2_sbstats_fops = {
 int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
 {
-        sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
+        struct dentry *dent;
-        if (!sdp->debugfs_dir)
-                return -ENOMEM;
+        dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
-        sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",
+        if (IS_ERR_OR_NULL(dent))
-                                                         S_IFREG | S_IRUGO,
+                goto fail;
-                                                         sdp->debugfs_dir, sdp,
+        sdp->debugfs_dir = dent;
-                                                         &gfs2_glocks_fops);
-        if (!sdp->debugfs_dentry_glocks)
+        dent = debugfs_create_file("glocks",
+                                   S_IFREG | S_IRUGO,
+                                   sdp->debugfs_dir, sdp,
+                                   &gfs2_glocks_fops);
+        if (IS_ERR_OR_NULL(dent))
                goto fail;
+        sdp->debugfs_dentry_glocks = dent;
-        sdp->debugfs_dentry_glstats = debugfs_create_file("glstats",
+        dent = debugfs_create_file("glstats",
-                                                        S_IFREG | S_IRUGO,
+                                   S_IFREG | S_IRUGO,
-                                                        sdp->debugfs_dir, sdp,
+                                   sdp->debugfs_dir, sdp,
-                                                        &gfs2_glstats_fops);
+                                   &gfs2_glstats_fops);
-        if (!sdp->debugfs_dentry_glstats)
+        if (IS_ERR_OR_NULL(dent))
                goto fail;
+        sdp->debugfs_dentry_glstats = dent;
-        sdp->debugfs_dentry_sbstats = debugfs_create_file("sbstats",
+        dent = debugfs_create_file("sbstats",
-                                                        S_IFREG | S_IRUGO,
+                                   S_IFREG | S_IRUGO,
-                                                        sdp->debugfs_dir, sdp,
+                                   sdp->debugfs_dir, sdp,
-                                                        &gfs2_sbstats_fops);
+                                   &gfs2_sbstats_fops);
-        if (!sdp->debugfs_dentry_sbstats)
+        if (IS_ERR_OR_NULL(dent))
                goto fail;
+        sdp->debugfs_dentry_sbstats = dent;
        return 0;
 fail:
        gfs2_delete_debugfs_file(sdp);
-        return -ENOMEM;
+        return dent ? PTR_ERR(dent) : -ENOMEM;
 }
 void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
@@ -2100,6 +2107,8 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
 int gfs2_register_debugfs(void)
 {
        gfs2_root = debugfs_create_dir("gfs2", NULL);
+        if (IS_ERR(gfs2_root))
+                return PTR_ERR(gfs2_root);
        return gfs2_root ? 0 : -ENOMEM;
 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 7a2dbbc0d634..58b75abf6ab2 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -301,8 +301,10 @@ struct gfs2_blkreserv {
 * to the allocation code.
 */
 struct gfs2_alloc_parms {
-        u32 target;
+        u64 target;
+        u32 min_target;
        u32 aflags;
+        u64 allowed;
 };
 enum {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 73c72253faac..08bc84d7e768 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -382,7 +382,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
        struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, };
        int error;
-        error = gfs2_quota_lock_check(ip);
+        error = gfs2_quota_lock_check(ip, &ap);
        if (error)
                goto out;
@@ -525,7 +525,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
        int error;
        if (da->nr_blocks) {
-                error = gfs2_quota_lock_check(dip);
+                error = gfs2_quota_lock_check(dip, &ap);
                if (error)
                        goto fail_quota_locks;
@@ -953,7 +953,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (da.nr_blocks) {
                struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
-                error = gfs2_quota_lock_check(dip);
+                error = gfs2_quota_lock_check(dip, &ap);
                if (error)
                        goto out_gunlock;
@@ -1470,7 +1470,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        if (da.nr_blocks) {
                struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
-                error = gfs2_quota_lock_check(ndip);
+                error = gfs2_quota_lock_check(ndip, &ap);
                if (error)
                        goto out_gunlock;
@@ -1669,6 +1669,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        kuid_t ouid, nuid;
        kgid_t ogid, ngid;
        int error;
+        struct gfs2_alloc_parms ap;
        ouid = inode->i_uid;
        ogid = inode->i_gid;
@@ -1696,9 +1697,11 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out;
+        ap.target = gfs2_get_inode_blocks(&ip->i_inode);
        if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
            !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
-                error = gfs2_quota_check(ip, nuid, ngid);
+                error = gfs2_quota_check(ip, nuid, ngid, &ap);
                if (error)
                        goto out_gunlock_q;
        }
@@ -1713,9 +1716,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
            !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
-                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
+                gfs2_quota_change(ip, -ap.target, ouid, ogid);
-                gfs2_quota_change(ip, -blocks, ouid, ogid);
+                gfs2_quota_change(ip, ap.target, nuid, ngid);
-                gfs2_quota_change(ip, blocks, nuid, ngid);
        }
 out_end_trans:
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3aa17d4d1cfc..5c27e48aa76f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -923,6 +923,9 @@ restart:
        if (error)
                return error;
+        if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+                force_refresh = FORCE;
        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
        if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
@@ -974,11 +977,8 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
             sizeof(struct gfs2_quota_data *), sort_qd, NULL);
        for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
-                int force = NO_FORCE;
                qd = ip->i_res->rs_qa_qd[x];
-                if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+                error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]);
-                        force = FORCE;
-                error = do_glock(qd, force, &ip->i_res->rs_qa_qd_ghs[x]);
                if (error)
                        break;
        }
@@ -1094,14 +1094,33 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
        return 0;
 }
-int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
+/**
+ * gfs2_quota_check - check if allocating new blocks will exceed quota
+ * @ip:  The inode for which this check is being performed
+ * @uid: The uid to check against
+ * @gid: The gid to check against
+ * @ap:  The allocation parameters. ap->target contains the requested
+ *       blocks. ap->min_target, if set, contains the minimum blks
+ *       requested.
+ *
+ * Returns: 0 on success.
+ *                  min_req = ap->min_target ? ap->min_target : ap->target;
+ *                  quota must allow atleast min_req blks for success and
+ *                  ap->allowed is set to the number of blocks allowed
+ *
+ *          -EDQUOT otherwise, quota violation. ap->allowed is set to number
+ *                  of blocks available.
+ */
+int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+                     struct gfs2_alloc_parms *ap)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_quota_data *qd;
-        s64 value;
+        s64 value, warn, limit;
        unsigned int x;
        int error = 0;
+        ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
        if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
                return 0;
@@ -1115,30 +1134,37 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
                      qid_eq(qd->qd_id, make_kqid_gid(gid))))
                        continue;
+                warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn);
+                limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit);
                value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
                spin_lock(&qd_lock);
                value += qd->qd_change;
                spin_unlock(&qd_lock);
-                if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
+                if (limit > 0 && (limit - value) < ap->allowed)
-                        print_message(qd, "exceeded");
+                        ap->allowed = limit - value;
-                        quota_send_warning(qd->qd_id,
+                /* If we can't meet the target */
-                                           sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
+                if (limit && limit < (value + (s64)ap->target)) {
+                        /* If no min_target specified or we don't meet
-                        error = -EDQUOT;
+                         * min_target, return -EDQUOT */
-                        break;
+                        if (!ap->min_target || ap->min_target > ap->allowed) {
-                } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
+                                print_message(qd, "exceeded");
-                           (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value &&
+                                quota_send_warning(qd->qd_id,
+                                                   sdp->sd_vfs->s_dev,
+                                                   QUOTA_NL_BHARDWARN);
+                                error = -EDQUOT;
+                                break;
+                        }
+                } else if (warn && warn < value &&
                           time_after_eq(jiffies, qd->qd_last_warn +
-                                         gfs2_tune_get(sdp,
+                                         gfs2_tune_get(sdp, gt_quota_warn_period)
-                                                gt_quota_warn_period) * HZ)) {
+                                         * HZ)) {
                        quota_send_warning(qd->qd_id,
                                           sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
                        error = print_message(qd, "warning");
                        qd->qd_last_warn = jiffies;
                }
        }
        return error;
 }
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 55d506eb3c4a..ad04b3acae2b 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -24,7 +24,8 @@ extern void gfs2_quota_unhold(struct gfs2_inode *ip);
 extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
 extern void gfs2_quota_unlock(struct gfs2_inode *ip);
-extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+                            struct gfs2_alloc_parms *ap);
 extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
                              kuid_t uid, kgid_t gid);
@@ -37,7 +38,8 @@ extern int gfs2_quotad(void *data);
 extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
-static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
+static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
+                                        struct gfs2_alloc_parms *ap)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        int ret;
@@ -48,7 +50,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
                return ret;
        if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
                return 0;
-        ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+        ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap);
        if (ret)
                gfs2_quota_unlock(ip);
        return ret;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9150207f365c..6af2396a317c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1946,10 +1946,18 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
 * @ip: the inode to reserve space for
 * @ap: the allocation parameters
 *
- * Returns: errno
+ * We try our best to find an rgrp that has at least ap->target blocks
+ * available. After a couple of passes (loops == 2), the prospects of finding
+ * such an rgrp diminish. At this stage, we return the first rgrp that has
+ * atleast ap->min_target blocks available. Either way, we set ap->allowed to
+ * the number of blocks available in the chosen rgrp.
+ *
+ * Returns: 0 on success,
+ *          -ENOMEM if a suitable rgrp can't be found
+ *          errno otherwise
 */
-int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap)
+int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *begin = NULL;
@@ -2012,7 +2020,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
                /* Skip unuseable resource groups */
                if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
                                                 GFS2_RDF_ERROR)) ||
-                    (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
+                    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
                        goto skip_rgrp;
                if (sdp->sd_args.ar_rgrplvb)
@@ -2027,11 +2035,13 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
                        goto check_rgrp;
                /* If rgrp has enough free space, use it */
-                if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) {
+                if (rs->rs_rbm.rgd->rd_free_clone >= ap->target ||
+                    (loops == 2 && ap->min_target &&
+                     rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) {
                        ip->i_rgd = rs->rs_rbm.rgd;
+                        ap->allowed = ip->i_rgd->rd_free_clone;
                        return 0;
                }
 check_rgrp:
                /* Check for unlinked inodes which can be reclaimed */
                if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b104f4af3afd..68972ecfbb01 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -41,7 +41,8 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 #define GFS2_AF_ORLOV 1
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap);
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip,
+                                struct gfs2_alloc_parms *ap);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 0b81f783f787..fd260ce8869a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -732,7 +732,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (error)
                return error;
-        error = gfs2_quota_lock_check(ip);
+        error = gfs2_quota_lock_check(ip, &ap);
        if (error)
                return error;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index d0929bc81782..98d4ea45bb70 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,7 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "hfs_fs.h"
 #include "btree.h"
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 6e560d56094b..754fdf8c6356 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -131,13 +131,16 @@ skip:
        hfs_bnode_write(node, entry, data_off + key_len, entry_len);
        hfs_bnode_dump(node);
-        if (new_node) {
+        /*
-                /* update parent key if we inserted a key
+         * update parent key if we inserted a key
-                 * at the start of the first node
+         * at the start of the node and it is not the new node
-                 */
+         */
-                if (!rec && new_node != node)
+        if (!rec && new_node != node) {
-                        hfs_brec_update_parent(fd);
+                hfs_bnode_read_key(node, fd->search_key, data_off + size);
+                hfs_brec_update_parent(fd);
+        }
+        if (new_node) {
                hfs_bnode_put(fd->bnode);
                if (!new_node->parent) {
                        hfs_btree_inc_height(tree);
@@ -168,9 +171,6 @@ skip:
                goto again;
        }
-        if (!rec)
-                hfs_brec_update_parent(fd);
        return 0;
 }
@@ -370,6 +370,8 @@ again:
        if (IS_ERR(parent))
                return PTR_ERR(parent);
        __hfs_brec_find(parent, fd, hfs_find_rec_by_key);
+        if (fd->record < 0)
+                return -ENOENT;
        hfs_bnode_dump(parent);
        rec = fd->record;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 0cf786f2d046..f541196d4ee9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,7 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c274aca8e8dc..db76cec3ce21 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -319,7 +319,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
 static void truncate_huge_page(struct page *page)
 {
-        cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
+        ClearPageDirty(page);
        ClearPageUptodate(page);
        delete_from_page_cache(page);
 }
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index d72817ac51f6..762c7a3cf43d 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -195,7 +195,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
        /* unchecked xdatum is chained with c->xattr_unchecked */
        list_del_init(&xd->xindex);
-        dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n",
+        dbg_xattr("success on verifying xdatum (xid=%u, version=%u)\n",
                  xd->xid, xd->version);
        return 0;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index bd3df1ca3c9b..3197aed10614 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,8 +22,8 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/uio.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 5d30c56ae075..4cd9798f4948 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -102,7 +102,7 @@ void jfs_error(struct super_block *sb, const char *fmt, ...)
        vaf.fmt = fmt;
        vaf.va = &args;
-        pr_err("ERROR: (device %s): %pf: %pV\n",
+        pr_err("ERROR: (device %s): %ps: %pV\n",
               sb->s_id, __builtin_return_address(0), &vaf);
        va_end(args);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index b684e8a132e6..2bacb9988566 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -207,6 +207,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
                goto out_free;
        }
+        of->event = atomic_read(&of->kn->attr.open->event);
        ops = kernfs_ops(of->kn);
        if (ops->read)
                len = ops->read(of, buf, len, *ppos);
diff --git a/fs/locks.c b/fs/locks.c
index 528fedfda15e..40bc384728c0 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1388,9 +1388,8 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
        int error = 0;
-        struct file_lock *new_fl;
        struct file_lock_context *ctx = inode->i_flctx;
-        struct file_lock *fl;
+        struct file_lock *new_fl, *fl, *tmp;
        unsigned long break_time;
        int want_write = (mode & O_ACCMODE) != O_RDONLY;
        LIST_HEAD(dispose);
@@ -1420,7 +1419,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                        break_time++;   /* so that 0 means no break time */
        }
-        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
                if (!leases_conflict(fl, new_fl))
                        continue;
                if (want_write) {
diff --git a/fs/namei.c b/fs/namei.c
index c83145af4bfc..76fb76a0818b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -119,15 +119,14 @@
 * PATH_MAX includes the nul terminator --RR.
 */
-#define EMBEDDED_NAME_MAX       (PATH_MAX - sizeof(struct filename))
+#define EMBEDDED_NAME_MAX       (PATH_MAX - offsetof(struct filename, iname))
 struct filename *
 getname_flags(const char __user *filename, int flags, int *empty)
 {
-        struct filename *result, *err;
+        struct filename *result;
-        int len;
-        long max;
        char *kname;
+        int len;
        result = audit_reusename(filename);
        if (result)
@@ -136,22 +135,18 @@ getname_flags(const char __user *filename, int flags, int *empty)
        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);
-        result->refcnt = 1;
        /*
         * First, try to embed the struct filename inside the names_cache
         * allocation
         */
-        kname = (char *)result + sizeof(*result);
+        kname = (char *)result->iname;
        result->name = kname;
-        result->separate = false;
-        max = EMBEDDED_NAME_MAX;
-recopy:
+        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
-        len = strncpy_from_user(kname, filename, max);
        if (unlikely(len < 0)) {
-                err = ERR_PTR(len);
+                __putname(result);
-                goto error;
+                return ERR_PTR(len);
        }
        /*
@@ -160,43 +155,49 @@ recopy:
         * names_cache allocation for the pathname, and re-do the copy from
         * userland.
         */
-        if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
+        if (unlikely(len == EMBEDDED_NAME_MAX)) {
+                const size_t size = offsetof(struct filename, iname[1]);
                kname = (char *)result;
-                result = kzalloc(sizeof(*result), GFP_KERNEL);
+                /*
-                if (!result) {
+                 * size is chosen that way we to guarantee that
-                        err = ERR_PTR(-ENOMEM);
+                 * result->iname[0] is within the same object and that
-                        result = (struct filename *)kname;
+                 * kname can't be equal to result->iname, no matter what.
-                        goto error;
+                 */
+                result = kzalloc(size, GFP_KERNEL);
+                if (unlikely(!result)) {
+                        __putname(kname);
+                        return ERR_PTR(-ENOMEM);
                }
                result->name = kname;
-                result->separate = true;
+                len = strncpy_from_user(kname, filename, PATH_MAX);
-                result->refcnt = 1;
+                if (unlikely(len < 0)) {
-                max = PATH_MAX;
+                        __putname(kname);
-                goto recopy;
+                        kfree(result);
+                        return ERR_PTR(len);
+                }
+                if (unlikely(len == PATH_MAX)) {
+                        __putname(kname);
+                        kfree(result);
+                        return ERR_PTR(-ENAMETOOLONG);
+                }
        }
+        result->refcnt = 1;
        /* The empty path is special. */
        if (unlikely(!len)) {
                if (empty)
                        *empty = 1;
-                err = ERR_PTR(-ENOENT);
+                if (!(flags & LOOKUP_EMPTY)) {
-                if (!(flags & LOOKUP_EMPTY))
+                        putname(result);
-                        goto error;
+                        return ERR_PTR(-ENOENT);
+                }
        }
-        err = ERR_PTR(-ENAMETOOLONG);
-        if (unlikely(len >= PATH_MAX))
-                goto error;
        result->uptr = filename;
        result->aname = NULL;
        audit_getname(result);
        return result;
-error:
-        putname(result);
-        return err;
 }
 struct filename *
@@ -216,8 +217,7 @@ getname_kernel(const char * filename)
                return ERR_PTR(-ENOMEM);
        if (len <= EMBEDDED_NAME_MAX) {
-                result->name = (char *)(result) + sizeof(*result);
+                result->name = (char *)result->iname;
-                result->separate = false;
        } else if (len <= PATH_MAX) {
                struct filename *tmp;
@@ -227,7 +227,6 @@ getname_kernel(const char * filename)
                        return ERR_PTR(-ENOMEM);
                }
                tmp->name = (char *)result;
-                tmp->separate = true;
                result = tmp;
        } else {
                __putname(result);
@@ -249,7 +248,7 @@ void putname(struct filename *name)
        if (--name->refcnt > 0)
                return;
-        if (name->separate) {
+        if (name->name != name->iname) {
                __putname(name->name);
                kfree(name);
        } else
@@ -1851,10 +1850,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        return err;
 }
-static int path_init(int dfd, const char *name, unsigned int flags,
+static int path_init(int dfd, const struct filename *name, unsigned int flags,
                     struct nameidata *nd)
 {
        int retval = 0;
+        const char *s = name->name;
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
        nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
@@ -1863,7 +1863,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        if (flags & LOOKUP_ROOT) {
                struct dentry *root = nd->root.dentry;
                struct inode *inode = root->d_inode;
-                if (*name) {
+                if (*s) {
                        if (!d_can_lookup(root))
                                return -ENOTDIR;
                        retval = inode_permission(inode, MAY_EXEC);
@@ -1885,7 +1885,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        nd->root.mnt = NULL;
        nd->m_seq = read_seqbegin(&mount_lock);
-        if (*name=='/') {
+        if (*s == '/') {
                if (flags & LOOKUP_RCU) {
                        rcu_read_lock();
                        nd->seq = set_root_rcu(nd);
@@ -1919,7 +1919,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                dentry = f.file->f_path.dentry;
-                if (*name) {
+                if (*s) {
                        if (!d_can_lookup(dentry)) {
                                fdput(f);
                                return -ENOTDIR;
@@ -1949,7 +1949,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        return -ECHILD;
 done:
        current->total_link_count = 0;
-        return link_path_walk(name, nd);
+        return link_path_walk(s, nd);
 }
 static void path_cleanup(struct nameidata *nd)
@@ -1972,7 +1972,7 @@ static inline int lookup_last(struct nameidata *nd, struct path *path)
 }
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int path_lookupat(int dfd, const char *name,
+static int path_lookupat(int dfd, const struct filename *name,
                                unsigned int flags, struct nameidata *nd)
 {
        struct path path;
@@ -2027,31 +2027,17 @@ static int path_lookupat(int dfd, const char *name,
 static int filename_lookup(int dfd, struct filename *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
+        int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
        if (unlikely(retval == -ECHILD))
-                retval = path_lookupat(dfd, name->name, flags, nd);
+                retval = path_lookupat(dfd, name, flags, nd);
        if (unlikely(retval == -ESTALE))
-                retval = path_lookupat(dfd, name->name,
+                retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
-                                                flags | LOOKUP_REVAL, nd);
        if (likely(!retval))
                audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
        return retval;
 }
-static int do_path_lookup(int dfd, const char *name,
-                                unsigned int flags, struct nameidata *nd)
-{
-        struct filename *filename = getname_kernel(name);
-        int retval = PTR_ERR(filename);
-        if (!IS_ERR(filename)) {
-                retval = filename_lookup(dfd, filename, flags, nd);
-                putname(filename);
-        }
-        return retval;
-}
 /* does lookup, returns the object with parent locked */
 struct dentry *kern_path_locked(const char *name, struct path *path)
 {
@@ -2089,9 +2075,15 @@ out:
 int kern_path(const char *name, unsigned int flags, struct path *path)
 {
        struct nameidata nd;
-        int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
+        struct filename *filename = getname_kernel(name);
-        if (!res)
+        int res = PTR_ERR(filename);
-                *path = nd.path;
+        if (!IS_ERR(filename)) {
+                res = filename_lookup(AT_FDCWD, filename, flags, &nd);
+                putname(filename);
+                if (!res)
+                        *path = nd.path;
+        }
        return res;
 }
 EXPORT_SYMBOL(kern_path);
@@ -2108,15 +2100,22 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct path *path)
 {
-        struct nameidata nd;
+        struct filename *filename = getname_kernel(name);
-        int err;
+        int err = PTR_ERR(filename);
-        nd.root.dentry = dentry;
-        nd.root.mnt = mnt;
        BUG_ON(flags & LOOKUP_PARENT);
-        /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
-        err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
+        /* the first argument of filename_lookup() is ignored with LOOKUP_ROOT */
-        if (!err)
+        if (!IS_ERR(filename)) {
-                *path = nd.path;
+                struct nameidata nd;
+                nd.root.dentry = dentry;
+                nd.root.mnt = mnt;
+                err = filename_lookup(AT_FDCWD, filename,
+                                      flags | LOOKUP_ROOT, &nd);
+                if (!err)
+                        *path = nd.path;
+                putname(filename);
+        }
        return err;
 }
 EXPORT_SYMBOL(vfs_path_lookup);
@@ -2138,9 +2137,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.  Also note that by using this function the
+ * not be called by generic code.
- * nameidata argument is passed to the filesystem methods and a filesystem
- * using this helper needs to be prepared for that.
 */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2341,7 +2338,8 @@ out:
 * Returns 0 and "path" will be valid on success; Returns error otherwise.
 */
 static int
-path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
+path_mountpoint(int dfd, const struct filename *name, struct path *path,
+                unsigned int flags)
 {
        struct nameidata nd;
        int err;
@@ -2370,20 +2368,20 @@ out:
 }
 static int
-filename_mountpoint(int dfd, struct filename *s, struct path *path,
+filename_mountpoint(int dfd, struct filename *name, struct path *path,
                        unsigned int flags)
 {
        int error;
-        if (IS_ERR(s))
+        if (IS_ERR(name))
-                return PTR_ERR(s);
+                return PTR_ERR(name);
-        error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
+        error = path_mountpoint(dfd, name, path, flags | LOOKUP_RCU);
        if (unlikely(error == -ECHILD))
-                error = path_mountpoint(dfd, s->name, path, flags);
+                error = path_mountpoint(dfd, name, path, flags);
        if (unlikely(error == -ESTALE))
-                error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
+                error = path_mountpoint(dfd, name, path, flags | LOOKUP_REVAL);
        if (likely(!error))
-                audit_inode(s, path->dentry, 0);
+                audit_inode(name, path->dentry, 0);
-        putname(s);
+        putname(name);
        return error;
 }
@@ -3156,7 +3154,7 @@ static int do_tmpfile(int dfd, struct filename *pathname,
        static const struct qstr name = QSTR_INIT("/", 1);
        struct dentry *dentry, *child;
        struct inode *dir;
-        int error = path_lookupat(dfd, pathname->name,
+        int error = path_lookupat(dfd, pathname,
                                  flags | LOOKUP_DIRECTORY, nd);
        if (unlikely(error))
                return error;
@@ -3229,7 +3227,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
                goto out;
        }
-        error = path_init(dfd, pathname->name, flags, nd);
+        error = path_init(dfd, pathname, flags, nd);
        if (unlikely(error))
                goto out;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e907c8cf732e..c3929fb2ab26 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -265,7 +265,7 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t
        return -EINVAL;
 #else
-        VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
+        VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
        if (rw == READ)
                return nfs_file_direct_read(iocb, iter, pos);
@@ -393,7 +393,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
                long res = (long) dreq->error;
                if (!res)
                        res = (long) dreq->count;
-                aio_complete(dreq->iocb, res, 0);
+                dreq->iocb->ki_complete(dreq->iocb, res, 0);
        }
        complete_all(&dreq->completion);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e679d24c39d3..37b15582e0de 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -26,7 +26,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
-#include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/swap.h>
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 849ed784d6ac..759931088094 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1876,11 +1876,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
                 * request from the inode / page_private pointer and
                 * release it */
                nfs_inode_remove_request(req);
-                /*
-                 * In case nfs_inode_remove_request has marked the
-                 * page as being dirty
-                 */
-                cancel_dirty_page(page, PAGE_CACHE_SIZE);
                nfs_unlock_and_release_request(req);
        }
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index cdbc78c72542..03d647bf195d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -137,7 +137,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
        seg->offset = iomap.offset;
        seg->length = iomap.length;
-        dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
+        dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es);
        return 0;
 out_error:
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 9da89fddab33..9aa2796da90d 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -122,19 +122,19 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
                p = xdr_decode_hyper(p, &bex.foff);
                if (bex.foff & (block_size - 1)) {
-                        dprintk("%s: unaligned offset %lld\n",
+                        dprintk("%s: unaligned offset 0x%llx\n",
                                __func__, bex.foff);
                        goto fail;
                }
                p = xdr_decode_hyper(p, &bex.len);
                if (bex.len & (block_size - 1)) {
-                        dprintk("%s: unaligned length %lld\n",
+                        dprintk("%s: unaligned length 0x%llx\n",
                                __func__, bex.foff);
                        goto fail;
                }
                p = xdr_decode_hyper(p, &bex.soff);
                if (bex.soff & (block_size - 1)) {
-                        dprintk("%s: unaligned disk offset %lld\n",
+                        dprintk("%s: unaligned disk offset 0x%llx\n",
                                __func__, bex.soff);
                        goto fail;
                }
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 3c1bfa155571..6904213a4363 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -118,7 +118,7 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
 {
        struct super_block *sb = exp->ex_path.mnt->mnt_sb;
-        if (exp->ex_flags & NFSEXP_NOPNFS)
+        if (!(exp->ex_flags & NFSEXP_PNFS))
                return;
        if (sb->s_export_op->get_uuid &&
@@ -440,15 +440,14 @@ nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
                        list_move_tail(&lp->lo_perstate, reaplist);
                        return;
                }
-                end = seg->offset;
+                lo->offset = layout_end(seg);
        } else {
                /* retain the whole layout segment on a split. */
                if (layout_end(seg) < end) {
                        dprintk("%s: split not supported\n", __func__);
                        return;
                }
+                end = seg->offset;
-                lo->offset = layout_end(seg);
        }
        layout_update_len(lo, end);
@@ -513,6 +512,9 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp,
        spin_lock(&clp->cl_lock);
        list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+                if (ls->ls_layout_type != lrp->lr_layout_type)
+                        continue;
                if (lrp->lr_return_type == RETURN_FSID &&
                    !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
                                   &cstate->current_fh.fh_handle))
@@ -587,7 +589,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
        rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
-        nfsd4_cb_layout_fail(ls);
+        trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
        printk(KERN_WARNING
                "nfsd: client %s failed to respond to layout recall. "
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d30bea8d0277..92b9d97aff4f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1237,8 +1237,8 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
                nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
        gdp->gd_notify_types &= ops->notify_types;
-        exp_put(exp);
 out:
+        exp_put(exp);
        return nfserr;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d2f2c37dc2db..8ba1d888f1e6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3221,7 +3221,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
        } else
                nfs4_free_openowner(&oo->oo_owner);
        spin_unlock(&clp->cl_lock);
-        return oo;
+        return ret;
 }
 static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
@@ -5062,7 +5062,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
        } else
                nfs4_free_lockowner(&lo->lo_owner);
        spin_unlock(&clp->cl_lock);
-        return lo;
+        return ret;
 }
 static void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index df5e66caf100..5fb7e78169a6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1562,7 +1562,11 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
        p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
        p = xdr_decode_hyper(p, &lgp->lg_seg.length);
        p = xdr_decode_hyper(p, &lgp->lg_minlength);
-        nfsd4_decode_stateid(argp, &lgp->lg_sid);
+        status = nfsd4_decode_stateid(argp, &lgp->lg_sid);
+        if (status)
+                return status;
        READ_BUF(4);
        lgp->lg_maxcount = be32_to_cpup(p++);
@@ -1580,7 +1584,11 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
        p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
        p = xdr_decode_hyper(p, &lcp->lc_seg.length);
        lcp->lc_reclaim = be32_to_cpup(p++);
-        nfsd4_decode_stateid(argp, &lcp->lc_sid);
+        status = nfsd4_decode_stateid(argp, &lcp->lc_sid);
+        if (status)
+                return status;
        READ_BUF(4);
        lcp->lc_newoffset = be32_to_cpup(p++);
        if (lcp->lc_newoffset) {
@@ -1628,7 +1636,11 @@ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
                READ_BUF(16);
                p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
                p = xdr_decode_hyper(p, &lrp->lr_seg.length);
-                nfsd4_decode_stateid(argp, &lrp->lr_sid);
+                status = nfsd4_decode_stateid(argp, &lrp->lr_sid);
+                if (status)
+                        return status;
                READ_BUF(4);
                lrp->lrf_body_len = be32_to_cpup(p++);
                if (lrp->lrf_body_len > 0) {
@@ -4123,7 +4135,7 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
                return nfserr_resource;
        *p++ = cpu_to_be32(lrp->lrs_present);
        if (lrp->lrs_present)
-                nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+                return nfsd4_encode_stateid(xdr, &lrp->lr_sid);
        return nfs_ok;
 }
 #endif /* CONFIG_NFSD_PNFS */
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 83a9694ec485..46ec934f5dee 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -165,13 +165,17 @@ int nfsd_reply_cache_init(void)
 {
        unsigned int hashsize;
        unsigned int i;
+        int status = 0;
        max_drc_entries = nfsd_cache_size_limit();
        atomic_set(&num_drc_entries, 0);
        hashsize = nfsd_hashsize(max_drc_entries);
        maskbits = ilog2(hashsize);
-        register_shrinker(&nfsd_reply_cache_shrinker);
+        status = register_shrinker(&nfsd_reply_cache_shrinker);
+        if (status)
+                return status;
        drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
                                        0, 0, NULL);
        if (!drc_slab)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8b5969538f39..ab4987bc637f 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -26,7 +26,7 @@
 #include <linux/mpage.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "segment.h"
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 36ae529511c4..2ff263e6d363 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -8,7 +8,7 @@ ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
 ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-ccflags-y := -DNTFS_VERSION=\"2.1.31\"
+ccflags-y := -DNTFS_VERSION=\"2.1.32\"
 ccflags-$(CONFIG_NTFS_DEBUG)    += -DDEBUG
 ccflags-$(CONFIG_NTFS_RW)       += -DNTFS_RW
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1da9b2d184dc..c1da78dad1af 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
 * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
 *
 * This program/include file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published
@@ -28,7 +28,6 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -329,62 +328,168 @@ err_out:
        return err;
 }
-/**
+static ssize_t ntfs_prepare_file_for_write(struct file *file, loff_t *ppos,
- * ntfs_fault_in_pages_readable -
+                size_t *count)
- *
- * Fault a number of userspace pages into pagetables.
- *
- * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
- * with more than two userspace pages as well as handling the single page case
- * elegantly.
- *
- * If you find this difficult to understand, then think of the while loop being
- * the following code, except that we do without the integer variable ret:
- *
- *      do {
- *              ret = __get_user(c, uaddr);
- *              uaddr += PAGE_SIZE;
- *      } while (!ret && uaddr < end);
- *
- * Note, the final __get_user() may well run out-of-bounds of the user buffer,
- * but _not_ out-of-bounds of the page the user buffer belongs to, and since
- * this is only a read and not a write, and since it is still in the same page,
- * it should not matter and this makes the code much simpler.
- */
-static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
-                int bytes)
 {
-        const char __user *end;
+        loff_t pos;
-        volatile char c;
+        s64 end, ll;
+        ssize_t err;
-        /* Set @end to the first byte outside the last page we care about. */
+        unsigned long flags;
-        end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
+        struct inode *vi = file_inode(file);
+        ntfs_inode *base_ni, *ni = NTFS_I(vi);
-        while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
+        ntfs_volume *vol = ni->vol;
-                ;
-}
-/**
- * ntfs_fault_in_pages_readable_iovec -
- *
- * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
- */
-static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
-                size_t iov_ofs, int bytes)
-{
-        do {
-                const char __user *buf;
-                unsigned len;
-                buf = iov->iov_base + iov_ofs;
+        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
-                len = iov->iov_len - iov_ofs;
+                        "0x%llx, count 0x%lx.", vi->i_ino,
-                if (len > bytes)
+                        (unsigned)le32_to_cpu(ni->type),
-                        len = bytes;
+                        (unsigned long long)*ppos, (unsigned long)*count);
-                ntfs_fault_in_pages_readable(buf, len);
+        /* We can write back this queue in page reclaim. */
-                bytes -= len;
+        current->backing_dev_info = inode_to_bdi(vi);
-                iov++;
+        err = generic_write_checks(file, ppos, count, S_ISBLK(vi->i_mode));
-                iov_ofs = 0;
+        if (unlikely(err))
-        } while (bytes);
+                goto out;
+        /*
+         * All checks have passed.  Before we start doing any writing we want
+         * to abort any totally illegal writes.
+         */
+        BUG_ON(NInoMstProtected(ni));
+        BUG_ON(ni->type != AT_DATA);
+        /* If file is encrypted, deny access, just like NT4. */
+        if (NInoEncrypted(ni)) {
+                /* Only $DATA attributes can be encrypted. */
+                /*
+                 * Reminder for later: Encrypted files are _always_
+                 * non-resident so that the content can always be encrypted.
+                 */
+                ntfs_debug("Denying write access to encrypted file.");
+                err = -EACCES;
+                goto out;
+        }
+        if (NInoCompressed(ni)) {
+                /* Only unnamed $DATA attribute can be compressed. */
+                BUG_ON(ni->name_len);
+                /*
+                 * Reminder for later: If resident, the data is not actually
+                 * compressed.  Only on the switch to non-resident does
+                 * compression kick in.  This is in contrast to encrypted files
+                 * (see above).
+                 */
+                ntfs_error(vi->i_sb, "Writing to compressed files is not "
+                                "implemented yet.  Sorry.");
+                err = -EOPNOTSUPP;
+                goto out;
+        }
+        if (*count == 0)
+                goto out;
+        base_ni = ni;
+        if (NInoAttr(ni))
+                base_ni = ni->ext.base_ntfs_ino;
+        err = file_remove_suid(file);
+        if (unlikely(err))
+                goto out;
+        /*
+         * Our ->update_time method always succeeds thus file_update_time()
+         * cannot fail either so there is no need to check the return code.
+         */
+        file_update_time(file);
+        pos = *ppos;
+        /* The first byte after the last cluster being written to. */
+        end = (pos + *count + vol->cluster_size_mask) &
+                        ~(u64)vol->cluster_size_mask;
+        /*
+         * If the write goes beyond the allocated size, extend the allocation
+         * to cover the whole of the write, rounded up to the nearest cluster.
+         */
+        read_lock_irqsave(&ni->size_lock, flags);
+        ll = ni->allocated_size;
+        read_unlock_irqrestore(&ni->size_lock, flags);
+        if (end > ll) {
+                /*
+                 * Extend the allocation without changing the data size.
+                 *
+                 * Note we ensure the allocation is big enough to at least
+                 * write some data but we do not require the allocation to be
+                 * complete, i.e. it may be partial.
+                 */
+                ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+                if (likely(ll >= 0)) {
+                        BUG_ON(pos >= ll);
+                        /* If the extension was partial truncate the write. */
+                        if (end > ll) {
+                                ntfs_debug("Truncating write to inode 0x%lx, "
+                                                "attribute type 0x%x, because "
+                                                "the allocation was only "
+                                                "partially extended.",
+                                                vi->i_ino, (unsigned)
+                                                le32_to_cpu(ni->type));
+                                *count = ll - pos;
+                        }
+                } else {
+                        err = ll;
+                        read_lock_irqsave(&ni->size_lock, flags);
+                        ll = ni->allocated_size;
+                        read_unlock_irqrestore(&ni->size_lock, flags);
+                        /* Perform a partial write if possible or fail. */
+                        if (pos < ll) {
+                                ntfs_debug("Truncating write to inode 0x%lx "
+                                                "attribute type 0x%x, because "
+                                                "extending the allocation "
+                                                "failed (error %d).",
+                                                vi->i_ino, (unsigned)
+                                                le32_to_cpu(ni->type),
+                                                (int)-err);
+                                *count = ll - pos;
+                        } else {
+                                if (err != -ENOSPC)
+                                        ntfs_error(vi->i_sb, "Cannot perform "
+                                                        "write to inode "
+                                                        "0x%lx, attribute "
+                                                        "type 0x%x, because "
+                                                        "extending the "
+                                                        "allocation failed "
+                                                        "(error %ld).",
+                                                        vi->i_ino, (unsigned)
+                                                        le32_to_cpu(ni->type),
+                                                        (long)-err);
+                                else
+                                        ntfs_debug("Cannot perform write to "
+                                                        "inode 0x%lx, "
+                                                        "attribute type 0x%x, "
+                                                        "because there is not "
+                                                        "space left.",
+                                                        vi->i_ino, (unsigned)
+                                                        le32_to_cpu(ni->type));
+                                goto out;
+                        }
+                }
+        }
+        /*
+         * If the write starts beyond the initialized size, extend it up to the
+         * beginning of the write and initialize all non-sparse space between
+         * the old initialized size and the new one.  This automatically also
+         * increments the vfs inode->i_size to keep it above or equal to the
+         * initialized_size.
+         */
+        read_lock_irqsave(&ni->size_lock, flags);
+        ll = ni->initialized_size;
+        read_unlock_irqrestore(&ni->size_lock, flags);
+        if (pos > ll) {
+                /*
+                 * Wait for ongoing direct i/o to complete before proceeding.
+                 * New direct i/o cannot start as we hold i_mutex.
+                 */
+                inode_dio_wait(vi);
+                err = ntfs_attr_extend_initialized(ni, pos);
+                if (unlikely(err < 0))
+                        ntfs_error(vi->i_sb, "Cannot perform write to inode "
+                                        "0x%lx, attribute type 0x%x, because "
+                                        "extending the initialized size "
+                                        "failed (error %d).", vi->i_ino,
+                                        (unsigned)le32_to_cpu(ni->type),
+                                        (int)-err);
+        }
+out:
+        return err;
 }
 /**
@@ -421,8 +526,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                        goto err_out;
                                }
                        }
-                        err = add_to_page_cache_lru(*cached_page, mapping, index,
+                        err = add_to_page_cache_lru(*cached_page, mapping,
-                                        GFP_KERNEL);
+                                        index, GFP_KERNEL);
                        if (unlikely(err)) {
                                if (err == -EEXIST)
                                        continue;
@@ -1268,180 +1373,6 @@ rl_not_mapped_enoent:
        return err;
 }
-/*
- * Copy as much as we can into the pages and return the number of bytes which
- * were successfully copied.  If a fault is encountered then clear the pages
- * out to (ofs + bytes) and return the number of bytes which were copied.
- */
-static inline size_t ntfs_copy_from_user(struct page **pages,
-                unsigned nr_pages, unsigned ofs, const char __user *buf,
-                size_t bytes)
-{
-        struct page **last_page = pages + nr_pages;
-        char *addr;
-        size_t total = 0;
-        unsigned len;
-        int left;
-        do {
-                len = PAGE_CACHE_SIZE - ofs;
-                if (len > bytes)
-                        len = bytes;
-                addr = kmap_atomic(*pages);
-                left = __copy_from_user_inatomic(addr + ofs, buf, len);
-                kunmap_atomic(addr);
-                if (unlikely(left)) {
-                        /* Do it the slow way. */
-                        addr = kmap(*pages);
-                        left = __copy_from_user(addr + ofs, buf, len);
-                        kunmap(*pages);
-                        if (unlikely(left))
-                                goto err_out;
-                }
-                total += len;
-                bytes -= len;
-                if (!bytes)
-                        break;
-                buf += len;
-                ofs = 0;
-        } while (++pages < last_page);
-out:
-        return total;
-err_out:
-        total += len - left;
-        /* Zero the rest of the target like __copy_from_user(). */
-        while (++pages < last_page) {
-                bytes -= len;
-                if (!bytes)
-                        break;
-                len = PAGE_CACHE_SIZE;
-                if (len > bytes)
-                        len = bytes;
-                zero_user(*pages, 0, len);
-        }
-        goto out;
-}
-static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
-                const struct iovec *iov, size_t iov_ofs, size_t bytes)
-{
-        size_t total = 0;
-        while (1) {
-                const char __user *buf = iov->iov_base + iov_ofs;
-                unsigned len;
-                size_t left;
-                len = iov->iov_len - iov_ofs;
-                if (len > bytes)
-                        len = bytes;
-                left = __copy_from_user_inatomic(vaddr, buf, len);
-                total += len;
-                bytes -= len;
-                vaddr += len;
-                if (unlikely(left)) {
-                        total -= left;
-                        break;
-                }
-                if (!bytes)
-                        break;
-                iov++;
-                iov_ofs = 0;
-        }
-        return total;
-}
-static inline void ntfs_set_next_iovec(const struct iovec **iovp,
-                size_t *iov_ofsp, size_t bytes)
-{
-        const struct iovec *iov = *iovp;
-        size_t iov_ofs = *iov_ofsp;
-        while (bytes) {
-                unsigned len;
-                len = iov->iov_len - iov_ofs;
-                if (len > bytes)
-                        len = bytes;
-                bytes -= len;
-                iov_ofs += len;
-                if (iov->iov_len == iov_ofs) {
-                        iov++;
-                        iov_ofs = 0;
-                }
-        }
-        *iovp = iov;
-        *iov_ofsp = iov_ofs;
-}
-/*
- * This has the same side-effects and return value as ntfs_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
- * single-segment behaviour.
- *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
- * atomic and when not atomic.  This is ok because it calls
- * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
- * fact, the only difference between __copy_from_user_inatomic() and
- * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error.  And on many architectures
- * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
- * makes no difference at all on those architectures.
- */
-static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
-                unsigned nr_pages, unsigned ofs, const struct iovec **iov,
-                size_t *iov_ofs, size_t bytes)
-{
-        struct page **last_page = pages + nr_pages;
-        char *addr;
-        size_t copied, len, total = 0;
-        do {
-                len = PAGE_CACHE_SIZE - ofs;
-                if (len > bytes)
-                        len = bytes;
-                addr = kmap_atomic(*pages);
-                copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
-                                *iov, *iov_ofs, len);
-                kunmap_atomic(addr);
-                if (unlikely(copied != len)) {
-                        /* Do it the slow way. */
-                        addr = kmap(*pages);
-                        copied = __ntfs_copy_from_user_iovec_inatomic(addr +
-                                        ofs, *iov, *iov_ofs, len);
-                        if (unlikely(copied != len))
-                                goto err_out;
-                        kunmap(*pages);
-                }
-                total += len;
-                ntfs_set_next_iovec(iov, iov_ofs, len);
-                bytes -= len;
-                if (!bytes)
-                        break;
-                ofs = 0;
-        } while (++pages < last_page);
-out:
-        return total;
-err_out:
-        BUG_ON(copied > len);
-        /* Zero the rest of the target like __copy_from_user(). */
-        memset(addr + ofs + copied, 0, len - copied);
-        kunmap(*pages);
-        total += copied;
-        ntfs_set_next_iovec(iov, iov_ofs, copied);
-        while (++pages < last_page) {
-                bytes -= len;
-                if (!bytes)
-                        break;
-                len = PAGE_CACHE_SIZE;
-                if (len > bytes)
-                        len = bytes;
-                zero_user(*pages, 0, len);
-        }
-        goto out;
-}
 static inline void ntfs_flush_dcache_pages(struct page **pages,
                unsigned nr_pages)
 {
@@ -1762,86 +1693,83 @@ err_out:
        return err;
 }
-static void ntfs_write_failed(struct address_space *mapping, loff_t to)
+/*
+ * Copy as much as we can into the pages and return the number of bytes which
+ * were successfully copied.  If a fault is encountered then clear the pages
+ * out to (ofs + bytes) and return the number of bytes which were copied.
+ */
+static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
+                unsigned ofs, struct iov_iter *i, size_t bytes)
 {
-        struct inode *inode = mapping->host;
+        struct page **last_page = pages + nr_pages;
+        size_t total = 0;
+        struct iov_iter data = *i;
+        unsigned len, copied;
-        if (to > inode->i_size) {
+        do {
-                truncate_pagecache(inode, inode->i_size);
+                len = PAGE_CACHE_SIZE - ofs;
-                ntfs_truncate_vfs(inode);
+                if (len > bytes)
-        }
+                        len = bytes;
+                copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
+                                len);
+                total += copied;
+                bytes -= copied;
+                if (!bytes)
+                        break;
+                iov_iter_advance(&data, copied);
+                if (copied < len)
+                        goto err;
+                ofs = 0;
+        } while (++pages < last_page);
+out:
+        return total;
+err:
+        /* Zero the rest of the target like __copy_from_user(). */
+        len = PAGE_CACHE_SIZE - copied;
+        do {
+                if (len > bytes)
+                        len = bytes;
+                zero_user(*pages, copied, len);
+                bytes -= len;
+                copied = 0;
+                len = PAGE_CACHE_SIZE;
+        } while (++pages < last_page);
+        goto out;
 }
 /**
- * ntfs_file_buffered_write -
+ * ntfs_perform_write - perform buffered write to a file
- *
+ * @file:       file to write to
- * Locking: The vfs is holding ->i_mutex on the inode.
+ * @i:          iov_iter with data to write
+ * @pos:        byte offset in file at which to begin writing to
 */
-static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
+static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
-                const struct iovec *iov, unsigned long nr_segs,
+                loff_t pos)
-                loff_t pos, loff_t *ppos, size_t count)
 {
-        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *vi = mapping->host;
        ntfs_inode *ni = NTFS_I(vi);
        ntfs_volume *vol = ni->vol;
        struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
        struct page *cached_page = NULL;
-        char __user *buf = NULL;
-        s64 end, ll;
        VCN last_vcn;
        LCN lcn;
-        unsigned long flags;
+        size_t bytes;
-        size_t bytes, iov_ofs = 0;      /* Offset in the current iovec. */
+        ssize_t status, written = 0;
-        ssize_t status, written;
        unsigned nr_pages;
-        int err;
-        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
-                        "pos 0x%llx, count 0x%lx.",
+                        "0x%llx, count 0x%lx.", vi->i_ino,
-                        vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+                        (unsigned)le32_to_cpu(ni->type),
-                        (unsigned long long)pos, (unsigned long)count);
+                        (unsigned long long)pos,
-        if (unlikely(!count))
+                        (unsigned long)iov_iter_count(i));
-                return 0;
-        BUG_ON(NInoMstProtected(ni));
-        /*
-         * If the attribute is not an index root and it is encrypted or
-         * compressed, we cannot write to it yet.  Note we need to check for
-         * AT_INDEX_ALLOCATION since this is the type of both directory and
-         * index inodes.
-         */
-        if (ni->type != AT_INDEX_ALLOCATION) {
-                /* If file is encrypted, deny access, just like NT4. */
-                if (NInoEncrypted(ni)) {
-                        /*
-                         * Reminder for later: Encrypted files are _always_
-                         * non-resident so that the content can always be
-                         * encrypted.
-                         */
-                        ntfs_debug("Denying write access to encrypted file.");
-                        return -EACCES;
-                }
-                if (NInoCompressed(ni)) {
-                        /* Only unnamed $DATA attribute can be compressed. */
-                        BUG_ON(ni->type != AT_DATA);
-                        BUG_ON(ni->name_len);
-                        /*
-                         * Reminder for later: If resident, the data is not
-                         * actually compressed.  Only on the switch to non-
-                         * resident does compression kick in.  This is in
-                         * contrast to encrypted files (see above).
-                         */
-                        ntfs_error(vi->i_sb, "Writing to compressed files is "
-                                        "not implemented yet.  Sorry.");
-                        return -EOPNOTSUPP;
-                }
-        }
        /*
         * If a previous ntfs_truncate() failed, repeat it and abort if it
         * fails again.
         */
        if (unlikely(NInoTruncateFailed(ni))) {
+                int err;
                inode_dio_wait(vi);
                err = ntfs_truncate(vi);
                if (err || NInoTruncateFailed(ni)) {
@@ -1855,81 +1783,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        return err;
                }
        }
-        /* The first byte after the write. */
-        end = pos + count;
-        /*
-         * If the write goes beyond the allocated size, extend the allocation
-         * to cover the whole of the write, rounded up to the nearest cluster.
-         */
-        read_lock_irqsave(&ni->size_lock, flags);
-        ll = ni->allocated_size;
-        read_unlock_irqrestore(&ni->size_lock, flags);
-        if (end > ll) {
-                /* Extend the allocation without changing the data size. */
-                ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
-                if (likely(ll >= 0)) {
-                        BUG_ON(pos >= ll);
-                        /* If the extension was partial truncate the write. */
-                        if (end > ll) {
-                                ntfs_debug("Truncating write to inode 0x%lx, "
-                                                "attribute type 0x%x, because "
-                                                "the allocation was only "
-                                                "partially extended.",
-                                                vi->i_ino, (unsigned)
-                                                le32_to_cpu(ni->type));
-                                end = ll;
-                                count = ll - pos;
-                        }
-                } else {
-                        err = ll;
-                        read_lock_irqsave(&ni->size_lock, flags);
-                        ll = ni->allocated_size;
-                        read_unlock_irqrestore(&ni->size_lock, flags);
-                        /* Perform a partial write if possible or fail. */
-                        if (pos < ll) {
-                                ntfs_debug("Truncating write to inode 0x%lx, "
-                                                "attribute type 0x%x, because "
-                                                "extending the allocation "
-                                                "failed (error code %i).",
-                                                vi->i_ino, (unsigned)
-                                                le32_to_cpu(ni->type), err);
-                                end = ll;
-                                count = ll - pos;
-                        } else {
-                                ntfs_error(vol->sb, "Cannot perform write to "
-                                                "inode 0x%lx, attribute type "
-                                                "0x%x, because extending the "
-                                                "allocation failed (error "
-                                                "code %i).", vi->i_ino,
-                                                (unsigned)
-                                                le32_to_cpu(ni->type), err);
-                                return err;
-                        }
-                }
-        }
-        written = 0;
-        /*
-         * If the write starts beyond the initialized size, extend it up to the
-         * beginning of the write and initialize all non-sparse space between
-         * the old initialized size and the new one.  This automatically also
-         * increments the vfs inode->i_size to keep it above or equal to the
-         * initialized_size.
-         */
-        read_lock_irqsave(&ni->size_lock, flags);
-        ll = ni->initialized_size;
-        read_unlock_irqrestore(&ni->size_lock, flags);
-        if (pos > ll) {
-                err = ntfs_attr_extend_initialized(ni, pos);
-                if (err < 0) {
-                        ntfs_error(vol->sb, "Cannot perform write to inode "
-                                        "0x%lx, attribute type 0x%x, because "
-                                        "extending the initialized size "
-                                        "failed (error code %i).", vi->i_ino,
-                                        (unsigned)le32_to_cpu(ni->type), err);
-                        status = err;
-                        goto err_out;
-                }
-        }
        /*
         * Determine the number of pages per cluster for non-resident
         * attributes.
@@ -1937,10 +1790,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
        nr_pages = 1;
        if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
                nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
-        /* Finally, perform the actual write. */
        last_vcn = -1;
-        if (likely(nr_segs == 1))
-                buf = iov->iov_base;
        do {
                VCN vcn;
                pgoff_t idx, start_idx;
@@ -1965,10 +1815,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                                                vol->cluster_size_bits, false);
                                up_read(&ni->runlist.lock);
                                if (unlikely(lcn < LCN_HOLE)) {
-                                        status = -EIO;
                                        if (lcn == LCN_ENOMEM)
                                                status = -ENOMEM;
-                                        else
+                                        else {
+                                                status = -EIO;
                                                ntfs_error(vol->sb, "Cannot "
                                                        "perform write to "
                                                        "inode 0x%lx, "
@@ -1977,6 +1827,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                                                        "is corrupt.",
                                                        vi->i_ino, (unsigned)
                                                        le32_to_cpu(ni->type));
+                                        }
                                        break;
                                }
                                if (lcn == LCN_HOLE) {
@@ -1989,8 +1840,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                                }
                        }
                }
-                if (bytes > count)
+                if (bytes > iov_iter_count(i))
-                        bytes = count;
+                        bytes = iov_iter_count(i);
+again:
                /*
                 * Bring in the user page(s) that we will copy from _first_.
                 * Otherwise there is a nasty deadlock on copying from the same
@@ -1999,10 +1851,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                 * pages being swapped out between us bringing them into memory
                 * and doing the actual copying.
                 */
-                if (likely(nr_segs == 1))
+                if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) {
-                        ntfs_fault_in_pages_readable(buf, bytes);
+                        status = -EFAULT;
-                else
+                        break;
-                        ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
+                }
                /* Get and lock @do_pages starting at index @start_idx. */
                status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
                                pages, &cached_page);
@@ -2018,56 +1870,57 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        status = ntfs_prepare_pages_for_non_resident_write(
                                        pages, do_pages, pos, bytes);
                        if (unlikely(status)) {
-                                loff_t i_size;
                                do {
                                        unlock_page(pages[--do_pages]);
                                        page_cache_release(pages[do_pages]);
                                } while (do_pages);
-                                /*
-                                 * The write preparation may have instantiated
-                                 * allocated space outside i_size.  Trim this
-                                 * off again.  We can ignore any errors in this
-                                 * case as we will just be waisting a bit of
-                                 * allocated space, which is not a disaster.
-                                 */
-                                i_size = i_size_read(vi);
-                                if (pos + bytes > i_size) {
-                                        ntfs_write_failed(mapping, pos + bytes);
-                                }
                                break;
                        }
                }
                u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
-                if (likely(nr_segs == 1)) {
+                copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
-                        copied = ntfs_copy_from_user(pages + u, do_pages - u,
+                                        i, bytes);
-                                        ofs, buf, bytes);
-                        buf += copied;
-                } else
-                        copied = ntfs_copy_from_user_iovec(pages + u,
-                                        do_pages - u, ofs, &iov, &iov_ofs,
-                                        bytes);
                ntfs_flush_dcache_pages(pages + u, do_pages - u);
-                status = ntfs_commit_pages_after_write(pages, do_pages, pos,
+                status = 0;
-                                bytes);
+                if (likely(copied == bytes)) {
-                if (likely(!status)) {
+                        status = ntfs_commit_pages_after_write(pages, do_pages,
-                        written += copied;
+                                        pos, bytes);
-                        count -= copied;
+                        if (!status)
-                        pos += copied;
+                                status = bytes;
-                        if (unlikely(copied != bytes))
-                                status = -EFAULT;
                }
                do {
                        unlock_page(pages[--do_pages]);
                        page_cache_release(pages[do_pages]);
                } while (do_pages);
-                if (unlikely(status))
+                if (unlikely(status < 0))
                        break;
-                balance_dirty_pages_ratelimited(mapping);
+                copied = status;
                cond_resched();
-        } while (count);
+                if (unlikely(!copied)) {
-err_out:
+                        size_t sc;
-        *ppos = pos;
+                        /*
+                         * We failed to copy anything.  Fall back to single
+                         * segment length write.
+                         *
+                         * This is needed to avoid possible livelock in the
+                         * case that all segments in the iov cannot be copied
+                         * at once without a pagefault.
+                         */
+                        sc = iov_iter_single_seg_count(i);
+                        if (bytes > sc)
+                                bytes = sc;
+                        goto again;
+                }
+                iov_iter_advance(i, copied);
+                pos += copied;
+                written += copied;
+                balance_dirty_pages_ratelimited(mapping);
+                if (fatal_signal_pending(current)) {
+                        status = -EINTR;
+                        break;
+                }
+        } while (iov_iter_count(i));
        if (cached_page)
                page_cache_release(cached_page);
        ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
@@ -2077,59 +1930,56 @@ err_out:
 }
 /**
- * ntfs_file_aio_write_nolock -
+ * ntfs_file_write_iter_nolock - write data to a file
+ * @iocb:       IO state structure (file, offset, etc.)
+ * @from:       iov_iter with data to write
+ *
+ * Basically the same as __generic_file_write_iter() except that it ends
+ * up calling ntfs_perform_write() instead of generic_perform_write() and that
+ * O_DIRECT is not implemented.
 */
-static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
+static ssize_t ntfs_file_write_iter_nolock(struct kiocb *iocb,
-                const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
+                struct iov_iter *from)
 {
        struct file *file = iocb->ki_filp;
-        struct address_space *mapping = file->f_mapping;
+        loff_t pos = iocb->ki_pos;
-        struct inode *inode = mapping->host;
+        ssize_t written = 0;
-        loff_t pos;
+        ssize_t err;
-        size_t count;           /* after file limit checks */
+        size_t count = iov_iter_count(from);
-        ssize_t written, err;
-        count = iov_length(iov, nr_segs);
+        err = ntfs_prepare_file_for_write(file, &pos, &count);
-        pos = *ppos;
+        if (count && !err) {
-        /* We can write back this queue in page reclaim. */
+                iov_iter_truncate(from, count);
-        current->backing_dev_info = inode_to_bdi(inode);
+                written = ntfs_perform_write(file, from, pos);
-        written = 0;
+                if (likely(written >= 0))
-        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+                        iocb->ki_pos = pos + written;
-        if (err)
+        }
-                goto out;
-        if (!count)
-                goto out;
-        err = file_remove_suid(file);
-        if (err)
-                goto out;
-        err = file_update_time(file);
-        if (err)
-                goto out;
-        written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
-                        count);
-out:
        current->backing_dev_info = NULL;
        return written ? written : err;
 }
 /**
- * ntfs_file_aio_write -
+ * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
+ * @iocb:       IO state structure
+ * @from:       iov_iter with data to write
+ *
+ * Basically the same as generic_file_write_iter() except that it ends up
+ * calling ntfs_file_write_iter_nolock() instead of
+ * __generic_file_write_iter().
 */
-static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-                unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
-        struct address_space *mapping = file->f_mapping;
+        struct inode *vi = file_inode(file);
-        struct inode *inode = mapping->host;
        ssize_t ret;
-        BUG_ON(iocb->ki_pos != pos);
+        mutex_lock(&vi->i_mutex);
+        ret = ntfs_file_write_iter_nolock(iocb, from);
-        mutex_lock(&inode->i_mutex);
+        mutex_unlock(&vi->i_mutex);
-        ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
-        mutex_unlock(&inode->i_mutex);
        if (ret > 0) {
-                int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+                ssize_t err;
+                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
                if (err < 0)
                        ret = err;
        }
@@ -2197,37 +2047,17 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
 #endif /* NTFS_RW */
 const struct file_operations ntfs_file_ops = {
-        .llseek         = generic_file_llseek,   /* Seek inside file. */
+        .llseek         = generic_file_llseek,
-        .read           = new_sync_read,         /* Read from file. */
+        .read           = new_sync_read,
-        .read_iter      = generic_file_read_iter, /* Async read from file. */
+        .read_iter      = generic_file_read_iter,
 #ifdef NTFS_RW
-        .write          = do_sync_write,         /* Write to file. */
+        .write          = new_sync_write,
-        .aio_write      = ntfs_file_aio_write,   /* Async write to file. */
+        .write_iter     = ntfs_file_write_iter,
-        /*.release      = ,*/                    /* Last file is closed.  See
+        .fsync          = ntfs_file_fsync,
-                                                    fs/ext2/file.c::
-                                                    ext2_release_file() for
-                                                    how to use this to discard
-                                                    preallocated space for
-                                                    write opened files. */
-        .fsync          = ntfs_file_fsync,       /* Sync a file to disk. */
-        /*.aio_fsync    = ,*/                    /* Sync all outstanding async
-                                                    i/o operations on a
-                                                    kiocb. */
 #endif /* NTFS_RW */
-        /*.ioctl        = ,*/                    /* Perform function on the
+        .mmap           = generic_file_mmap,
-                                                    mounted filesystem. */
+        .open           = ntfs_file_open,
-        .mmap           = generic_file_mmap,     /* Mmap file. */
+        .splice_read    = generic_file_splice_read,
-        .open           = ntfs_file_open,        /* Open file. */
-        .splice_read    = generic_file_splice_read /* Zero-copy data send with
-                                                    the data source being on
-                                                    the ntfs partition.  We do
-                                                    not need to care about the
-                                                    data destination. */
-        /*.sendpage     = ,*/                    /* Zero-copy data send with
-                                                    the data destination being
-                                                    on the ntfs partition.  We
-                                                    do not need to care about
-                                                    the data source. */
 };
 const struct inode_operations ntfs_file_inode_ops = {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 898b9949d363..1d0c21df0d80 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,7 +28,6 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/log2.h>
-#include <linux/aio.h>
 #include "aops.h"
 #include "attrib.h"
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 044158bd22be..2d7f76e52c37 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3370,7 +3370,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
                ret = ocfs2_get_right_path(et, left_path, &right_path);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        return ret;
                }
                right_el = path_leaf_el(right_path);
@@ -3453,8 +3453,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
                                           subtree_index);
        }
 out:
-        if (right_path)
+        ocfs2_free_path(right_path);
-                ocfs2_free_path(right_path);
        return ret;
 }
@@ -3536,7 +3535,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
                ret = ocfs2_get_left_path(et, right_path, &left_path);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        return ret;
                }
                left_el = path_leaf_el(left_path);
@@ -3647,8 +3646,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
                                                   right_path, subtree_index);
        }
 out:
-        if (left_path)
+        ocfs2_free_path(left_path);
-                ocfs2_free_path(left_path);
        return ret;
 }
@@ -4334,17 +4332,17 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
        } else if (path->p_tree_depth > 0) {
                status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
                if (status)
-                        goto out;
+                        goto exit;
                if (left_cpos != 0) {
                        left_path = ocfs2_new_path_from_path(path);
                        if (!left_path)
-                                goto out;
+                                goto exit;
                        status = ocfs2_find_path(et->et_ci, left_path,
                                                 left_cpos);
                        if (status)
-                                goto out;
+                                goto free_left_path;
                        new_el = path_leaf_el(left_path);
@@ -4361,7 +4359,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                                            le16_to_cpu(new_el->l_next_free_rec),
                                            le16_to_cpu(new_el->l_count));
                                status = -EINVAL;
-                                goto out;
+                                goto free_left_path;
                        }
                        rec = &new_el->l_recs[
                                le16_to_cpu(new_el->l_next_free_rec) - 1];
@@ -4388,18 +4386,18 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                 path->p_tree_depth > 0) {
                status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
                if (status)
-                        goto out;
+                        goto free_left_path;
                if (right_cpos == 0)
-                        goto out;
+                        goto free_left_path;
                right_path = ocfs2_new_path_from_path(path);
                if (!right_path)
-                        goto out;
+                        goto free_left_path;
                status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
                if (status)
-                        goto out;
+                        goto free_right_path;
                new_el = path_leaf_el(right_path);
                rec = &new_el->l_recs[0];
@@ -4413,7 +4411,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
                                            le16_to_cpu(new_el->l_next_free_rec));
                                status = -EINVAL;
-                                goto out;
+                                goto free_right_path;
                        }
                        rec = &new_el->l_recs[1];
                }
@@ -4430,12 +4428,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                        ret = contig_type;
        }
-out:
+free_right_path:
-        if (left_path)
+        ocfs2_free_path(right_path);
-                ocfs2_free_path(left_path);
+free_left_path:
-        if (right_path)
+        ocfs2_free_path(left_path);
-                ocfs2_free_path(right_path);
+exit:
        return ret;
 }
@@ -6858,13 +6855,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                if (pages == NULL) {
                        ret = -ENOMEM;
                        mlog_errno(ret);
-                        goto out;
+                        return ret;
                }
                ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        goto free_pages;
                }
        }
@@ -6996,9 +6993,8 @@ out_commit:
 out:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
-        if (pages)
+free_pages:
-                kfree(pages);
+        kfree(pages);
        return ret;
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 44db1808cdb5..8d2bc840c288 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,6 +29,7 @@
 #include <linux/mpage.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
+#include <linux/uio.h>
 #include <cluster/masklog.h>
@@ -663,6 +664,117 @@ static int ocfs2_is_overwrite(struct ocfs2_super *osb,
        return 0;
 }
+static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
+                struct inode *inode, loff_t offset,
+                u64 zero_len, int cluster_align)
+{
+        u32 p_cpos = 0;
+        u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
+        int ret = 0;
+        if (offset <= i_size_read(inode) || cluster_align)
+                return 0;
+        ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
+                        &ext_flags);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+                u64 s = i_size_read(inode);
+                sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) +
+                        (do_div(s, osb->s_clustersize) >> 9);
+                ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
+                                zero_len >> 9, GFP_NOFS, false);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+        return ret;
+}
+static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
+                struct inode *inode, loff_t offset)
+{
+        u64 zero_start, zero_len, total_zero_len;
+        u32 p_cpos = 0, clusters_to_add;
+        u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
+        u32 size_div, offset_div;
+        int ret = 0;
+        {
+                u64 o = offset;
+                u64 s = i_size_read(inode);
+                offset_div = do_div(o, osb->s_clustersize);
+                size_div = do_div(s, osb->s_clustersize);
+        }
+        if (offset <= i_size_read(inode))
+                return 0;
+        clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
+                ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
+        total_zero_len = offset - i_size_read(inode);
+        if (clusters_to_add)
+                total_zero_len -= offset_div;
+        /* Allocate clusters to fill out holes, and this is only needed
+         * when we add more than one clusters. Otherwise the cluster will
+         * be allocated during direct IO */
+        if (clusters_to_add > 1) {
+                ret = ocfs2_extend_allocation(inode,
+                                OCFS2_I(inode)->ip_clusters,
+                                clusters_to_add - 1, 0);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        while (total_zero_len) {
+                ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
+                                &ext_flags);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
+                        size_div;
+                zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
+                        size_div;
+                zero_len = min(total_zero_len, zero_len);
+                if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+                        ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+                                        zero_start >> 9, zero_len >> 9,
+                                        GFP_NOFS, false);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+                total_zero_len -= zero_len;
+                v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
+                /* Only at first iteration can be cluster not aligned.
+                 * So set size_div to 0 for the rest */
+                size_div = 0;
+        }
+out:
+        return ret;
+}
 static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
                struct iov_iter *iter,
                loff_t offset)
@@ -677,8 +789,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
        struct buffer_head *di_bh = NULL;
        size_t count = iter->count;
        journal_t *journal = osb->journal->j_journal;
-        u32 zero_len;
+        u64 zero_len_head, zero_len_tail;
-        int cluster_align;
+        int cluster_align_head, cluster_align_tail;
        loff_t final_size = offset + count;
        int append_write = offset >= i_size_read(inode) ? 1 : 0;
        unsigned int num_clusters = 0;
@@ -686,9 +798,16 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
        {
                u64 o = offset;
+                u64 s = i_size_read(inode);
+                zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
+                cluster_align_head = !zero_len_head;
-                zero_len = do_div(o, 1 << osb->s_clustersize_bits);
+                zero_len_tail = osb->s_clustersize -
-                cluster_align = !zero_len;
+                        do_div(s, osb->s_clustersize);
+                if ((offset - i_size_read(inode)) < zero_len_tail)
+                        zero_len_tail = offset - i_size_read(inode);
+                cluster_align_tail = !zero_len_tail;
        }
        /*
@@ -706,21 +825,23 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
        }
        if (append_write) {
-                ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                ret = ocfs2_inode_lock(inode, NULL, 1);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto clean_orphan;
                }
+                /* zeroing out the previously allocated cluster tail
+                 * that but not zeroed */
                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-                        ret = ocfs2_zero_extend(inode, di_bh, offset);
+                        ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
+                                        zero_len_tail, cluster_align_tail);
                else
-                        ret = ocfs2_extend_no_holes(inode, di_bh, offset,
+                        ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
                                        offset);
                if (ret < 0) {
                        mlog_errno(ret);
                        ocfs2_inode_unlock(inode, 1);
-                        brelse(di_bh);
                        goto clean_orphan;
                }
@@ -728,13 +849,10 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
                if (is_overwrite < 0) {
                        mlog_errno(is_overwrite);
                        ocfs2_inode_unlock(inode, 1);
-                        brelse(di_bh);
                        goto clean_orphan;
                }
                ocfs2_inode_unlock(inode, 1);
-                brelse(di_bh);
-                di_bh = NULL;
        }
        written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
@@ -771,15 +889,23 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
                        if (ret < 0)
                                mlog_errno(ret);
                }
-        } else if (written < 0 && append_write && !is_overwrite &&
+        } else if (written > 0 && append_write && !is_overwrite &&
-                        !cluster_align) {
+                        !cluster_align_head) {
+                /* zeroing out the allocated cluster head */
                u32 p_cpos = 0;
                u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+                ret = ocfs2_inode_lock(inode, NULL, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto clean_orphan;
+                }
                ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
                                &num_clusters, &ext_flags);
                if (ret < 0) {
                        mlog_errno(ret);
+                        ocfs2_inode_unlock(inode, 0);
                        goto clean_orphan;
                }
@@ -787,9 +913,11 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
                ret = blkdev_issue_zeroout(osb->sb->s_bdev,
                                p_cpos << (osb->s_clustersize_bits - 9),
-                                zero_len >> 9, GFP_KERNEL, false);
+                                zero_len_head >> 9, GFP_NOFS, false);
                if (ret < 0)
                        mlog_errno(ret);
+                ocfs2_inode_unlock(inode, 0);
        }
 clean_orphan:
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 6cae155d54df..dd59599b022d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,7 +22,7 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
-#include <linux/aio.h>
+#include <linux/fs.h>
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         struct page *page,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 16eff45727ee..8e19b9d7aba8 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1312,7 +1312,9 @@ static int o2hb_debug_init(void)
        int ret = -ENOMEM;
        o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
-        if (!o2hb_debug_dir) {
+        if (IS_ERR_OR_NULL(o2hb_debug_dir)) {
+                ret = o2hb_debug_dir ?
+                        PTR_ERR(o2hb_debug_dir) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1325,7 +1327,9 @@ static int o2hb_debug_init(void)
                                                 sizeof(o2hb_live_node_bitmap),
                                                 O2NM_MAX_NODES,
                                                 o2hb_live_node_bitmap);
-        if (!o2hb_debug_livenodes) {
+        if (IS_ERR_OR_NULL(o2hb_debug_livenodes)) {
+                ret = o2hb_debug_livenodes ?
+                        PTR_ERR(o2hb_debug_livenodes) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1338,7 +1342,9 @@ static int o2hb_debug_init(void)
                                                   sizeof(o2hb_live_region_bitmap),
                                                   O2NM_MAX_REGIONS,
                                                   o2hb_live_region_bitmap);
-        if (!o2hb_debug_liveregions) {
+        if (IS_ERR_OR_NULL(o2hb_debug_liveregions)) {
+                ret = o2hb_debug_liveregions ?
+                        PTR_ERR(o2hb_debug_liveregions) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1352,7 +1358,9 @@ static int o2hb_debug_init(void)
                                          sizeof(o2hb_quorum_region_bitmap),
                                          O2NM_MAX_REGIONS,
                                          o2hb_quorum_region_bitmap);
-        if (!o2hb_debug_quorumregions) {
+        if (IS_ERR_OR_NULL(o2hb_debug_quorumregions)) {
+                ret = o2hb_debug_quorumregions ?
+                        PTR_ERR(o2hb_debug_quorumregions) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1366,7 +1374,9 @@ static int o2hb_debug_init(void)
                                          sizeof(o2hb_failed_region_bitmap),
                                          O2NM_MAX_REGIONS,
                                          o2hb_failed_region_bitmap);
-        if (!o2hb_debug_failedregions) {
+        if (IS_ERR_OR_NULL(o2hb_debug_failedregions)) {
+                ret = o2hb_debug_failedregions ?
+                        PTR_ERR(o2hb_debug_failedregions) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2000,7 +2010,8 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
        reg->hr_debug_dir =
                debugfs_create_dir(config_item_name(&reg->hr_item), dir);
-        if (!reg->hr_debug_dir) {
+        if (IS_ERR_OR_NULL(reg->hr_debug_dir)) {
+                ret = reg->hr_debug_dir ? PTR_ERR(reg->hr_debug_dir) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2013,7 +2024,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          O2HB_DB_TYPE_REGION_LIVENODES,
                                          sizeof(reg->hr_live_node_bitmap),
                                          O2NM_MAX_NODES, reg);
-        if (!reg->hr_debug_livenodes) {
+        if (IS_ERR_OR_NULL(reg->hr_debug_livenodes)) {
+                ret = reg->hr_debug_livenodes ?
+                        PTR_ERR(reg->hr_debug_livenodes) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2025,7 +2038,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          sizeof(*(reg->hr_db_regnum)),
                                          O2HB_DB_TYPE_REGION_NUMBER,
                                          0, O2NM_MAX_NODES, reg);
-        if (!reg->hr_debug_regnum) {
+        if (IS_ERR_OR_NULL(reg->hr_debug_regnum)) {
+                ret = reg->hr_debug_regnum ?
+                        PTR_ERR(reg->hr_debug_regnum) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2037,7 +2052,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          sizeof(*(reg->hr_db_elapsed_time)),
                                          O2HB_DB_TYPE_REGION_ELAPSED_TIME,
                                          0, 0, reg);
-        if (!reg->hr_debug_elapsed_time) {
+        if (IS_ERR_OR_NULL(reg->hr_debug_elapsed_time)) {
+                ret = reg->hr_debug_elapsed_time ?
+                        PTR_ERR(reg->hr_debug_elapsed_time) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2049,13 +2066,16 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          sizeof(*(reg->hr_db_pinned)),
                                          O2HB_DB_TYPE_REGION_PINNED,
                                          0, 0, reg);
-        if (!reg->hr_debug_pinned) {
+        if (IS_ERR_OR_NULL(reg->hr_debug_pinned)) {
+                ret = reg->hr_debug_pinned ?
+                        PTR_ERR(reg->hr_debug_pinned) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
-        ret = 0;
+        return 0;
 bail:
+        debugfs_remove_recursive(reg->hr_debug_dir);
        return ret;
 }
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 2260fb9e6508..7fdc25a4d8c0 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -196,13 +196,14 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
        }                                                               \
 } while (0)
-#define mlog_errno(st) do {                                             \
+#define mlog_errno(st) ({                                               \
        int _st = (st);                                                 \
        if (_st != -ERESTARTSYS && _st != -EINTR &&                     \
            _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC &&              \
            _st != -EDQUOT)                                             \
                mlog(ML_ERROR, "status = %lld\n", (long long)_st);      \
-} while (0)
+        _st;                                                            \
+})
 #define mlog_bug_on_msg(cond, fmt, args...) do {                        \
        if (cond) {                                                     \
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b08050bd3f2e..ccd4dcfc3645 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -18,7 +18,7 @@
 *
 *   linux/fs/minix/dir.c
 *
- *   Copyright (C) 1991, 1992 Linux Torvalds
+ *   Copyright (C) 1991, 1992 Linus Torvalds
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -2047,22 +2047,19 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
                              const char *name,
                              int namelen)
 {
-        int ret;
+        int ret = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
        trace_ocfs2_check_dir_for_entry(
                (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
-        ret = -EEXIST;
+        if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) {
-        if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
+                ret = -EEXIST;
-                goto bail;
+                mlog_errno(ret);
+        }
-        ret = 0;
-bail:
        ocfs2_free_dir_lookup_result(&lookup);
-        if (ret)
-                mlog_errno(ret);
        return ret;
 }
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 11849a44dc5a..956edf67be20 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1391,6 +1391,11 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
        int noqueue_attempted = 0;
        int dlm_locked = 0;
+        if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
+                mlog_errno(-EINVAL);
+                return -EINVAL;
+        }
        ocfs2_init_mask_waiter(&mw);
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
@@ -2954,7 +2959,7 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
                                                         osb->osb_debug_root,
                                                         osb,
                                                         &ocfs2_dlm_debug_fops);
-        if (!dlm_debug->d_locking_state) {
+        if (IS_ERR_OR_NULL(dlm_debug->d_locking_state)) {
                ret = -EINVAL;
                mlog(ML_ERROR,
                     "Unable to create locking state debugfs file.\n");
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 29651167190d..540dc4bdd042 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -82,7 +82,6 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
        }
        status = ocfs2_test_inode_bit(osb, blkno, &set);
-        trace_ocfs2_get_dentry_test_bit(status, set);
        if (status < 0) {
                if (status == -EINVAL) {
                        /*
@@ -96,6 +95,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
                goto unlock_nfs_sync;
        }
+        trace_ocfs2_get_dentry_test_bit(status, set);
        /* If the inode allocator bit is clear, this inode must be stale */
        if (!set) {
                status = -ESTALE;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 46e0d4e857c7..91f03ce98108 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2280,7 +2280,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                file->f_path.dentry->d_name.name,
                (unsigned int)from->nr_segs);   /* GRRRRR */
-        if (iocb->ki_nbytes == 0)
+        if (count == 0)
                return 0;
        appending = file->f_flags & O_APPEND ? 1 : 0;
@@ -2330,8 +2330,7 @@ relock:
        }
        can_do_direct = direct_io;
-        ret = ocfs2_prepare_inode_for_write(file, ppos,
+        ret = ocfs2_prepare_inode_for_write(file, ppos, count, appending,
-                                            iocb->ki_nbytes, appending,
                                            &can_do_direct, &has_refcount);
        if (ret < 0) {
                mlog_errno(ret);
@@ -2339,8 +2338,7 @@ relock:
        }
        if (direct_io && !is_sync_kiocb(iocb))
-                unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
+                unaligned_dio = ocfs2_is_io_unaligned(inode, count, *ppos);
-                                                      *ppos);
        /*
         * We can't complete the direct I/O as requested, fall back to
@@ -2394,7 +2392,6 @@ relock:
                /*
                 * for completing the rest of the request.
                 */
-                *ppos += written;
                count -= written;
                written_buffered = generic_perform_write(file, from, *ppos);
                /*
@@ -2409,7 +2406,6 @@ relock:
                        goto out_dio;
                }
-                iocb->ki_pos = *ppos + written_buffered;
                /* We need to ensure that the page cache pages are written to
                 * disk and invalidated to preserve the expected O_DIRECT
                 * semantics.
@@ -2418,6 +2414,7 @@ relock:
                ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
                                endbyte);
                if (ret == 0) {
+                        iocb->ki_pos = *ppos + written_buffered;
                        written += written_buffered;
                        invalidate_mapping_pages(mapping,
                                        *ppos >> PAGE_CACHE_SHIFT,
@@ -2440,10 +2437,14 @@ out_dio:
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
+        if (unlikely(written <= 0))
+                goto no_sync;
        if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
            ((file->f_flags & O_DIRECT) && !direct_io)) {
-                ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
+                ret = filemap_fdatawrite_range(file->f_mapping,
-                                               *ppos + count - 1);
+                                               iocb->ki_pos - written,
+                                               iocb->ki_pos - 1);
                if (ret < 0)
                        written = ret;
@@ -2454,10 +2455,12 @@ out_dio:
                }
                if (!ret)
-                        ret = filemap_fdatawait_range(file->f_mapping, *ppos,
+                        ret = filemap_fdatawait_range(file->f_mapping,
-                                                      *ppos + count - 1);
+                                                      iocb->ki_pos - written,
+                                                      iocb->ki_pos - 1);
        }
+no_sync:
        /*
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
         * function pointer which is called when o_direct io completes so that
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 3025c0da6b8a..be71ca0937f7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -624,7 +624,7 @@ static int ocfs2_remove_inode(struct inode *inode,
                ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
                                            le16_to_cpu(di->i_suballoc_slot));
        if (!inode_alloc_inode) {
-                status = -EEXIST;
+                status = -ENOENT;
                mlog_errno(status);
                goto bail;
        }
@@ -742,7 +742,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
                                                               ORPHAN_DIR_SYSTEM_INODE,
                                                               orphaned_slot);
                if (!orphan_dir_inode) {
-                        status = -EEXIST;
+                        status = -ENOENT;
                        mlog_errno(status);
                        goto bail;
                }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 044013455621..857bbbcd39f3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -666,7 +666,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
        if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
            ocfs2_local_alloc_count_bits(alloc)) {
                ocfs2_error(osb->sb, "local alloc inode %llu says it has "
-                            "%u free bits, but a count shows %u",
+                            "%u used bits, but a count shows %u",
                            (unsigned long long)le64_to_cpu(alloc->i_blkno),
                            le32_to_cpu(alloc->id1.bitmap1.i_used),
                            ocfs2_local_alloc_count_bits(alloc));
@@ -839,7 +839,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                                     u32 *numbits,
                                     struct ocfs2_alloc_reservation *resv)
 {
-        int numfound, bitoff, left, startoff, lastzero;
+        int numfound = 0, bitoff, left, startoff, lastzero;
        int local_resv = 0;
        struct ocfs2_alloc_reservation r;
        void *bitmap = NULL;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b5c3a5ea3ee6..09f90cbf0e24 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2322,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        trace_ocfs2_orphan_del(
             (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
-             name, namelen);
+             name, strlen(name));
        /* find it's spot in the orphan directory */
-        status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
+        status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
                                  &lookup);
        if (status) {
                mlog_errno(status);
@@ -2808,7 +2808,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                                                       ORPHAN_DIR_SYSTEM_INODE,
                                                       osb->slot_num);
        if (!orphan_dir_inode) {
-                status = -EEXIST;
+                status = -ENOENT;
                mlog_errno(status);
                goto leave;
        }
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ee541f92dab4..df3a500789c7 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4276,7 +4276,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
        error = posix_acl_create(dir, &mode, &default_acl, &acl);
        if (error) {
                mlog_errno(error);
-                goto out;
+                return error;
        }
        error = ocfs2_create_inode_in_orphan(dir, mode,
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d5493e361a38..e78a203d44c8 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -427,7 +427,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
        if (!si) {
                status = -ENOMEM;
                mlog_errno(status);
-                goto bail;
+                return status;
        }
        si->si_extended = ocfs2_uses_extended_slot_map(osb);
@@ -452,7 +452,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
        osb->slot_info = (struct ocfs2_slot_info *)si;
 bail:
-        if (status < 0 && si)
+        if (status < 0)
                __ocfs2_free_slot_info(si);
        return status;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 1724d43d3da1..220cae7bbdbc 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -295,7 +295,7 @@ static int o2cb_cluster_check(void)
                set_bit(node_num, netmap);
                if (!memcmp(hbmap, netmap, sizeof(hbmap)))
                        return 0;
-                if (i < O2CB_MAP_STABILIZE_COUNT)
+                if (i < O2CB_MAP_STABILIZE_COUNT - 1)
                        msleep(1000);
        }
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 720aa389e0ea..2768eb1da2b8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1004,10 +1004,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
        BUG_ON(conn == NULL);
        lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
-        if (!lc) {
+        if (!lc)
-                rc = -ENOMEM;
+                return -ENOMEM;
-                goto out;
-        }
        init_waitqueue_head(&lc->oc_wait);
        init_completion(&lc->oc_sync_wait);
@@ -1063,7 +1061,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
        }
 out:
-        if (rc && lc)
+        if (rc)
                kfree(lc);
        return rc;
 }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 0cb889a17ae1..4479029630bb 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2499,6 +2499,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
                                         alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
+                ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
+                                start_bit, count);
                goto bail;
        }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 26675185b886..837ddce4b659 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1112,7 +1112,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
                                                 ocfs2_debugfs_root);
-        if (!osb->osb_debug_root) {
+        if (IS_ERR_OR_NULL(osb->osb_debug_root)) {
                status = -EINVAL;
                mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
                goto read_super_error;
@@ -1122,7 +1122,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                                            osb->osb_debug_root,
                                            osb,
                                            &ocfs2_osb_debug_fops);
-        if (!osb->osb_ctxt) {
+        if (IS_ERR_OR_NULL(osb->osb_ctxt)) {
                status = -EINVAL;
                mlog_errno(status);
                goto read_super_error;
@@ -1606,8 +1606,9 @@ static int __init ocfs2_init(void)
        }
        ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
-        if (!ocfs2_debugfs_root) {
+        if (IS_ERR_OR_NULL(ocfs2_debugfs_root)) {
-                status = -ENOMEM;
+                status = ocfs2_debugfs_root ?
+                        PTR_ERR(ocfs2_debugfs_root) : -ENOMEM;
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
                goto out4;
        }
@@ -2069,6 +2070,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
        cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
        bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
        sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+        memcpy(sb->s_uuid, di->id2.i_super.s_uuid,
+               sizeof(di->id2.i_super.s_uuid));
        osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
@@ -2333,7 +2336,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
                mlog_errno(status);
                goto bail;
        }
-        cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
+        cleancache_init_shared_fs(sb);
 bail:
        return status;
@@ -2563,22 +2566,22 @@ static void ocfs2_handle_error(struct super_block *sb)
        ocfs2_set_ro_flag(osb, 0);
 }
-static char error_buf[1024];
+void __ocfs2_error(struct super_block *sb, const char *function,
+                  const char *fmt, ...)
-void __ocfs2_error(struct super_block *sb,
-                   const char *function,
-                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+        vaf.fmt = fmt;
-        va_end(args);
+        vaf.va = &args;
        /* Not using mlog here because we want to show the actual
         * function the error came from. */
-        printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
+        printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
-               sb->s_id, function, error_buf);
+               sb->s_id, function, &vaf);
+        va_end(args);
        ocfs2_handle_error(sb);
 }
@@ -2586,18 +2589,21 @@ void __ocfs2_error(struct super_block *sb,
 /* Handle critical errors. This is intentionally more drastic than
 * ocfs2_handle_error, so we only use for things like journal errors,
 * etc. */
-void __ocfs2_abort(struct super_block* sb,
+void __ocfs2_abort(struct super_block *sb, const char *function,
-                   const char *function,
                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        vsnprintf(error_buf, sizeof(error_buf), fmt, args);
-        va_end(args);
-        printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
+        vaf.fmt = fmt;
-               sb->s_id, function, error_buf);
+        vaf.va = &args;
+        printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
+        va_end(args);
        /* We don't have the cluster support yet to go straight to
         * hard readonly in here. Until then, we want to keep
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 85b190dc132f..4ca7533be479 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1238,6 +1238,10 @@ static int ocfs2_xattr_block_get(struct inode *inode,
                                                                i,
                                                                &block_off,
                                                                &name_offset);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto cleanup;
+                        }
                        xs->base = bucket_block(xs->bucket, block_off);
                }
                if (ocfs2_xattr_is_local(xs->here)) {
@@ -5665,6 +5669,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
                ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
                                                      i, &xv, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
                ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
                                                         args->ref_ci,
diff --git a/fs/open.c b/fs/open.c
index 33f9cbf2610b..6a83c47d5904 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -570,6 +570,7 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
        uid = make_kuid(current_user_ns(), user);
        gid = make_kgid(current_user_ns(), group);
+retry_deleg:
        newattrs.ia_valid =  ATTR_CTIME;
        if (user != (uid_t) -1) {
                if (!uid_valid(uid))
@@ -586,7 +587,6 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
        if (!S_ISDIR(inode->i_mode))
                newattrs.ia_valid |=
                        ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
-retry_deleg:
        mutex_lock(&inode->i_mutex);
        error = security_path_chown(path, uid, gid);
        if (!error)
@@ -988,9 +988,6 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
                return ERR_PTR(err);
        if (flags & O_CREAT)
                return ERR_PTR(-EINVAL);
-        if (!filename && (flags & O_DIRECTORY))
-                if (!dentry->d_inode->i_op->lookup)
-                        return ERR_PTR(-ENOTDIR);
        return do_file_open_root(dentry, mnt, filename, &op);
 }
 EXPORT_SYMBOL(file_open_root);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index b90952f528b1..5f0d1993e6e3 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -529,8 +529,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data)
 {
        struct ovl_fs *ufs = sb->s_fs_info;
-        if (!(*flags & MS_RDONLY) &&
+        if (!(*flags & MS_RDONLY) && !ufs->upper_mnt)
-            (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)))
                return -EROFS;
        return 0;
@@ -615,9 +614,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
                        break;
                default:
+                        pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
                        return -EINVAL;
                }
        }
+        /* Workdir is useless in non-upper mount */
+        if (!config->upperdir && config->workdir) {
+                pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+                        config->workdir);
+                kfree(config->workdir);
+                config->workdir = NULL;
+        }
        return 0;
 }
@@ -837,7 +846,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_stack_depth = 0;
        if (ufs->config.upperdir) {
-                /* FIXME: workdir is not needed for a R/O mount */
                if (!ufs->config.workdir) {
                        pr_err("overlayfs: missing 'workdir'\n");
                        goto out_free_config;
@@ -847,6 +855,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
                if (err)
                        goto out_free_config;
+                /* Upper fs should not be r/o */
+                if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) {
+                        pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+                        err = -EINVAL;
+                        goto out_put_upperpath;
+                }
                err = ovl_mount_dir(ufs->config.workdir, &workpath);
                if (err)
                        goto out_put_upperpath;
@@ -869,8 +884,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        err = -EINVAL;
        stacklen = ovl_split_lowerdirs(lowertmp);
-        if (stacklen > OVL_MAX_STACK)
+        if (stacklen > OVL_MAX_STACK) {
+                pr_err("overlayfs: too many lower directries, limit is %d\n",
+                       OVL_MAX_STACK);
                goto out_free_lowertmp;
+        } else if (!ufs->config.upperdir && stacklen == 1) {
+                pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
+                goto out_free_lowertmp;
+        }
        stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
        if (!stack)
@@ -932,8 +953,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
                ufs->numlower++;
        }
-        /* If the upper fs is r/o or nonexistent, we mark overlayfs r/o too */
+        /* If the upper fs is nonexistent, we mark overlayfs r/o too */
-        if (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY))
+        if (!ufs->upper_mnt)
                sb->s_flags |= MS_RDONLY;
        sb->s_d_op = &ovl_dentry_operations;
diff --git a/fs/pipe.c b/fs/pipe.c
index 21981e58e2a6..2d084f2d0b83 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,7 +21,6 @@
 #include <linux/audit.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
-#include <linux/aio.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 956b75d61809..6dee68d013ff 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1325,6 +1325,9 @@ out:
 static int pagemap_open(struct inode *inode, struct file *file)
 {
+        /* do not disclose physical addresses: attack vector */
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
        pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
                        "to stop being page-shift some time soon. See the "
                        "linux/Documentation/vm/pagemap.txt for details.\n");
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 39d1373128e9..44a549beeafa 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -539,6 +539,9 @@ static int ramoops_probe(struct platform_device *pdev)
        mem_address = pdata->mem_address;
        record_size = pdata->record_size;
        dump_oops = pdata->dump_oops;
+        ramoops_console_size = pdata->console_size;
+        ramoops_pmsg_size = pdata->pmsg_size;
+        ramoops_ftrace_size = pdata->ftrace_size;
        pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n",
                cxt->size, (unsigned long long)cxt->phys_addr,
diff --git a/fs/read_write.c b/fs/read_write.c
index 8e1b68786d66..69128b378646 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/uio.h>
-#include <linux/aio.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/export.h>
@@ -343,13 +342,10 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = iov_iter_count(iter);
        iter->type |= READ;
        ret = file->f_op->read_iter(&kiocb, iter);
-        if (ret == -EIOCBQUEUED)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        if (ret > 0)
                *ppos = kiocb.ki_pos;
        return ret;
@@ -366,13 +362,10 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = iov_iter_count(iter);
        iter->type |= WRITE;
        ret = file->f_op->write_iter(&kiocb, iter);
-        if (ret == -EIOCBQUEUED)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        if (ret > 0)
                *ppos = kiocb.ki_pos;
        return ret;
@@ -426,11 +419,9 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
        ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
-        if (-EIOCBQUEUED == ret)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
@@ -446,12 +437,10 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
        iov_iter_init(&iter, READ, &iov, 1, len);
        ret = filp->f_op->read_iter(&kiocb, &iter);
-        if (-EIOCBQUEUED == ret)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
@@ -510,11 +499,9 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
        ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
-        if (-EIOCBQUEUED == ret)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
@@ -530,12 +517,10 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
        iov_iter_init(&iter, WRITE, &iov, 1, len);
        ret = filp->f_op->write_iter(&kiocb, &iter);
-        if (-EIOCBQUEUED == ret)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
@@ -710,60 +695,47 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 }
 EXPORT_SYMBOL(iov_shorten);
-static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
+static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
-                unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
+                loff_t *ppos, iter_fn_t fn)
 {
        struct kiocb kiocb;
-        struct iov_iter iter;
        ssize_t ret;
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
-        iov_iter_init(&iter, rw, iov, nr_segs, len);
+        ret = fn(&kiocb, iter);
-        ret = fn(&kiocb, &iter);
+        BUG_ON(ret == -EIOCBQUEUED);
-        if (ret == -EIOCBQUEUED)
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
-static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
+static ssize_t do_sync_readv_writev(struct file *filp, struct iov_iter *iter,
-                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
+                loff_t *ppos, iov_fn_t fn)
 {
        struct kiocb kiocb;
        ssize_t ret;
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
-        kiocb.ki_nbytes = len;
-        ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
+        ret = fn(&kiocb, iter->iov, iter->nr_segs, kiocb.ki_pos);
-        if (ret == -EIOCBQUEUED)
+        BUG_ON(ret == -EIOCBQUEUED);
-                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
 }
 /* Do it by hand, with file-ops */
-static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
+static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
-                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
+                loff_t *ppos, io_fn_t fn)
 {
-        struct iovec *vector = iov;
        ssize_t ret = 0;
-        while (nr_segs > 0) {
+        while (iov_iter_count(iter)) {
-                void __user *base;
+                struct iovec iovec = iov_iter_iovec(iter);
-                size_t len;
                ssize_t nr;
-                base = vector->iov_base;
+                nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
-                len = vector->iov_len;
-                vector++;
-                nr_segs--;
-                nr = fn(filp, base, len, ppos);
                if (nr < 0) {
                        if (!ret)
@@ -771,8 +743,9 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
                        break;
                }
                ret += nr;
-                if (nr != len)
+                if (nr != iovec.iov_len)
                        break;
+                iov_iter_advance(iter, nr);
        }
        return ret;
@@ -863,17 +836,20 @@ static ssize_t do_readv_writev(int type, struct file *file,
        size_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
+        struct iov_iter iter;
        ssize_t ret;
        io_fn_t fn;
        iov_fn_t fnv;
        iter_fn_t iter_fn;
-        ret = rw_copy_check_uvector(type, uvector, nr_segs,
+        ret = import_iovec(type, uvector, nr_segs,
-                                    ARRAY_SIZE(iovstack), iovstack, &iov);
+                           ARRAY_SIZE(iovstack), &iov, &iter);
-        if (ret <= 0)
+        if (ret < 0)
-                goto out;
+                return ret;
-        tot_len = ret;
+        tot_len = iov_iter_count(&iter);
+        if (!tot_len)
+                goto out;
        ret = rw_verify_area(type, file, pos, tot_len);
        if (ret < 0)
                goto out;
@@ -891,20 +867,17 @@ static ssize_t do_readv_writev(int type, struct file *file,
        }
        if (iter_fn)
-                ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
+                ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
-                                                pos, iter_fn);
        else if (fnv)
-                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+                ret = do_sync_readv_writev(file, &iter, pos, fnv);
-                                                pos, fnv);
        else
-                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+                ret = do_loop_readv_writev(file, &iter, pos, fn);
        if (type != READ)
                file_end_write(file);
 out:
-        if (iov != iovstack)
+        kfree(iov);
-                kfree(iov);
        if ((ret + (type == READ)) > 0) {
                if (type == READ)
                        fsnotify_access(file);
@@ -1043,17 +1016,20 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
        compat_ssize_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
+        struct iov_iter iter;
        ssize_t ret;
        io_fn_t fn;
        iov_fn_t fnv;
        iter_fn_t iter_fn;
-        ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
+        ret = compat_import_iovec(type, uvector, nr_segs,
-                                               UIO_FASTIOV, iovstack, &iov);
+                                  UIO_FASTIOV, &iov, &iter);
-        if (ret <= 0)
+        if (ret < 0)
-                goto out;
+                return ret;
-        tot_len = ret;
+        tot_len = iov_iter_count(&iter);
+        if (!tot_len)
+                goto out;
        ret = rw_verify_area(type, file, pos, tot_len);
        if (ret < 0)
                goto out;
@@ -1071,20 +1047,17 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
        }
        if (iter_fn)
-                ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
+                ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
-                                                pos, iter_fn);
        else if (fnv)
-                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+                ret = do_sync_readv_writev(file, &iter, pos, fnv);
-                                                pos, fnv);
        else
-                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+                ret = do_loop_readv_writev(file, &iter, pos, fn);
        if (type != READ)
                file_end_write(file);
 out:
-        if (iov != iovstack)
+        kfree(iov);
-                kfree(iov);
        if ((ret + (type == READ)) > 0) {
                if (type == READ)
                        fsnotify_access(file);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index e72401e1f995..9312b7842e03 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,7 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 #include <linux/swap.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
diff --git a/fs/splice.c b/fs/splice.c
index 7968da96bebb..41cbb16299e0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -32,7 +32,6 @@
 #include <linux/gfp.h>
 #include <linux/socket.h>
 #include <linux/compat.h>
-#include <linux/aio.h>
 #include "internal.h"
 /*
@@ -1534,34 +1533,29 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
-        ssize_t count;
        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
-        ret = rw_copy_check_uvector(READ, uiov, nr_segs,
+        ret = import_iovec(READ, uiov, nr_segs,
-                                    ARRAY_SIZE(iovstack), iovstack, &iov);
+                           ARRAY_SIZE(iovstack), &iov, &iter);
-        if (ret <= 0)
+        if (ret < 0)
-                goto out;
+                return ret;
-        count = ret;
-        iov_iter_init(&iter, READ, iov, nr_segs, count);
+        sd.total_len = iov_iter_count(&iter);
        sd.len = 0;
-        sd.total_len = count;
        sd.flags = flags;
        sd.u.data = &iter;
        sd.pos = 0;
-        pipe_lock(pipe);
+        if (sd.total_len) {
-        ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
+                pipe_lock(pipe);
-        pipe_unlock(pipe);
+                ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
+                pipe_unlock(pipe);
-out:
+        }
-        if (iov != iovstack)
-                kfree(iov);
+        kfree(iov);
        return ret;
 }
diff --git a/fs/stat.c b/fs/stat.c
index ae0c3cef9927..19636af5e75c 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -66,7 +66,7 @@ int vfs_getattr(struct path *path, struct kstat *stat)
 {
        int retval;
-        retval = security_inode_getattr(path->mnt, path->dentry);
+        retval = security_inode_getattr(path);
        if (retval)
                return retval;
        return vfs_getattr_nosec(path, stat);
diff --git a/fs/super.c b/fs/super.c
index 2b7dc90ccdbb..928c20f47af9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -224,7 +224,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        s->s_maxbytes = MAX_NON_LFS;
        s->s_op = &default_op;
        s->s_time_gran = 1000000000;
-        s->cleancache_poolid = -1;
+        s->cleancache_poolid = CLEANCACHE_NO_POOL;
        s->s_shrink.seeks = DEFAULT_SEEKS;
        s->s_shrink.scan_objects = super_cache_scan;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 2554d8835b48..b400c04371f0 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -41,7 +41,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
        if (grp->attrs) {
                for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
-                        umode_t mode = 0;
+                        umode_t mode = (*attr)->mode;
                        /*
                         * In update mode, we're changing the permissions or
@@ -55,9 +55,14 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
                                if (!mode)
                                        continue;
                        }
+                        WARN(mode & ~(SYSFS_PREALLOC | 0664),
+                             "Attribute %s: Invalid permissions 0%o\n",
+                             (*attr)->name, mode);
+                        mode &= SYSFS_PREALLOC | 0664;
                        error = sysfs_add_file_mode_ns(parent, *attr, false,
-                                                       (*attr)->mode | mode,
+                                                       mode, NULL);
-                                                       NULL);
                        if (unlikely(error))
                                break;
                }
diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
new file mode 100644
index 000000000000..82fa35b656c4
--- /dev/null
+++ b/fs/tracefs/Makefile
@@ -0,0 +1,4 @@
+tracefs-objs    := inode.o
+obj-$(CONFIG_TRACING)   += tracefs.o
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
new file mode 100644
index 000000000000..d92bdf3b079a
--- /dev/null
+++ b/fs/tracefs/inode.c
@@ -0,0 +1,650 @@
+/*
+ *  inode.c - part of tracefs, a pseudo file system for activating tracing
+ *
+ * Based on debugfs by: Greg Kroah-Hartman <greg@kroah.com>
+ *
+ *  Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License version
+ *      2 as published by the Free Software Foundation.
+ *
+ * tracefs is the file system that is used by the tracing infrastructure.
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/kobject.h>
+#include <linux/namei.h>
+#include <linux/tracefs.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#define TRACEFS_DEFAULT_MODE    0700
+static struct vfsmount *tracefs_mount;
+static int tracefs_mount_count;
+static bool tracefs_registered;
+static ssize_t default_read_file(struct file *file, char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+        return 0;
+}
+static ssize_t default_write_file(struct file *file, const char __user *buf,
+                                   size_t count, loff_t *ppos)
+{
+        return count;
+}
+static const struct file_operations tracefs_file_operations = {
+        .read =         default_read_file,
+        .write =        default_write_file,
+        .open =         simple_open,
+        .llseek =       noop_llseek,
+};
+static struct tracefs_dir_ops {
+        int (*mkdir)(const char *name);
+        int (*rmdir)(const char *name);
+} tracefs_ops;
+static char *get_dname(struct dentry *dentry)
+{
+        const char *dname;
+        char *name;
+        int len = dentry->d_name.len;
+        dname = dentry->d_name.name;
+        name = kmalloc(len + 1, GFP_KERNEL);
+        if (!name)
+                return NULL;
+        memcpy(name, dname, len);
+        name[len] = 0;
+        return name;
+}
+static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+        char *name;
+        int ret;
+        name = get_dname(dentry);
+        if (!name)
+                return -ENOMEM;
+        /*
+         * The mkdir call can call the generic functions that create
+         * the files within the tracefs system. It is up to the individual
+         * mkdir routine to handle races.
+         */
+        mutex_unlock(&inode->i_mutex);
+        ret = tracefs_ops.mkdir(name);
+        mutex_lock(&inode->i_mutex);
+        kfree(name);
+        return ret;
+}
+static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
+{
+        char *name;
+        int ret;
+        name = get_dname(dentry);
+        if (!name)
+                return -ENOMEM;
+        /*
+         * The rmdir call can call the generic functions that create
+         * the files within the tracefs system. It is up to the individual
+         * rmdir routine to handle races.
+         * This time we need to unlock not only the parent (inode) but
+         * also the directory that is being deleted.
+         */
+        mutex_unlock(&inode->i_mutex);
+        mutex_unlock(&dentry->d_inode->i_mutex);
+        ret = tracefs_ops.rmdir(name);
+        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+        mutex_lock(&dentry->d_inode->i_mutex);
+        kfree(name);
+        return ret;
+}
+static const struct inode_operations tracefs_dir_inode_operations = {
+        .lookup         = simple_lookup,
+        .mkdir          = tracefs_syscall_mkdir,
+        .rmdir          = tracefs_syscall_rmdir,
+};
+static struct inode *tracefs_get_inode(struct super_block *sb)
+{
+        struct inode *inode = new_inode(sb);
+        if (inode) {
+                inode->i_ino = get_next_ino();
+                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        }
+        return inode;
+}
+struct tracefs_mount_opts {
+        kuid_t uid;
+        kgid_t gid;
+        umode_t mode;
+};
+enum {
+        Opt_uid,
+        Opt_gid,
+        Opt_mode,
+        Opt_err
+};
+static const match_table_t tokens = {
+        {Opt_uid, "uid=%u"},
+        {Opt_gid, "gid=%u"},
+        {Opt_mode, "mode=%o"},
+        {Opt_err, NULL}
+};
+struct tracefs_fs_info {
+        struct tracefs_mount_opts mount_opts;
+};
+static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
+{
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        int token;
+        kuid_t uid;
+        kgid_t gid;
+        char *p;
+        opts->mode = TRACEFS_DEFAULT_MODE;
+        while ((p = strsep(&data, ",")) != NULL) {
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_uid:
+                        if (match_int(&args[0], &option))
+                                return -EINVAL;
+                        uid = make_kuid(current_user_ns(), option);
+                        if (!uid_valid(uid))
+                                return -EINVAL;
+                        opts->uid = uid;
+                        break;
+                case Opt_gid:
+                        if (match_int(&args[0], &option))
+                                return -EINVAL;
+                        gid = make_kgid(current_user_ns(), option);
+                        if (!gid_valid(gid))
+                                return -EINVAL;
+                        opts->gid = gid;
+                        break;
+                case Opt_mode:
+                        if (match_octal(&args[0], &option))
+                                return -EINVAL;
+                        opts->mode = option & S_IALLUGO;
+                        break;
+                /*
+                 * We might like to report bad mount options here;
+                 * but traditionally tracefs has ignored all mount options
+                 */
+                }
+        }
+        return 0;
+}
+static int tracefs_apply_options(struct super_block *sb)
+{
+        struct tracefs_fs_info *fsi = sb->s_fs_info;
+        struct inode *inode = sb->s_root->d_inode;
+        struct tracefs_mount_opts *opts = &fsi->mount_opts;
+        inode->i_mode &= ~S_IALLUGO;
+        inode->i_mode |= opts->mode;
+        inode->i_uid = opts->uid;
+        inode->i_gid = opts->gid;
+        return 0;
+}
+static int tracefs_remount(struct super_block *sb, int *flags, char *data)
+{
+        int err;
+        struct tracefs_fs_info *fsi = sb->s_fs_info;
+        sync_filesystem(sb);
+        err = tracefs_parse_options(data, &fsi->mount_opts);
+        if (err)
+                goto fail;
+        tracefs_apply_options(sb);
+fail:
+        return err;
+}
+static int tracefs_show_options(struct seq_file *m, struct dentry *root)
+{
+        struct tracefs_fs_info *fsi = root->d_sb->s_fs_info;
+        struct tracefs_mount_opts *opts = &fsi->mount_opts;
+        if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+                seq_printf(m, ",uid=%u",
+                           from_kuid_munged(&init_user_ns, opts->uid));
+        if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+                seq_printf(m, ",gid=%u",
+                           from_kgid_munged(&init_user_ns, opts->gid));
+        if (opts->mode != TRACEFS_DEFAULT_MODE)
+                seq_printf(m, ",mode=%o", opts->mode);
+        return 0;
+}
+static const struct super_operations tracefs_super_operations = {
+        .statfs         = simple_statfs,
+        .remount_fs     = tracefs_remount,
+        .show_options   = tracefs_show_options,
+};
+static int trace_fill_super(struct super_block *sb, void *data, int silent)
+{
+        static struct tree_descr trace_files[] = {{""}};
+        struct tracefs_fs_info *fsi;
+        int err;
+        save_mount_options(sb, data);
+        fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL);
+        sb->s_fs_info = fsi;
+        if (!fsi) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        err = tracefs_parse_options(data, &fsi->mount_opts);
+        if (err)
+                goto fail;
+        err  =  simple_fill_super(sb, TRACEFS_MAGIC, trace_files);
+        if (err)
+                goto fail;
+        sb->s_op = &tracefs_super_operations;
+        tracefs_apply_options(sb);
+        return 0;
+fail:
+        kfree(fsi);
+        sb->s_fs_info = NULL;
+        return err;
+}
+static struct dentry *trace_mount(struct file_system_type *fs_type,
+                        int flags, const char *dev_name,
+                        void *data)
+{
+        return mount_single(fs_type, flags, data, trace_fill_super);
+}
+static struct file_system_type trace_fs_type = {
+        .owner =        THIS_MODULE,
+        .name =         "tracefs",
+        .mount =        trace_mount,
+        .kill_sb =      kill_litter_super,
+};
+MODULE_ALIAS_FS("tracefs");
+static struct dentry *start_creating(const char *name, struct dentry *parent)
+{
+        struct dentry *dentry;
+        int error;
+        pr_debug("tracefs: creating file '%s'\n",name);
+        error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
+                              &tracefs_mount_count);
+        if (error)
+                return ERR_PTR(error);
+        /* If the parent is not specified, we create it in the root.
+         * We need the root dentry to do this, which is in the super
+         * block. A pointer to that is in the struct vfsmount that we
+         * have around.
+         */
+        if (!parent)
+                parent = tracefs_mount->mnt_root;
+        mutex_lock(&parent->d_inode->i_mutex);
+        dentry = lookup_one_len(name, parent, strlen(name));
+        if (!IS_ERR(dentry) && dentry->d_inode) {
+                dput(dentry);
+                dentry = ERR_PTR(-EEXIST);
+        }
+        if (IS_ERR(dentry))
+                mutex_unlock(&parent->d_inode->i_mutex);
+        return dentry;
+}
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+        mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+        dput(dentry);
+        simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+        return NULL;
+}
+static struct dentry *end_creating(struct dentry *dentry)
+{
+        mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+        return dentry;
+}
+/**
+ * tracefs_create_file - create a file in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the tracefs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ *
+ * This is the basic "create a file" function for tracefs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the tracefs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If tracefs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_file(const char *name, umode_t mode,
+                                   struct dentry *parent, void *data,
+                                   const struct file_operations *fops)
+{
+        struct dentry *dentry;
+        struct inode *inode;
+        if (!(mode & S_IFMT))
+                mode |= S_IFREG;
+        BUG_ON(!S_ISREG(mode));
+        dentry = start_creating(name, parent);
+        if (IS_ERR(dentry))
+                return NULL;
+        inode = tracefs_get_inode(dentry->d_sb);
+        if (unlikely(!inode))
+                return failed_creating(dentry);
+        inode->i_mode = mode;
+        inode->i_fop = fops ? fops : &tracefs_file_operations;
+        inode->i_private = data;
+        d_instantiate(dentry, inode);
+        fsnotify_create(dentry->d_parent->d_inode, dentry);
+        return end_creating(dentry);
+}
+static struct dentry *__create_dir(const char *name, struct dentry *parent,
+                                   const struct inode_operations *ops)
+{
+        struct dentry *dentry = start_creating(name, parent);
+        struct inode *inode;
+        if (IS_ERR(dentry))
+                return NULL;
+        inode = tracefs_get_inode(dentry->d_sb);
+        if (unlikely(!inode))
+                return failed_creating(dentry);
+        inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+        inode->i_op = ops;
+        inode->i_fop = &simple_dir_operations;
+        /* directory inodes start off with i_nlink == 2 (for "." entry) */
+        inc_nlink(inode);
+        d_instantiate(dentry, inode);
+        inc_nlink(dentry->d_parent->d_inode);
+        fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+        return end_creating(dentry);
+}
+/**
+ * tracefs_create_dir - create a directory in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the directory to
+ *        create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          directory will be created in the root of the tracefs filesystem.
+ *
+ * This function creates a directory in tracefs with the given name.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed. If an error occurs, %NULL will be returned.
+ *
+ * If tracing is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
+{
+        return __create_dir(name, parent, &simple_dir_inode_operations);
+}
+/**
+ * tracefs_create_instance_dir - create the tracing instances directory
+ * @name: The name of the instances directory to create
+ * @parent: The parent directory that the instances directory will exist
+ * @mkdir: The function to call when a mkdir is performed.
+ * @rmdir: The function to call when a rmdir is performed.
+ *
+ * Only one instances directory is allowed.
+ *
+ * The instances directory is special as it allows for mkdir and rmdir to
+ * to be done by userspace. When a mkdir or rmdir is performed, the inode
+ * locks are released and the methhods passed in (@mkdir and @rmdir) are
+ * called without locks and with the name of the directory being created
+ * within the instances directory.
+ *
+ * Returns the dentry of the instances directory.
+ */
+struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
+                                          int (*mkdir)(const char *name),
+                                          int (*rmdir)(const char *name))
+{
+        struct dentry *dentry;
+        /* Only allow one instance of the instances directory. */
+        if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
+                return NULL;
+        dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
+        if (!dentry)
+                return NULL;
+        tracefs_ops.mkdir = mkdir;
+        tracefs_ops.rmdir = rmdir;
+        return dentry;
+}
+static inline int tracefs_positive(struct dentry *dentry)
+{
+        return dentry->d_inode && !d_unhashed(dentry);
+}
+static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
+{
+        int ret = 0;
+        if (tracefs_positive(dentry)) {
+                if (dentry->d_inode) {
+                        dget(dentry);
+                        switch (dentry->d_inode->i_mode & S_IFMT) {
+                        case S_IFDIR:
+                                ret = simple_rmdir(parent->d_inode, dentry);
+                                break;
+                        default:
+                                simple_unlink(parent->d_inode, dentry);
+                                break;
+                        }
+                        if (!ret)
+                                d_delete(dentry);
+                        dput(dentry);
+                }
+        }
+        return ret;
+}
+/**
+ * tracefs_remove - removes a file or directory from the tracefs filesystem
+ * @dentry: a pointer to a the dentry of the file or directory to be
+ *          removed.
+ *
+ * This function removes a file or directory in tracefs that was previously
+ * created with a call to another tracefs function (like
+ * tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove(struct dentry *dentry)
+{
+        struct dentry *parent;
+        int ret;
+        if (IS_ERR_OR_NULL(dentry))
+                return;
+        parent = dentry->d_parent;
+        if (!parent || !parent->d_inode)
+                return;
+        mutex_lock(&parent->d_inode->i_mutex);
+        ret = __tracefs_remove(dentry, parent);
+        mutex_unlock(&parent->d_inode->i_mutex);
+        if (!ret)
+                simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+}
+/**
+ * tracefs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in tracefs that
+ * was previously created with a call to another tracefs function
+ * (like tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove_recursive(struct dentry *dentry)
+{
+        struct dentry *child, *parent;
+        if (IS_ERR_OR_NULL(dentry))
+                return;
+        parent = dentry->d_parent;
+        if (!parent || !parent->d_inode)
+                return;
+        parent = dentry;
+ down:
+        mutex_lock(&parent->d_inode->i_mutex);
+ loop:
+        /*
+         * The parent->d_subdirs is protected by the d_lock. Outside that
+         * lock, the child can be unlinked and set to be freed which can
+         * use the d_u.d_child as the rcu head and corrupt this list.
+         */
+        spin_lock(&parent->d_lock);
+        list_for_each_entry(child, &parent->d_subdirs, d_child) {
+                if (!tracefs_positive(child))
+                        continue;
+                /* perhaps simple_empty(child) makes more sense */
+                if (!list_empty(&child->d_subdirs)) {
+                        spin_unlock(&parent->d_lock);
+                        mutex_unlock(&parent->d_inode->i_mutex);
+                        parent = child;
+                        goto down;
+                }
+                spin_unlock(&parent->d_lock);
+                if (!__tracefs_remove(child, parent))
+                        simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+                /*
+                 * The parent->d_lock protects agaist child from unlinking
+                 * from d_subdirs. When releasing the parent->d_lock we can
+                 * no longer trust that the next pointer is valid.
+                 * Restart the loop. We'll skip this one with the
+                 * tracefs_positive() check.
+                 */
+                goto loop;
+        }
+        spin_unlock(&parent->d_lock);
+        mutex_unlock(&parent->d_inode->i_mutex);
+        child = parent;
+        parent = parent->d_parent;
+        mutex_lock(&parent->d_inode->i_mutex);
+        if (child != dentry)
+                /* go up */
+                goto loop;
+        if (!__tracefs_remove(child, parent))
+                simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+        mutex_unlock(&parent->d_inode->i_mutex);
+}
+/**
+ * tracefs_initialized - Tells whether tracefs has been registered
+ */
+bool tracefs_initialized(void)
+{
+        return tracefs_registered;
+}
+static struct kobject *trace_kobj;
+static int __init tracefs_init(void)
+{
+        int retval;
+        trace_kobj = kobject_create_and_add("tracing", kernel_kobj);
+        if (!trace_kobj)
+                return -EINVAL;
+        retval = register_filesystem(&trace_fs_type);
+        if (!retval)
+                tracefs_registered = true;
+        return retval;
+}
+core_initcall(tracefs_init);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e627c0acf626..c3d15fe83403 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,7 +50,6 @@
 */
 #include "ubifs.h"
-#include <linux/aio.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/slab.h>
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 08f3555fbeac..7f885cc8b0b7 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,7 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        int err, pos;
-        size_t count = iocb->ki_nbytes;
+        size_t count = iov_iter_count(from);
        struct udf_inode_info *iinfo = UDF_I(inode);
        mutex_lock(&inode->i_mutex);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a445d599098d..9c1fbd23913d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,7 +38,7 @@
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
 #include <linux/mpage.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a9b7a1b8704..4f8cdc59bc38 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,7 +31,6 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
-#include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a2e1cb8a568b..f44212fae653 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,7 +38,6 @@
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
-#include <linux/aio.h>
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>