23 files changed, 296 insertions, 289 deletions
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 783a94355efd..84a2d1868271 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -413,7 +413,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        ret = 0;
 fail:
        while (ret < 0 && !list_empty(&tmplist)) {
-                sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+                sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
                list_del(&sums->list);
                kfree(sums);
        }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 659f2ea9e6f7..cefca661464b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2638,7 +2638,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
        for (i = 0; i < CEPH_CAP_BITS; i++)
                if ((dirty & (1 << i)) &&
-                    flush_tid == ci->i_cap_flush_tid[i])
+                    (u16)flush_tid == ci->i_cap_flush_tid[i])
                        cleaned |= 1 << i;
        dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5228f201d3d5..4f46f7a05289 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -378,7 +378,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
        loff_t offset = header->args.offset;
        size_t count = header->args.count;
        struct page **pages = header->args.pages;
-        int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+        int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
        unsigned int pg_len;
        struct blk_plug plug;
        int i;
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index e966c023b1b7..acbf9ca4018c 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -65,17 +65,18 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
        dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+        mutex_lock(&nn->bl_mutex);
        bl_pipe_msg.bl_wq = &nn->bl_wq;
        b->simple.len += 4;     /* single volume */
        if (b->simple.len > PAGE_SIZE)
-                return -EIO;
+                goto out_unlock;
        memset(msg, 0, sizeof(*msg));
        msg->len = sizeof(*bl_msg) + b->simple.len;
        msg->data = kzalloc(msg->len, gfp_mask);
        if (!msg->data)
-                goto out;
+                goto out_free_data;
        bl_msg = msg->data;
        bl_msg->type = BL_DEVICE_MOUNT,
@@ -87,7 +88,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
        rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
        if (rc < 0) {
                remove_wait_queue(&nn->bl_wq, &wq);
-                goto out;
+                goto out_free_data;
        }
        set_current_state(TASK_UNINTERRUPTIBLE);
@@ -97,12 +98,14 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
        if (reply->status != BL_DEVICE_REQUEST_PROC) {
                printk(KERN_WARNING "%s failed to decode device: %d\n",
                        __func__, reply->status);
-                goto out;
+                goto out_free_data;
        }
        dev = MKDEV(reply->major, reply->minor);
-out:
+out_free_data:
        kfree(msg->data);
+out_unlock:
+        mutex_unlock(&nn->bl_mutex);
        return dev;
 }
@@ -232,6 +235,7 @@ static int nfs4blocklayout_net_init(struct net *net)
        struct nfs_net *nn = net_generic(net, nfs_net_id);
        struct dentry *dentry;
+        mutex_init(&nn->bl_mutex);
        init_waitqueue_head(&nn->bl_wq);
        nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
        if (IS_ERR(nn->bl_device_pipe))
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5853f53db732..7f3f60641344 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -125,6 +125,8 @@ again:
                        continue;
                if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
                        continue;
+                if (!nfs4_valid_open_stateid(state))
+                        continue;
                if (!nfs4_stateid_match(&state->stateid, stateid))
                        continue;
                get_nfs_open_context(ctx);
@@ -193,7 +195,11 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
 {
        int res = 0;
-        res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+        if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+                res = nfs4_proc_delegreturn(inode,
+                                delegation->cred,
+                                &delegation->stateid,
+                                issync);
        nfs_free_delegation(delegation);
        return res;
 }
@@ -380,11 +386,13 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
-        int err;
+        int err = 0;
        if (delegation == NULL)
                return 0;
        do {
+                if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+                        break;
                err = nfs_delegation_claim_opens(inode, &delegation->stateid);
                if (!issync || err != -EAGAIN)
                        break;
@@ -605,10 +613,23 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
        rcu_read_unlock();
 }
+static void nfs_revoke_delegation(struct inode *inode)
+{
+        struct nfs_delegation *delegation;
+        rcu_read_lock();
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
+        if (delegation != NULL) {
+                set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+                nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
+        }
+        rcu_read_unlock();
+}
 void nfs_remove_bad_delegation(struct inode *inode)
 {
        struct nfs_delegation *delegation;
+        nfs_revoke_delegation(inode);
        delegation = nfs_inode_detach_delegation(inode);
        if (delegation) {
                nfs_inode_find_state_and_recover(inode, &delegation->stateid);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 5c1cce39297f..e3c20a3ccc93 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -31,6 +31,7 @@ enum {
        NFS_DELEGATION_RETURN_IF_CLOSED,
        NFS_DELEGATION_REFERENCED,
        NFS_DELEGATION_RETURNING,
+        NFS_DELEGATION_REVOKED,
 };
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06e8cfcbb670..6e62155abf26 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1527,6 +1527,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
                case -ENOENT:
                        d_drop(dentry);
                        d_add(dentry, NULL);
+                        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
                        break;
                case -EISDIR:
                case -ENOTDIR:
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 20cffc830468..10bf07280f4a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,6 +266,7 @@ static void nfs_direct_req_free(struct kref *kref)
 {
        struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+        nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
        if (dreq->l_ctx != NULL)
                nfs_put_lock_context(dreq->l_ctx);
        if (dreq->ctx != NULL)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 46fab1cb455a..7afb52f6a25a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -145,9 +145,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
        case -NFS4ERR_DELEG_REVOKED:
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_BAD_STATEID:
-                if (state == NULL)
-                        break;
-                nfs_remove_bad_delegation(state->inode);
        case -NFS4ERR_OPENMODE:
                if (state == NULL)
                        break;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6388a59f2add..00689a8a85e4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -626,7 +626,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
        int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
-        int err;
+        int err = 0;
        trace_nfs_getattr_enter(inode);
        /* Flush out writes to the server in order to update c/mtime.  */
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index ef221fb8a183..f0e06e4acbef 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -19,6 +19,7 @@ struct nfs_net {
        struct rpc_pipe *bl_device_pipe;
        struct bl_dev_msg bl_mount_reply;
        wait_queue_head_t bl_wq;
+        struct mutex bl_mutex;
        struct list_head nfs_client_list;
        struct list_head nfs_volume_list;
 #if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 405bd95c1f58..69dc20a743f9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -370,11 +370,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_BAD_STATEID:
-                        if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) {
-                                nfs_remove_bad_delegation(inode);
-                                exception->retry = 1;
-                                break;
-                        }
                        if (state == NULL)
                                break;
                        ret = nfs4_schedule_stateid_recovery(server, state);
@@ -1654,7 +1649,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
                        nfs_inode_find_state_and_recover(state->inode,
                                        stateid);
                        nfs4_schedule_stateid_recovery(server, state);
-                        return 0;
+                        return -EAGAIN;
                case -NFS4ERR_DELAY:
                case -NFS4ERR_GRACE:
                        set_bit(NFS_DELEGATED_STATE, &state->flags);
@@ -2109,46 +2104,60 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
        return ret;
 }
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+{
+        nfs_remove_bad_delegation(state->inode);
+        write_seqlock(&state->seqlock);
+        nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+        write_sequnlock(&state->seqlock);
+        clear_bit(NFS_DELEGATED_STATE, &state->flags);
+}
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+        if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+                nfs_finish_clear_delegation_stateid(state);
+}
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+        /* NFSv4.0 doesn't allow for delegation recovery on open expire */
+        nfs40_clear_delegation_stateid(state);
+        return nfs4_open_expired(sp, state);
+}
 #if defined(CONFIG_NFS_V4_1)
-static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs41_check_delegation_stateid(struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
-        nfs4_stateid *stateid = &state->stateid;
+        nfs4_stateid stateid;
        struct nfs_delegation *delegation;
-        struct rpc_cred *cred = NULL;
+        struct rpc_cred *cred;
-        int status = -NFS4ERR_BAD_STATEID;
+        int status;
-        /* If a state reset has been done, test_stateid is unneeded */
-        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
-                return;
        /* Get the delegation credential for use by test/free_stateid */
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(state->inode)->delegation);
-        if (delegation != NULL &&
+        if (delegation == NULL) {
-            nfs4_stateid_match(&delegation->stateid, stateid)) {
-                cred = get_rpccred(delegation->cred);
-                rcu_read_unlock();
-                status = nfs41_test_stateid(server, stateid, cred);
-                trace_nfs4_test_delegation_stateid(state, NULL, status);
-        } else
                rcu_read_unlock();
+                return;
+        }
+        nfs4_stateid_copy(&stateid, &delegation->stateid);
+        cred = get_rpccred(delegation->cred);
+        rcu_read_unlock();
+        status = nfs41_test_stateid(server, &stateid, cred);
+        trace_nfs4_test_delegation_stateid(state, NULL, status);
        if (status != NFS_OK) {
                /* Free the stateid unless the server explicitly
                 * informs us the stateid is unrecognized. */
                if (status != -NFS4ERR_BAD_STATEID)
-                        nfs41_free_stateid(server, stateid, cred);
+                        nfs41_free_stateid(server, &stateid, cred);
-                nfs_remove_bad_delegation(state->inode);
+                nfs_finish_clear_delegation_stateid(state);
-                write_seqlock(&state->seqlock);
-                nfs4_stateid_copy(&state->stateid, &state->open_stateid);
-                write_sequnlock(&state->seqlock);
-                clear_bit(NFS_DELEGATED_STATE, &state->flags);
        }
-        if (cred != NULL)
+        put_rpccred(cred);
-                put_rpccred(cred);
 }
 /**
@@ -2192,7 +2201,7 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
 {
        int status;
-        nfs41_clear_delegation_stateid(state);
+        nfs41_check_delegation_stateid(state);
        status = nfs41_check_open_stateid(state);
        if (status != NFS_OK)
                status = nfs4_open_expired(sp, state);
@@ -2231,19 +2240,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
        seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
        ret = _nfs4_proc_open(opendata);
-        if (ret != 0) {
+        if (ret != 0)
-                if (ret == -ENOENT) {
-                        dentry = opendata->dentry;
-                        if (dentry->d_inode)
-                                d_delete(dentry);
-                        else if (d_unhashed(dentry))
-                                d_add(dentry, NULL);
-                        nfs_set_verifier(dentry,
-                                         nfs_save_change_attribute(opendata->dir->d_inode));
-                }
                goto out;
-        }
        state = nfs4_opendata_to_nfs4_state(opendata);
        ret = PTR_ERR(state);
@@ -4841,9 +4839,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_BAD_STATEID:
-                        if (state == NULL)
-                                break;
-                        nfs_remove_bad_delegation(state->inode);
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
@@ -8341,7 +8336,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
 static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
-        .recover_open   = nfs4_open_expired,
+        .recover_open   = nfs40_open_expired,
        .recover_lock   = nfs4_lock_expired,
        .establish_clid = nfs4_init_clientid,
 };
@@ -8408,8 +8403,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
                | NFS_CAP_CHANGE_ATTR
                | NFS_CAP_POSIX_LOCK
                | NFS_CAP_STATEID_NFSV41
-                | NFS_CAP_ATOMIC_OPEN_V1
+                | NFS_CAP_ATOMIC_OPEN_V1,
-                | NFS_CAP_SEEK,
        .init_client = nfs41_init_client,
        .shutdown_client = nfs41_shutdown_client,
        .match_stateid = nfs41_match_stateid,
@@ -8431,7 +8425,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
                | NFS_CAP_CHANGE_ATTR
                | NFS_CAP_POSIX_LOCK
                | NFS_CAP_STATEID_NFSV41
-                | NFS_CAP_ATOMIC_OPEN_V1,
+                | NFS_CAP_ATOMIC_OPEN_V1
+                | NFS_CAP_SEEK,
        .init_client = nfs41_init_client,
        .shutdown_client = nfs41_shutdown_client,
        .match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 12493846a2d3..f83b02dc9166 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -715,8 +715,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
                nfs_release_request(req);
-        else
-                WARN_ON_ONCE(1);
 }
 static void
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9d3e9c50066a..89326acd4561 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -229,8 +229,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                                              &fsnotify_mark_srcu);
        }
+        /*
+         * We need to merge inode & vfsmount mark lists so that inode mark
+         * ignore masks are properly reflected for mount mark notifications.
+         * That's why this traversal is so complicated...
+         */
        while (inode_node || vfsmount_node) {
-                inode_group = vfsmount_group = NULL;
+                inode_group = NULL;
+                inode_mark = NULL;
+                vfsmount_group = NULL;
+                vfsmount_mark = NULL;
                if (inode_node) {
                        inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
@@ -244,21 +252,19 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                        vfsmount_group = vfsmount_mark->group;
                }
-                if (inode_group > vfsmount_group) {
+                if (inode_group && vfsmount_group) {
-                        /* handle inode */
+                        int cmp = fsnotify_compare_groups(inode_group,
-                        ret = send_to_group(to_tell, inode_mark, NULL, mask,
+                                                          vfsmount_group);
-                                            data, data_is, cookie, file_name);
+                        if (cmp > 0) {
-                        /* we didn't use the vfsmount_mark */
+                                inode_group = NULL;
-                        vfsmount_group = NULL;
+                                inode_mark = NULL;
-                } else if (vfsmount_group > inode_group) {
+                        } else if (cmp < 0) {
-                        ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
+                                vfsmount_group = NULL;
-                                            data, data_is, cookie, file_name);
+                                vfsmount_mark = NULL;
-                        inode_group = NULL;
+                        }
-                } else {
-                        ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
-                                            mask, data, data_is, cookie,
-                                            file_name);
                }
+                ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
+                                    data, data_is, cookie, file_name);
                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
                        goto out;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 9c0898c4cfe1..3b68b0ae0a97 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,6 +12,10 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
 /* protects reads of inode and vfsmount marks list */
 extern struct srcu_struct fsnotify_mark_srcu;
+/* compare two groups for sorting of marks lists */
+extern int fsnotify_compare_groups(struct fsnotify_group *a,
+                                   struct fsnotify_group *b);
 extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
                                                __u32 mask);
 /* add a mark to an inode */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e8497144b323..dfbf5447eea4 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -194,6 +194,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 {
        struct fsnotify_mark *lmark, *last = NULL;
        int ret = 0;
+        int cmp;
        mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
@@ -219,11 +220,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                        goto out;
                }
-                if (mark->group->priority < lmark->group->priority)
+                cmp = fsnotify_compare_groups(lmark->group, mark->group);
-                        continue;
+                if (cmp < 0)
-                if ((mark->group->priority == lmark->group->priority) &&
-                    (mark->group < lmark->group))
                        continue;
                hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d90deaa08e78..34c38fabf514 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -210,6 +210,42 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas
 }
 /*
+ * Sorting function for lists of fsnotify marks.
+ *
+ * Fanotify supports different notification classes (reflected as priority of
+ * notification group). Events shall be passed to notification groups in
+ * decreasing priority order. To achieve this marks in notification lists for
+ * inodes and vfsmounts are sorted so that priorities of corresponding groups
+ * are descending.
+ *
+ * Furthermore correct handling of the ignore mask requires processing inode
+ * and vfsmount marks of each group together. Using the group address as
+ * further sort criterion provides a unique sorting order and thus we can
+ * merge inode and vfsmount lists of marks in linear time and find groups
+ * present in both lists.
+ *
+ * A return value of 1 signifies that b has priority over a.
+ * A return value of 0 signifies that the two marks have to be handled together.
+ * A return value of -1 signifies that a has priority over b.
+ */
+int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
+{
+        if (a == b)
+                return 0;
+        if (!a)
+                return 1;
+        if (!b)
+                return -1;
+        if (a->priority < b->priority)
+                return 1;
+        if (a->priority > b->priority)
+                return -1;
+        if (a < b)
+                return 1;
+        return -1;
+}
+/*
 * Attach an initialized mark to a given group and fs object.
 * These marks may be used for the fsnotify backend to determine which
 * event types should be delivered to which group.
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index ac851e8376b1..faefa72a11eb 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -153,6 +153,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
        struct mount *m = real_mount(mnt);
        struct fsnotify_mark *lmark, *last = NULL;
        int ret = 0;
+        int cmp;
        mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
@@ -178,11 +179,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
                        goto out;
                }
-                if (mark->group->priority < lmark->group->priority)
+                cmp = fsnotify_compare_groups(lmark->group, mark->group);
-                        continue;
+                if (cmp < 0)
-                if ((mark->group->priority == lmark->group->priority) &&
-                    (mark->group < lmark->group))
                        continue;
                hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 97de0fbd9f78..a96044004064 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -925,7 +925,7 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
                              size_t veclen, size_t total)
 {
        int ret;
-        struct msghdr msg;
+        struct msghdr msg = {.msg_flags = 0,};
        if (sock == NULL) {
                ret = -EINVAL;
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 4e9d7c1fea52..2a7ef4f8e2a6 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -168,7 +168,7 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
 {
        struct ovl_dir_cache *cache = od->cache;
-        list_del(&od->cursor.l_node);
+        list_del_init(&od->cursor.l_node);
        WARN_ON(cache->refcount <= 0);
        cache->refcount--;
        if (!cache->refcount) {
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 92e8f99a5857..281002689d64 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1338,7 +1338,10 @@ xfs_free_file_space(
        goto out;
 }
+/*
+ * Preallocate and zero a range of a file. This mechanism has the allocation
+ * semantics of fallocate and in addition converts data in the range to zeroes.
+ */
 int
 xfs_zero_file_space(
        struct xfs_inode        *ip,
@@ -1346,65 +1349,30 @@ xfs_zero_file_space(
        xfs_off_t               len)
 {
        struct xfs_mount        *mp = ip->i_mount;
-        uint                    granularity;
+        uint                    blksize;
-        xfs_off_t               start_boundary;
-        xfs_off_t               end_boundary;
        int                     error;
        trace_xfs_zero_file_space(ip);
-        granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+        blksize = 1 << mp->m_sb.sb_blocklog;
        /*
-         * Round the range of extents we are going to convert inwards.  If the
+         * Punch a hole and prealloc the range. We use hole punch rather than
-         * offset is aligned, then it doesn't get changed so we zero from the
+         * unwritten extent conversion for two reasons:
-         * start of the block offset points to.
+         *
+         * 1.) Hole punch handles partial block zeroing for us.
+         *
+         * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
+         * by virtue of the hole punch.
         */
-        start_boundary = round_up(offset, granularity);
+        error = xfs_free_file_space(ip, offset, len);
-        end_boundary = round_down(offset + len, granularity);
+        if (error)
+                goto out;
-        ASSERT(start_boundary >= offset);
-        ASSERT(end_boundary <= offset + len);
-        if (start_boundary < end_boundary - 1) {
-                /*
-                 * Writeback the range to ensure any inode size updates due to
-                 * appending writes make it to disk (otherwise we could just
-                 * punch out the delalloc blocks).
-                 */
-                error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                start_boundary, end_boundary - 1);
-                if (error)
-                        goto out;
-                truncate_pagecache_range(VFS_I(ip), start_boundary,
-                                         end_boundary - 1);
-                /* convert the blocks */
-                error = xfs_alloc_file_space(ip, start_boundary,
-                                        end_boundary - start_boundary - 1,
-                                        XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
-                if (error)
-                        goto out;
-                /* We've handled the interior of the range, now for the edges */
-                if (start_boundary != offset) {
-                        error = xfs_iozero(ip, offset, start_boundary - offset);
-                        if (error)
-                                goto out;
-                }
-                if (end_boundary != offset + len)
-                        error = xfs_iozero(ip, end_boundary,
-                                           offset + len - end_boundary);
-        } else {
-                /*
-                 * It's either a sub-granularity range or the range spanned lies
-                 * partially across two adjacent blocks.
-                 */
-                error = xfs_iozero(ip, offset, len);
-        }
+        error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+                                     round_up(offset + len, blksize) -
+                                     round_down(offset, blksize),
+                                     XFS_BMAPI_PREALLOC);
 out:
        return error;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f1deb961a296..894924a5129b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -236,8 +236,10 @@ xfs_bulkstat_grab_ichunk(
        XFS_WANT_CORRUPTED_RETURN(stat == 1);
        /* Check if the record contains the inode in request */
-        if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
+        if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
-                return -EINVAL;
+                *icount = 0;
+                return 0;
+        }
        idx = agino - irec->ir_startino + 1;
        if (idx < XFS_INODES_PER_CHUNK &&
@@ -262,75 +264,76 @@ xfs_bulkstat_grab_ichunk(
 #define XFS_BULKSTAT_UBLEFT(ubleft)     ((ubleft) >= statstruct_size)
+struct xfs_bulkstat_agichunk {
+        char            __user **ac_ubuffer;/* pointer into user's buffer */
+        int             ac_ubleft;      /* bytes left in user's buffer */
+        int             ac_ubelem;      /* spaces used in user's buffer */
+};
 /*
 * Process inodes in chunk with a pointer to a formatter function
 * that will iget the inode and fill in the appropriate structure.
 */
-int
+static int
 xfs_bulkstat_ag_ichunk(
        struct xfs_mount                *mp,
        xfs_agnumber_t                  agno,
        struct xfs_inobt_rec_incore     *irbp,
        bulkstat_one_pf                 formatter,
        size_t                          statstruct_size,
-        struct xfs_bulkstat_agichunk    *acp)
+        struct xfs_bulkstat_agichunk    *acp,
+        xfs_agino_t                     *last_agino)
 {
-        xfs_ino_t                       lastino = acp->ac_lastino;
        char                            __user **ubufp = acp->ac_ubuffer;
-        int                             ubleft = acp->ac_ubleft;
+        int                             chunkidx;
-        int                             ubelem = acp->ac_ubelem;
-        int                             chunkidx, clustidx;
        int                             error = 0;
-        xfs_agino_t                     agino;
+        xfs_agino_t                     agino = irbp->ir_startino;
-        for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
+        for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
-             XFS_BULKSTAT_UBLEFT(ubleft) &&
+             chunkidx++, agino++) {
-             irbp->ir_freecount < XFS_INODES_PER_CHUNK;
+                int             fmterror;
-             chunkidx++, clustidx++, agino++) {
-                int             fmterror;       /* bulkstat formatter result */
                int             ubused;
-                xfs_ino_t       ino = XFS_AGINO_TO_INO(mp, agno, agino);
-                ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+                /* inode won't fit in buffer, we are done */
+                if (acp->ac_ubleft < statstruct_size)
+                        break;
                /* Skip if this inode is free */
-                if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
+                if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
-                        lastino = ino;
                        continue;
-                }
-                /*
-                 * Count used inodes as free so we can tell when the
-                 * chunk is used up.
-                 */
-                irbp->ir_freecount++;
                /* Get the inode and fill in a single buffer */
                ubused = statstruct_size;
-                error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror);
+                error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
-                if (fmterror == BULKSTAT_RV_NOTHING) {
+                                  *ubufp, acp->ac_ubleft, &ubused, &fmterror);
-                        if (error && error != -ENOENT && error != -EINVAL) {
-                                ubleft = 0;
+                if (fmterror == BULKSTAT_RV_GIVEUP ||
-                                break;
+                    (error && error != -ENOENT && error != -EINVAL)) {
-                        }
+                        acp->ac_ubleft = 0;
-                        lastino = ino;
-                        continue;
-                }
-                if (fmterror == BULKSTAT_RV_GIVEUP) {
-                        ubleft = 0;
                        ASSERT(error);
                        break;
                }
-                if (*ubufp)
-                        *ubufp += ubused;
+                /* be careful not to leak error if at end of chunk */
-                ubleft -= ubused;
+                if (fmterror == BULKSTAT_RV_NOTHING || error) {
-                ubelem++;
+                        error = 0;
-                lastino = ino;
+                        continue;
+                }
+                *ubufp += ubused;
+                acp->ac_ubleft -= ubused;
+                acp->ac_ubelem++;
        }
-        acp->ac_lastino = lastino;
+        /*
-        acp->ac_ubleft = ubleft;
+         * Post-update *last_agino. At this point, agino will always point one
-        acp->ac_ubelem = ubelem;
+         * inode past the last inode we processed successfully. Hence we
+         * substract that inode when setting the *last_agino cursor so that we
+         * return the correct cookie to userspace. On the next bulkstat call,
+         * the inode under the lastino cookie will be skipped as we have already
+         * processed it here.
+         */
+        *last_agino = agino - 1;
        return error;
 }
@@ -353,45 +356,33 @@ xfs_bulkstat(
        xfs_agino_t             agino;  /* inode # in allocation group */
        xfs_agnumber_t          agno;   /* allocation group number */
        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
-        int                     end_of_ag; /* set if we've seen the ag end */
-        int                     error;  /* error code */
-        int                     fmterror;/* bulkstat formatter result */
-        int                     i;      /* loop index */
-        int                     icount; /* count of inodes good in irbuf */
        size_t                  irbsize; /* size of irec buffer in bytes */
-        xfs_ino_t               ino;    /* inode number (filesystem) */
-        xfs_inobt_rec_incore_t  *irbp;  /* current irec buffer pointer */
        xfs_inobt_rec_incore_t  *irbuf; /* start of irec buffer */
-        xfs_inobt_rec_incore_t  *irbufend; /* end of good irec buffer entries */
-        xfs_ino_t               lastino; /* last inode number returned */
        int                     nirbuf; /* size of irbuf */
-        int                     rval;   /* return value error code */
-        int                     tmp;    /* result value from btree calls */
        int                     ubcount; /* size of user's buffer */
-        int                     ubleft; /* bytes left in user's buffer */
+        struct xfs_bulkstat_agichunk ac;
-        char                    __user *ubufp;  /* pointer into user's buffer */
+        int                     error = 0;
-        int                     ubelem; /* spaces used in user's buffer */
        /*
         * Get the last inode value, see if there's nothing to do.
         */
-        ino = (xfs_ino_t)*lastinop;
+        agno = XFS_INO_TO_AGNO(mp, *lastinop);
-        lastino = ino;
+        agino = XFS_INO_TO_AGINO(mp, *lastinop);
-        agno = XFS_INO_TO_AGNO(mp, ino);
-        agino = XFS_INO_TO_AGINO(mp, ino);
        if (agno >= mp->m_sb.sb_agcount ||
-            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+            *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
                *done = 1;
                *ubcountp = 0;
                return 0;
        }
        ubcount = *ubcountp; /* statstruct's */
-        ubleft = ubcount * statstruct_size; /* bytes */
+        ac.ac_ubuffer = &ubuffer;
-        *ubcountp = ubelem = 0;
+        ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
+        ac.ac_ubelem = 0;
+        *ubcountp = 0;
        *done = 0;
-        fmterror = 0;
-        ubufp = ubuffer;
        irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
        if (!irbuf)
                return -ENOMEM;
@@ -402,9 +393,13 @@ xfs_bulkstat(
         * Loop over the allocation groups, starting from the last
         * inode returned; 0 means start of the allocation group.
         */
-        rval = 0;
+        while (agno < mp->m_sb.sb_agcount) {
-        while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
+                struct xfs_inobt_rec_incore     *irbp = irbuf;
-                cond_resched();
+                struct xfs_inobt_rec_incore     *irbufend = irbuf + nirbuf;
+                bool                            end_of_ag = false;
+                int                             icount = 0;
+                int                             stat;
                error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
                if (error)
                        break;
@@ -414,10 +409,6 @@ xfs_bulkstat(
                 */
                cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
                                            XFS_BTNUM_INO);
-                irbp = irbuf;
-                irbufend = irbuf + nirbuf;
-                end_of_ag = 0;
-                icount = 0;
                if (agino > 0) {
                        /*
                         * In the middle of an allocation group, we need to get
@@ -427,22 +418,23 @@ xfs_bulkstat(
                        error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
                        if (error)
-                                break;
+                                goto del_cursor;
                        if (icount) {
                                irbp->ir_startino = r.ir_startino;
                                irbp->ir_freecount = r.ir_freecount;
                                irbp->ir_free = r.ir_free;
                                irbp++;
-                                agino = r.ir_startino + XFS_INODES_PER_CHUNK;
                        }
                        /* Increment to the next record */
-                        error = xfs_btree_increment(cur, 0, &tmp);
+                        error = xfs_btree_increment(cur, 0, &stat);
                } else {
                        /* Start of ag.  Lookup the first inode chunk */
-                        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
+                        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
+                }
+                if (error || stat == 0) {
+                        end_of_ag = true;
+                        goto del_cursor;
                }
-                if (error)
-                        break;
                /*
                 * Loop through inode btree records in this ag,
@@ -451,10 +443,10 @@ xfs_bulkstat(
                while (irbp < irbufend && icount < ubcount) {
                        struct xfs_inobt_rec_incore     r;
-                        error = xfs_inobt_get_rec(cur, &r, &i);
+                        error = xfs_inobt_get_rec(cur, &r, &stat);
-                        if (error || i == 0) {
+                        if (error || stat == 0) {
-                                end_of_ag = 1;
+                                end_of_ag = true;
-                                break;
+                                goto del_cursor;
                        }
                        /*
@@ -469,77 +461,79 @@ xfs_bulkstat(
                                irbp++;
                                icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
                        }
-                        /*
+                        error = xfs_btree_increment(cur, 0, &stat);
-                         * Set agino to after this chunk and bump the cursor.
+                        if (error || stat == 0) {
-                         */
+                                end_of_ag = true;
-                        agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+                                goto del_cursor;
-                        error = xfs_btree_increment(cur, 0, &tmp);
+                        }
                        cond_resched();
                }
                /*
-                 * Drop the btree buffers and the agi buffer.
+                 * Drop the btree buffers and the agi buffer as we can't hold any
-                 * We can't hold any of the locks these represent
+                 * of the locks these represent when calling iget. If there is a
-                 * when calling iget.
+                 * pending error, then we are done.
                 */
+del_cursor:
                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
                xfs_buf_relse(agbp);
+                if (error)
+                        break;
                /*
-                 * Now format all the good inodes into the user's buffer.
+                 * Now format all the good inodes into the user's buffer. The
+                 * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
+                 * for the next loop iteration.
                 */
                irbufend = irbp;
                for (irbp = irbuf;
-                     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
+                     irbp < irbufend && ac.ac_ubleft >= statstruct_size;
-                        struct xfs_bulkstat_agichunk ac;
+                     irbp++) {
-                        ac.ac_lastino = lastino;
-                        ac.ac_ubuffer = &ubuffer;
-                        ac.ac_ubleft = ubleft;
-                        ac.ac_ubelem = ubelem;
                        error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
-                                        formatter, statstruct_size, &ac);
+                                        formatter, statstruct_size, &ac,
+                                        &agino);
                        if (error)
-                                rval = error;
+                                break;
-                        lastino = ac.ac_lastino;
-                        ubleft = ac.ac_ubleft;
-                        ubelem = ac.ac_ubelem;
                        cond_resched();
                }
                /*
-                 * Set up for the next loop iteration.
+                 * If we've run out of space or had a formatting error, we
+                 * are now done
                 */
-                if (XFS_BULKSTAT_UBLEFT(ubleft)) {
+                if (ac.ac_ubleft < statstruct_size || error)
-                        if (end_of_ag) {
-                                agno++;
-                                agino = 0;
-                        } else
-                                agino = XFS_INO_TO_AGINO(mp, lastino);
-                } else
                        break;
+                if (end_of_ag) {
+                        agno++;
+                        agino = 0;
+                }
        }
        /*
         * Done, we're either out of filesystem or space to put the data.
         */
        kmem_free(irbuf);
-        *ubcountp = ubelem;
+        *ubcountp = ac.ac_ubelem;
        /*
-         * Found some inodes, return them now and return the error next time.
+         * We found some inodes, so clear the error status and return them.
+         * The lastino pointer will point directly at the inode that triggered
+         * any error that occurred, so on the next call the error will be
+         * triggered again and propagated to userspace as there will be no
+         * formatted inodes in the buffer.
         */
-        if (ubelem)
+        if (ac.ac_ubelem)
-                rval = 0;
+                error = 0;
-        if (agno >= mp->m_sb.sb_agcount) {
-                /*
+        /*
-                 * If we ran out of filesystem, mark lastino as off
+         * If we ran out of filesystem, lastino will point off the end of
-                 * the end of the filesystem, so the next call
+         * the filesystem so the next call will return immediately.
-                 * will return immediately.
+         */
-                 */
+        *lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
-                *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
+        if (agno >= mp->m_sb.sb_agcount)
                *done = 1;
-        } else
-                *lastinop = (xfs_ino_t)lastino;
-        return rval;
+        return error;
 }
 int
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index aaed08022eb9..6ea8b3912fa4 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -30,22 +30,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
                               int              *ubused,
                               int              *stat);
-struct xfs_bulkstat_agichunk {
-        xfs_ino_t       ac_lastino;     /* last inode returned */
-        char            __user **ac_ubuffer;/* pointer into user's buffer */
-        int             ac_ubleft;      /* bytes left in user's buffer */
-        int             ac_ubelem;      /* spaces used in user's buffer */
-};
-int
-xfs_bulkstat_ag_ichunk(
-        struct xfs_mount                *mp,
-        xfs_agnumber_t                  agno,
-        struct xfs_inobt_rec_incore     *irbp,
-        bulkstat_one_pf                 formatter,
-        size_t                          statstruct_size,
-        struct xfs_bulkstat_agichunk    *acp);
 /*
 * Values for stat return value.
 */