119 files changed, 2206 insertions, 1043 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 4b0eff6da674..85737e96ab8b 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -189,11 +189,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
        case 1:
                _debug("extract FID count");
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("FID count: %u", call->count);
@@ -210,11 +207,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
                _debug("extract FID array");
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       call->count * 3 * 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                _debug("unmarshall FID array");
                call->request = kcalloc(call->count,
@@ -239,11 +233,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
        case 3:
                _debug("extract CB count");
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                tmp = ntohl(call->tmp);
                _debug("CB count: %u", tmp);
@@ -258,11 +249,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
                _debug("extract CB array");
                ret = afs_extract_data(call, skb, last, call->request,
                                       call->count * 3 * 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                _debug("unmarshall CB array");
                cb = call->request;
@@ -278,9 +266,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
                call->unmarshall++;
        case 5:
-                _debug("trailer");
+                ret = afs_data_complete(call, skb, last);
-                if (skb->len != 0)
+                if (ret < 0)
-                        return -EBADMSG;
+                        return ret;
                /* Record that the message was unmarshalled successfully so
                 * that the call destructor can know do the callback breaking
@@ -294,8 +282,6 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
                break;
        }
-        if (!last)
-                return 0;
        call->state = AFS_CALL_REPLYING;
@@ -335,13 +321,13 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
 {
        struct afs_server *server;
        struct in_addr addr;
+        int ret;
        _enter(",{%u},%d", skb->len, last);
-        if (skb->len > 0)
+        ret = afs_data_complete(call, skb, last);
-                return -EBADMSG;
+        if (ret < 0)
-        if (!last)
+                return ret;
-                return 0;
        /* no unmarshalling required */
        call->state = AFS_CALL_REPLYING;
@@ -371,8 +357,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
        _enter(",{%u},%d", skb->len, last);
+        /* There are some arguments that we ignore */
+        afs_data_consumed(call, skb);
        if (!last)
-                return 0;
+                return -EAGAIN;
        /* no unmarshalling required */
        call->state = AFS_CALL_REPLYING;
@@ -408,12 +396,13 @@ static void SRXAFSCB_Probe(struct work_struct *work)
 static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
                                bool last)
 {
+        int ret;
        _enter(",{%u},%d", skb->len, last);
-        if (skb->len > 0)
+        ret = afs_data_complete(call, skb, last);
-                return -EBADMSG;
+        if (ret < 0)
-        if (!last)
+                return ret;
-                return 0;
        /* no unmarshalling required */
        call->state = AFS_CALL_REPLYING;
@@ -460,10 +449,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        if (skb->len > 0)
+        ret = afs_data_complete(call, skb, last);
-                return -EBADMSG;
+        if (ret < 0)
-        if (!last)
+                return ret;
-                return 0;
        switch (call->unmarshall) {
        case 0:
@@ -509,8 +497,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
                break;
        }
-        if (!last)
+        ret = afs_data_complete(call, skb, last);
-                return 0;
+        if (ret < 0)
+                return ret;
        call->state = AFS_CALL_REPLYING;
@@ -588,12 +577,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
                                                 struct sk_buff *skb, bool last)
 {
+        int ret;
        _enter(",{%u},%d", skb->len, last);
-        if (skb->len > 0)
+        ret = afs_data_complete(call, skb, last);
-                return -EBADMSG;
+        if (ret < 0)
-        if (!last)
+                return ret;
-                return 0;
        /* no unmarshalling required */
        call->state = AFS_CALL_REPLYING;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index c2e930ec2888..9312b92e54be 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -240,15 +240,13 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call,
 {
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter(",,%u", last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -335,11 +333,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
        case 1:
                _debug("extract data length (MSW)");
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("DATA length MSW: %u", call->count);
@@ -353,11 +348,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
        case 2:
                _debug("extract data length");
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("DATA length: %u", call->count);
@@ -375,11 +367,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
                        ret = afs_extract_data(call, skb, last, buffer,
                                               call->count);
                        kunmap_atomic(buffer);
-                        switch (ret) {
+                        if (ret < 0)
-                        case 0:         break;
+                                return ret;
-                        case -EAGAIN:   return 0;
-                        default:        return ret;
-                        }
                }
                call->offset = 0;
@@ -389,11 +378,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
        case 4:
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       (21 + 3 + 6) * 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                bp = call->buffer;
                xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
@@ -405,15 +391,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
                call->unmarshall++;
        case 5:
-                _debug("trailer");
+                ret = afs_data_complete(call, skb, last);
-                if (skb->len != 0)
+                if (ret < 0)
-                        return -EBADMSG;
+                        return ret;
                break;
        }
-        if (!last)
-                return 0;
        if (call->count < PAGE_SIZE) {
                _debug("clear");
                page = call->reply3;
@@ -537,9 +520,8 @@ static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
 {
        _enter(",{%u},%d", skb->len, last);
-        if (skb->len > 0)
+        /* shouldn't be any reply data */
-                return -EBADMSG; /* shouldn't be any reply data */
+        return afs_data_complete(call, skb, last);
-        return 0;
 }
 /*
@@ -622,15 +604,13 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call,
 {
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -721,15 +701,13 @@ static int afs_deliver_fs_remove(struct afs_call *call,
 {
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -804,15 +782,13 @@ static int afs_deliver_fs_link(struct afs_call *call,
 {
        struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -892,15 +868,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call,
 {
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -999,15 +973,13 @@ static int afs_deliver_fs_rename(struct afs_call *call,
 {
        struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -1105,20 +1077,13 @@ static int afs_deliver_fs_store_data(struct afs_call *call,
 {
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter(",,%u", last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last) {
+        if (ret < 0)
-                _leave(" = 0 [more]");
+                return ret;
-                return 0;
-        }
-        if (call->reply_size != call->reply_max) {
-                _leave(" = -EBADMSG [%u != %u]",
-                       call->reply_size, call->reply_max);
-                return -EBADMSG;
-        }
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -1292,20 +1257,13 @@ static int afs_deliver_fs_store_status(struct afs_call *call,
        afs_dataversion_t *store_version;
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter(",,%u", last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last) {
+        if (ret < 0)
-                _leave(" = 0 [more]");
+                return ret;
-                return 0;
-        }
-        if (call->reply_size != call->reply_max) {
-                _leave(" = -EBADMSG [%u != %u]",
-                       call->reply_size, call->reply_max);
-                return -EBADMSG;
-        }
        /* unmarshall the reply once we've received all of it */
        store_version = NULL;
@@ -1504,11 +1462,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                _debug("extract status");
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       12 * 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                bp = call->buffer;
                xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
@@ -1518,11 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                /* extract the volume name length */
        case 2:
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("volname length: %u", call->count);
@@ -1537,11 +1489,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                if (call->count > 0) {
                        ret = afs_extract_data(call, skb, last, call->reply3,
                                               call->count);
-                        switch (ret) {
+                        if (ret < 0)
-                        case 0:         break;
+                                return ret;
-                        case -EAGAIN:   return 0;
-                        default:        return ret;
-                        }
                }
                p = call->reply3;
@@ -1561,11 +1510,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
        case 4:
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       call->count);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->offset = 0;
                call->unmarshall++;
@@ -1574,11 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                /* extract the offline message length */
        case 5:
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("offline msg length: %u", call->count);
@@ -1593,11 +1536,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                if (call->count > 0) {
                        ret = afs_extract_data(call, skb, last, call->reply3,
                                               call->count);
-                        switch (ret) {
+                        if (ret < 0)
-                        case 0:         break;
+                                return ret;
-                        case -EAGAIN:   return 0;
-                        default:        return ret;
-                        }
                }
                p = call->reply3;
@@ -1617,11 +1557,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
        case 7:
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       call->count);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->offset = 0;
                call->unmarshall++;
@@ -1630,11 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                /* extract the message of the day length */
        case 8:
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("motd length: %u", call->count);
@@ -1649,11 +1583,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                if (call->count > 0) {
                        ret = afs_extract_data(call, skb, last, call->reply3,
                                               call->count);
-                        switch (ret) {
+                        if (ret < 0)
-                        case 0:         break;
+                                return ret;
-                        case -EAGAIN:   return 0;
-                        default:        return ret;
-                        }
                }
                p = call->reply3;
@@ -1673,26 +1604,20 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
        case 10:
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       call->count);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->offset = 0;
                call->unmarshall++;
        no_motd_padding:
        case 11:
-                _debug("trailer %d", skb->len);
+                ret = afs_data_complete(call, skb, last);
-                if (skb->len != 0)
+                if (ret < 0)
-                        return -EBADMSG;
+                        return ret;
                break;
        }
-        if (!last)
-                return 0;
        _leave(" = 0 [done]");
        return 0;
 }
@@ -1764,15 +1689,13 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
                                    struct sk_buff *skb, bool last)
 {
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 71d5982312f3..df976b2a7f40 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -609,17 +609,29 @@ extern void afs_proc_cell_remove(struct afs_cell *);
 */
 extern int afs_open_socket(void);
 extern void afs_close_socket(void);
+extern void afs_data_consumed(struct afs_call *, struct sk_buff *);
 extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
                         const struct afs_wait_mode *);
 extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
                                            size_t, size_t);
 extern void afs_flat_call_destructor(struct afs_call *);
-extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
+extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool);
 extern void afs_send_empty_reply(struct afs_call *);
 extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
 extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
                            size_t);
+static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb,
+                                    bool last)
+{
+        if (skb->len > 0)
+                return -EBADMSG;
+        afs_data_consumed(call, skb);
+        if (!last)
+                return -EAGAIN;
+        return 0;
+}
 /*
 * security.c
 */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 4832de84d52c..14d04c848465 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -150,10 +150,9 @@ void afs_close_socket(void)
 }
 /*
- * note that the data in a socket buffer is now delivered and that the buffer
+ * Note that the data in a socket buffer is now consumed.
- * should be freed
 */
-static void afs_data_delivered(struct sk_buff *skb)
+void afs_data_consumed(struct afs_call *call, struct sk_buff *skb)
 {
        if (!skb) {
                _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
@@ -161,9 +160,7 @@ static void afs_data_delivered(struct sk_buff *skb)
        } else {
                _debug("DLVR %p{%u} [%d]",
                       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
-                if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+                rxrpc_kernel_data_consumed(call->rxcall, skb);
-                        BUG();
-                rxrpc_kernel_data_delivered(skb);
        }
 }
@@ -489,9 +486,15 @@ static void afs_deliver_to_call(struct afs_call *call)
                        last = rxrpc_kernel_is_data_last(skb);
                        ret = call->type->deliver(call, skb, last);
                        switch (ret) {
+                        case -EAGAIN:
+                                if (last) {
+                                        _debug("short data");
+                                        goto unmarshal_error;
+                                }
+                                break;
                        case 0:
-                                if (last &&
+                                ASSERT(last);
-                                    call->state == AFS_CALL_AWAIT_REPLY)
+                                if (call->state == AFS_CALL_AWAIT_REPLY)
                                        call->state = AFS_CALL_COMPLETE;
                                break;
                        case -ENOTCONN:
@@ -501,6 +504,7 @@ static void afs_deliver_to_call(struct afs_call *call)
                                abort_code = RX_INVALID_OPERATION;
                                goto do_abort;
                        default:
+                        unmarshal_error:
                                abort_code = RXGEN_CC_UNMARSHAL;
                                if (call->state != AFS_CALL_AWAIT_REPLY)
                                        abort_code = RXGEN_SS_UNMARSHAL;
@@ -511,9 +515,7 @@ static void afs_deliver_to_call(struct afs_call *call)
                                call->state = AFS_CALL_ERROR;
                                break;
                        }
-                        afs_data_delivered(skb);
+                        break;
-                        skb = NULL;
-                        continue;
                case RXRPC_SKB_MARK_FINAL_ACK:
                        _debug("Rcv ACK");
                        call->state = AFS_CALL_COMPLETE;
@@ -685,15 +687,35 @@ static void afs_process_async_call(struct afs_call *call)
 }
 /*
- * empty a socket buffer into a flat reply buffer
+ * Empty a socket buffer into a flat reply buffer.
 */
-void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
+int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last)
 {
        size_t len = skb->len;
-        if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
+        if (len > call->reply_max - call->reply_size) {
-                BUG();
+                _leave(" = -EBADMSG [%zu > %u]",
-        call->reply_size += len;
+                       len, call->reply_max - call->reply_size);
+                return -EBADMSG;
+        }
+        if (len > 0) {
+                if (skb_copy_bits(skb, 0, call->buffer + call->reply_size,
+                                  len) < 0)
+                        BUG();
+                call->reply_size += len;
+        }
+        afs_data_consumed(call, skb);
+        if (!last)
+                return -EAGAIN;
+        if (call->reply_size != call->reply_max) {
+                _leave(" = -EBADMSG [%u != %u]",
+                       call->reply_size, call->reply_max);
+                return -EBADMSG;
+        }
+        return 0;
 }
 /*
@@ -745,7 +767,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
 }
 /*
- * grab the operation ID from an incoming cache manager call
+ * Grab the operation ID from an incoming cache manager call.  The socket
+ * buffer is discarded on error or if we don't yet have sufficient data.
 */
 static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
                                bool last)
@@ -766,12 +789,9 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
        call->offset += len;
        if (call->offset < 4) {
-                if (last) {
+                afs_data_consumed(call, skb);
-                        _leave(" = -EBADMSG [op ID short]");
+                _leave(" = -EAGAIN");
-                        return -EBADMSG;
+                return -EAGAIN;
-                }
-                _leave(" = 0 [incomplete]");
-                return 0;
        }
        call->state = AFS_CALL_AWAIT_REQUEST;
@@ -855,7 +875,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 }
 /*
- * extract a piece of data from the received data socket buffers
+ * Extract a piece of data from the received data socket buffers.
 */
 int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
                     bool last, void *buf, size_t count)
@@ -873,10 +893,7 @@ int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
        call->offset += len;
        if (call->offset < count) {
-                if (last) {
+                afs_data_consumed(call, skb);
-                        _leave(" = -EBADMSG [%d < %zu]", call->offset, count);
-                        return -EBADMSG;
-                }
                _leave(" = -EAGAIN");
                return -EAGAIN;
        }
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 340afd0cd182..f94d1abdc3eb 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -64,16 +64,13 @@ static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
        struct afs_cache_vlocation *entry;
        __be32 *bp;
        u32 tmp;
-        int loop;
+        int loop, ret;
        _enter(",,%u", last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        entry = call->reply;
diff --git a/fs/aio.c b/fs/aio.c
index fb8e45b88cd4..4fe81d1c60f9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -239,7 +239,12 @@ static struct dentry *aio_mount(struct file_system_type *fs_type,
        static const struct dentry_operations ops = {
                .d_dname        = simple_dname,
        };
-        return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
+        struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops,
+                                           AIO_RING_MAGIC);
+        if (!IS_ERR(root))
+                root->d_sb->s_iflags |= SB_I_NOEXEC;
+        return root;
 }
 /* aio_setup
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b493909e7492..d8e6d421c27f 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry,
        }
        return NULL;
 }
 /*
 * Find an eligible tree to time-out
 * A tree is eligible if :-
@@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        struct dentry *root = sb->s_root;
        struct dentry *dentry;
        struct dentry *expired;
+        struct dentry *found;
        struct autofs_info *ino;
        if (!root)
@@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        dentry = NULL;
        while ((dentry = get_next_positive_subdir(dentry, root))) {
+                int flags = how;
                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(dentry);
-                if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
+                if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
-                        expired = NULL;
-                else
-                        expired = should_expire(dentry, mnt, timeout, how);
-                if (!expired) {
                        spin_unlock(&sbi->fs_lock);
                        continue;
                }
+                spin_unlock(&sbi->fs_lock);
+                expired = should_expire(dentry, mnt, timeout, flags);
+                if (!expired)
+                        continue;
+                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(expired);
                ino->flags |= AUTOFS_INF_WANT_EXPIRE;
                spin_unlock(&sbi->fs_lock);
                synchronize_rcu();
-                spin_lock(&sbi->fs_lock);
-                if (should_expire(expired, mnt, timeout, how)) {
-                        if (expired != dentry)
-                                dput(dentry);
-                        goto found;
-                }
+                /* Make sure a reference is not taken on found if
+                 * things have changed.
+                 */
+                flags &= ~AUTOFS_EXP_LEAVES;
+                found = should_expire(expired, mnt, timeout, how);
+                if (!found || found != expired)
+                        /* Something has changed, continue */
+                        goto next;
+                if (expired != dentry)
+                        dput(dentry);
+                spin_lock(&sbi->fs_lock);
+                goto found;
+next:
+                spin_lock(&sbi->fs_lock);
                ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
+                spin_unlock(&sbi->fs_lock);
                if (expired != dentry)
                        dput(expired);
-                spin_unlock(&sbi->fs_lock);
        }
        return NULL;
@@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        int status;
+        int state;
        /* Block on any pending expire */
        if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE))
@@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
        if (rcu_walk)
                return -ECHILD;
+retry:
        spin_lock(&sbi->fs_lock);
-        if (ino->flags & AUTOFS_INF_EXPIRING) {
+        state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING);
+        if (state == AUTOFS_INF_WANT_EXPIRE) {
+                spin_unlock(&sbi->fs_lock);
+                /*
+                 * Possibly being selected for expire, wait until
+                 * it's selected or not.
+                 */
+                schedule_timeout_uninterruptible(HZ/10);
+                goto retry;
+        }
+        if (state & AUTOFS_INF_EXPIRING) {
                spin_unlock(&sbi->fs_lock);
                pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7f6aff3f72eb..e5495f37c6ed 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -853,6 +853,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
                current->flags |= PF_RANDOMIZE;
        setup_new_exec(bprm);
+        install_exec_creds(bprm);
        /* Do this so that we can load the interpreter, if need be.  We will
           change some of these later */
@@ -1044,7 +1045,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
                goto out;
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
-        install_exec_creds(bprm);
        retval = create_elf_tables(bprm, &loc->elf_ex,
                          load_addr, interp_load_addr);
        if (retval < 0)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c3cdde87cc8c..08ae99343d92 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -249,7 +249,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
                 * thaw_bdev drops it.
                 */
                sb = get_super(bdev);
-                drop_super(sb);
+                if (sb)
+                        drop_super(sb);
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                return sb;
        }
@@ -646,7 +647,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type,
 {
        struct dentry *dent;
        dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
-        if (dent)
+        if (!IS_ERR(dent))
                dent->d_sb->s_iflags |= SB_I_CGROUPWB;
        return dent;
 }
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2b88439c2ee8..455a6b2fd539 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -589,6 +589,7 @@ static void __merge_refs(struct list_head *head, int mode)
                        list_del(&ref2->list);
                        kmem_cache_free(btrfs_prelim_ref_cache, ref2);
+                        cond_resched();
                }
        }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2fe8f89091a3..33fe03551105 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -427,6 +427,7 @@ struct btrfs_space_info {
        struct list_head ro_bgs;
        struct list_head priority_tickets;
        struct list_head tickets;
+        u64 tickets_id;
        struct rw_semaphore groups_sem;
        /* for block groups in our same type */
@@ -1028,6 +1029,7 @@ struct btrfs_fs_info {
        struct btrfs_workqueue *qgroup_rescan_workers;
        struct completion qgroup_rescan_completion;
        struct btrfs_work qgroup_rescan_work;
+        bool qgroup_rescan_running;     /* protected by qgroup_rescan_lock */
        /* filesystem state */
        unsigned long fs_state;
@@ -1079,6 +1081,8 @@ struct btrfs_fs_info {
        struct list_head pinned_chunks;
        int creating_free_space_tree;
+        /* Used to record internally whether fs has been frozen */
+        int fs_frozen;
 };
 struct btrfs_subvolume_writers {
@@ -2578,7 +2582,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   u64 root_objectid, u64 owner, u64 offset,
                                   struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
                         u64 min_alloc_size, u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data, int delalloc);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index b6d210e7a993..ac02e041464b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_ref_head *existing;
        struct btrfs_delayed_ref_head *head_ref = NULL;
        struct btrfs_delayed_ref_root *delayed_refs;
-        struct btrfs_qgroup_extent_record *qexisting;
        int count_mod = 1;
        int must_insert_reserved = 0;
@@ -606,10 +605,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
                qrecord->num_bytes = num_bytes;
                qrecord->old_roots = NULL;
-                qexisting = btrfs_qgroup_insert_dirty_extent(fs_info,
+                if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info,
-                                                             delayed_refs,
+                                        delayed_refs, qrecord))
-                                                             qrecord);
-                if (qexisting)
                        kfree(qrecord);
        }
@@ -862,33 +859,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        return 0;
 }
-int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
-                                     struct btrfs_trans_handle *trans,
-                                     u64 ref_root, u64 bytenr, u64 num_bytes)
-{
-        struct btrfs_delayed_ref_root *delayed_refs;
-        struct btrfs_delayed_ref_head *ref_head;
-        int ret = 0;
-        if (!fs_info->quota_enabled || !is_fstree(ref_root))
-                return 0;
-        delayed_refs = &trans->transaction->delayed_refs;
-        spin_lock(&delayed_refs->lock);
-        ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
-        if (!ref_head) {
-                ret = -ENOENT;
-                goto out;
-        }
-        WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
-        ref_head->qgroup_ref_root = ref_root;
-        ref_head->qgroup_reserved = num_bytes;
-out:
-        spin_unlock(&delayed_refs->lock);
-        return ret;
-}
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5fca9534a271..43f3629760e9 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -250,9 +250,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, u64 reserved, int action,
                               struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
-                                     struct btrfs_trans_handle *trans,
-                                     u64 ref_root, u64 bytenr, u64 num_bytes);
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 59febfb8d04a..54bc8c7c6bcd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -559,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root,
        u32 nritems = btrfs_header_nritems(leaf);
        int slot;
-        if (nritems == 0)
+        if (nritems == 0) {
+                struct btrfs_root *check_root;
+                key.objectid = btrfs_header_owner(leaf);
+                key.type = BTRFS_ROOT_ITEM_KEY;
+                key.offset = (u64)-1;
+                check_root = btrfs_get_fs_root(root->fs_info, &key, false);
+                /*
+                 * The only reason we also check NULL here is that during
+                 * open_ctree() some roots has not yet been set up.
+                 */
+                if (!IS_ERR_OR_NULL(check_root)) {
+                        /* if leaf is the root, then it's fine */
+                        if (leaf->start !=
+                            btrfs_root_bytenr(&check_root->root_item)) {
+                                CORRUPT("non-root leaf's nritems is 0",
+                                        leaf, root, 0);
+                                return -EIO;
+                        }
+                }
                return 0;
+        }
        /* Check the 0 item */
        if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
@@ -612,6 +633,19 @@ static noinline int check_leaf(struct btrfs_root *root,
        return 0;
 }
+static int check_node(struct btrfs_root *root, struct extent_buffer *node)
+{
+        unsigned long nr = btrfs_header_nritems(node);
+        if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
+                btrfs_crit(root->fs_info,
+                           "corrupt node: block %llu root %llu nritems %lu",
+                           node->start, root->objectid, nr);
+                return -EIO;
+        }
+        return 0;
+}
 static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
                                      u64 phy_offset, struct page *page,
                                      u64 start, u64 end, int mirror)
@@ -682,6 +716,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
                ret = -EIO;
        }
+        if (found_level > 0 && check_node(root, eb))
+                ret = -EIO;
        if (!ret)
                set_extent_buffer_uptodate(eb);
 err:
@@ -1618,8 +1655,8 @@ fail:
        return ret;
 }
-static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
-                                               u64 root_id)
+                                        u64 root_id)
 {
        struct btrfs_root *root;
@@ -2298,6 +2335,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
        fs_info->quota_enabled = 0;
        fs_info->pending_quota_state = 0;
        fs_info->qgroup_ulist = NULL;
+        fs_info->qgroup_rescan_running = false;
        mutex_init(&fs_info->qgroup_rescan_lock);
 }
@@ -2624,6 +2662,7 @@ int open_ctree(struct super_block *sb,
        atomic_set(&fs_info->qgroup_op_seq, 0);
        atomic_set(&fs_info->reada_works_cnt, 0);
        atomic64_set(&fs_info->tree_mod_seq, 0);
+        fs_info->fs_frozen = 0;
        fs_info->sb = sb;
        fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
        fs_info->metadata_ratio = 0;
@@ -3739,8 +3778,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
        if (btrfs_root_refs(&root->root_item) == 0)
                synchronize_srcu(&fs_info->subvol_srcu);
-        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
                btrfs_free_log(NULL, root);
+                if (root->reloc_root) {
+                        free_extent_buffer(root->reloc_root->node);
+                        free_extent_buffer(root->reloc_root->commit_root);
+                        btrfs_put_fs_root(root->reloc_root);
+                        root->reloc_root = NULL;
+                }
+        }
        if (root->free_ino_pinned)
                __btrfs_remove_free_space_cache(root->free_ino_pinned);
@@ -3851,7 +3897,7 @@ void close_ctree(struct btrfs_root *root)
        smp_mb();
        /* wait for the qgroup rescan worker to stop */
-        btrfs_qgroup_wait_for_completion(fs_info);
+        btrfs_qgroup_wait_for_completion(fs_info, false);
        /* wait for the uuid_scan task to finish */
        down(&fs_info->uuid_tree_rescan_sem);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b3207a0e09f7..f19a982f5a4f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
                                      struct btrfs_key *location);
 int btrfs_init_fs_root(struct btrfs_root *root);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                        u64 root_id);
 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
                         struct btrfs_root *root);
 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 61b494e8e604..665da8f66ff1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -60,21 +60,6 @@ enum {
        CHUNK_ALLOC_FORCE = 2,
 };
-/*
- * Control how reservations are dealt with.
- *
- * RESERVE_FREE - freeing a reservation.
- * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
- *   ENOSPC accounting
- * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
- *   bytes_may_use as the ENOSPC accounting is done elsewhere
- */
-enum {
-        RESERVE_FREE = 0,
-        RESERVE_ALLOC = 1,
-        RESERVE_ALLOC_NO_ACCOUNT = 2,
-};
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root, u64 bytenr,
                              u64 num_bytes, int alloc);
@@ -104,9 +89,10 @@ static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
-static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                       u64 num_bytes, int reserve,
+                                    u64 ram_bytes, u64 num_bytes, int delalloc);
-                                       int delalloc);
+static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                     u64 num_bytes, int delalloc);
 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
                               u64 num_bytes);
 int btrfs_pin_extent(struct btrfs_root *root,
@@ -3501,7 +3487,6 @@ again:
                dcs = BTRFS_DC_SETUP;
        else if (ret == -ENOSPC)
                set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
-        btrfs_free_reserved_data_space(inode, 0, num_pages);
 out_put:
        iput(inode);
@@ -4286,13 +4271,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
        if (ret < 0)
                return ret;
-        /*
+        /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
-         * Use new btrfs_qgroup_reserve_data to reserve precious data space
-         *
-         * TODO: Find a good method to avoid reserve data space for NOCOW
-         * range, but don't impact performance on quota disable case.
-         */
        ret = btrfs_qgroup_reserve_data(inode, start, len);
+        if (ret)
+                btrfs_free_reserved_data_space_noquota(inode, start, len);
        return ret;
 }
@@ -4472,6 +4454,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
        }
 }
+/*
+ * If force is CHUNK_ALLOC_FORCE:
+ *    - return 1 if it successfully allocates a chunk,
+ *    - return errors including -ENOSPC otherwise.
+ * If force is NOT CHUNK_ALLOC_FORCE:
+ *    - return 0 if it doesn't need to allocate a new chunk,
+ *    - return 1 if it successfully allocates a chunk,
+ *    - return errors including -ENOSPC otherwise.
+ */
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 flags, int force)
 {
@@ -4882,7 +4873,7 @@ static int flush_space(struct btrfs_root *root,
                                     btrfs_get_alloc_profile(root, 0),
                                     CHUNK_ALLOC_NO_FORCE);
                btrfs_end_transaction(trans, root);
-                if (ret == -ENOSPC)
+                if (ret > 0 || ret == -ENOSPC)
                        ret = 0;
                break;
        case COMMIT_TRANS:
@@ -4907,11 +4898,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
        u64 expected;
        u64 to_reclaim = 0;
-        to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
-        if (can_overcommit(root, space_info, to_reclaim,
-                           BTRFS_RESERVE_FLUSH_ALL))
-                return 0;
        list_for_each_entry(ticket, &space_info->tickets, list)
                to_reclaim += ticket->bytes;
        list_for_each_entry(ticket, &space_info->priority_tickets, list)
@@ -4919,6 +4905,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
        if (to_reclaim)
                return to_reclaim;
+        to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
+        if (can_overcommit(root, space_info, to_reclaim,
+                           BTRFS_RESERVE_FLUSH_ALL))
+                return 0;
        used = space_info->bytes_used + space_info->bytes_reserved +
               space_info->bytes_pinned + space_info->bytes_readonly +
               space_info->bytes_may_use;
@@ -4972,12 +4963,12 @@ static void wake_all_tickets(struct list_head *head)
 */
 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 {
-        struct reserve_ticket *last_ticket = NULL;
        struct btrfs_fs_info *fs_info;
        struct btrfs_space_info *space_info;
        u64 to_reclaim;
        int flush_state;
        int commit_cycles = 0;
+        u64 last_tickets_id;
        fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
@@ -4990,8 +4981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                spin_unlock(&space_info->lock);
                return;
        }
-        last_ticket = list_first_entry(&space_info->tickets,
+        last_tickets_id = space_info->tickets_id;
-                                       struct reserve_ticket, list);
        spin_unlock(&space_info->lock);
        flush_state = FLUSH_DELAYED_ITEMS_NR;
@@ -5011,10 +5001,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                                                              space_info);
                ticket = list_first_entry(&space_info->tickets,
                                          struct reserve_ticket, list);
-                if (last_ticket == ticket) {
+                if (last_tickets_id == space_info->tickets_id) {
                        flush_state++;
                } else {
-                        last_ticket = ticket;
+                        last_tickets_id = space_info->tickets_id;
                        flush_state = FLUSH_DELAYED_ITEMS_NR;
                        if (commit_cycles)
                                commit_cycles--;
@@ -5390,6 +5380,7 @@ again:
                        list_del_init(&ticket->list);
                        num_bytes -= ticket->bytes;
                        ticket->bytes = 0;
+                        space_info->tickets_id++;
                        wake_up(&ticket->wait);
                } else {
                        ticket->bytes -= num_bytes;
@@ -5432,6 +5423,7 @@ again:
                        num_bytes -= ticket->bytes;
                        space_info->bytes_may_use += ticket->bytes;
                        ticket->bytes = 0;
+                        space_info->tickets_id++;
                        wake_up(&ticket->wait);
                } else {
                        trace_btrfs_space_reservation(fs_info, "space_info",
@@ -6497,19 +6489,15 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 }
 /**
- * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * btrfs_add_reserved_bytes - update the block_group and space info counters
 * @cache:      The cache we are manipulating
+ * @ram_bytes:  The number of bytes of file content, and will be same to
+ *              @num_bytes except for the compress path.
 * @num_bytes:  The number of bytes in question
- * @reserve:    One of the reservation enums
 * @delalloc:   The blocks are allocated for the delalloc write
 *
- * This is called by the allocator when it reserves space, or by somebody who is
+ * This is called by the allocator when it reserves space. Metadata
- * freeing space that was never actually used on disk.  For example if you
+ * reservations should be called with RESERVE_ALLOC so we do the proper
- * reserve some space for a new leaf in transaction A and before transaction A
- * commits you free that leaf, you call this with reserve set to 0 in order to
- * clear the reservation.
- *
- * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
 * ENOSPC accounting.  For data we handle the reservation through clearing the
 * delalloc bits in the io_tree.  We have to do this since we could end up
 * allocating less disk space for the amount of data we have reserved in the
@@ -6519,44 +6507,63 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 * make the reservation and return -EAGAIN, otherwise this function always
 * succeeds.
 */
-static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                       u64 num_bytes, int reserve, int delalloc)
+                                    u64 ram_bytes, u64 num_bytes, int delalloc)
 {
        struct btrfs_space_info *space_info = cache->space_info;
        int ret = 0;
        spin_lock(&space_info->lock);
        spin_lock(&cache->lock);
-        if (reserve != RESERVE_FREE) {
+        if (cache->ro) {
-                if (cache->ro) {
+                ret = -EAGAIN;
-                        ret = -EAGAIN;
-                } else {
-                        cache->reserved += num_bytes;
-                        space_info->bytes_reserved += num_bytes;
-                        if (reserve == RESERVE_ALLOC) {
-                                trace_btrfs_space_reservation(cache->fs_info,
-                                                "space_info", space_info->flags,
-                                                num_bytes, 0);
-                                space_info->bytes_may_use -= num_bytes;
-                        }
-                        if (delalloc)
-                                cache->delalloc_bytes += num_bytes;
-                }
        } else {
-                if (cache->ro)
+                cache->reserved += num_bytes;
-                        space_info->bytes_readonly += num_bytes;
+                space_info->bytes_reserved += num_bytes;
-                cache->reserved -= num_bytes;
-                space_info->bytes_reserved -= num_bytes;
+                trace_btrfs_space_reservation(cache->fs_info,
+                                "space_info", space_info->flags,
+                                ram_bytes, 0);
+                space_info->bytes_may_use -= ram_bytes;
                if (delalloc)
-                        cache->delalloc_bytes -= num_bytes;
+                        cache->delalloc_bytes += num_bytes;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&space_info->lock);
        return ret;
 }
+/**
+ * btrfs_free_reserved_bytes - update the block_group and space info counters
+ * @cache:      The cache we are manipulating
+ * @num_bytes:  The number of bytes in question
+ * @delalloc:   The blocks are allocated for the delalloc write
+ *
+ * This is called by somebody who is freeing space that was never actually used
+ * on disk.  For example if you reserve some space for a new leaf in transaction
+ * A and before transaction A commits you free that leaf, you call this with
+ * reserve set to 0 in order to clear the reservation.
+ */
+static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                     u64 num_bytes, int delalloc)
+{
+        struct btrfs_space_info *space_info = cache->space_info;
+        int ret = 0;
+        spin_lock(&space_info->lock);
+        spin_lock(&cache->lock);
+        if (cache->ro)
+                space_info->bytes_readonly += num_bytes;
+        cache->reserved -= num_bytes;
+        space_info->bytes_reserved -= num_bytes;
+        if (delalloc)
+                cache->delalloc_bytes -= num_bytes;
+        spin_unlock(&cache->lock);
+        spin_unlock(&space_info->lock);
+        return ret;
+}
 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root)
 {
@@ -7191,7 +7198,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
                btrfs_add_free_space(cache, buf->start, buf->len);
-                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+                btrfs_free_reserved_bytes(cache, buf->len, 0);
                btrfs_put_block_group(cache);
                trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
                pin = 0;
@@ -7416,9 +7423,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
 * the free space extent currently.
 */
 static noinline int find_free_extent(struct btrfs_root *orig_root,
-                                     u64 num_bytes, u64 empty_size,
+                                u64 ram_bytes, u64 num_bytes, u64 empty_size,
-                                     u64 hint_byte, struct btrfs_key *ins,
+                                u64 hint_byte, struct btrfs_key *ins,
-                                     u64 flags, int delalloc)
+                                u64 flags, int delalloc)
 {
        int ret = 0;
        struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -7430,8 +7437,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        struct btrfs_space_info *space_info;
        int loop = 0;
        int index = __get_raid_index(flags);
-        int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
-                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
        bool use_cluster = true;
@@ -7763,8 +7768,8 @@ checks:
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+                ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
-                                                  alloc_type, delalloc);
+                                num_bytes, delalloc);
                if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
                        goto loop;
@@ -7936,7 +7941,7 @@ again:
        up_read(&info->groups_sem);
 }
-int btrfs_reserve_extent(struct btrfs_root *root,
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
                         u64 num_bytes, u64 min_alloc_size,
                         u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data, int delalloc)
@@ -7948,8 +7953,8 @@ int btrfs_reserve_extent(struct btrfs_root *root,
        flags = btrfs_get_alloc_profile(root, is_data);
 again:
        WARN_ON(num_bytes < root->sectorsize);
-        ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
+        ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
-                               flags, delalloc);
+                               hint_byte, ins, flags, delalloc);
        if (!ret && !is_data) {
                btrfs_dec_block_group_reservations(root->fs_info,
                                                   ins->objectid);
@@ -7958,6 +7963,7 @@ again:
                        num_bytes = min(num_bytes >> 1, ins->offset);
                        num_bytes = round_down(num_bytes, root->sectorsize);
                        num_bytes = max(num_bytes, min_alloc_size);
+                        ram_bytes = num_bytes;
                        if (num_bytes == min_alloc_size)
                                final_tried = true;
                        goto again;
@@ -7995,7 +8001,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
                if (btrfs_test_opt(root->fs_info, DISCARD))
                        ret = btrfs_discard_extent(root, start, len, NULL);
                btrfs_add_free_space(cache, start, len);
-                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+                btrfs_free_reserved_bytes(cache, len, delalloc);
                trace_btrfs_reserved_extent_free(root, start, len);
        }
@@ -8208,6 +8214,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
        struct btrfs_block_group_cache *block_group;
+        struct btrfs_space_info *space_info;
        /*
         * Mixed block groups will exclude before processing the log so we only
@@ -8223,9 +8230,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        if (!block_group)
                return -EINVAL;
-        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+        space_info = block_group->space_info;
-                                          RESERVE_ALLOC_NO_ACCOUNT, 0);
+        spin_lock(&space_info->lock);
-        BUG_ON(ret); /* logic error */
+        spin_lock(&block_group->lock);
+        space_info->bytes_reserved += ins->offset;
+        block_group->reserved += ins->offset;
+        spin_unlock(&block_group->lock);
+        spin_unlock(&space_info->lock);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
        btrfs_put_block_group(block_group);
@@ -8368,7 +8380,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
        if (IS_ERR(block_rsv))
                return ERR_CAST(block_rsv);
-        ret = btrfs_reserve_extent(root, blocksize, blocksize,
+        ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
                                   empty_size, hint, &ins, 0, 0);
        if (ret)
                goto out_unuse;
@@ -8521,35 +8533,6 @@ reada:
        wc->reada_slot = slot;
 }
-/*
- * These may not be seen by the usual inc/dec ref code so we have to
- * add them here.
- */
-static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root, u64 bytenr,
-                                     u64 num_bytes)
-{
-        struct btrfs_qgroup_extent_record *qrecord;
-        struct btrfs_delayed_ref_root *delayed_refs;
-        qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
-        if (!qrecord)
-                return -ENOMEM;
-        qrecord->bytenr = bytenr;
-        qrecord->num_bytes = num_bytes;
-        qrecord->old_roots = NULL;
-        delayed_refs = &trans->transaction->delayed_refs;
-        spin_lock(&delayed_refs->lock);
-        if (btrfs_qgroup_insert_dirty_extent(trans->fs_info,
-                                             delayed_refs, qrecord))
-                kfree(qrecord);
-        spin_unlock(&delayed_refs->lock);
-        return 0;
-}
 static int account_leaf_items(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct extent_buffer *eb)
@@ -8583,7 +8566,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
-                ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+                ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+                                bytenr, num_bytes, GFP_NOFS);
                if (ret)
                        return ret;
        }
@@ -8732,8 +8716,9 @@ walk_down:
                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
-                        ret = record_one_subtree_extent(trans, root, child_bytenr,
+                        ret = btrfs_qgroup_insert_dirty_extent(trans,
-                                                        root->nodesize);
+                                        root->fs_info, child_bytenr,
+                                        root->nodesize, GFP_NOFS);
                        if (ret)
                                goto out;
                }
@@ -9906,6 +9891,7 @@ static int find_first_block_group(struct btrfs_root *root,
                        } else {
                                ret = 0;
                        }
+                        free_extent_map(em);
                        goto out;
                }
                path->slots[0]++;
@@ -9942,6 +9928,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
                block_group->iref = 0;
                block_group->inode = NULL;
                spin_unlock(&block_group->lock);
+                ASSERT(block_group->io_ctl.inode == NULL);
                iput(inode);
                last = block_group->key.objectid + block_group->key.offset;
                btrfs_put_block_group(block_group);
@@ -9999,6 +9986,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                        free_excluded_extents(info->extent_root, block_group);
                btrfs_remove_free_space_cache(block_group);
+                ASSERT(list_empty(&block_group->dirty_list));
+                ASSERT(list_empty(&block_group->io_list));
+                ASSERT(list_empty(&block_group->bg_list));
+                ASSERT(atomic_read(&block_group->count) == 1);
                btrfs_put_block_group(block_group);
                spin_lock(&info->block_group_cache_lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bc2729a7612d..28cd88fccc7e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,6 +20,7 @@
 #define EXTENT_DAMAGED          (1U << 14)
 #define EXTENT_NORESERVE        (1U << 15)
 #define EXTENT_QGROUP_RESERVED  (1U << 16)
+#define EXTENT_CLEAR_DATA_RESV  (1U << 17)
 #define EXTENT_IOBITS           (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS          (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9404121fd5f7..fea31a4a6e36 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2033,6 +2033,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                 */
                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                          &BTRFS_I(inode)->runtime_flags);
+                /*
+                 * An ordered extent might have started before and completed
+                 * already with io errors, in which case the inode was not
+                 * updated and we end up here. So check the inode's mapping
+                 * flags for any errors that might have happened while doing
+                 * writeback of file data.
+                 */
+                ret = btrfs_inode_check_errors(inode);
                inode_unlock(inode);
                goto out;
        }
@@ -2062,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        }
        trans->sync = true;
-        btrfs_init_log_ctx(&ctx);
+        btrfs_init_log_ctx(&ctx, inode);
        ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
        if (ret < 0) {
@@ -2667,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode,
        alloc_start = round_down(offset, blocksize);
        alloc_end = round_up(offset + len, blocksize);
+        cur_offset = alloc_start;
        /* Make sure we aren't being give some crap mode */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2759,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode,
        /* First, check if we exceed the qgroup limit */
        INIT_LIST_HEAD(&reserve_list);
-        cur_offset = alloc_start;
        while (1) {
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
@@ -2786,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode,
                                        last_byte - cur_offset);
                        if (ret < 0)
                                break;
+                } else {
+                        /*
+                         * Do not need to reserve unwritten extent for this
+                         * range, free reserved data space first, otherwise
+                         * it'll result in false ENOSPC error.
+                         */
+                        btrfs_free_reserved_data_space(inode, cur_offset,
+                                last_byte - cur_offset);
                }
                free_extent_map(em);
                cur_offset = last_byte;
@@ -2803,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode,
                                        range->start,
                                        range->len, 1 << inode->i_blkbits,
                                        offset + len, &alloc_hint);
+                else
+                        btrfs_free_reserved_data_space(inode, range->start,
+                                                       range->len);
                list_del(&range->list);
                kfree(range);
        }
@@ -2837,18 +2856,11 @@ out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_KERNEL);
 out:
-        /*
-         * As we waited the extent range, the data_rsv_map must be empty
-         * in the range, as written data range will be released from it.
-         * And for prealloacted extent, it will also be released when
-         * its metadata is written.
-         * So this is completely used as cleanup.
-         */
-        btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
        inode_unlock(inode);
        /* Let go of our reservation. */
-        btrfs_free_reserved_data_space(inode, alloc_start,
+        if (ret != 0)
-                                       alloc_end - alloc_start);
+                btrfs_free_reserved_data_space(inode, alloc_start,
+                                       alloc_end - cur_offset);
        return ret;
 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index aa6fabaee72e..359ee861b5a4 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -495,10 +495,9 @@ again:
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
                                              prealloc, prealloc, &alloc_hint);
        if (ret) {
-                btrfs_delalloc_release_space(inode, 0, prealloc);
+                btrfs_delalloc_release_metadata(inode, prealloc);
                goto out_put;
        }
-        btrfs_free_reserved_data_space(inode, 0, prealloc);
        ret = btrfs_write_out_ino_cache(root, trans, path, inode);
 out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2f5975954ccf..e6811c42e41e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -566,6 +566,8 @@ cont:
                                                     PAGE_SET_WRITEBACK |
                                                     page_error_op |
                                                     PAGE_END_WRITEBACK);
+                        btrfs_free_reserved_data_space_noquota(inode, start,
+                                                end - start + 1);
                        goto free_pages_out;
                }
        }
@@ -742,7 +744,7 @@ retry:
                lock_extent(io_tree, async_extent->start,
                            async_extent->start + async_extent->ram_size - 1);
-                ret = btrfs_reserve_extent(root,
+                ret = btrfs_reserve_extent(root, async_extent->ram_size,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
                                           0, alloc_hint, &ins, 1, 1);
@@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode,
                                     EXTENT_DEFRAG, PAGE_UNLOCK |
                                     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
                                     PAGE_END_WRITEBACK);
+                        btrfs_free_reserved_data_space_noquota(inode, start,
+                                                end - start + 1);
                        *nr_written = *nr_written +
                             (end - start + PAGE_SIZE) / PAGE_SIZE;
                        *page_started = 1;
@@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode,
                unsigned long op;
                cur_alloc_size = disk_num_bytes;
-                ret = btrfs_reserve_extent(root, cur_alloc_size,
+                ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
                                           root->sectorsize, 0, alloc_hint,
                                           &ins, 1, 1);
                if (ret < 0)
@@ -1489,8 +1492,10 @@ out_check:
                extent_clear_unlock_delalloc(inode, cur_offset,
                                             cur_offset + num_bytes - 1,
                                             locked_page, EXTENT_LOCKED |
-                                             EXTENT_DELALLOC, PAGE_UNLOCK |
+                                             EXTENT_DELALLOC |
-                                             PAGE_SET_PRIVATE2);
+                                             EXTENT_CLEAR_DATA_RESV,
+                                             PAGE_UNLOCK | PAGE_SET_PRIVATE2);
                if (!nolock && nocow)
                        btrfs_end_write_no_snapshoting(root);
                cur_offset = extent_end;
@@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                        return;
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-                    && do_list && !(state->state & EXTENT_NORESERVE))
+                    && do_list && !(state->state & EXTENT_NORESERVE)
+                    && (*bits & (EXTENT_DO_ACCOUNTING |
+                    EXTENT_CLEAR_DATA_RESV)))
                        btrfs_free_reserved_data_space_noquota(inode,
                                        state->start, len);
@@ -3435,10 +3442,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
                ret = PTR_ERR_OR_ZERO(inode);
-                if (ret && ret != -ESTALE)
+                if (ret && ret != -ENOENT)
                        goto out;
-                if (ret == -ESTALE && root == root->fs_info->tree_root) {
+                if (ret == -ENOENT && root == root->fs_info->tree_root) {
                        struct btrfs_root *dead_root;
                        struct btrfs_fs_info *fs_info = root->fs_info;
                        int is_dead_root = 0;
@@ -3474,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                 * Inode is already gone but the orphan item is still there,
                 * kill the orphan item.
                 */
-                if (ret == -ESTALE) {
+                if (ret == -ENOENT) {
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
                                ret = PTR_ERR(trans);
@@ -3633,7 +3640,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
 * read an inode from the btree into the in-memory inode
 */
-static void btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode)
 {
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -3652,14 +3659,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
                filled = true;
        path = btrfs_alloc_path();
-        if (!path)
+        if (!path) {
+                ret = -ENOMEM;
                goto make_bad;
+        }
        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
-        if (ret)
+        if (ret) {
+                if (ret > 0)
+                        ret = -ENOENT;
                goto make_bad;
+        }
        leaf = path->nodes[0];
@@ -3812,11 +3824,12 @@ cache_acl:
        }
        btrfs_update_iflags(inode);
-        return;
+        return 0;
 make_bad:
        btrfs_free_path(path);
        make_bad_inode(inode);
+        return ret;
 }
 /*
@@ -4204,6 +4217,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        int err = 0;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
+        u64 last_unlink_trans;
        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
@@ -4226,11 +4240,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (err)
                goto out;
+        last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
        /* now the directory is empty */
        err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
                                 dentry->d_name.name, dentry->d_name.len);
-        if (!err)
+        if (!err) {
                btrfs_i_size_write(inode, 0);
+                /*
+                 * Propagate the last_unlink_trans value of the deleted dir to
+                 * its parent directory. This is to prevent an unrecoverable
+                 * log tree in the case we do something like this:
+                 * 1) create dir foo
+                 * 2) create snapshot under dir foo
+                 * 3) delete the snapshot
+                 * 4) rmdir foo
+                 * 5) mkdir foo
+                 * 6) fsync foo or some file inside foo
+                 */
+                if (last_unlink_trans >= trans->transid)
+                        BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
+        }
 out:
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
@@ -5606,7 +5636,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                return ERR_PTR(-ENOMEM);
        if (inode->i_state & I_NEW) {
-                btrfs_read_locked_inode(inode);
+                int ret;
+                ret = btrfs_read_locked_inode(inode);
                if (!is_bad_inode(inode)) {
                        inode_tree_add(inode);
                        unlock_new_inode(inode);
@@ -5615,7 +5647,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                } else {
                        unlock_new_inode(inode);
                        iput(inode);
-                        inode = ERR_PTR(-ESTALE);
+                        ASSERT(ret < 0);
+                        inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
                }
        }
@@ -7225,7 +7258,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        int ret;
        alloc_hint = get_extent_allocation_hint(inode, start, len);
-        ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
+        ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
                                   alloc_hint, &ins, 1, 1);
        if (ret)
                return ERR_PTR(ret);
@@ -7725,6 +7758,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                ret = PTR_ERR(em2);
                                goto unlock_err;
                        }
+                        /*
+                         * For inode marked NODATACOW or extent marked PREALLOC,
+                         * use the existing or preallocated extent, so does not
+                         * need to adjust btrfs_space_info's bytes_may_use.
+                         */
+                        btrfs_free_reserved_data_space_noquota(inode,
+                                        start, len);
                        goto unlock;
                }
        }
@@ -7759,7 +7799,6 @@ unlock:
                        i_size_write(inode, start + len);
                adjust_dio_outstanding_extents(inode, dio_data, len);
-                btrfs_free_reserved_data_space(inode, start, len);
                WARN_ON(dio_data->reserve < len);
                dio_data->reserve -= len;
                dio_data->unsubmitted_oe_range_end = start + len;
@@ -10280,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
        u64 last_alloc = (u64)-1;
        int ret = 0;
        bool own_trans = true;
+        u64 end = start + num_bytes - 1;
        if (trans)
                own_trans = false;
@@ -10301,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                 * sized chunks.
                 */
                cur_bytes = min(cur_bytes, last_alloc);
-                ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
+                ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
-                                           *alloc_hint, &ins, 1, 0);
+                                min_size, 0, *alloc_hint, &ins, 1, 0);
                if (ret) {
                        if (own_trans)
                                btrfs_end_transaction(trans, root);
@@ -10388,6 +10428,9 @@ next:
                if (own_trans)
                        btrfs_end_transaction(trans, root);
        }
+        if (cur_offset < end)
+                btrfs_free_reserved_data_space(inode, cur_offset,
+                        end - cur_offset + 1);
        return ret;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 14ed1e9e6bc8..7fd939bfbd99 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1634,6 +1634,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
        int namelen;
        int ret = 0;
+        if (!S_ISDIR(file_inode(file)->i_mode))
+                return -ENOTDIR;
        ret = mnt_want_write_file(file);
        if (ret)
                goto out;
@@ -1691,6 +1694,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
        struct btrfs_ioctl_vol_args *vol_args;
        int ret;
+        if (!S_ISDIR(file_inode(file)->i_mode))
+                return -ENOTDIR;
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args))
                return PTR_ERR(vol_args);
@@ -1714,6 +1720,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
        bool readonly = false;
        struct btrfs_qgroup_inherit *inherit = NULL;
+        if (!S_ISDIR(file_inode(file)->i_mode))
+                return -ENOTDIR;
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args))
                return PTR_ERR(vol_args);
@@ -2357,6 +2366,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        int ret;
        int err = 0;
+        if (!S_ISDIR(dir->i_mode))
+                return -ENOTDIR;
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args))
                return PTR_ERR(vol_args);
@@ -5084,7 +5096,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        return btrfs_qgroup_wait_for_completion(root->fs_info);
+        return btrfs_qgroup_wait_for_completion(root->fs_info, true);
 }
 static long _btrfs_ioctl_set_received_subvol(struct file *file,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 93ee1c18ef9d..8db2e29fdcf4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
                goto out;
        fs_info->quota_enabled = 0;
        fs_info->pending_quota_state = 0;
-        btrfs_qgroup_wait_for_completion(fs_info);
+        btrfs_qgroup_wait_for_completion(fs_info, false);
        spin_lock(&fs_info->qgroup_lock);
        quota_root = fs_info->quota_root;
        fs_info->quota_root = NULL;
@@ -1453,10 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
        return ret;
 }
-struct btrfs_qgroup_extent_record *
+int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info,
-btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+                                struct btrfs_delayed_ref_root *delayed_refs,
-                                 struct btrfs_delayed_ref_root *delayed_refs,
+                                struct btrfs_qgroup_extent_record *record)
-                                 struct btrfs_qgroup_extent_record *record)
 {
        struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
        struct rb_node *parent_node = NULL;
@@ -1475,12 +1474,42 @@ btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
                else if (bytenr > entry->bytenr)
                        p = &(*p)->rb_right;
                else
-                        return entry;
+                        return 1;
        }
        rb_link_node(&record->node, parent_node, p);
        rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
-        return NULL;
+        return 0;
+}
+int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
+                struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
+                gfp_t gfp_flag)
+{
+        struct btrfs_qgroup_extent_record *record;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0)
+                return 0;
+        if (WARN_ON(trans == NULL))
+                return -EINVAL;
+        record = kmalloc(sizeof(*record), gfp_flag);
+        if (!record)
+                return -ENOMEM;
+        delayed_refs = &trans->transaction->delayed_refs;
+        record->bytenr = bytenr;
+        record->num_bytes = num_bytes;
+        record->old_roots = NULL;
+        spin_lock(&delayed_refs->lock);
+        ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs,
+                                                      record);
+        spin_unlock(&delayed_refs->lock);
+        if (ret > 0)
+                kfree(record);
+        return 0;
 }
 #define UPDATE_NEW      0
@@ -2303,6 +2332,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
        int err = -ENOMEM;
        int ret = 0;
+        mutex_lock(&fs_info->qgroup_rescan_lock);
+        fs_info->qgroup_rescan_running = true;
+        mutex_unlock(&fs_info->qgroup_rescan_lock);
        path = btrfs_alloc_path();
        if (!path)
                goto out;
@@ -2369,6 +2402,9 @@ out:
        }
 done:
+        mutex_lock(&fs_info->qgroup_rescan_lock);
+        fs_info->qgroup_rescan_running = false;
+        mutex_unlock(&fs_info->qgroup_rescan_lock);
        complete_all(&fs_info->qgroup_rescan_completion);
 }
@@ -2487,20 +2523,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
        return 0;
 }
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
+                                     bool interruptible)
 {
        int running;
        int ret = 0;
        mutex_lock(&fs_info->qgroup_rescan_lock);
        spin_lock(&fs_info->qgroup_lock);
-        running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+        running = fs_info->qgroup_rescan_running;
        spin_unlock(&fs_info->qgroup_lock);
        mutex_unlock(&fs_info->qgroup_rescan_lock);
-        if (running)
+        if (!running)
+                return 0;
+        if (interruptible)
                ret = wait_for_completion_interruptible(
                                        &fs_info->qgroup_rescan_completion);
+        else
+                wait_for_completion(&fs_info->qgroup_rescan_completion);
        return ret;
 }
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 710887c06aaf..1bc64c864b62 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
 void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
+                                     bool interruptible);
 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
                              struct btrfs_fs_info *fs_info, u64 src, u64 dst);
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
@@ -63,10 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
 struct btrfs_delayed_extent_op;
 int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info);
-struct btrfs_qgroup_extent_record *
+/*
-btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+ * Insert one dirty extent record into @delayed_refs, informing qgroup to
-                                 struct btrfs_delayed_ref_root *delayed_refs,
+ * account that extent at commit trans time.
-                                 struct btrfs_qgroup_extent_record *record);
+ *
+ * No lock version, caller must acquire delayed ref lock and allocate memory.
+ *
+ * Return 0 for success insert
+ * Return >0 for existing record, caller can free @record safely.
+ * Error is not possible
+ */
+int btrfs_qgroup_insert_dirty_extent_nolock(
+                struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_root *delayed_refs,
+                struct btrfs_qgroup_extent_record *record);
+/*
+ * Insert one dirty extent record into @delayed_refs, informing qgroup to
+ * account that extent at commit trans time.
+ *
+ * Better encapsulated version.
+ *
+ * Return 0 if the operation is done.
+ * Return <0 for error, like memory allocation failure or invalid parameter
+ * (NULL trans)
+ */
+int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
+                struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
+                gfp_t gfp_flag);
 int
 btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
                            struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b26a5aea41b4..c0c13dc6fe12 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -31,6 +31,7 @@
 #include "async-thread.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "qgroup.h"
 /*
 * backref_node, mapping_node and tree_block start with this
@@ -3037,15 +3038,19 @@ int prealloc_file_extent_cluster(struct inode *inode,
        u64 num_bytes;
        int nr = 0;
        int ret = 0;
+        u64 prealloc_start = cluster->start - offset;
+        u64 prealloc_end = cluster->end - offset;
+        u64 cur_offset;
        BUG_ON(cluster->start != cluster->boundary[0]);
        inode_lock(inode);
-        ret = btrfs_check_data_free_space(inode, cluster->start,
+        ret = btrfs_check_data_free_space(inode, prealloc_start,
-                                          cluster->end + 1 - cluster->start);
+                                          prealloc_end + 1 - prealloc_start);
        if (ret)
                goto out;
+        cur_offset = prealloc_start;
        while (nr < cluster->nr) {
                start = cluster->boundary[nr] - offset;
                if (nr + 1 < cluster->nr)
@@ -3055,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode,
                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
                num_bytes = end + 1 - start;
+                if (cur_offset < start)
+                        btrfs_free_reserved_data_space(inode, cur_offset,
+                                        start - cur_offset);
                ret = btrfs_prealloc_file_range(inode, 0, start,
                                                num_bytes, num_bytes,
                                                end + 1, &alloc_hint);
+                cur_offset = end + 1;
                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
                if (ret)
                        break;
                nr++;
        }
-        btrfs_free_reserved_data_space(inode, cluster->start,
+        if (cur_offset < prealloc_end)
-                                       cluster->end + 1 - cluster->start);
+                btrfs_free_reserved_data_space(inode, cur_offset,
+                                       prealloc_end + 1 - cur_offset);
 out:
        inode_unlock(inode);
        return ret;
@@ -3916,6 +3926,90 @@ int prepare_to_relocate(struct reloc_control *rc)
        return 0;
 }
+/*
+ * Qgroup fixer for data chunk relocation.
+ * The data relocation is done in the following steps
+ * 1) Copy data extents into data reloc tree
+ * 2) Create tree reloc tree(special snapshot) for related subvolumes
+ * 3) Modify file extents in tree reloc tree
+ * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
+ *
+ * The problem is, data and tree reloc tree are not accounted to qgroup,
+ * and 4) will only info qgroup to track tree blocks change, not file extents
+ * in the tree blocks.
+ *
+ * The good news is, related data extents are all in data reloc tree, so we
+ * only need to info qgroup to track all file extents in data reloc tree
+ * before commit trans.
+ */
+static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
+                                             struct reloc_control *rc)
+{
+        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+        struct inode *inode = rc->data_inode;
+        struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret = 0;
+        if (!fs_info->quota_enabled)
+                return 0;
+        /*
+         * Only for stage where we update data pointers the qgroup fix is
+         * valid.
+         * For MOVING_DATA stage, we will miss the timing of swapping tree
+         * blocks, and won't fix it.
+         */
+        if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = btrfs_ino(inode);
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        key.offset = 0;
+        ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
+        while (1) {
+                struct btrfs_file_extent_item *fi;
+                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                if (key.objectid > btrfs_ino(inode))
+                        break;
+                if (key.type != BTRFS_EXTENT_DATA_KEY)
+                        goto next;
+                fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                    struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(path->nodes[0], fi) !=
+                                BTRFS_FILE_EXTENT_REG)
+                        goto next;
+                ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
+                        btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
+                        btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
+                        GFP_NOFS);
+                if (ret < 0)
+                        break;
+next:
+                ret = btrfs_next_item(data_reloc_root, path);
+                if (ret < 0)
+                        break;
+                if (ret > 0) {
+                        ret = 0;
+                        break;
+                }
+        }
+        unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
        struct rb_root blocks = RB_ROOT;
@@ -4102,10 +4196,18 @@ restart:
        /* get rid of pinned extents */
        trans = btrfs_join_transaction(rc->extent_root);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
-        else
+                goto out_free;
-                btrfs_commit_transaction(trans, rc->extent_root);
+        }
+        ret = qgroup_fix_relocated_data_extents(trans, rc);
+        if (ret < 0) {
+                btrfs_abort_transaction(trans, ret);
+                if (!err)
+                        err = ret;
+                goto out_free;
+        }
+        btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
        btrfs_free_path(path);
@@ -4468,10 +4570,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        unset_reloc_control(rc);
        trans = btrfs_join_transaction(rc->extent_root);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
-        else
+                goto out_free;
-                err = btrfs_commit_transaction(trans, rc->extent_root);
+        }
+        err = qgroup_fix_relocated_data_extents(trans, rc);
+        if (err < 0) {
+                btrfs_abort_transaction(trans, err);
+                goto out_free;
+        }
+        err = btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
        kfree(rc);
 out:
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7fd7e1830cfe..091296062456 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                root_key.objectid = key.offset;
                key.offset++;
+                /*
+                 * The root might have been inserted already, as before we look
+                 * for orphan roots, log replay might have happened, which
+                 * triggers a transaction commit and qgroup accounting, which
+                 * in turn reads and inserts fs roots while doing backref
+                 * walking.
+                 */
+                root = btrfs_lookup_fs_root(tree_root->fs_info,
+                                            root_key.objectid);
+                if (root) {
+                        WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+                                          &root->state));
+                        if (btrfs_root_refs(&root->root_item) == 0)
+                                btrfs_add_dead_root(root);
+                        continue;
+                }
                root = btrfs_read_fs_root(tree_root, &root_key);
                err = PTR_ERR_OR_ZERO(root);
                if (err && err != -ENOENT) {
@@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
                err = btrfs_insert_fs_root(root->fs_info, root);
-                /*
-                 * The root might have been inserted already, as before we look
-                 * for orphan roots, log replay might have happened, which
-                 * triggers a transaction commit and qgroup accounting, which
-                 * in turn reads and inserts fs roots while doing backref
-                 * walking.
-                 */
-                if (err == -EEXIST)
-                        err = 0;
                if (err) {
+                        BUG_ON(err == -EEXIST);
                        btrfs_free_fs_root(root);
                        break;
                }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b71dd298385c..a87675ffd02b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -231,7 +231,6 @@ struct pending_dir_move {
        u64 parent_ino;
        u64 ino;
        u64 gen;
-        bool is_orphan;
        struct list_head update_refs;
 };
@@ -274,6 +273,39 @@ struct name_cache_entry {
        char name[];
 };
+static void inconsistent_snapshot_error(struct send_ctx *sctx,
+                                        enum btrfs_compare_tree_result result,
+                                        const char *what)
+{
+        const char *result_string;
+        switch (result) {
+        case BTRFS_COMPARE_TREE_NEW:
+                result_string = "new";
+                break;
+        case BTRFS_COMPARE_TREE_DELETED:
+                result_string = "deleted";
+                break;
+        case BTRFS_COMPARE_TREE_CHANGED:
+                result_string = "updated";
+                break;
+        case BTRFS_COMPARE_TREE_SAME:
+                ASSERT(0);
+                result_string = "unchanged";
+                break;
+        default:
+                ASSERT(0);
+                result_string = "unexpected";
+        }
+        btrfs_err(sctx->send_root->fs_info,
+                  "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
+                  result_string, what, sctx->cmp_key->objectid,
+                  sctx->send_root->root_key.objectid,
+                  (sctx->parent_root ?
+                   sctx->parent_root->root_key.objectid : 0));
+}
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
 static struct waiting_dir_move *
@@ -1861,7 +1893,8 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
         * was already unlinked/moved, so we can safely assume that we will not
         * overwrite anything at this point in time.
         */
-        if (other_inode > sctx->send_progress) {
+        if (other_inode > sctx->send_progress ||
+            is_waiting_for_move(sctx, other_inode)) {
                ret = get_inode_info(sctx->parent_root, other_inode, NULL,
                                who_gen, NULL, NULL, NULL, NULL);
                if (ret < 0)
@@ -2502,6 +2535,8 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
        ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+        if (ret > 0)
+                ret = -ENOENT;
        if (ret < 0)
                goto out;
@@ -2947,6 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
                }
                if (loc.objectid > send_progress) {
+                        struct orphan_dir_info *odi;
+                        odi = get_orphan_dir_info(sctx, dir);
+                        free_orphan_dir_info(sctx, odi);
                        ret = 0;
                        goto out;
                }
@@ -3047,7 +3086,6 @@ static int add_pending_dir_move(struct send_ctx *sctx,
        pm->parent_ino = parent_ino;
        pm->ino = ino;
        pm->gen = ino_gen;
-        pm->is_orphan = is_orphan;
        INIT_LIST_HEAD(&pm->list);
        INIT_LIST_HEAD(&pm->update_refs);
        RB_CLEAR_NODE(&pm->node);
@@ -3113,6 +3151,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
        return NULL;
 }
+static int path_loop(struct send_ctx *sctx, struct fs_path *name,
+                     u64 ino, u64 gen, u64 *ancestor_ino)
+{
+        int ret = 0;
+        u64 parent_inode = 0;
+        u64 parent_gen = 0;
+        u64 start_ino = ino;
+        *ancestor_ino = 0;
+        while (ino != BTRFS_FIRST_FREE_OBJECTID) {
+                fs_path_reset(name);
+                if (is_waiting_for_rm(sctx, ino))
+                        break;
+                if (is_waiting_for_move(sctx, ino)) {
+                        if (*ancestor_ino == 0)
+                                *ancestor_ino = ino;
+                        ret = get_first_ref(sctx->parent_root, ino,
+                                            &parent_inode, &parent_gen, name);
+                } else {
+                        ret = __get_cur_name_and_parent(sctx, ino, gen,
+                                                        &parent_inode,
+                                                        &parent_gen, name);
+                        if (ret > 0) {
+                                ret = 0;
+                                break;
+                        }
+                }
+                if (ret < 0)
+                        break;
+                if (parent_inode == start_ino) {
+                        ret = 1;
+                        if (*ancestor_ino == 0)
+                                *ancestor_ino = ino;
+                        break;
+                }
+                ino = parent_inode;
+                gen = parent_gen;
+        }
+        return ret;
+}
 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 {
        struct fs_path *from_path = NULL;
@@ -3123,6 +3203,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
        u64 parent_ino, parent_gen;
        struct waiting_dir_move *dm = NULL;
        u64 rmdir_ino = 0;
+        u64 ancestor;
+        bool is_orphan;
        int ret;
        name = fs_path_alloc();
@@ -3135,9 +3217,10 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
        dm = get_waiting_dir_move(sctx, pm->ino);
        ASSERT(dm);
        rmdir_ino = dm->rmdir_ino;
+        is_orphan = dm->orphanized;
        free_waiting_dir_move(sctx, dm);
-        if (pm->is_orphan) {
+        if (is_orphan) {
                ret = gen_unique_name(sctx, pm->ino,
                                      pm->gen, from_path);
        } else {
@@ -3155,6 +3238,24 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
                goto out;
        sctx->send_progress = sctx->cur_ino + 1;
+        ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+        if (ret < 0)
+                goto out;
+        if (ret) {
+                LIST_HEAD(deleted_refs);
+                ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
+                ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
+                                           &pm->update_refs, &deleted_refs,
+                                           is_orphan);
+                if (ret < 0)
+                        goto out;
+                if (rmdir_ino) {
+                        dm = get_waiting_dir_move(sctx, pm->ino);
+                        ASSERT(dm);
+                        dm->rmdir_ino = rmdir_ino;
+                }
+                goto out;
+        }
        fs_path_reset(name);
        to_path = name;
        name = NULL;
@@ -3174,7 +3275,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
                        /* already deleted */
                        goto finish;
                }
-                ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+                ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino);
                if (ret < 0)
                        goto out;
                if (!ret)
@@ -3204,8 +3305,18 @@ finish:
         * and old parent(s).
         */
        list_for_each_entry(cur, &pm->update_refs, list) {
-                if (cur->dir == rmdir_ino)
+                /*
+                 * The parent inode might have been deleted in the send snapshot
+                 */
+                ret = get_inode_info(sctx->send_root, cur->dir, NULL,
+                                     NULL, NULL, NULL, NULL, NULL);
+                if (ret == -ENOENT) {
+                        ret = 0;
                        continue;
+                }
+                if (ret < 0)
+                        goto out;
                ret = send_utimes(sctx, cur->dir, cur->dir_gen);
                if (ret < 0)
                        goto out;
@@ -3325,6 +3436,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
        u64 left_gen;
        u64 right_gen;
        int ret = 0;
+        struct waiting_dir_move *wdm;
        if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
                return 0;
@@ -3383,7 +3495,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
                goto out;
        }
-        if (is_waiting_for_move(sctx, di_key.objectid)) {
+        wdm = get_waiting_dir_move(sctx, di_key.objectid);
+        if (wdm && !wdm->orphanized) {
                ret = add_pending_dir_move(sctx,
                                           sctx->cur_ino,
                                           sctx->cur_inode_gen,
@@ -3470,7 +3583,8 @@ static int wait_for_parent_move(struct send_ctx *sctx,
                        ret = is_ancestor(sctx->parent_root,
                                          sctx->cur_ino, sctx->cur_inode_gen,
                                          ino, path_before);
-                        break;
+                        if (ret)
+                                break;
                }
                fs_path_reset(path_before);
@@ -3643,11 +3757,26 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                goto out;
                        if (ret) {
                                struct name_cache_entry *nce;
+                                struct waiting_dir_move *wdm;
                                ret = orphanize_inode(sctx, ow_inode, ow_gen,
                                                cur->full_path);
                                if (ret < 0)
                                        goto out;
+                                /*
+                                 * If ow_inode has its rename operation delayed
+                                 * make sure that its orphanized name is used in
+                                 * the source path when performing its rename
+                                 * operation.
+                                 */
+                                if (is_waiting_for_move(sctx, ow_inode)) {
+                                        wdm = get_waiting_dir_move(sctx,
+                                                                   ow_inode);
+                                        ASSERT(wdm);
+                                        wdm->orphanized = true;
+                                }
                                /*
                                 * Make sure we clear our orphanized inode's
                                 * name from the name cache. This is because the
@@ -3663,6 +3792,19 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                        name_cache_delete(sctx, nce);
                                        kfree(nce);
                                }
+                                /*
+                                 * ow_inode might currently be an ancestor of
+                                 * cur_ino, therefore compute valid_path (the
+                                 * current path of cur_ino) again because it
+                                 * might contain the pre-orphanization name of
+                                 * ow_inode, which is no longer valid.
+                                 */
+                                fs_path_reset(valid_path);
+                                ret = get_cur_path(sctx, sctx->cur_ino,
+                                           sctx->cur_inode_gen, valid_path);
+                                if (ret < 0)
+                                        goto out;
                        } else {
                                ret = send_unlink(sctx, cur->full_path);
                                if (ret < 0)
@@ -4126,10 +4268,12 @@ static int process_all_refs(struct send_ctx *sctx,
        }
        btrfs_release_path(path);
+        /*
+         * We don't actually care about pending_move as we are simply
+         * re-creating this inode and will be rename'ing it into place once we
+         * rename the parent directory.
+         */
        ret = process_recorded_refs(sctx, &pending_move);
-        /* Only applicable to an incremental send. */
-        ASSERT(pending_move == 0);
 out:
        btrfs_free_path(path);
        return ret;
@@ -5602,7 +5746,10 @@ static int changed_ref(struct send_ctx *sctx,
 {
        int ret = 0;
-        BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+        if (sctx->cur_ino != sctx->cmp_key->objectid) {
+                inconsistent_snapshot_error(sctx, result, "reference");
+                return -EIO;
+        }
        if (!sctx->cur_inode_new_gen &&
            sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
@@ -5627,7 +5774,10 @@ static int changed_xattr(struct send_ctx *sctx,
 {
        int ret = 0;
-        BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+        if (sctx->cur_ino != sctx->cmp_key->objectid) {
+                inconsistent_snapshot_error(sctx, result, "xattr");
+                return -EIO;
+        }
        if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
                if (result == BTRFS_COMPARE_TREE_NEW)
@@ -5651,7 +5801,10 @@ static int changed_extent(struct send_ctx *sctx,
 {
        int ret = 0;
-        BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+        if (sctx->cur_ino != sctx->cmp_key->objectid) {
+                inconsistent_snapshot_error(sctx, result, "extent");
+                return -EIO;
+        }
        if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
                if (result != BTRFS_COMPARE_TREE_DELETED)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 864ce334f696..4071fe2bd098 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2241,6 +2241,13 @@ static int btrfs_freeze(struct super_block *sb)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+        root->fs_info->fs_frozen = 1;
+        /*
+         * We don't need a barrier here, we'll wait for any transaction that
+         * could be in progress on other threads (and do delayed iputs that
+         * we want to avoid on a frozen filesystem), or do the commit
+         * ourselves.
+         */
        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                /* no transaction, don't bother */
@@ -2251,6 +2258,14 @@ static int btrfs_freeze(struct super_block *sb)
        return btrfs_commit_transaction(trans, root);
 }
+static int btrfs_unfreeze(struct super_block *sb)
+{
+        struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+        root->fs_info->fs_frozen = 0;
+        return 0;
+}
 static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2299,6 +2314,7 @@ static const struct super_operations btrfs_super_ops = {
        .statfs         = btrfs_statfs,
        .remount_fs     = btrfs_remount,
        .freeze_fs      = btrfs_freeze,
+        .unfreeze_fs    = btrfs_unfreeze,
 };
 static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9cca0a721961..95d41919d034 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2278,8 +2278,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
+        /*
+         * If fs has been frozen, we can not handle delayed iputs, otherwise
+         * it'll result in deadlock about SB_FREEZE_FS.
+         */
        if (current != root->fs_info->transaction_kthread &&
-            current != root->fs_info->cleaner_kthread)
+            current != root->fs_info->cleaner_kthread &&
+            !root->fs_info->fs_frozen)
                btrfs_run_delayed_iputs(root);
        return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d31a0c4f56be..ef9c55bc7907 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
 #include "backref.h"
 #include "hash.h"
 #include "compression.h"
+#include "qgroup.h"
 /* magic values for the inode_only field in btrfs_log_inode:
 *
@@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                ins.type = BTRFS_EXTENT_ITEM_KEY;
                offset = key->offset - btrfs_file_extent_offset(eb, item);
+                /*
+                 * Manually record dirty extent, as here we did a shallow
+                 * file extent item copy and skip normal backref update,
+                 * but modifying extent tree all by ourselves.
+                 * So need to manually record dirty extent for qgroup,
+                 * as the owner of the file extent changed from log tree
+                 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
+                 */
+                ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+                                btrfs_file_extent_disk_bytenr(eb, item),
+                                btrfs_file_extent_disk_num_bytes(eb, item),
+                                GFP_NOFS);
+                if (ret < 0)
+                        goto out;
                if (ins.objectid > 0) {
                        u64 csum_start;
                        u64 csum_end;
@@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         */
        mutex_unlock(&root->log_mutex);
-        btrfs_init_log_ctx(&root_log_ctx);
+        btrfs_init_log_ctx(&root_log_ctx, NULL);
        mutex_lock(&log_root_tree->log_mutex);
        atomic_inc(&log_root_tree->log_batch);
@@ -2851,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
                blk_finish_plug(&plug);
+                list_del_init(&root_log_ctx.list);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = root_log_ctx.log_ret;
                goto out;
@@ -4469,7 +4486,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                         const int slot,
                                         const struct btrfs_key *key,
-                                         struct inode *inode)
+                                         struct inode *inode,
+                                         u64 *other_ino)
 {
        int ret;
        struct btrfs_path *search_path;
@@ -4528,7 +4546,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                           search_path, parent,
                                           name, this_name_len, 0);
                if (di && !IS_ERR(di)) {
-                        ret = 1;
+                        struct btrfs_key di_key;
+                        btrfs_dir_item_key_to_cpu(search_path->nodes[0],
+                                                  di, &di_key);
+                        if (di_key.type == BTRFS_INODE_ITEM_KEY) {
+                                ret = 1;
+                                *other_ino = di_key.objectid;
+                        } else {
+                                ret = -EAGAIN;
+                        }
                        goto out;
                } else if (IS_ERR(di)) {
                        ret = PTR_ERR(di);
@@ -4722,16 +4749,72 @@ again:
                if ((min_key.type == BTRFS_INODE_REF_KEY ||
                     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
                    BTRFS_I(inode)->generation == trans->transid) {
+                        u64 other_ino = 0;
                        ret = btrfs_check_ref_name_override(path->nodes[0],
                                                            path->slots[0],
-                                                            &min_key, inode);
+                                                            &min_key, inode,
+                                                            &other_ino);
                        if (ret < 0) {
                                err = ret;
                                goto out_unlock;
-                        } else if (ret > 0) {
+                        } else if (ret > 0 && ctx &&
-                                err = 1;
+                                   other_ino != btrfs_ino(ctx->inode)) {
-                                btrfs_set_log_full_commit(root->fs_info, trans);
+                                struct btrfs_key inode_key;
-                                goto out_unlock;
+                                struct inode *other_inode;
+                                if (ins_nr > 0) {
+                                        ins_nr++;
+                                } else {
+                                        ins_nr = 1;
+                                        ins_start_slot = path->slots[0];
+                                }
+                                ret = copy_items(trans, inode, dst_path, path,
+                                                 &last_extent, ins_start_slot,
+                                                 ins_nr, inode_only,
+                                                 logged_isize);
+                                if (ret < 0) {
+                                        err = ret;
+                                        goto out_unlock;
+                                }
+                                ins_nr = 0;
+                                btrfs_release_path(path);
+                                inode_key.objectid = other_ino;
+                                inode_key.type = BTRFS_INODE_ITEM_KEY;
+                                inode_key.offset = 0;
+                                other_inode = btrfs_iget(root->fs_info->sb,
+                                                         &inode_key, root,
+                                                         NULL);
+                                /*
+                                 * If the other inode that had a conflicting dir
+                                 * entry was deleted in the current transaction,
+                                 * we don't need to do more work nor fallback to
+                                 * a transaction commit.
+                                 */
+                                if (IS_ERR(other_inode) &&
+                                    PTR_ERR(other_inode) == -ENOENT) {
+                                        goto next_key;
+                                } else if (IS_ERR(other_inode)) {
+                                        err = PTR_ERR(other_inode);
+                                        goto out_unlock;
+                                }
+                                /*
+                                 * We are safe logging the other inode without
+                                 * acquiring its i_mutex as long as we log with
+                                 * the LOG_INODE_EXISTS mode. We're safe against
+                                 * concurrent renames of the other inode as well
+                                 * because during a rename we pin the log and
+                                 * update the log with the new name before we
+                                 * unpin it.
+                                 */
+                                err = btrfs_log_inode(trans, root, other_inode,
+                                                      LOG_INODE_EXISTS,
+                                                      0, LLONG_MAX, ctx);
+                                iput(other_inode);
+                                if (err)
+                                        goto out_unlock;
+                                else
+                                        goto next_key;
                        }
                }
@@ -4799,7 +4882,7 @@ next_slot:
                        ins_nr = 0;
                }
                btrfs_release_path(path);
+next_key:
                if (min_key.offset < (u64)-1) {
                        min_key.offset++;
                } else if (min_key.type < max_key.type) {
@@ -4993,8 +5076,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                        break;
-                if (IS_ROOT(parent))
+                if (IS_ROOT(parent)) {
+                        inode = d_inode(parent);
+                        if (btrfs_must_commit_transaction(trans, inode))
+                                ret = 1;
                        break;
+                }
                parent = dget_parent(parent);
                dput(old_parent);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index a9f1b75d080d..ab858e31ccbc 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -30,15 +30,18 @@ struct btrfs_log_ctx {
        int log_transid;
        int io_err;
        bool log_new_dentries;
+        struct inode *inode;
        struct list_head list;
 };
-static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
+                                      struct inode *inode)
 {
        ctx->log_ret = 0;
        ctx->log_transid = 0;
        ctx->io_err = 0;
        ctx->log_new_dentries = false;
+        ctx->inode = inode;
        INIT_LIST_HEAD(&ctx->list);
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 51f125508771..035efce603a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -834,10 +834,6 @@ static void __free_device(struct work_struct *work)
        struct btrfs_device *device;
        device = container_of(work, struct btrfs_device, rcu_work);
-        if (device->bdev)
-                blkdev_put(device->bdev, device->mode);
        rcu_string_free(device->name);
        kfree(device);
 }
@@ -852,6 +848,17 @@ static void free_device(struct rcu_head *head)
        schedule_work(&device->rcu_work);
 }
+static void btrfs_close_bdev(struct btrfs_device *device)
+{
+        if (device->bdev && device->writeable) {
+                sync_blockdev(device->bdev);
+                invalidate_bdev(device->bdev);
+        }
+        if (device->bdev)
+                blkdev_put(device->bdev, device->mode);
+}
 static void btrfs_close_one_device(struct btrfs_device *device)
 {
        struct btrfs_fs_devices *fs_devices = device->fs_devices;
@@ -870,10 +877,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
        if (device->missing)
                fs_devices->missing_devices--;
-        if (device->bdev && device->writeable) {
+        btrfs_close_bdev(device);
-                sync_blockdev(device->bdev);
-                invalidate_bdev(device->bdev);
-        }
        new_device = btrfs_alloc_device(NULL, &device->devid,
                                        device->uuid);
@@ -1932,6 +1936,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
                btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
        }
+        btrfs_close_bdev(device);
        call_rcu(&device->rcu, free_device);
        num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
@@ -2025,6 +2031,9 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
                /* zero out the old super if it is writable */
                btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
        }
+        btrfs_close_bdev(srcdev);
        call_rcu(&srcdev->rcu, free_device);
        /*
@@ -2080,6 +2089,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
         * the device_list_mutex lock.
         */
        btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+        btrfs_close_bdev(tgtdev);
        call_rcu(&tgtdev->rcu, free_device);
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 99115cae1652..16e6ded0b7f2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1347,9 +1347,12 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
-        struct ceph_mds_session *session = *psession;
+        struct ceph_mds_session *session = NULL;
        int mds;
        dout("ceph_flush_snaps %p\n", inode);
+        if (psession)
+                session = *psession;
 retry:
        spin_lock(&ci->i_ceph_lock);
        if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c64a0b794d49..df4b3e6fa563 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -597,7 +597,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
        if (is_hash_order(new_pos)) {
                /* no need to reset last_name for a forward seek when
                 * dentries are sotred in hash order */
-        } else if (fi->frag |= fpos_frag(new_pos)) {
+        } else if (fi->frag != fpos_frag(new_pos)) {
                return true;
        }
        rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fa59a85226b2..f72d4ae303b2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2759,6 +2759,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        } else {
                path = NULL;
                pathlen = 0;
+                pathbase = 0;
        }
        spin_lock(&ci->i_ceph_lock);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 6bbec5e784cd..14ae4b8e1a3c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -609,6 +609,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
        char *s, *p;
        char sep;
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
+                return dget(sb->s_root);
        full_path = cifs_build_path_to_root(vol, cifs_sb,
                                            cifs_sb_master_tcon(cifs_sb));
        if (full_path == NULL)
@@ -686,26 +689,22 @@ cifs_do_mount(struct file_system_type *fs_type,
        cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
        if (cifs_sb->mountdata == NULL) {
                root = ERR_PTR(-ENOMEM);
-                goto out_cifs_sb;
+                goto out_free;
        }
-        if (volume_info->prepath) {
+        rc = cifs_setup_cifs_sb(volume_info, cifs_sb);
-                cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL);
+        if (rc) {
-                if (cifs_sb->prepath == NULL) {
+                root = ERR_PTR(rc);
-                        root = ERR_PTR(-ENOMEM);
+                goto out_free;
-                        goto out_cifs_sb;
-                }
        }
-        cifs_setup_cifs_sb(volume_info, cifs_sb);
        rc = cifs_mount(cifs_sb, volume_info);
        if (rc) {
                if (!(flags & MS_SILENT))
                        cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
                                 rc);
                root = ERR_PTR(rc);
-                goto out_mountdata;
+                goto out_free;
        }
        mnt_data.vol = volume_info;
@@ -735,11 +734,7 @@ cifs_do_mount(struct file_system_type *fs_type,
                sb->s_flags |= MS_ACTIVE;
        }
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
+        root = cifs_get_root(volume_info, sb);
-                root = dget(sb->s_root);
-        else
-                root = cifs_get_root(volume_info, sb);
        if (IS_ERR(root))
                goto out_super;
@@ -752,9 +747,9 @@ out:
        cifs_cleanup_volume_info(volume_info);
        return root;
-out_mountdata:
+out_free:
+        kfree(cifs_sb->prepath);
        kfree(cifs_sb->mountdata);
-out_cifs_sb:
        kfree(cifs_sb);
 out_nls:
        unload_nls(volume_info->local_nls);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1243bd326591..95dab43646f0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -184,7 +184,7 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
                                 unsigned int to_read);
 extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
                                      struct page *page, unsigned int to_read);
-extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
                               struct cifs_sb_info *cifs_sb);
 extern int cifs_match_super(struct super_block *, void *);
 extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7ae03283bd61..2e4f4bad8b1e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2781,6 +2781,24 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
        return 1;
 }
+static int
+match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
+{
+        struct cifs_sb_info *old = CIFS_SB(sb);
+        struct cifs_sb_info *new = mnt_data->cifs_sb;
+        if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) {
+                if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH))
+                        return 0;
+                /* The prepath should be null terminated strings */
+                if (strcmp(new->prepath, old->prepath))
+                        return 0;
+                return 1;
+        }
+        return 0;
+}
 int
 cifs_match_super(struct super_block *sb, void *data)
 {
@@ -2808,7 +2826,8 @@ cifs_match_super(struct super_block *sb, void *data)
        if (!match_server(tcp_srv, volume_info) ||
            !match_session(ses, volume_info) ||
-            !match_tcon(tcon, volume_info->UNC)) {
+            !match_tcon(tcon, volume_info->UNC) ||
+            !match_prepath(sb, mnt_data)) {
                rc = 0;
                goto out;
        }
@@ -3222,7 +3241,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
        }
 }
-void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
                        struct cifs_sb_info *cifs_sb)
 {
        INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
@@ -3316,6 +3335,14 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
        if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
                cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n");
+        if (pvolume_info->prepath) {
+                cifs_sb->prepath = kstrdup(pvolume_info->prepath, GFP_KERNEL);
+                if (cifs_sb->prepath == NULL)
+                        return -ENOMEM;
+        }
+        return 0;
 }
 static void
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index c30cf49b69d2..2c6312db8516 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -333,6 +333,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
                if (bin_attr->cb_max_size &&
                        *ppos + count > bin_attr->cb_max_size) {
                        len = -EFBIG;
+                        goto out;
                }
                tbuf = vmalloc(*ppos + count);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 0f9961eede1e..ed115acb5dee 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -11,6 +11,7 @@
 #include <linux/random.h>
 #include <linux/string.h>
 #include <linux/fscrypto.h>
+#include <linux/mount.h>
 static int inode_has_encryption_context(struct inode *inode)
 {
@@ -92,26 +93,42 @@ static int create_encryption_context_from_policy(struct inode *inode,
        return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
 }
-int fscrypt_process_policy(struct inode *inode,
+int fscrypt_process_policy(struct file *filp,
                                const struct fscrypt_policy *policy)
 {
+        struct inode *inode = file_inode(filp);
+        int ret;
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
        if (policy->version != 0)
                return -EINVAL;
+        ret = mnt_want_write_file(filp);
+        if (ret)
+                return ret;
        if (!inode_has_encryption_context(inode)) {
-                if (!inode->i_sb->s_cop->empty_dir)
+                if (!S_ISDIR(inode->i_mode))
-                        return -EOPNOTSUPP;
+                        ret = -EINVAL;
-                if (!inode->i_sb->s_cop->empty_dir(inode))
+                else if (!inode->i_sb->s_cop->empty_dir)
-                        return -ENOTEMPTY;
+                        ret = -EOPNOTSUPP;
-                return create_encryption_context_from_policy(inode, policy);
+                else if (!inode->i_sb->s_cop->empty_dir(inode))
+                        ret = -ENOTEMPTY;
+                else
+                        ret = create_encryption_context_from_policy(inode,
+                                                                    policy);
+        } else if (!is_encryption_context_consistent_with_policy(inode,
+                                                                 policy)) {
+                printk(KERN_WARNING
+                       "%s: Policy inconsistent with encryption context\n",
+                       __func__);
+                ret = -EINVAL;
        }
-        if (is_encryption_context_consistent_with_policy(inode, policy))
+        mnt_drop_write_file(filp);
-                return 0;
+        return ret;
-        printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
-               __func__);
-        return -EINVAL;
 }
 EXPORT_SYMBOL(fscrypt_process_policy);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d116453b0276..79a5941c2474 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -585,7 +585,8 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
 */
 void *devpts_get_priv(struct dentry *dentry)
 {
-        WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC);
+        if (dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC)
+                return NULL;
        return dentry->d_fsdata;
 }
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index eea64912c9c0..466f7d60edc2 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -607,20 +607,54 @@ static const struct file_operations format2_fops;
 static const struct file_operations format3_fops;
 static const struct file_operations format4_fops;
-static int table_open(struct inode *inode, struct file *file)
+static int table_open1(struct inode *inode, struct file *file)
 {
        struct seq_file *seq;
-        int ret = -1;
+        int ret;
-        if (file->f_op == &format1_fops)
+        ret = seq_open(file, &format1_seq_ops);
-                ret = seq_open(file, &format1_seq_ops);
+        if (ret)
-        else if (file->f_op == &format2_fops)
+                return ret;
-                ret = seq_open(file, &format2_seq_ops);
-        else if (file->f_op == &format3_fops)
+        seq = file->private_data;
-                ret = seq_open(file, &format3_seq_ops);
+        seq->private = inode->i_private; /* the dlm_ls */
-        else if (file->f_op == &format4_fops)
+        return 0;
-                ret = seq_open(file, &format4_seq_ops);
+}
+static int table_open2(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &format2_seq_ops);
+        if (ret)
+                return ret;
+        seq = file->private_data;
+        seq->private = inode->i_private; /* the dlm_ls */
+        return 0;
+}
+static int table_open3(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &format3_seq_ops);
+        if (ret)
+                return ret;
+        seq = file->private_data;
+        seq->private = inode->i_private; /* the dlm_ls */
+        return 0;
+}
+static int table_open4(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &format4_seq_ops);
        if (ret)
                return ret;
@@ -631,7 +665,7 @@ static int table_open(struct inode *inode, struct file *file)
 static const struct file_operations format1_fops = {
        .owner   = THIS_MODULE,
-        .open    = table_open,
+        .open    = table_open1,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release
@@ -639,7 +673,7 @@ static const struct file_operations format1_fops = {
 static const struct file_operations format2_fops = {
        .owner   = THIS_MODULE,
-        .open    = table_open,
+        .open    = table_open2,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release
@@ -647,7 +681,7 @@ static const struct file_operations format2_fops = {
 static const struct file_operations format3_fops = {
        .owner   = THIS_MODULE,
-        .open    = table_open,
+        .open    = table_open3,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release
@@ -655,7 +689,7 @@ static const struct file_operations format3_fops = {
 static const struct file_operations format4_fops = {
        .owner   = THIS_MODULE,
-        .open    = table_open,
+        .open    = table_open4,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3131747199e1..c6ea25a190f8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5466,8 +5466,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
                                                      sbi->s_want_extra_isize,
                                                      iloc, handle);
                        if (ret) {
-                                ext4_set_inode_state(inode,
-                                                     EXT4_STATE_NO_EXPAND);
                                if (mnt_count !=
                                        le16_to_cpu(sbi->s_es->s_mnt_count)) {
                                        ext4_warning(inode->i_sb,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 10686fd67fb4..1bb7df5e4536 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -776,7 +776,7 @@ resizefs_out:
                                   (struct fscrypt_policy __user *)arg,
                                   sizeof(policy)))
                        return -EFAULT;
-                return fscrypt_process_policy(inode, &policy);
+                return fscrypt_process_policy(filp, &policy);
 #else
                return -EOPNOTSUPP;
 #endif
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1c593aa0218e..3ec8708989ca 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2211,6 +2211,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
 /* Called at mount-time, super-block is locked */
 static int ext4_check_descriptors(struct super_block *sb,
+                                  ext4_fsblk_t sb_block,
                                  ext4_group_t *first_not_zeroed)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2241,6 +2242,11 @@ static int ext4_check_descriptors(struct super_block *sb,
                        grp = i;
                block_bitmap = ext4_block_bitmap(sb, gdp);
+                if (block_bitmap == sb_block) {
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+                                 "Block bitmap for group %u overlaps "
+                                 "superblock", i);
+                }
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Block bitmap for group %u not in group "
@@ -2248,6 +2254,11 @@ static int ext4_check_descriptors(struct super_block *sb,
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
+                if (inode_bitmap == sb_block) {
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+                                 "Inode bitmap for group %u overlaps "
+                                 "superblock", i);
+                }
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode bitmap for group %u not in group "
@@ -2255,6 +2266,11 @@ static int ext4_check_descriptors(struct super_block *sb,
                        return 0;
                }
                inode_table = ext4_inode_table(sb, gdp);
+                if (inode_table == sb_block) {
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+                                 "Inode table for group %u overlaps "
+                                 "superblock", i);
+                }
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -3757,7 +3773,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount2;
                }
        }
-        if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
+        if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                ret = -EFSCORRUPTED;
                goto failed_mount2;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 39e9cfb1b371..2eb935ca5d9e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1353,15 +1353,19 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
        size_t min_offs, free;
        int total_ino;
        void *base, *start, *end;
-        int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
+        int error = 0, tried_min_extra_isize = 0;
        int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
+        int isize_diff; /* How much do we need to grow i_extra_isize */
        down_write(&EXT4_I(inode)->xattr_sem);
+        /*
+         * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty
+         */
+        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
 retry:
-        if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
+        isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize;
-                up_write(&EXT4_I(inode)->xattr_sem);
+        if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
-                return 0;
+                goto out;
-        }
        header = IHDR(inode, raw_inode);
        entry = IFIRST(header);
@@ -1382,7 +1386,7 @@ retry:
                goto cleanup;
        free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
-        if (free >= new_extra_isize) {
+        if (free >= isize_diff) {
                entry = IFIRST(header);
                ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
                                - new_extra_isize, (void *)raw_inode +
@@ -1390,8 +1394,7 @@ retry:
                                (void *)header, total_ino,
                                inode->i_sb->s_blocksize);
                EXT4_I(inode)->i_extra_isize = new_extra_isize;
-                error = 0;
+                goto out;
-                goto cleanup;
        }
        /*
@@ -1414,7 +1417,7 @@ retry:
                end = bh->b_data + bh->b_size;
                min_offs = end - base;
                free = ext4_xattr_free_space(first, &min_offs, base, NULL);
-                if (free < new_extra_isize) {
+                if (free < isize_diff) {
                        if (!tried_min_extra_isize && s_min_extra_isize) {
                                tried_min_extra_isize++;
                                new_extra_isize = s_min_extra_isize;
@@ -1428,7 +1431,7 @@ retry:
                free = inode->i_sb->s_blocksize;
        }
-        while (new_extra_isize > 0) {
+        while (isize_diff > 0) {
                size_t offs, size, entry_size;
                struct ext4_xattr_entry *small_entry = NULL;
                struct ext4_xattr_info i = {
@@ -1459,7 +1462,7 @@ retry:
                        EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
                                        EXT4_XATTR_LEN(last->e_name_len);
                        if (total_size <= free && total_size < min_total_size) {
-                                if (total_size < new_extra_isize) {
+                                if (total_size < isize_diff) {
                                        small_entry = last;
                                } else {
                                        entry = last;
@@ -1514,22 +1517,22 @@ retry:
                error = ext4_xattr_ibody_set(handle, inode, &i, is);
                if (error)
                        goto cleanup;
+                total_ino -= entry_size;
                entry = IFIRST(header);
-                if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
+                if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff)
-                        shift_bytes = new_extra_isize;
+                        shift_bytes = isize_diff;
                else
-                        shift_bytes = entry_size + size;
+                        shift_bytes = entry_size + EXT4_XATTR_SIZE(size);
                /* Adjust the offsets and shift the remaining entries ahead */
-                ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize -
+                ext4_xattr_shift_entries(entry, -shift_bytes,
-                        shift_bytes, (void *)raw_inode +
+                        (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
-                        EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
+                        EXT4_I(inode)->i_extra_isize + shift_bytes,
-                        (void *)header, total_ino - entry_size,
+                        (void *)header, total_ino, inode->i_sb->s_blocksize);
-                        inode->i_sb->s_blocksize);
-                extra_isize += shift_bytes;
+                isize_diff -= shift_bytes;
-                new_extra_isize -= shift_bytes;
+                EXT4_I(inode)->i_extra_isize += shift_bytes;
-                EXT4_I(inode)->i_extra_isize = extra_isize;
+                header = IHDR(inode, raw_inode);
                i.name = b_entry_name;
                i.value = buffer;
@@ -1551,6 +1554,8 @@ retry:
                kfree(bs);
        }
        brelse(bh);
+out:
+        ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
        up_write(&EXT4_I(inode)->xattr_sem);
        return 0;
@@ -1562,6 +1567,10 @@ cleanup:
        kfree(is);
        kfree(bs);
        brelse(bh);
+        /*
+         * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode
+         * size expansion failed.
+         */
        up_write(&EXT4_I(inode)->xattr_sem);
        return error;
 }
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 69dd3e6566e0..a92e783fa057 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -24,6 +24,7 @@
 #define EXT4_XATTR_INDEX_SYSTEM                 7
 #define EXT4_XATTR_INDEX_RICHACL                8
 #define EXT4_XATTR_INDEX_ENCRYPTION             9
+#define EXT4_XATTR_INDEX_HURD                   10 /* Reserved for Hurd */
 struct ext4_xattr_header {
        __le32  h_magic;        /* magic number for identification */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d64d2a515cb2..ccb401eebc11 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1699,11 +1699,11 @@ static int f2fs_write_end(struct file *file,
        trace_f2fs_write_end(inode, pos, len, copied);
        set_page_dirty(page);
-        f2fs_put_page(page, 1);
        if (pos + copied > i_size_read(inode))
                f2fs_i_size_write(inode, pos + copied);
+        f2fs_put_page(page, 1);
        f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
        return copied;
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 675fa79d86f6..14f5fe2b841e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -538,7 +538,7 @@ struct f2fs_nm_info {
        /* NAT cache management */
        struct radix_tree_root nat_root;/* root of the nat entry cache */
        struct radix_tree_root nat_set_root;/* root of the nat set cache */
-        struct percpu_rw_semaphore nat_tree_lock;       /* protect nat_tree_lock */
+        struct rw_semaphore nat_tree_lock;      /* protect nat_tree_lock */
        struct list_head nat_entries;   /* cached nat entry list (clean) */
        unsigned int nat_cnt;           /* the # of cached nat entries */
        unsigned int dirty_nat_cnt;     /* total num of nat entries in set */
@@ -787,7 +787,7 @@ struct f2fs_sb_info {
        struct f2fs_checkpoint *ckpt;           /* raw checkpoint pointer */
        struct inode *meta_inode;               /* cache meta blocks */
        struct mutex cp_mutex;                  /* checkpoint procedure lock */
-        struct percpu_rw_semaphore cp_rwsem;            /* blocking FS operations */
+        struct rw_semaphore cp_rwsem;           /* blocking FS operations */
        struct rw_semaphore node_write;         /* locking node writes */
        wait_queue_head_t cp_wait;
        unsigned long last_time[MAX_TIME];      /* to store time in jiffies */
@@ -1074,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
 static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
 {
-        percpu_down_read(&sbi->cp_rwsem);
+        down_read(&sbi->cp_rwsem);
 }
 static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
 {
-        percpu_up_read(&sbi->cp_rwsem);
+        up_read(&sbi->cp_rwsem);
 }
 static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
 {
-        percpu_down_write(&sbi->cp_rwsem);
+        down_write(&sbi->cp_rwsem);
 }
 static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
 {
-        percpu_up_write(&sbi->cp_rwsem);
+        up_write(&sbi->cp_rwsem);
 }
 static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0e493f63ea41..28f4f4cbb8d8 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1757,21 +1757,14 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
        struct fscrypt_policy policy;
        struct inode *inode = file_inode(filp);
-        int ret;
        if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
                                                        sizeof(policy)))
                return -EFAULT;
-        ret = mnt_want_write_file(filp);
-        if (ret)
-                return ret;
        f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
-        ret = fscrypt_process_policy(inode, &policy);
-        mnt_drop_write_file(filp);
+        return fscrypt_process_policy(filp, &policy);
-        return ret;
 }
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
@@ -2086,15 +2079,19 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
        if (unlikely(f2fs_readonly(src->i_sb)))
                return -EROFS;
-        if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode))
+        if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode))
-                return -EISDIR;
+                return -EINVAL;
        if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
                return -EOPNOTSUPP;
        inode_lock(src);
-        if (src != dst)
+        if (src != dst) {
-                inode_lock(dst);
+                if (!inode_trylock(dst)) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+        }
        ret = -EINVAL;
        if (pos_in + len > src->i_size || pos_in + len < pos_in)
@@ -2152,6 +2149,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
 out_unlock:
        if (src != dst)
                inode_unlock(dst);
+out:
        inode_unlock(src);
        return ret;
 }
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b2fa4b615925..f75d197d5beb 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -206,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
        struct nat_entry *e;
        bool need = false;
-        percpu_down_read(&nm_i->nat_tree_lock);
+        down_read(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, nid);
        if (e) {
                if (!get_nat_flag(e, IS_CHECKPOINTED) &&
                                !get_nat_flag(e, HAS_FSYNCED_INODE))
                        need = true;
        }
-        percpu_up_read(&nm_i->nat_tree_lock);
+        up_read(&nm_i->nat_tree_lock);
        return need;
 }
@@ -223,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
        struct nat_entry *e;
        bool is_cp = true;
-        percpu_down_read(&nm_i->nat_tree_lock);
+        down_read(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, nid);
        if (e && !get_nat_flag(e, IS_CHECKPOINTED))
                is_cp = false;
-        percpu_up_read(&nm_i->nat_tree_lock);
+        up_read(&nm_i->nat_tree_lock);
        return is_cp;
 }
@@ -237,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
        struct nat_entry *e;
        bool need_update = true;
-        percpu_down_read(&nm_i->nat_tree_lock);
+        down_read(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, ino);
        if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
                        (get_nat_flag(e, IS_CHECKPOINTED) ||
                         get_nat_flag(e, HAS_FSYNCED_INODE)))
                need_update = false;
-        percpu_up_read(&nm_i->nat_tree_lock);
+        up_read(&nm_i->nat_tree_lock);
        return need_update;
 }
@@ -284,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct nat_entry *e;
-        percpu_down_write(&nm_i->nat_tree_lock);
+        down_write(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, ni->nid);
        if (!e) {
                e = grab_nat_entry(nm_i, ni->nid);
@@ -334,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
                        set_nat_flag(e, HAS_FSYNCED_INODE, true);
                set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
        }
-        percpu_up_write(&nm_i->nat_tree_lock);
+        up_write(&nm_i->nat_tree_lock);
 }
 int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -342,7 +342,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        int nr = nr_shrink;
-        percpu_down_write(&nm_i->nat_tree_lock);
+        if (!down_write_trylock(&nm_i->nat_tree_lock))
+                return 0;
        while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
                struct nat_entry *ne;
@@ -351,7 +352,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
                __del_from_nat_cache(nm_i, ne);
                nr_shrink--;
        }
-        percpu_up_write(&nm_i->nat_tree_lock);
+        up_write(&nm_i->nat_tree_lock);
        return nr - nr_shrink;
 }
@@ -373,13 +374,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
        ni->nid = nid;
        /* Check nat cache */
-        percpu_down_read(&nm_i->nat_tree_lock);
+        down_read(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, nid);
        if (e) {
                ni->ino = nat_get_ino(e);
                ni->blk_addr = nat_get_blkaddr(e);
                ni->version = nat_get_version(e);
-                percpu_up_read(&nm_i->nat_tree_lock);
+                up_read(&nm_i->nat_tree_lock);
                return;
        }
@@ -403,11 +404,11 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
        node_info_from_raw_nat(ni, &ne);
        f2fs_put_page(page, 1);
 cache:
-        percpu_up_read(&nm_i->nat_tree_lock);
+        up_read(&nm_i->nat_tree_lock);
        /* cache nat entry */
-        percpu_down_write(&nm_i->nat_tree_lock);
+        down_write(&nm_i->nat_tree_lock);
        cache_nat_entry(sbi, nid, &ne);
-        percpu_up_write(&nm_i->nat_tree_lock);
+        up_write(&nm_i->nat_tree_lock);
 }
 /*
@@ -1788,7 +1789,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
        ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
                                                        META_NAT, true);
-        percpu_down_read(&nm_i->nat_tree_lock);
+        down_read(&nm_i->nat_tree_lock);
        while (1) {
                struct page *page = get_current_nat_page(sbi, nid);
@@ -1820,7 +1821,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
                        remove_free_nid(nm_i, nid);
        }
        up_read(&curseg->journal_rwsem);
-        percpu_up_read(&nm_i->nat_tree_lock);
+        up_read(&nm_i->nat_tree_lock);
        ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
                                        nm_i->ra_nid_pages, META_NAT, false);
@@ -2209,7 +2210,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
        if (!nm_i->dirty_nat_cnt)
                return;
-        percpu_down_write(&nm_i->nat_tree_lock);
+        down_write(&nm_i->nat_tree_lock);
        /*
         * if there are no enough space in journal to store dirty nat
@@ -2232,7 +2233,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
        list_for_each_entry_safe(set, tmp, &sets, set_list)
                __flush_nat_entry_set(sbi, set);
-        percpu_up_write(&nm_i->nat_tree_lock);
+        up_write(&nm_i->nat_tree_lock);
        f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
 }
@@ -2268,8 +2269,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
        mutex_init(&nm_i->build_lock);
        spin_lock_init(&nm_i->free_nid_list_lock);
-        if (percpu_init_rwsem(&nm_i->nat_tree_lock))
+        init_rwsem(&nm_i->nat_tree_lock);
-                return -ENOMEM;
        nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
        nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -2326,7 +2326,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
        spin_unlock(&nm_i->free_nid_list_lock);
        /* destroy nat cache */
-        percpu_down_write(&nm_i->nat_tree_lock);
+        down_write(&nm_i->nat_tree_lock);
        while ((found = __gang_lookup_nat_cache(nm_i,
                                        nid, NATVEC_SIZE, natvec))) {
                unsigned idx;
@@ -2351,9 +2351,8 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
                        kmem_cache_free(nat_entry_set_slab, setvec[idx]);
                }
        }
-        percpu_up_write(&nm_i->nat_tree_lock);
+        up_write(&nm_i->nat_tree_lock);
-        percpu_free_rwsem(&nm_i->nat_tree_lock);
        kfree(nm_i->nat_bitmap);
        sbi->nm_info = NULL;
        kfree(nm_i);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1b86d3f638ef..7f863a645ab1 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -706,8 +706,6 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
                percpu_counter_destroy(&sbi->nr_pages[i]);
        percpu_counter_destroy(&sbi->alloc_valid_block_count);
        percpu_counter_destroy(&sbi->total_valid_inode_count);
-        percpu_free_rwsem(&sbi->cp_rwsem);
 }
 static void f2fs_put_super(struct super_block *sb)
@@ -1483,9 +1481,6 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
 {
        int i, err;
-        if (percpu_init_rwsem(&sbi->cp_rwsem))
-                return -ENOMEM;
        for (i = 0; i < NR_COUNT_TYPE; i++) {
                err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
                if (err)
@@ -1686,6 +1681,7 @@ try_onemore:
                sbi->write_io[i].bio = NULL;
        }
+        init_rwsem(&sbi->cp_rwsem);
        init_waitqueue_head(&sbi->cp_wait);
        init_sb_info(sbi);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4d09d4441e3e..05713a5da083 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1949,6 +1949,12 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 {
        struct backing_dev_info *bdi;
+        /*
+         * If we are expecting writeback progress we must submit plugged IO.
+         */
+        if (blk_needs_flush_plug(current))
+                blk_schedule_flush_plug(current);
        if (!nr_pages)
                nr_pages = get_nr_dirty_pages();
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f394aff59c36..3988b43c2f5a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -530,13 +530,13 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
        req->out.args[0].size = count;
 }
-static void fuse_release_user_pages(struct fuse_req *req, int write)
+static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty)
 {
        unsigned i;
        for (i = 0; i < req->num_pages; i++) {
                struct page *page = req->pages[i];
-                if (write)
+                if (should_dirty)
                        set_page_dirty_lock(page);
                put_page(page);
        }
@@ -1320,6 +1320,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
                       loff_t *ppos, int flags)
 {
        int write = flags & FUSE_DIO_WRITE;
+        bool should_dirty = !write && iter_is_iovec(iter);
        int cuse = flags & FUSE_DIO_CUSE;
        struct file *file = io->file;
        struct inode *inode = file->f_mapping->host;
@@ -1363,7 +1364,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
                        nres = fuse_send_read(req, io, pos, nbytes, owner);
                if (!io->async)
-                        fuse_release_user_pages(req, !write);
+                        fuse_release_user_pages(req, should_dirty);
                if (req->out.h.error) {
                        err = req->out.h.error;
                        break;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 0f56deb24ce6..c415668c86d4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -568,7 +568,7 @@ static int ioctl_fsthaw(struct file *filp)
        return thaw_super(sb);
 }
-static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
+static int ioctl_file_dedupe_range(struct file *file, void __user *arg)
 {
        struct file_dedupe_range __user *argp = arg;
        struct file_dedupe_range *same = NULL;
@@ -582,6 +582,10 @@ static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
        }
        size = offsetof(struct file_dedupe_range __user, info[count]);
+        if (size > PAGE_SIZE) {
+                ret = -ENOMEM;
+                goto out;
+        }
        same = memdup_user(argp, size);
        if (IS_ERR(same)) {
diff --git a/fs/iomap.c b/fs/iomap.c
index 48141b8eff5f..706270f21b35 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -84,8 +84,11 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
         * Now the data has been copied, commit the range we've copied.  This
         * should not fail unless the filesystem has had a fatal error.
         */
-        ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+        if (ops->iomap_end) {
-                        flags, &iomap);
+                ret = ops->iomap_end(inode, pos, length,
+                                     written > 0 ? written : 0,
+                                     flags, &iomap);
+        }
        return written ? written : ret;
 }
@@ -194,12 +197,9 @@ again:
                if (mapping_writably_mapped(inode->i_mapping))
                        flush_dcache_page(page);
-                pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
-                pagefault_enable();
                flush_dcache_page(page);
-                mark_page_accessed(page);
                status = iomap_write_end(inode, pos, bytes, copied, page);
                if (unlikely(status < 0))
@@ -428,9 +428,12 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
                break;
        }
+        if (iomap->flags & IOMAP_F_MERGED)
+                flags |= FIEMAP_EXTENT_MERGED;
        return fiemap_fill_next_extent(fi, iomap->offset,
                        iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
-                        iomap->length, flags | FIEMAP_EXTENT_MERGED);
+                        iomap->length, flags);
 }
@@ -470,13 +473,18 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
        if (ret)
                return ret;
-        ret = filemap_write_and_wait(inode->i_mapping);
+        if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
-        if (ret)
+                ret = filemap_write_and_wait(inode->i_mapping);
-                return ret;
+                if (ret)
+                        return ret;
+        }
        while (len > 0) {
                ret = iomap_apply(inode, start, len, 0, ops, &ctx,
                                iomap_fiemap_actor);
+                /* inode with no (attribute) mapping will give ENOENT */
+                if (ret == -ENOENT)
+                        break;
                if (ret < 0)
                        return ret;
                if (ret == 0)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e1574008adc9..2bcb86e6e6ca 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -840,21 +840,35 @@ repeat:
        mutex_lock(&kernfs_mutex);
        list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
+                struct kernfs_node *parent;
                struct inode *inode;
-                struct dentry *dentry;
+                /*
+                 * We want fsnotify_modify() on @kn but as the
+                 * modifications aren't originating from userland don't
+                 * have the matching @file available.  Look up the inodes
+                 * and generate the events manually.
+                 */
                inode = ilookup(info->sb, kn->ino);
                if (!inode)
                        continue;
-                dentry = d_find_any_alias(inode);
+                parent = kernfs_get_parent(kn);
-                if (dentry) {
+                if (parent) {
-                        fsnotify_parent(NULL, dentry, FS_MODIFY);
+                        struct inode *p_inode;
-                        fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
-                                 NULL, 0);
+                        p_inode = ilookup(info->sb, parent->ino);
-                        dput(dentry);
+                        if (p_inode) {
+                                fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
+                                         inode, FSNOTIFY_EVENT_INODE, kn->name, 0);
+                                iput(p_inode);
+                        }
+                        kernfs_put(parent);
                }
+                fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
+                         kn->name, 0);
                iput(inode);
        }
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index f55a4e756047..217847679f0e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -346,7 +346,7 @@ static void bl_write_cleanup(struct work_struct *work)
                        PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
                ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
-                                        (end - start) >> SECTOR_SHIFT);
+                                        (end - start) >> SECTOR_SHIFT, end);
        }
        pnfs_ld_write_done(hdr);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 18e6fd0b9506..efc007f00742 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -141,6 +141,7 @@ struct pnfs_block_layout {
        struct rb_root          bl_ext_ro;
        spinlock_t              bl_ext_lock;   /* Protects list manipulation */
        bool                    bl_scsi_layout;
+        u64                     bl_lwb;
 };
 static inline struct pnfs_block_layout *
@@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl,
 int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
                sector_t end);
 int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
-                sector_t len);
+                sector_t len, u64 lwb);
 bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
                struct pnfs_block_extent *ret, bool rw);
 int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 992bcb19c11e..c85fbfd2d0d9 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
 int
 ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
-                sector_t len)
+                sector_t len, u64 lwb)
 {
        struct rb_root *root = &bl->bl_ext_rw;
        sector_t end = start + len;
@@ -471,6 +471,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
                }
        }
 out:
+        if (bl->bl_lwb < lwb)
+                bl->bl_lwb = lwb;
        spin_unlock(&bl->bl_ext_lock);
        __ext_put_deviceids(&tmp);
@@ -518,7 +520,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
 }
 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
-                size_t buffer_size, size_t *count)
+                size_t buffer_size, size_t *count, __u64 *lastbyte)
 {
        struct pnfs_block_extent *be;
        int ret = 0;
@@ -542,6 +544,8 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
                        p = encode_block_extent(be, p);
                be->be_tag = EXTENT_COMMITTING;
        }
+        *lastbyte = bl->bl_lwb - 1;
+        bl->bl_lwb = 0;
        spin_unlock(&bl->bl_ext_lock);
        return ret;
@@ -564,7 +568,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
        arg->layoutupdate_pages = &arg->layoutupdate_page;
 retry:
-        ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+        ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
        if (unlikely(ret)) {
                ext_tree_free_commitdata(arg, buffer_size);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a7f2e6e33305..52a28311e2a4 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -275,6 +275,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
 err_socks:
        svc_rpcb_cleanup(serv, net);
 err_bind:
+        nn->cb_users[minorversion]--;
        dprintk("NFS: Couldn't create callback socket: err = %d; "
                        "net = %p\n", ret, net);
        return ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c92a75e066a6..f953ef6b2f2e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -454,11 +454,8 @@ static bool referring_call_exists(struct nfs_client *clp,
                                ((u32 *)&rclist->rcl_sessionid.data)[3],
                                ref->rc_sequenceid, ref->rc_slotid);
-                        spin_lock(&tbl->slot_tbl_lock);
+                        status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid,
-                        status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
+                                        ref->rc_sequenceid, HZ >> 1) < 0;
-                                  tbl->slots[ref->rc_slotid].seq_nr ==
-                                        ref->rc_sequenceid);
-                        spin_unlock(&tbl->slot_tbl_lock);
                        if (status)
                                goto out;
                }
@@ -487,7 +484,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
                goto out;
        tbl = &clp->cl_session->bc_slot_table;
-        slot = tbl->slots + args->csa_slotid;
        /* Set up res before grabbing the spinlock */
        memcpy(&res->csr_sessionid, &args->csa_sessionid,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 003ebce4bbc4..1e106780a237 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -426,7 +426,7 @@ EXPORT_SYMBOL_GPL(nfs_mark_client_ready);
 * Initialise the timeout values for a connection
 */
 void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
-                                    unsigned int timeo, unsigned int retrans)
+                                    int timeo, int retrans)
 {
        to->to_initval = timeo * HZ / 10;
        to->to_retries = retrans;
@@ -434,9 +434,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
        switch (proto) {
        case XPRT_TRANSPORT_TCP:
        case XPRT_TRANSPORT_RDMA:
-                if (to->to_retries == 0)
+                if (retrans == NFS_UNSPEC_RETRANS)
                        to->to_retries = NFS_DEF_TCP_RETRANS;
-                if (to->to_initval == 0)
+                if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0)
                        to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
                        to->to_initval = NFS_MAX_TCP_TIMEOUT;
@@ -449,9 +449,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
                to->to_exponential = 0;
                break;
        case XPRT_TRANSPORT_UDP:
-                if (to->to_retries == 0)
+                if (retrans == NFS_UNSPEC_RETRANS)
                        to->to_retries = NFS_DEF_UDP_RETRANS;
-                if (!to->to_initval)
+                if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
                        to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
                if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
                        to->to_initval = NFS_MAX_UDP_TIMEOUT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7d620970f2e1..ca699ddc11c1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -657,7 +657,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
        if (result <= 0)
                goto out;
-        written = generic_write_sync(iocb, result);
+        result = generic_write_sync(iocb, result);
+        if (result < 0)
+                goto out;
+        written = result;
        iocb->ki_pos += written;
        /* Return error values */
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index e6206eaf2bdf..51b51369704c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -37,6 +37,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
        if (ffl) {
                INIT_LIST_HEAD(&ffl->error_list);
                INIT_LIST_HEAD(&ffl->mirrors);
+                ffl->last_report_time = ktime_get();
                return &ffl->generic_hdr;
        } else
                return NULL;
@@ -640,19 +641,18 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
 {
        static const ktime_t notime = {0};
        s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
+        struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
        nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
        if (ktime_equal(mirror->start_time, notime))
                mirror->start_time = now;
-        if (ktime_equal(mirror->last_report_time, notime))
-                mirror->last_report_time = now;
        if (mirror->report_interval != 0)
                report_interval = (s64)mirror->report_interval * 1000LL;
        else if (layoutstats_timer != 0)
                report_interval = (s64)layoutstats_timer * 1000LL;
-        if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
+        if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >=
                        report_interval) {
-                mirror->last_report_time = now;
+                ffl->last_report_time = now;
                return true;
        }
@@ -806,11 +806,14 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
 {
        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
        struct nfs4_pnfs_ds *ds;
+        bool fail_return = false;
        int idx;
        /* mirrors are sorted by efficiency */
        for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
-                ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+                if (idx+1 == fls->mirror_array_cnt)
+                        fail_return = true;
+                ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return);
                if (ds) {
                        *best_idx = idx;
                        return ds;
@@ -859,6 +862,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
        struct nfs4_pnfs_ds *ds;
        int ds_idx;
+retry:
        /* Use full layout for now */
        if (!pgio->pg_lseg)
                ff_layout_pg_get_read(pgio, req, false);
@@ -871,10 +875,13 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
        ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
        if (!ds) {
-                if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+                if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
-                        goto out_pnfs;
-                else
                        goto out_mds;
+                pnfs_put_lseg(pgio->pg_lseg);
+                pgio->pg_lseg = NULL;
+                /* Sleep for 1 second before retrying */
+                ssleep(1);
+                goto retry;
        }
        mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
@@ -890,12 +897,6 @@ out_mds:
        pnfs_put_lseg(pgio->pg_lseg);
        pgio->pg_lseg = NULL;
        nfs_pageio_reset_read_mds(pgio);
-        return;
-out_pnfs:
-        pnfs_set_lo_fail(pgio->pg_lseg);
-        pnfs_put_lseg(pgio->pg_lseg);
-        pgio->pg_lseg = NULL;
 }
 static void
@@ -909,6 +910,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
        int i;
        int status;
+retry:
        if (!pgio->pg_lseg) {
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                                   req->wb_context,
@@ -940,10 +942,13 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
        for (i = 0; i < pgio->pg_mirror_count; i++) {
                ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
                if (!ds) {
-                        if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+                        if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
-                                goto out_pnfs;
-                        else
                                goto out_mds;
+                        pnfs_put_lseg(pgio->pg_lseg);
+                        pgio->pg_lseg = NULL;
+                        /* Sleep for 1 second before retrying */
+                        ssleep(1);
+                        goto retry;
                }
                pgm = &pgio->pg_mirrors[i];
                mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
@@ -956,12 +961,6 @@ out_mds:
        pnfs_put_lseg(pgio->pg_lseg);
        pgio->pg_lseg = NULL;
        nfs_pageio_reset_write_mds(pgio);
-        return;
-out_pnfs:
-        pnfs_set_lo_fail(pgio->pg_lseg);
-        pnfs_put_lseg(pgio->pg_lseg);
-        pgio->pg_lseg = NULL;
 }
 static unsigned int
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 1bcdb15d0c41..3ee0c9fcea76 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -84,7 +84,6 @@ struct nfs4_ff_layout_mirror {
        struct nfs4_ff_layoutstat       read_stat;
        struct nfs4_ff_layoutstat       write_stat;
        ktime_t                         start_time;
-        ktime_t                         last_report_time;
        u32                             report_interval;
 };
@@ -101,6 +100,7 @@ struct nfs4_flexfile_layout {
        struct pnfs_ds_commit_info commit_info;
        struct list_head        mirrors;
        struct list_head        error_list; /* nfs4_ff_layout_ds_err */
+        ktime_t                 last_report_time; /* Layoutstat report times */
 };
 static inline struct nfs4_flexfile_layout *
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 0aa36be71fce..f7a3f6b05369 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -17,8 +17,8 @@
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
-static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+static unsigned int dataserver_retrans;
 void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
 {
@@ -379,7 +379,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
        devid = &mirror->mirror_ds->id_node;
        if (ff_layout_test_devid_unavailable(devid))
-                goto out;
+                goto out_fail;
        ds = mirror->mirror_ds->ds;
        /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -405,15 +405,16 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                        mirror->mirror_ds->ds_versions[0].rsize = max_payload;
                if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
                        mirror->mirror_ds->ds_versions[0].wsize = max_payload;
-        } else {
+                goto out;
-                ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
-                                         mirror, lseg->pls_range.offset,
-                                         lseg->pls_range.length, NFS4ERR_NXIO,
-                                         OP_ILLEGAL, GFP_NOIO);
-                if (fail_return || !ff_layout_has_available_ds(lseg))
-                        pnfs_error_mark_layout_for_return(ino, lseg);
-                ds = NULL;
        }
+        ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+                                 mirror, lseg->pls_range.offset,
+                                 lseg->pls_range.length, NFS4ERR_NXIO,
+                                 OP_ILLEGAL, GFP_NOIO);
+out_fail:
+        if (fail_return || !ff_layout_has_available_ds(lseg))
+                pnfs_error_mark_layout_for_return(ino, lseg);
+        ds = NULL;
 out:
        return ds;
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7ce5e023c3c3..74935a19e4bf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -58,6 +58,9 @@ struct nfs_clone_mount {
 */
 #define NFS_UNSPEC_PORT         (-1)
+#define NFS_UNSPEC_RETRANS      (UINT_MAX)
+#define NFS_UNSPEC_TIMEO        (UINT_MAX)
 /*
 * Maximum number of pages that readdir can use for creating
 * a vmapped array of pages.
@@ -156,7 +159,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
 int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
 void nfs_server_insert_lists(struct nfs_server *);
 void nfs_server_remove_lists(struct nfs_server *);
-void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int);
+void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans);
 int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
                rpc_authflavor_t);
 struct nfs_server *nfs_alloc_server(void);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 33da841a21bb..64b43b4ad9dd 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -318,10 +318,22 @@ static void
 nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs42_layoutstat_data *data = calldata;
-        struct nfs_server *server = NFS_SERVER(data->args.inode);
+        struct inode *inode = data->inode;
+        struct nfs_server *server = NFS_SERVER(inode);
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&inode->i_lock);
+        lo = NFS_I(inode)->layout;
+        if (!pnfs_layout_is_valid(lo)) {
+                spin_unlock(&inode->i_lock);
+                rpc_exit(task, 0);
+                return;
+        }
+        nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid);
+        spin_unlock(&inode->i_lock);
        nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
                             &data->res.seq_res, task);
 }
 static void
@@ -338,12 +350,14 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
        case 0:
                break;
        case -NFS4ERR_EXPIRED:
+        case -NFS4ERR_ADMIN_REVOKED:
+        case -NFS4ERR_DELEG_REVOKED:
        case -NFS4ERR_STALE_STATEID:
-        case -NFS4ERR_OLD_STATEID:
        case -NFS4ERR_BAD_STATEID:
                spin_lock(&inode->i_lock);
                lo = NFS_I(inode)->layout;
-                if (lo && nfs4_stateid_match(&data->args.stateid,
+                if (pnfs_layout_is_valid(lo) &&
+                    nfs4_stateid_match(&data->args.stateid,
                                             &lo->plh_stateid)) {
                        LIST_HEAD(head);
@@ -357,11 +371,23 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
                } else
                        spin_unlock(&inode->i_lock);
                break;
+        case -NFS4ERR_OLD_STATEID:
+                spin_lock(&inode->i_lock);
+                lo = NFS_I(inode)->layout;
+                if (pnfs_layout_is_valid(lo) &&
+                    nfs4_stateid_match_other(&data->args.stateid,
+                                        &lo->plh_stateid)) {
+                        /* Do we need to delay before resending? */
+                        if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+                                                &data->args.stateid))
+                                rpc_delay(task, HZ);
+                        rpc_restart_call_prepare(task);
+                }
+                spin_unlock(&inode->i_lock);
+                break;
        case -ENOTSUPP:
        case -EOPNOTSUPP:
                NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
-        default:
-                break;
        }
        dprintk("%s server returns %d\n", __func__, task->tk_status);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 324bfdc21250..9bf64eacba5b 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -396,6 +396,10 @@ extern void nfs4_schedule_state_renewal(struct nfs_client *);
 extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
 extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(struct work_struct *);
+extern void nfs4_set_lease_period(struct nfs_client *clp,
+                unsigned long lease,
+                unsigned long lastrenewed);
 /* nfs4state.c */
 struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8d7d08d4f95f..cd3b7cfdde16 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -817,6 +817,11 @@ static int nfs4_set_client(struct nfs_server *server,
                goto error;
        }
+        if (server->nfs_client == clp) {
+                error = -ELOOP;
+                goto error;
+        }
        /*
         * Query for the lease time on clientid setup or renewal
         *
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a036e93bdf96..a9dec32ba9ba 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -634,15 +634,11 @@ out_sleep:
 }
 EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
-static int nfs40_sequence_done(struct rpc_task *task,
+static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res)
-                               struct nfs4_sequence_res *res)
 {
        struct nfs4_slot *slot = res->sr_slot;
        struct nfs4_slot_table *tbl;
-        if (slot == NULL)
-                goto out;
        tbl = slot->table;
        spin_lock(&tbl->slot_tbl_lock);
        if (!nfs41_wake_and_assign_slot(tbl, slot))
@@ -650,7 +646,13 @@ static int nfs40_sequence_done(struct rpc_task *task,
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slot = NULL;
-out:
+}
+static int nfs40_sequence_done(struct rpc_task *task,
+                               struct nfs4_sequence_res *res)
+{
+        if (res->sr_slot != NULL)
+                nfs40_sequence_free_slot(res);
        return 1;
 }
@@ -666,6 +668,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        tbl = slot->table;
        session = tbl->session;
+        /* Bump the slot sequence number */
+        if (slot->seq_done)
+                slot->seq_nr++;
+        slot->seq_done = 0;
        spin_lock(&tbl->slot_tbl_lock);
        /* Be nice to the server: try to ensure that the last transmitted
         * value for highest_user_slotid <= target_highest_slotid
@@ -686,9 +693,12 @@ out_unlock:
        res->sr_slot = NULL;
        if (send_new_highest_used_slotid)
                nfs41_notify_server(session->clp);
+        if (waitqueue_active(&tbl->slot_waitq))
+                wake_up_all(&tbl->slot_waitq);
 }
-int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+static int nfs41_sequence_process(struct rpc_task *task,
+                struct nfs4_sequence_res *res)
 {
        struct nfs4_session *session;
        struct nfs4_slot *slot = res->sr_slot;
@@ -714,7 +724,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
        switch (res->sr_status) {
        case 0:
                /* Update the slot's sequence and clientid lease timer */
-                ++slot->seq_nr;
+                slot->seq_done = 1;
                clp = session->clp;
                do_renew_lease(clp, res->sr_timestamp);
                /* Check sequence flags */
@@ -769,16 +779,16 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
                goto retry_nowait;
        default:
                /* Just update the slot sequence no. */
-                ++slot->seq_nr;
+                slot->seq_done = 1;
        }
 out:
        /* The session may be reset by one of the error handlers. */
        dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
-        nfs41_sequence_free_slot(res);
 out_noaction:
        return ret;
 retry_nowait:
        if (rpc_restart_call_prepare(task)) {
+                nfs41_sequence_free_slot(res);
                task->tk_status = 0;
                ret = 0;
        }
@@ -789,8 +799,37 @@ out_retry:
        rpc_delay(task, NFS4_POLL_RETRY_MAX);
        return 0;
 }
+int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+        if (!nfs41_sequence_process(task, res))
+                return 0;
+        if (res->sr_slot != NULL)
+                nfs41_sequence_free_slot(res);
+        return 1;
+}
 EXPORT_SYMBOL_GPL(nfs41_sequence_done);
+static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+        if (res->sr_slot == NULL)
+                return 1;
+        if (res->sr_slot->table->session != NULL)
+                return nfs41_sequence_process(task, res);
+        return nfs40_sequence_done(task, res);
+}
+static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+        if (res->sr_slot != NULL) {
+                if (res->sr_slot->table->session != NULL)
+                        nfs41_sequence_free_slot(res);
+                else
+                        nfs40_sequence_free_slot(res);
+        }
+}
 int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
        if (res->sr_slot == NULL)
@@ -920,6 +959,17 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
                                    args, res, task);
 }
+static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+        return nfs40_sequence_done(task, res);
+}
+static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+        if (res->sr_slot != NULL)
+                nfs40_sequence_free_slot(res);
+}
 int nfs4_sequence_done(struct rpc_task *task,
                       struct nfs4_sequence_res *res)
 {
@@ -1197,6 +1247,7 @@ static void nfs4_opendata_free(struct kref *kref)
        struct super_block *sb = p->dentry->d_sb;
        nfs_free_seqid(p->o_arg.seqid);
+        nfs4_sequence_free_slot(&p->o_res.seq_res);
        if (p->state != NULL)
                nfs4_put_open_state(p->state);
        nfs4_put_state_owner(p->owner);
@@ -1656,9 +1707,14 @@ err:
 static struct nfs4_state *
 nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 {
+        struct nfs4_state *ret;
        if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
-                return _nfs4_opendata_reclaim_to_nfs4_state(data);
+                ret =_nfs4_opendata_reclaim_to_nfs4_state(data);
-        return _nfs4_opendata_to_nfs4_state(data);
+        else
+                ret = _nfs4_opendata_to_nfs4_state(data);
+        nfs4_sequence_free_slot(&data->o_res.seq_res);
+        return ret;
 }
 static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
@@ -2056,7 +2112,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
        data->rpc_status = task->tk_status;
-        if (!nfs4_sequence_done(task, &data->o_res.seq_res))
+        if (!nfs4_sequence_process(task, &data->o_res.seq_res))
                return;
        if (task->tk_status == 0) {
@@ -4237,12 +4293,9 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str
                err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
                trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
                if (err == 0) {
-                        struct nfs_client *clp = server->nfs_client;
+                        nfs4_set_lease_period(server->nfs_client,
+                                        fsinfo->lease_time * HZ,
-                        spin_lock(&clp->cl_lock);
+                                        now);
-                        clp->cl_lease_time = fsinfo->lease_time * HZ;
-                        clp->cl_last_renewal = now;
-                        spin_unlock(&clp->cl_lock);
                        break;
                }
                err = nfs4_handle_exception(server, err, &exception);
@@ -7517,12 +7570,20 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
        trace_nfs4_create_session(clp, status);
+        switch (status) {
+        case -NFS4ERR_STALE_CLIENTID:
+        case -NFS4ERR_DELAY:
+        case -ETIMEDOUT:
+        case -EACCES:
+        case -EAGAIN:
+                goto out;
+        };
+        clp->cl_seqid++;
        if (!status) {
                /* Verify the session's negotiated channel_attrs values */
                status = nfs4_verify_channel_attrs(&args, &res);
                /* Increment the clientid slot sequence id */
-                if (clp->cl_seqid == res.seqid)
-                        clp->cl_seqid++;
                if (status)
                        goto out;
                nfs4_update_session(session, &res);
@@ -7867,7 +7928,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
        struct nfs4_layoutget *lgp = calldata;
        dprintk("--> %s\n", __func__);
-        nfs41_sequence_done(task, &lgp->res.seq_res);
+        nfs41_sequence_process(task, &lgp->res.seq_res);
        dprintk("<-- %s\n", __func__);
 }
@@ -8083,6 +8144,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
        /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
        if (status == 0 && lgp->res.layoutp->len)
                lseg = pnfs_layout_process(lgp);
+        nfs4_sequence_free_slot(&lgp->res.seq_res);
        rpc_put_task(task);
        dprintk("<-- %s status=%d\n", __func__, status);
        if (status)
@@ -8109,7 +8171,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
        dprintk("--> %s\n", __func__);
-        if (!nfs41_sequence_done(task, &lrp->res.seq_res))
+        if (!nfs41_sequence_process(task, &lrp->res.seq_res))
                return;
        server = NFS_SERVER(lrp->args.inode);
@@ -8121,6 +8183,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
        case -NFS4ERR_DELAY:
                if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
                        break;
+                nfs4_sequence_free_slot(&lrp->res.seq_res);
                rpc_restart_call_prepare(task);
                return;
        }
@@ -8135,12 +8198,16 @@ static void nfs4_layoutreturn_release(void *calldata)
        dprintk("--> %s\n", __func__);
        spin_lock(&lo->plh_inode->i_lock);
-        pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
+        if (lrp->res.lrs_present) {
-                        be32_to_cpu(lrp->args.stateid.seqid));
+                pnfs_mark_matching_lsegs_invalid(lo, &freeme,
-        if (lrp->res.lrs_present && pnfs_layout_is_valid(lo))
+                                &lrp->args.range,
+                                be32_to_cpu(lrp->args.stateid.seqid));
                pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+        } else
+                pnfs_mark_layout_stateid_invalid(lo, &freeme);
        pnfs_clear_layoutreturn_waitbit(lo);
        spin_unlock(&lo->plh_inode->i_lock);
+        nfs4_sequence_free_slot(&lrp->res.seq_res);
        pnfs_free_lseg_list(&freeme);
        pnfs_put_layout_hdr(lrp->args.layout);
        nfs_iput_and_deactive(lrp->inode);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e1ba58c3d1ad..82e77198d17e 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -136,6 +136,26 @@ nfs4_kill_renewd(struct nfs_client *clp)
        cancel_delayed_work_sync(&clp->cl_renewd);
 }
+/**
+ * nfs4_set_lease_period - Sets the lease period on a nfs_client
+ *
+ * @clp: pointer to nfs_client
+ * @lease: new value for lease period
+ * @lastrenewed: time at which lease was last renewed
+ */
+void nfs4_set_lease_period(struct nfs_client *clp,
+                unsigned long lease,
+                unsigned long lastrenewed)
+{
+        spin_lock(&clp->cl_lock);
+        clp->cl_lease_time = lease;
+        clp->cl_last_renewal = lastrenewed;
+        spin_unlock(&clp->cl_lock);
+        /* Cap maximum reconnect timeout at 1/2 lease period */
+        rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1);
+}
 /*
 * Local variables:
 *   c-basic-offset: 8
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index 332d06e64fa9..b62973045a3e 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -28,6 +28,7 @@ static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue)
        tbl->highest_used_slotid = NFS4_NO_SLOT;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue);
+        init_waitqueue_head(&tbl->slot_waitq);
        init_completion(&tbl->complete);
 }
@@ -172,6 +173,58 @@ struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
        return ERR_PTR(-E2BIG);
 }
+static int nfs4_slot_get_seqid(struct nfs4_slot_table  *tbl, u32 slotid,
+                u32 *seq_nr)
+        __must_hold(&tbl->slot_tbl_lock)
+{
+        struct nfs4_slot *slot;
+        slot = nfs4_lookup_slot(tbl, slotid);
+        if (IS_ERR(slot))
+                return PTR_ERR(slot);
+        *seq_nr = slot->seq_nr;
+        return 0;
+}
+/*
+ * nfs4_slot_seqid_in_use - test if a slot sequence id is still in use
+ *
+ * Given a slot table, slot id and sequence number, determine if the
+ * RPC call in question is still in flight. This function is mainly
+ * intended for use by the callback channel.
+ */
+static bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl,
+                u32 slotid, u32 seq_nr)
+{
+        u32 cur_seq;
+        bool ret = false;
+        spin_lock(&tbl->slot_tbl_lock);
+        if (nfs4_slot_get_seqid(tbl, slotid, &cur_seq) == 0 &&
+            cur_seq == seq_nr && test_bit(slotid, tbl->used_slots))
+                ret = true;
+        spin_unlock(&tbl->slot_tbl_lock);
+        return ret;
+}
+/*
+ * nfs4_slot_wait_on_seqid - wait until a slot sequence id is complete
+ *
+ * Given a slot table, slot id and sequence number, wait until the
+ * corresponding RPC call completes. This function is mainly
+ * intended for use by the callback channel.
+ */
+int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
+                u32 slotid, u32 seq_nr,
+                unsigned long timeout)
+{
+        if (wait_event_timeout(tbl->slot_waitq,
+                        !nfs4_slot_seqid_in_use(tbl, slotid, seq_nr),
+                        timeout) == 0)
+                return -ETIMEDOUT;
+        return 0;
+}
 /*
 * nfs4_alloc_slot - efficiently look for a free slot
 *
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 5b51298d1d03..f703b755351b 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -21,7 +21,8 @@ struct nfs4_slot {
        unsigned long           generation;
        u32                     slot_nr;
        u32                     seq_nr;
-        unsigned int            interrupted : 1;
+        unsigned int            interrupted : 1,
+                                seq_done : 1;
 };
 /* Sessions */
@@ -36,6 +37,7 @@ struct nfs4_slot_table {
        unsigned long   used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
        spinlock_t      slot_tbl_lock;
        struct rpc_wait_queue   slot_tbl_waitq; /* allocators may wait here */
+        wait_queue_head_t       slot_waitq;     /* Completion wait on slot */
        u32             max_slots;              /* # slots in table */
        u32             max_slotid;             /* Max allowed slotid value */
        u32             highest_used_slotid;    /* sent to server on each SEQ.
@@ -78,6 +80,9 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
 extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
 extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
 extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
+extern int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
+                u32 slotid, u32 seq_nr,
+                unsigned long timeout);
 extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
 extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
 extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 834b875900d6..cada00aa5096 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -277,20 +277,17 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
 {
        int status;
        struct nfs_fsinfo fsinfo;
+        unsigned long now;
        if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
                nfs4_schedule_state_renewal(clp);
                return 0;
        }
+        now = jiffies;
        status = nfs4_proc_get_lease_time(clp, &fsinfo);
        if (status == 0) {
-                /* Update lease time and schedule renewal */
+                nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now);
-                spin_lock(&clp->cl_lock);
-                clp->cl_lease_time = fsinfo.lease_time * HZ;
-                clp->cl_last_renewal = jiffies;
-                spin_unlock(&clp->cl_lock);
                nfs4_schedule_state_renewal(clp);
        }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 70806cae0d36..2c93a85eda51 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -365,7 +365,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
        /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
        atomic_dec(&lo->plh_refcount);
        if (list_empty(&lo->plh_segs)) {
-                set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+                if (atomic_read(&lo->plh_outstanding) == 0)
+                        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
                clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
        }
        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
@@ -768,17 +769,32 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
        pnfs_destroy_layouts_byclid(clp, false);
 }
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+        lo->plh_return_iomode = 0;
+        lo->plh_return_seq = 0;
+        clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
                        bool update_barrier)
 {
        u32 oldseq, newseq, new_barrier = 0;
-        bool invalid = !pnfs_layout_is_valid(lo);
        oldseq = be32_to_cpu(lo->plh_stateid.seqid);
        newseq = be32_to_cpu(new->seqid);
-        if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) {
+        if (!pnfs_layout_is_valid(lo)) {
+                nfs4_stateid_copy(&lo->plh_stateid, new);
+                lo->plh_barrier = newseq;
+                pnfs_clear_layoutreturn_info(lo);
+                clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+                return;
+        }
+        if (pnfs_seqid_is_newer(newseq, oldseq)) {
                nfs4_stateid_copy(&lo->plh_stateid, new);
                /*
                 * Because of wraparound, we want to keep the barrier
@@ -790,7 +806,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
                new_barrier = be32_to_cpu(new->seqid);
        else if (new_barrier == 0)
                return;
-        if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
+        if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
                lo->plh_barrier = new_barrier;
 }
@@ -886,19 +902,14 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
        rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 }
-static void
-pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
-{
-        lo->plh_return_iomode = 0;
-        lo->plh_return_seq = 0;
-        clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
-}
 static bool
 pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
                nfs4_stateid *stateid,
                enum pnfs_iomode *iomode)
 {
+        /* Serialise LAYOUTGET/LAYOUTRETURN */
+        if (atomic_read(&lo->plh_outstanding) != 0)
+                return false;
        if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
                return false;
        pnfs_get_layout_hdr(lo);
@@ -1555,6 +1566,7 @@ pnfs_update_layout(struct inode *ino,
        }
 lookup_again:
+        nfs4_client_recover_expired_lease(clp);
        first = false;
        spin_lock(&ino->i_lock);
        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
@@ -1797,16 +1809,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                 */
                pnfs_mark_layout_stateid_invalid(lo, &free_me);
-                nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
+                pnfs_set_layout_stateid(lo, &res->stateid, true);
-                lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
        }
        pnfs_get_lseg(lseg);
        pnfs_layout_insert_lseg(lo, lseg, &free_me);
-        if (!pnfs_layout_is_valid(lo)) {
-                pnfs_clear_layoutreturn_info(lo);
-                clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-        }
        if (res->return_on_close)
@@ -2510,7 +2517,6 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
        data->args.fh = NFS_FH(inode);
        data->args.inode = inode;
-        nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
        status = ld->prepare_layoutstats(&data->args);
        if (status)
                goto out_free;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 18d446e1a82b..d39601381adf 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -923,6 +923,8 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data) {
+                data->timeo             = NFS_UNSPEC_TIMEO;
+                data->retrans           = NFS_UNSPEC_RETRANS;
                data->acregmin          = NFS_DEF_ACREGMIN;
                data->acregmax          = NFS_DEF_ACREGMAX;
                data->acdirmin          = NFS_DEF_ACDIRMIN;
@@ -1189,6 +1191,19 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option)
        return rc;
 }
+static int nfs_get_option_ul_bound(substring_t args[], unsigned long *option,
+                unsigned long l_bound, unsigned long u_bound)
+{
+        int ret;
+        ret = nfs_get_option_ul(args, option);
+        if (ret != 0)
+                return ret;
+        if (*option < l_bound || *option > u_bound)
+                return -ERANGE;
+        return 0;
+}
 /*
 * Error-check and convert a string of mount options from user space into
 * a data structure.  The whole mount string is processed; bad options are
@@ -1352,12 +1367,12 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->bsize = option;
                        break;
                case Opt_timeo:
-                        if (nfs_get_option_ul(args, &option) || option == 0)
+                        if (nfs_get_option_ul_bound(args, &option, 1, INT_MAX))
                                goto out_invalid_value;
                        mnt->timeo = option;
                        break;
                case Opt_retrans:
-                        if (nfs_get_option_ul(args, &option) || option == 0)
+                        if (nfs_get_option_ul_bound(args, &option, 0, INT_MAX))
                                goto out_invalid_value;
                        mnt->retrans = option;
                        break;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8410ca275db1..a204d7e109d4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4903,6 +4903,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        return nfs_ok;
 }
+static __be32
+nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s)
+{
+        struct nfs4_ol_stateid *stp = openlockstateid(s);
+        __be32 ret;
+        mutex_lock(&stp->st_mutex);
+        ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+        if (ret)
+                goto out;
+        ret = nfserr_locks_held;
+        if (check_for_locks(stp->st_stid.sc_file,
+                            lockowner(stp->st_stateowner)))
+                goto out;
+        release_lock_stateid(stp);
+        ret = nfs_ok;
+out:
+        mutex_unlock(&stp->st_mutex);
+        nfs4_put_stid(s);
+        return ret;
+}
 __be32
 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                   struct nfsd4_free_stateid *free_stateid)
@@ -4910,7 +4936,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        stateid_t *stateid = &free_stateid->fr_stateid;
        struct nfs4_stid *s;
        struct nfs4_delegation *dp;
-        struct nfs4_ol_stateid *stp;
        struct nfs4_client *cl = cstate->session->se_client;
        __be32 ret = nfserr_bad_stateid;
@@ -4929,18 +4954,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                ret = nfserr_locks_held;
                break;
        case NFS4_LOCK_STID:
-                ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+                atomic_inc(&s->sc_count);
-                if (ret)
-                        break;
-                stp = openlockstateid(s);
-                ret = nfserr_locks_held;
-                if (check_for_locks(stp->st_stid.sc_file,
-                                    lockowner(stp->st_stateowner)))
-                        break;
-                WARN_ON(!unhash_lock_stateid(stp));
                spin_unlock(&cl->cl_lock);
-                nfs4_put_stid(s);
+                ret = nfsd4_free_lock_stateid(stateid, s);
-                ret = nfs_ok;
                goto out;
        case NFS4_REVOKED_DELEG_STID:
                dp = delegstateid(s);
@@ -5507,7 +5523,7 @@ static __be32
 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
                            struct nfs4_ol_stateid *ost,
                            struct nfsd4_lock *lock,
-                            struct nfs4_ol_stateid **lst, bool *new)
+                            struct nfs4_ol_stateid **plst, bool *new)
 {
        __be32 status;
        struct nfs4_file *fi = ost->st_stid.sc_file;
@@ -5515,7 +5531,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
        struct nfs4_client *cl = oo->oo_owner.so_client;
        struct inode *inode = d_inode(cstate->current_fh.fh_dentry);
        struct nfs4_lockowner *lo;
+        struct nfs4_ol_stateid *lst;
        unsigned int strhashval;
+        bool hashed;
        lo = find_lockowner_str(cl, &lock->lk_new_owner);
        if (!lo) {
@@ -5531,12 +5549,27 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
                        goto out;
        }
-        *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
+retry:
-        if (*lst == NULL) {
+        lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
+        if (lst == NULL) {
                status = nfserr_jukebox;
                goto out;
        }
+        mutex_lock(&lst->st_mutex);
+        /* See if it's still hashed to avoid race with FREE_STATEID */
+        spin_lock(&cl->cl_lock);
+        hashed = !list_empty(&lst->st_perfile);
+        spin_unlock(&cl->cl_lock);
+        if (!hashed) {
+                mutex_unlock(&lst->st_mutex);
+                nfs4_put_stid(&lst->st_stid);
+                goto retry;
+        }
        status = nfs_ok;
+        *plst = lst;
 out:
        nfs4_put_stateowner(&lo->lo_owner);
        return status;
@@ -5603,8 +5636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                status = lookup_or_create_lock_state(cstate, open_stp, lock,
                                                        &lock_stp, &new);
-                if (status == nfs_ok)
-                        mutex_lock(&lock_stp->st_mutex);
        } else {
                status = nfs4_preprocess_seqid_op(cstate,
                                       lock->lk_old_lock_seqid,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ba944123167b..ff476e654b8f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1252,10 +1252,13 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (IS_ERR(dchild))
                return nfserrno(host_err);
        err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
-        if (err) {
+        /*
-                dput(dchild);
+         * We unconditionally drop our ref to dchild as fh_compose will have
+         * already grabbed its own ref for it.
+         */
+        dput(dchild);
+        if (err)
                return err;
-        }
        return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
                                        rdev, resfhp);
 }
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d2f97ecca6a5..e0e5f7c3c99f 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        wait_event(group->fanotify_data.access_waitq, event->response ||
+        wait_event(group->fanotify_data.access_waitq, event->response);
-                                atomic_read(&group->fanotify_data.bypass_perm));
-        if (!event->response) { /* bypass_perm set */
-                /*
-                 * Event was canceled because group is being destroyed. Remove
-                 * it from group's event list because we are responsible for
-                 * freeing the permission event.
-                 */
-                fsnotify_remove_event(group, &event->fae.fse);
-                return 0;
-        }
        /* userspace responded, convert to something usable */
        switch (event->response) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8e8e6bcd1d43..a64313868d3a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
        struct fanotify_perm_event_info *event, *next;
+        struct fsnotify_event *fsn_event;
        /*
-         * There may be still new events arriving in the notification queue
+         * Stop new events from arriving in the notification queue. since
-         * but since userspace cannot use fanotify fd anymore, no event can
+         * userspace cannot use fanotify fd anymore, no event can enter or
-         * enter or leave access_list by now.
+         * leave access_list by now either.
         */
-        spin_lock(&group->fanotify_data.access_lock);
+        fsnotify_group_stop_queueing(group);
-        atomic_inc(&group->fanotify_data.bypass_perm);
+        /*
+         * Process all permission events on access_list and notification queue
+         * and simulate reply from userspace.
+         */
+        spin_lock(&group->fanotify_data.access_lock);
        list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
                                 fae.fse.list) {
                pr_debug("%s: found group=%p event=%p\n", __func__, group,
@@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
        spin_unlock(&group->fanotify_data.access_lock);
        /*
-         * Since bypass_perm is set, newly queued events will not wait for
+         * Destroy all non-permission events. For permission events just
-         * access response. Wake up the already sleeping ones now.
+         * dequeue them and set the response. They will be freed once the
-         * synchronize_srcu() in fsnotify_destroy_group() will wait for all
+         * response is consumed and fanotify_get_response() returns.
-         * processes sleeping in fanotify_handle_event() waiting for access
-         * response and thus also for all permission events to be freed.
         */
+        mutex_lock(&group->notification_mutex);
+        while (!fsnotify_notify_queue_is_empty(group)) {
+                fsn_event = fsnotify_remove_first_event(group);
+                if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
+                        fsnotify_destroy_event(group, fsn_event);
+                else
+                        FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+        }
+        mutex_unlock(&group->notification_mutex);
+        /* Response for all permission events it set, wakeup waiters */
        wake_up(&group->fanotify_data.access_waitq);
 #endif
@@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        spin_lock_init(&group->fanotify_data.access_lock);
        init_waitqueue_head(&group->fanotify_data.access_waitq);
        INIT_LIST_HEAD(&group->fanotify_data.access_list);
-        atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
        switch (flags & FAN_ALL_CLASS_BITS) {
        case FAN_CLASS_NOTIF:
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 3e2dd85be5dd..b47f7cfdcaa4 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -40,6 +40,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
 }
 /*
+ * Stop queueing new events for this group. Once this function returns
+ * fsnotify_add_event() will not add any new events to the group's queue.
+ */
+void fsnotify_group_stop_queueing(struct fsnotify_group *group)
+{
+        mutex_lock(&group->notification_mutex);
+        group->shutdown = true;
+        mutex_unlock(&group->notification_mutex);
+}
+/*
 * Trying to get rid of a group. Remove all marks, flush all events and release
 * the group reference.
 * Note that another thread calling fsnotify_clear_marks_by_group() may still
@@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
 */
 void fsnotify_destroy_group(struct fsnotify_group *group)
 {
+        /*
+         * Stop queueing new events. The code below is careful enough to not
+         * require this but fanotify needs to stop queuing events even before
+         * fsnotify_destroy_group() is called and this makes the other callers
+         * of fsnotify_destroy_group() to see the same behavior.
+         */
+        fsnotify_group_stop_queueing(group);
        /* clear all inode marks for this group, attach them to destroy_list */
        fsnotify_detach_group_marks(group);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index a95d8e037aeb..e455e83ceeeb 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
 * Add an event to the group notification queue.  The group can later pull this
 * event off the queue to deal with.  The function returns 0 if the event was
 * added to the queue, 1 if the event was merged with some other queued event,
- * 2 if the queue of events has overflown.
+ * 2 if the event was not queued - either the queue of events has overflown
+ * or the group is shutting down.
 */
 int fsnotify_add_event(struct fsnotify_group *group,
                       struct fsnotify_event *event,
@@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group,
        mutex_lock(&group->notification_mutex);
+        if (group->shutdown) {
+                mutex_unlock(&group->notification_mutex);
+                return 2;
+        }
        if (group->q_len >= group->max_events) {
                ret = 2;
                /* Queue overflow event only if it isn't already queued */
@@ -126,21 +132,6 @@ queue:
 }
 /*
- * Remove @event from group's notification queue. It is the responsibility of
- * the caller to destroy the event.
- */
-void fsnotify_remove_event(struct fsnotify_group *group,
-                           struct fsnotify_event *event)
-{
-        mutex_lock(&group->notification_mutex);
-        if (!list_empty(&event->list)) {
-                list_del_init(&event->list);
-                group->q_len--;
-        }
-        mutex_unlock(&group->notification_mutex);
-}
-/*
 * Remove and return the first event from the notification list.  It is the
 * responsibility of the caller to destroy the obtained event
 */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 7dabbc31060e..f165f867f332 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5922,7 +5922,6 @@ bail:
 }
 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
-                                         handle_t *handle,
                                         struct inode *data_alloc_inode,
                                         struct buffer_head *data_alloc_bh)
 {
@@ -5935,11 +5934,19 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
        struct ocfs2_truncate_log *tl;
        struct inode *tl_inode = osb->osb_tl_inode;
        struct buffer_head *tl_bh = osb->osb_tl_bh;
+        handle_t *handle;
        di = (struct ocfs2_dinode *) tl_bh->b_data;
        tl = &di->id2.i_dealloc;
        i = le16_to_cpu(tl->tl_used) - 1;
        while (i >= 0) {
+                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto bail;
+                }
                /* Caller has given us at least enough credits to
                 * update the truncate log dinode */
                status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
@@ -5974,12 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                        }
                }
-                status = ocfs2_extend_trans(handle,
+                ocfs2_commit_trans(osb, handle);
-                                OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                i--;
        }
@@ -5994,7 +5996,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 {
        int status;
        unsigned int num_to_flush;
-        handle_t *handle;
        struct inode *tl_inode = osb->osb_tl_inode;
        struct inode *data_alloc_inode = NULL;
        struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -6038,21 +6039,11 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
                goto out_mutex;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+        status = ocfs2_replay_truncate_records(osb, data_alloc_inode,
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                mlog_errno(status);
-                goto out_unlock;
-        }
-        status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
                                               data_alloc_bh);
        if (status < 0)
                mlog_errno(status);
-        ocfs2_commit_trans(osb, handle);
-out_unlock:
        brelse(data_alloc_bh);
        ocfs2_inode_unlock(data_alloc_inode, 1);
@@ -6413,43 +6404,34 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
                goto out_mutex;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out_unlock;
-        }
        while (head) {
                if (head->free_bg)
                        bg_blkno = head->free_bg;
                else
                        bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
                                                              head->free_bit);
+                handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        mlog_errno(ret);
+                        goto out_unlock;
+                }
                trace_ocfs2_free_cached_blocks(
                     (unsigned long long)head->free_blk, head->free_bit);
                ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
                                               head->free_bit, bg_blkno, 1);
-                if (ret) {
+                if (ret)
                        mlog_errno(ret);
-                        goto out_journal;
-                }
-                ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+                ocfs2_commit_trans(osb, handle);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_journal;
-                }
                tmp = head;
                head = head->free_next;
                kfree(tmp);
        }
-out_journal:
-        ocfs2_commit_trans(osb, handle);
 out_unlock:
        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 94b18369b1cc..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -44,9 +44,6 @@
 * version here in tcp_internal.h should not need to be bumped for
 * filesystem locking changes.
 *
- * New in version 12
- *      - Negotiate hb timeout when storage is down.
- *
 * New in version 11
 *      - Negotiation of filesystem locking in the dlm join.
 *
@@ -78,7 +75,7 @@
 *      - full 64 bit i_size in the metadata lock lvbs
 *      - introduction of "rw" lock and pushing meta/data locking down
 */
-#define O2NET_PROTOCOL_VERSION 12ULL
+#define O2NET_PROTOCOL_VERSION 11ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index cdeafb4e7ed6..0bb128659d4b 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -268,7 +268,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                                  struct dlm_lock *lock, int flags, int type)
 {
        enum dlm_status status;
-        u8 old_owner = res->owner;
        mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
             lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -335,7 +334,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
-        lock->convert_pending = 0;
        /* if it failed, move it back to granted queue.
         * if master returns DLM_NORMAL and then down before sending ast,
         * it may have already been moved to granted queue, reset to
@@ -344,12 +342,14 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                if (status != DLM_NOTQUEUED)
                        dlm_error(status);
                dlm_revert_pending_convert(res, lock);
-        } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+        } else if (!lock->convert_pending) {
-                        (old_owner != res->owner)) {
+                mlog(0, "%s: res %.*s, owner died and lock has been moved back "
-                mlog(0, "res %.*s is in recovering or has been recovered.\n",
+                                "to granted list, retry convert.\n",
-                                res->lockname.len, res->lockname.name);
+                                dlm->name, res->lockname.len, res->lockname.name);
                status = DLM_RECOVERING;
        }
+        lock->convert_pending = 0;
 bail:
        spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4e7b0dc22450..0b055bfb8e86 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1506,7 +1506,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
                                       u64 start, u64 len)
 {
        int ret = 0;
-        u64 tmpend, end = start + len;
+        u64 tmpend = 0;
+        u64 end = start + len;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        unsigned int csize = osb->s_clustersize;
        handle_t *handle;
@@ -1538,18 +1539,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
        }
        /*
-         * We want to get the byte offset of the end of the 1st cluster.
+         * If start is on a cluster boundary and end is somewhere in another
+         * cluster, we have not COWed the cluster starting at start, unless
+         * end is also within the same cluster. So, in this case, we skip this
+         * first call to ocfs2_zero_range_for_truncate() truncate and move on
+         * to the next one.
         */
-        tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
+        if ((start & (csize - 1)) != 0) {
-        if (tmpend > end)
+                /*
-                tmpend = end;
+                 * We want to get the byte offset of the end of the 1st
+                 * cluster.
+                 */
+                tmpend = (u64)osb->s_clustersize +
+                        (start & ~(osb->s_clustersize - 1));
+                if (tmpend > end)
+                        tmpend = end;
-        trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+                trace_ocfs2_zero_partial_clusters_range1(
-                                                 (unsigned long long)tmpend);
+                        (unsigned long long)start,
+                        (unsigned long long)tmpend);
-        ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
+                ret = ocfs2_zero_range_for_truncate(inode, handle, start,
-        if (ret)
+                                                    tmpend);
-                mlog_errno(ret);
+                if (ret)
+                        mlog_errno(ret);
+        }
        if (tmpend < end) {
                /*
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index ea47120a85ff..6ad3533940ba 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1199,14 +1199,24 @@ retry:
                        inode_unlock((*ac)->ac_inode);
                        ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
-                        if (ret == 1)
+                        if (ret == 1) {
+                                iput((*ac)->ac_inode);
+                                (*ac)->ac_inode = NULL;
                                goto retry;
+                        }
                        if (ret < 0)
                                mlog_errno(ret);
                        inode_lock((*ac)->ac_inode);
-                        ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+                        ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                inode_unlock((*ac)->ac_inode);
+                                iput((*ac)->ac_inode);
+                                (*ac)->ac_inode = NULL;
+                                goto bail;
+                        }
                }
                if (status < 0) {
                        if (status != -ENOSPC)
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 54e5d6681786..43fdc2765aea 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -80,6 +80,8 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
        }
        for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
+                if (ovl_is_private_xattr(name))
+                        continue;
 retry:
                size = vfs_getxattr(old, name, value, value_size);
                if (size == -ERANGE)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 12bcd07b9e32..1560fdc09a5f 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -12,6 +12,8 @@
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include <linux/cred.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include "overlayfs.h"
 void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
@@ -186,6 +188,9 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
        struct dentry *newdentry;
        int err;
+        if (!hardlink && !IS_POSIXACL(udir))
+                stat->mode &= ~current_umask();
        inode_lock_nested(udir, I_MUTEX_PARENT);
        newdentry = lookup_one_len(dentry->d_name.name, upperdir,
                                   dentry->d_name.len);
@@ -335,6 +340,32 @@ out_free:
        return ret;
 }
+static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name,
+                             const struct posix_acl *acl)
+{
+        void *buffer;
+        size_t size;
+        int err;
+        if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl)
+                return 0;
+        size = posix_acl_to_xattr(NULL, acl, NULL, 0);
+        buffer = kmalloc(size, GFP_KERNEL);
+        if (!buffer)
+                return -ENOMEM;
+        size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+        err = size;
+        if (err < 0)
+                goto out_free;
+        err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE);
+out_free:
+        kfree(buffer);
+        return err;
+}
 static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
                                    struct kstat *stat, const char *link,
                                    struct dentry *hardlink)
@@ -346,10 +377,18 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
        struct dentry *upper;
        struct dentry *newdentry;
        int err;
+        struct posix_acl *acl, *default_acl;
        if (WARN_ON(!workdir))
                return -EROFS;
+        if (!hardlink) {
+                err = posix_acl_create(dentry->d_parent->d_inode,
+                                       &stat->mode, &default_acl, &acl);
+                if (err)
+                        return err;
+        }
        err = ovl_lock_rename_workdir(workdir, upperdir);
        if (err)
                goto out;
@@ -384,6 +423,17 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
                if (err)
                        goto out_cleanup;
        }
+        if (!hardlink) {
+                err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS,
+                                        acl);
+                if (err)
+                        goto out_cleanup;
+                err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT,
+                                        default_acl);
+                if (err)
+                        goto out_cleanup;
+        }
        if (!hardlink && S_ISDIR(stat->mode)) {
                err = ovl_set_opaque(newdentry);
@@ -410,6 +460,10 @@ out_dput:
 out_unlock:
        unlock_rename(workdir, upperdir);
 out:
+        if (!hardlink) {
+                posix_acl_release(acl);
+                posix_acl_release(default_acl);
+        }
        return err;
 out_cleanup:
@@ -950,9 +1004,9 @@ const struct inode_operations ovl_dir_inode_operations = {
        .permission     = ovl_permission,
        .getattr        = ovl_dir_getattr,
        .setxattr       = generic_setxattr,
-        .getxattr       = ovl_getxattr,
+        .getxattr       = generic_getxattr,
        .listxattr      = ovl_listxattr,
-        .removexattr    = ovl_removexattr,
+        .removexattr    = generic_removexattr,
        .get_acl        = ovl_get_acl,
        .update_time    = ovl_update_time,
 };
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 1b885c156028..c75625c1efa3 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl.h>
 #include "overlayfs.h"
 static int ovl_copy_up_truncate(struct dentry *dentry)
@@ -191,32 +192,44 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
        return err;
 }
-static bool ovl_is_private_xattr(const char *name)
+bool ovl_is_private_xattr(const char *name)
 {
-#define OVL_XATTR_PRE_NAME OVL_XATTR_PREFIX "."
+        return strncmp(name, OVL_XATTR_PREFIX,
-        return strncmp(name, OVL_XATTR_PRE_NAME,
+                       sizeof(OVL_XATTR_PREFIX) - 1) == 0;
-                       sizeof(OVL_XATTR_PRE_NAME) - 1) == 0;
 }
-int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value,
-                 const char *name, const void *value,
+                  size_t size, int flags)
-                 size_t size, int flags)
 {
        int err;
-        struct dentry *upperdentry;
+        struct path realpath;
+        enum ovl_path_type type = ovl_path_real(dentry, &realpath);
        const struct cred *old_cred;
        err = ovl_want_write(dentry);
        if (err)
                goto out;
+        if (!value && !OVL_TYPE_UPPER(type)) {
+                err = vfs_getxattr(realpath.dentry, name, NULL, 0);
+                if (err < 0)
+                        goto out_drop_write;
+        }
        err = ovl_copy_up(dentry);
        if (err)
                goto out_drop_write;
-        upperdentry = ovl_dentry_upper(dentry);
+        if (!OVL_TYPE_UPPER(type))
+                ovl_path_upper(dentry, &realpath);
        old_cred = ovl_override_creds(dentry->d_sb);
-        err = vfs_setxattr(upperdentry, name, value, size, flags);
+        if (value)
+                err = vfs_setxattr(realpath.dentry, name, value, size, flags);
+        else {
+                WARN_ON(flags != XATTR_REPLACE);
+                err = vfs_removexattr(realpath.dentry, name);
+        }
        revert_creds(old_cred);
 out_drop_write:
@@ -225,16 +238,13 @@ out:
        return err;
 }
-ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
+int ovl_xattr_get(struct dentry *dentry, const char *name,
-                     const char *name, void *value, size_t size)
+                  void *value, size_t size)
 {
        struct dentry *realdentry = ovl_dentry_real(dentry);
        ssize_t res;
        const struct cred *old_cred;
-        if (ovl_is_private_xattr(name))
-                return -ENODATA;
        old_cred = ovl_override_creds(dentry->d_sb);
        res = vfs_getxattr(realdentry, name, value, size);
        revert_creds(old_cred);
@@ -245,7 +255,8 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
 {
        struct dentry *realdentry = ovl_dentry_real(dentry);
        ssize_t res;
-        int off;
+        size_t len;
+        char *s;
        const struct cred *old_cred;
        old_cred = ovl_override_creds(dentry->d_sb);
@@ -255,73 +266,39 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
                return res;
        /* filter out private xattrs */
-        for (off = 0; off < res;) {
+        for (s = list, len = res; len;) {
-                char *s = list + off;
+                size_t slen = strnlen(s, len) + 1;
-                size_t slen = strlen(s) + 1;
-                BUG_ON(off + slen > res);
+                /* underlying fs providing us with an broken xattr list? */
+                if (WARN_ON(slen > len))
+                        return -EIO;
+                len -= slen;
                if (ovl_is_private_xattr(s)) {
                        res -= slen;
-                        memmove(s, s + slen, res - off);
+                        memmove(s, s + slen, len);
                } else {
-                        off += slen;
+                        s += slen;
                }
        }
        return res;
 }
-int ovl_removexattr(struct dentry *dentry, const char *name)
-{
-        int err;
-        struct path realpath;
-        enum ovl_path_type type = ovl_path_real(dentry, &realpath);
-        const struct cred *old_cred;
-        err = ovl_want_write(dentry);
-        if (err)
-                goto out;
-        err = -ENODATA;
-        if (ovl_is_private_xattr(name))
-                goto out_drop_write;
-        if (!OVL_TYPE_UPPER(type)) {
-                err = vfs_getxattr(realpath.dentry, name, NULL, 0);
-                if (err < 0)
-                        goto out_drop_write;
-                err = ovl_copy_up(dentry);
-                if (err)
-                        goto out_drop_write;
-                ovl_path_upper(dentry, &realpath);
-        }
-        old_cred = ovl_override_creds(dentry->d_sb);
-        err = vfs_removexattr(realpath.dentry, name);
-        revert_creds(old_cred);
-out_drop_write:
-        ovl_drop_write(dentry);
-out:
-        return err;
-}
 struct posix_acl *ovl_get_acl(struct inode *inode, int type)
 {
        struct inode *realinode = ovl_inode_real(inode, NULL);
        const struct cred *old_cred;
        struct posix_acl *acl;
-        if (!IS_POSIXACL(realinode))
+        if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
                return NULL;
        if (!realinode->i_op->get_acl)
                return NULL;
        old_cred = ovl_override_creds(inode->i_sb);
-        acl = realinode->i_op->get_acl(realinode, type);
+        acl = get_acl(realinode, type);
        revert_creds(old_cred);
        return acl;
@@ -391,9 +368,9 @@ static const struct inode_operations ovl_file_inode_operations = {
        .permission     = ovl_permission,
        .getattr        = ovl_getattr,
        .setxattr       = generic_setxattr,
-        .getxattr       = ovl_getxattr,
+        .getxattr       = generic_getxattr,
        .listxattr      = ovl_listxattr,
-        .removexattr    = ovl_removexattr,
+        .removexattr    = generic_removexattr,
        .get_acl        = ovl_get_acl,
        .update_time    = ovl_update_time,
 };
@@ -404,9 +381,9 @@ static const struct inode_operations ovl_symlink_inode_operations = {
        .readlink       = ovl_readlink,
        .getattr        = ovl_getattr,
        .setxattr       = generic_setxattr,
-        .getxattr       = ovl_getxattr,
+        .getxattr       = generic_getxattr,
        .listxattr      = ovl_listxattr,
-        .removexattr    = ovl_removexattr,
+        .removexattr    = generic_removexattr,
        .update_time    = ovl_update_time,
 };
@@ -415,6 +392,9 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
        inode->i_ino = get_next_ino();
        inode->i_mode = mode;
        inode->i_flags |= S_NOCMTIME;
+#ifdef CONFIG_FS_POSIX_ACL
+        inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
+#endif
        mode &= S_IFMT;
        switch (mode) {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e4f5c9536bfe..5813ccff8cd9 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -24,8 +24,8 @@ enum ovl_path_type {
        (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
-#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay"
+#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
-#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX ".opaque"
+#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
 #define OVL_ISUPPER_MASK 1UL
@@ -179,20 +179,21 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
 void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
 void ovl_cache_free(struct list_head *list);
 int ovl_check_d_type_supported(struct path *realpath);
+void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
+                         struct dentry *dentry, int level);
 /* inode.c */
 int ovl_setattr(struct dentry *dentry, struct iattr *attr);
 int ovl_permission(struct inode *inode, int mask);
-int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value,
-                 const char *name, const void *value,
+                  size_t size, int flags);
-                 size_t size, int flags);
+int ovl_xattr_get(struct dentry *dentry, const char *name,
-ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
+                  void *value, size_t size);
-                     const char *name, void *value, size_t size);
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
-int ovl_removexattr(struct dentry *dentry, const char *name);
 struct posix_acl *ovl_get_acl(struct inode *inode, int type);
 int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
 int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
+bool ovl_is_private_xattr(const char *name);
 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
 struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index cf37fc76fc9f..f241b4ee3d8a 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -248,7 +248,7 @@ static inline int ovl_dir_read(struct path *realpath,
                        err = rdd->err;
        } while (!err && rdd->count);
-        if (!err && rdd->first_maybe_whiteout)
+        if (!err && rdd->first_maybe_whiteout && rdd->dentry)
                err = ovl_check_whiteouts(realpath->dentry, rdd);
        fput(realfile);
@@ -606,3 +606,64 @@ int ovl_check_d_type_supported(struct path *realpath)
        return rdd.d_type_supported;
 }
+static void ovl_workdir_cleanup_recurse(struct path *path, int level)
+{
+        int err;
+        struct inode *dir = path->dentry->d_inode;
+        LIST_HEAD(list);
+        struct ovl_cache_entry *p;
+        struct ovl_readdir_data rdd = {
+                .ctx.actor = ovl_fill_merge,
+                .dentry = NULL,
+                .list = &list,
+                .root = RB_ROOT,
+                .is_lowest = false,
+        };
+        err = ovl_dir_read(path, &rdd);
+        if (err)
+                goto out;
+        inode_lock_nested(dir, I_MUTEX_PARENT);
+        list_for_each_entry(p, &list, l_node) {
+                struct dentry *dentry;
+                if (p->name[0] == '.') {
+                        if (p->len == 1)
+                                continue;
+                        if (p->len == 2 && p->name[1] == '.')
+                                continue;
+                }
+                dentry = lookup_one_len(p->name, path->dentry, p->len);
+                if (IS_ERR(dentry))
+                        continue;
+                if (dentry->d_inode)
+                        ovl_workdir_cleanup(dir, path->mnt, dentry, level);
+                dput(dentry);
+        }
+        inode_unlock(dir);
+out:
+        ovl_cache_free(&list);
+}
+void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
+                         struct dentry *dentry, int level)
+{
+        int err;
+        if (!d_is_dir(dentry) || level > 1) {
+                ovl_cleanup(dir, dentry);
+                return;
+        }
+        err = ovl_do_rmdir(dir, dentry);
+        if (err) {
+                struct path path = { .mnt = mnt, .dentry = dentry };
+                inode_unlock(dir);
+                ovl_workdir_cleanup_recurse(&path, level + 1);
+                inode_lock_nested(dir, I_MUTEX_PARENT);
+                ovl_cleanup(dir, dentry);
+        }
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 4036132842b5..e2a94a26767b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -814,6 +814,10 @@ retry:
                struct kstat stat = {
                        .mode = S_IFDIR | 0,
                };
+                struct iattr attr = {
+                        .ia_valid = ATTR_MODE,
+                        .ia_mode = stat.mode,
+                };
                if (work->d_inode) {
                        err = -EEXIST;
@@ -821,7 +825,7 @@ retry:
                                goto out_dput;
                        retried = true;
-                        ovl_cleanup(dir, work);
+                        ovl_workdir_cleanup(dir, mnt, work, 0);
                        dput(work);
                        goto retry;
                }
@@ -829,6 +833,21 @@ retry:
                err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
                if (err)
                        goto out_dput;
+                err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT);
+                if (err && err != -ENODATA && err != -EOPNOTSUPP)
+                        goto out_dput;
+                err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS);
+                if (err && err != -ENODATA && err != -EOPNOTSUPP)
+                        goto out_dput;
+                /* Clear any inherited mode bits */
+                inode_lock(work->d_inode);
+                err = notify_change(work, &attr, NULL);
+                inode_unlock(work->d_inode);
+                if (err)
+                        goto out_dput;
        }
 out_unlock:
        inode_unlock(dir);
@@ -967,10 +986,19 @@ static unsigned int ovl_split_lowerdirs(char *str)
        return ctr;
 }
-static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
+static int __maybe_unused
-                                   struct dentry *dentry, struct inode *inode,
+ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
-                                   const char *name, const void *value,
+                        struct dentry *dentry, struct inode *inode,
-                                   size_t size, int flags)
+                        const char *name, void *buffer, size_t size)
+{
+        return ovl_xattr_get(dentry, handler->name, buffer, size);
+}
+static int __maybe_unused
+ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
+                        struct dentry *dentry, struct inode *inode,
+                        const char *name, const void *value,
+                        size_t size, int flags)
 {
        struct dentry *workdir = ovl_workdir(dentry);
        struct inode *realinode = ovl_inode_real(inode, NULL);
@@ -998,19 +1026,22 @@ static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
        posix_acl_release(acl);
-        return ovl_setxattr(dentry, inode, handler->name, value, size, flags);
+        err = ovl_xattr_set(dentry, handler->name, value, size, flags);
+        if (!err)
+                ovl_copyattr(ovl_inode_real(inode, NULL), inode);
+        return err;
 out_acl_release:
        posix_acl_release(acl);
        return err;
 }
-static int ovl_other_xattr_set(const struct xattr_handler *handler,
+static int ovl_own_xattr_get(const struct xattr_handler *handler,
-                               struct dentry *dentry, struct inode *inode,
+                             struct dentry *dentry, struct inode *inode,
-                               const char *name, const void *value,
+                             const char *name, void *buffer, size_t size)
-                               size_t size, int flags)
 {
-        return ovl_setxattr(dentry, inode, name, value, size, flags);
+        return -EPERM;
 }
 static int ovl_own_xattr_set(const struct xattr_handler *handler,
@@ -1021,42 +1052,59 @@ static int ovl_own_xattr_set(const struct xattr_handler *handler,
        return -EPERM;
 }
-static const struct xattr_handler ovl_posix_acl_access_xattr_handler = {
+static int ovl_other_xattr_get(const struct xattr_handler *handler,
+                               struct dentry *dentry, struct inode *inode,
+                               const char *name, void *buffer, size_t size)
+{
+        return ovl_xattr_get(dentry, name, buffer, size);
+}
+static int ovl_other_xattr_set(const struct xattr_handler *handler,
+                               struct dentry *dentry, struct inode *inode,
+                               const char *name, const void *value,
+                               size_t size, int flags)
+{
+        return ovl_xattr_set(dentry, name, value, size, flags);
+}
+static const struct xattr_handler __maybe_unused
+ovl_posix_acl_access_xattr_handler = {
        .name = XATTR_NAME_POSIX_ACL_ACCESS,
        .flags = ACL_TYPE_ACCESS,
+        .get = ovl_posix_acl_xattr_get,
        .set = ovl_posix_acl_xattr_set,
 };
-static const struct xattr_handler ovl_posix_acl_default_xattr_handler = {
+static const struct xattr_handler __maybe_unused
+ovl_posix_acl_default_xattr_handler = {
        .name = XATTR_NAME_POSIX_ACL_DEFAULT,
        .flags = ACL_TYPE_DEFAULT,
+        .get = ovl_posix_acl_xattr_get,
        .set = ovl_posix_acl_xattr_set,
 };
 static const struct xattr_handler ovl_own_xattr_handler = {
        .prefix = OVL_XATTR_PREFIX,
+        .get = ovl_own_xattr_get,
        .set = ovl_own_xattr_set,
 };
 static const struct xattr_handler ovl_other_xattr_handler = {
        .prefix = "", /* catch all */
+        .get = ovl_other_xattr_get,
        .set = ovl_other_xattr_set,
 };
 static const struct xattr_handler *ovl_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
        &ovl_posix_acl_access_xattr_handler,
        &ovl_posix_acl_default_xattr_handler,
+#endif
        &ovl_own_xattr_handler,
        &ovl_other_xattr_handler,
        NULL
 };
-static const struct xattr_handler *ovl_xattr_noacl_handlers[] = {
-        &ovl_own_xattr_handler,
-        &ovl_other_xattr_handler,
-        NULL,
-};
 static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct path upperpath = { NULL, NULL };
@@ -1132,7 +1180,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        err = -EINVAL;
        stacklen = ovl_split_lowerdirs(lowertmp);
        if (stacklen > OVL_MAX_STACK) {
-                pr_err("overlayfs: too many lower directries, limit is %d\n",
+                pr_err("overlayfs: too many lower directories, limit is %d\n",
                       OVL_MAX_STACK);
                goto out_free_lowertmp;
        } else if (!ufs->config.upperdir && stacklen == 1) {
@@ -1269,10 +1317,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = OVERLAYFS_SUPER_MAGIC;
        sb->s_op = &ovl_super_operations;
-        if (IS_ENABLED(CONFIG_FS_POSIX_ACL))
+        sb->s_xattr = ovl_xattr_handlers;
-                sb->s_xattr = ovl_xattr_handlers;
-        else
-                sb->s_xattr = ovl_xattr_noacl_handlers;
        sb->s_root = root_dentry;
        sb->s_fs_info = ufs;
        sb->s_flags |= MS_POSIXACL;
diff --git a/fs/pipe.c b/fs/pipe.c
index 4b32928f5426..4ebe6b2e5217 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -144,10 +144,8 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
        struct page *page = buf->page;
        if (page_count(page) == 1) {
-                if (memcg_kmem_enabled()) {
+                if (memcg_kmem_enabled())
                        memcg_kmem_uncharge(page, 0);
-                        __ClearPageKmemcg(page);
-                }
                __SetPageLocked(page);
                return 0;
        }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 54e270262979..ac0df4dde823 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1556,18 +1556,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
 static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
        struct task_struct *task;
-        struct mm_struct *mm;
        struct file *exe_file;
        task = get_proc_task(d_inode(dentry));
        if (!task)
                return -ENOENT;
-        mm = get_task_mm(task);
+        exe_file = get_task_exe_file(task);
        put_task_struct(task);
-        if (!mm)
-                return -ENOENT;
-        exe_file = get_mm_exe_file(mm);
-        mmput(mm);
        if (exe_file) {
                *exe_path = exe_file->f_path;
                path_get(&exe_file->f_path);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a939f5ed7f89..5c89a07e3d7f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
+        char *buf = file->private_data;
        ssize_t acc = 0;
        size_t size, tsz;
        size_t elf_buflen;
@@ -500,23 +501,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
                        if (clear_user(buffer, tsz))
                                return -EFAULT;
                } else if (is_vmalloc_or_module_addr((void *)start)) {
-                        char * elf_buf;
+                        vread(buf, (char *)start, tsz);
-                        elf_buf = kzalloc(tsz, GFP_KERNEL);
-                        if (!elf_buf)
-                                return -ENOMEM;
-                        vread(elf_buf, (char *)start, tsz);
                        /* we have to zero-fill user buffer even if no read */
-                        if (copy_to_user(buffer, elf_buf, tsz)) {
+                        if (copy_to_user(buffer, buf, tsz))
-                                kfree(elf_buf);
                                return -EFAULT;
-                        }
-                        kfree(elf_buf);
                } else {
                        if (kern_addr_valid(start)) {
                                unsigned long n;
-                                n = copy_to_user(buffer, (char *)start, tsz);
+                                /*
+                                 * Using bounce buffer to bypass the
+                                 * hardened user copy kernel text checks.
+                                 */
+                                memcpy(buf, (char *) start, tsz);
+                                n = copy_to_user(buffer, buf, tsz);
                                /*
                                 * We cannot distinguish between fault on source
                                 * and fault on destination. When this happens
@@ -549,6 +547,11 @@ static int open_kcore(struct inode *inode, struct file *filp)
 {
        if (!capable(CAP_SYS_RAWIO))
                return -EPERM;
+        filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!filp->private_data)
+                return -ENOMEM;
        if (kcore_need_update)
                kcore_update_ram();
        if (i_size_read(inode) != proc_root_kcore->size) {
@@ -559,10 +562,16 @@ static int open_kcore(struct inode *inode, struct file *filp)
        return 0;
 }
+static int release_kcore(struct inode *inode, struct file *file)
+{
+        kfree(file->private_data);
+        return 0;
+}
 static const struct file_operations proc_kcore_operations = {
        .read           = read_kcore,
        .open           = open_kcore,
+        .release        = release_kcore,
        .llseek         = default_llseek,
 };
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 09e18fdf61e5..b9a8c813e5e6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -46,7 +46,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                cached = 0;
        for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
-                pages[lru] = global_page_state(NR_LRU_BASE + lru);
+                pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
        available = si_mem_available();
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187d84ef9de9..f6fa99eca515 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
                mss->anonymous_thp += HPAGE_PMD_SIZE;
        else if (PageSwapBacked(page))
                mss->shmem_thp += HPAGE_PMD_SIZE;
+        else if (is_zone_device_page(page))
+                /* pass */;
        else
                VM_BUG_ON_PAGE(1, page);
        smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 183a212694bf..12af0490322f 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -27,9 +27,17 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/ramfs.h>
+#include <linux/sched.h>
 #include "internal.h"
+static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
+                unsigned long addr, unsigned long len, unsigned long pgoff,
+                unsigned long flags)
+{
+        return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
 const struct file_operations ramfs_file_operations = {
        .read_iter      = generic_file_read_iter,
        .write_iter     = generic_file_write_iter,
@@ -38,6 +46,7 @@ const struct file_operations ramfs_file_operations = {
        .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
        .llseek         = generic_file_llseek,
+        .get_unmapped_area      = ramfs_mmu_get_unmapped_area,
 };
 const struct inode_operations ramfs_file_inode_operations = {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 19f532e7d35e..6dc4296eed62 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -223,8 +223,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                size -= n;
                buf += n;
                copied += n;
-                if (!m->count)
+                if (!m->count) {
+                        m->from = 0;
                        m->index++;
+                }
                if (!size)
                        goto Done;
        }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f35523d4fa3a..b803213d1307 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -114,9 +114,15 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
         * If buf != of->prealloc_buf, we don't know how
         * large it is, so cannot safely pass it to ->show
         */
-        if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
+        if (WARN_ON_ONCE(buf != of->prealloc_buf))
                return 0;
        len = ops->show(kobj, of->kn->priv, buf);
+        if (pos) {
+                if (len <= pos)
+                        return 0;
+                len -= pos;
+                memmove(buf, buf + pos, len);
+        }
        return min(count, len);
 }
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index b45345d701e7..51157da3f76e 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -370,7 +370,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
        p = c->gap_lebs;
        do {
-                ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
+                ubifs_assert(p < c->gap_lebs + c->lst.idx_lebs);
                written = layout_leb_in_gaps(c, p);
                if (written < 0) {
                        err = written;
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e237811f09ce..11a004114eba 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -575,7 +575,8 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
        dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name,
                inode->i_ino, dentry, size);
-        return  __ubifs_getxattr(inode, name, buffer, size);
+        name = xattr_full_name(handler, name);
+        return __ubifs_getxattr(inode, name, buffer, size);
 }
 static int ubifs_xattr_set(const struct xattr_handler *handler,
@@ -586,6 +587,8 @@ static int ubifs_xattr_set(const struct xattr_handler *handler,
        dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
                name, inode->i_ino, dentry, size);
+        name = xattr_full_name(handler, name);
        if (value)
                return __ubifs_setxattr(inode, name, value, size, flags);
        else
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 776ae2f325d1..05b5243d89f6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1582,6 +1582,7 @@ xfs_alloc_ag_vextent_small(
        xfs_extlen_t    *flenp, /* result length */
        int             *stat)  /* status: 0-freelist, 1-normal/none */
 {
+        struct xfs_owner_info   oinfo;
        int             error;
        xfs_agblock_t   fbno;
        xfs_extlen_t    flen;
@@ -1624,6 +1625,18 @@ xfs_alloc_ag_vextent_small(
                                error0);
                        args->wasfromfl = 1;
                        trace_xfs_alloc_small_freelist(args);
+                        /*
+                         * If we're feeding an AGFL block to something that
+                         * doesn't live in the free space, we need to clear
+                         * out the OWN_AG rmap.
+                         */
+                        xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+                        error = xfs_rmap_free(args->tp, args->agbp, args->agno,
+                                        fbno, 1, &oinfo);
+                        if (error)
+                                goto error0;
                        *stat = 0;
                        return 0;
                }
@@ -2264,6 +2277,9 @@ xfs_alloc_log_agf(
                offsetof(xfs_agf_t, agf_longest),
                offsetof(xfs_agf_t, agf_btreeblks),
                offsetof(xfs_agf_t, agf_uuid),
+                offsetof(xfs_agf_t, agf_rmap_blocks),
+                /* needed so that we don't log the whole rest of the structure: */
+                offsetof(xfs_agf_t, agf_spare64),
                sizeof(xfs_agf_t)
        };
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index b5c213a051cd..08569792fe20 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1814,6 +1814,10 @@ xfs_btree_lookup(
        XFS_BTREE_STATS_INC(cur, lookup);
+        /* No such thing as a zero-level tree. */
+        if (cur->bc_nlevels == 0)
+                return -EFSCORRUPTED;
        block = NULL;
        keyno = 0;
@@ -4554,15 +4558,22 @@ xfs_btree_simple_query_range(
        if (error)
                goto out;
+        /* Nothing?  See if there's anything to the right. */
+        if (!stat) {
+                error = xfs_btree_increment(cur, 0, &stat);
+                if (error)
+                        goto out;
+        }
        while (stat) {
                /* Find the record. */
                error = xfs_btree_get_rec(cur, &recp, &stat);
                if (error || !stat)
                        break;
-                cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
                /* Skip if high_key(rec) < low_key. */
                if (firstrec) {
+                        cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
                        firstrec = false;
                        diff = cur->bc_ops->diff_two_keys(cur, low_key,
                                        &rec_key);
@@ -4571,6 +4582,7 @@ xfs_btree_simple_query_range(
                }
                /* Stop if high_key < low_key(rec). */
+                cur->bc_ops->init_key_from_rec(&rec_key, recp);
                diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
                if (diff > 0)
                        break;
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 054a2032fdb3..c221d0ecd52e 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -194,7 +194,7 @@ xfs_defer_trans_abort(
        /* Abort intent items. */
        list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
                trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
-                if (dfp->dfp_committed)
+                if (!dfp->dfp_done)
                        dfp->dfp_type->abort_intent(dfp->dfp_intent);
        }
@@ -290,7 +290,6 @@ xfs_defer_finish(
        struct xfs_defer_pending        *dfp;
        struct list_head                *li;
        struct list_head                *n;
-        void                            *done_item = NULL;
        void                            *state;
        int                             error = 0;
        void                            (*cleanup_fn)(struct xfs_trans *, void *, int);
@@ -309,19 +308,11 @@ xfs_defer_finish(
                if (error)
                        goto out;
-                /* Mark all pending intents as committed. */
-                list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) {
-                        if (dfp->dfp_committed)
-                                break;
-                        trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp);
-                        dfp->dfp_committed = true;
-                }
                /* Log an intent-done item for the first pending item. */
                dfp = list_first_entry(&dop->dop_pending,
                                struct xfs_defer_pending, dfp_list);
                trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
-                done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
+                dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
                                dfp->dfp_count);
                cleanup_fn = dfp->dfp_type->finish_cleanup;
@@ -331,7 +322,7 @@ xfs_defer_finish(
                        list_del(li);
                        dfp->dfp_count--;
                        error = dfp->dfp_type->finish_item(*tp, dop, li,
-                                        done_item, &state);
+                                        dfp->dfp_done, &state);
                        if (error) {
                                /*
                                 * Clean up after ourselves and jump out.
@@ -428,8 +419,8 @@ xfs_defer_add(
                dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
                                KM_SLEEP | KM_NOFS);
                dfp->dfp_type = defer_op_types[type];
-                dfp->dfp_committed = false;
                dfp->dfp_intent = NULL;
+                dfp->dfp_done = NULL;
                dfp->dfp_count = 0;
                INIT_LIST_HEAD(&dfp->dfp_work);
                list_add_tail(&dfp->dfp_list, &dop->dop_intake);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index cc3981c48296..e96533d178cf 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -30,8 +30,8 @@ struct xfs_defer_op_type;
 struct xfs_defer_pending {
        const struct xfs_defer_op_type  *dfp_type;      /* function pointers */
        struct list_head                dfp_list;       /* pending items */
-        bool                            dfp_committed;  /* committed trans? */
        void                            *dfp_intent;    /* log intent item */
+        void                            *dfp_done;      /* log done item */
        struct list_head                dfp_work;       /* work items */
        unsigned int                    dfp_count;      /* # extent items */
 };
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index f814d42c73b2..270fb5cf4fa1 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -640,12 +640,15 @@ typedef struct xfs_agf {
        __be32          agf_btreeblks;  /* # of blocks held in AGF btrees */
        uuid_t          agf_uuid;       /* uuid of filesystem */
+        __be32          agf_rmap_blocks;        /* rmapbt blocks used */
+        __be32          agf_padding;            /* padding */
        /*
         * reserve some contiguous space for future logged fields before we add
         * the unlogged fields. This makes the range logging via flags and
         * structure offsets much simpler.
         */
-        __be64          agf_spare64[16];
+        __be64          agf_spare64[15];
        /* unlogged fields, written during buffer writeback. */
        __be64          agf_lsn;        /* last write sequence */
@@ -670,7 +673,9 @@ typedef struct xfs_agf {
 #define XFS_AGF_LONGEST         0x00000400
 #define XFS_AGF_BTREEBLKS       0x00000800
 #define XFS_AGF_UUID            0x00001000
-#define XFS_AGF_NUM_BITS        13
+#define XFS_AGF_RMAP_BLOCKS     0x00002000
+#define XFS_AGF_SPARE64         0x00004000
+#define XFS_AGF_NUM_BITS        15
 #define XFS_AGF_ALL_BITS        ((1 << XFS_AGF_NUM_BITS) - 1)
 #define XFS_AGF_FLAGS \
@@ -686,7 +691,9 @@ typedef struct xfs_agf {
        { XFS_AGF_FREEBLKS,     "FREEBLKS" }, \
        { XFS_AGF_LONGEST,      "LONGEST" }, \
        { XFS_AGF_BTREEBLKS,    "BTREEBLKS" }, \
-        { XFS_AGF_UUID,         "UUID" }
+        { XFS_AGF_UUID,         "UUID" }, \
+        { XFS_AGF_RMAP_BLOCKS,  "RMAP_BLOCKS" }, \
+        { XFS_AGF_SPARE64,      "SPARE64" }
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGF_DADDR(mp)       ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index bc1faebc84ec..17b8eeb34ac8 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -98,6 +98,8 @@ xfs_rmapbt_alloc_block(
        union xfs_btree_ptr     *new,
        int                     *stat)
 {
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
        int                     error;
        xfs_agblock_t           bno;
@@ -124,6 +126,8 @@ xfs_rmapbt_alloc_block(
        xfs_trans_agbtree_delta(cur->bc_tp, 1);
        new->s = cpu_to_be32(bno);
+        be32_add_cpu(&agf->agf_rmap_blocks, 1);
+        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
        *stat = 1;
@@ -143,6 +147,8 @@ xfs_rmapbt_free_block(
        bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
        trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
                        bno, 1);
+        be32_add_cpu(&agf->agf_rmap_blocks, -1);
+        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
        error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
        if (error)
                return error;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 0e3d4f5ec33c..4aecc5fefe96 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -583,7 +583,8 @@ xfs_sb_verify(
         * Only check the in progress field for the primary superblock as
         * mkfs.xfs doesn't clear it from secondary superblocks.
         */
-        return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+        return xfs_mount_validate_sb(mp, &sb,
+                                     bp->b_maps[0].bm_bn == XFS_SB_DADDR,
                                     check_version);
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 47a318ce82e0..b5b9bffe3520 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -115,7 +115,6 @@ xfs_buf_ioacct_dec(
        if (!(bp->b_flags & _XBF_IN_FLIGHT))
                return;
-        ASSERT(bp->b_flags & XBF_ASYNC);
        bp->b_flags &= ~_XBF_IN_FLIGHT;
        percpu_counter_dec(&bp->b_target->bt_io_count);
 }
@@ -1612,7 +1611,7 @@ xfs_wait_buftarg(
         */
        while (percpu_counter_sum(&btp->bt_io_count))
                delay(100);
-        drain_workqueue(btp->bt_mount->m_buf_workqueue);
+        flush_workqueue(btp->bt_mount->m_buf_workqueue);
        /* loop until there is nothing left on the lru list. */
        while (list_lru_count(&btp->bt_lru)) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ed95e5bb04e6..e612a0233710 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -741,9 +741,20 @@ xfs_file_dax_write(
         * page is inserted into the pagecache when we have to serve a write
         * fault on a hole.  It should never be dirtied and can simply be
         * dropped from the pagecache once we get real data for the page.
+         *
+         * XXX: This is racy against mmap, and there's nothing we can do about
+         * it. dax_do_io() should really do this invalidation internally as
+         * it will know if we've allocated over a holei for this specific IO and
+         * if so it needs to update the mapping tree and invalidate existing
+         * PTEs over the newly allocated range. Remove this invalidation when
+         * dax_do_io() is fixed up.
         */
        if (mapping->nrpages) {
-                ret = invalidate_inode_pages2(mapping);
+                loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
+                ret = invalidate_inode_pages2_range(mapping,
+                                                    iocb->ki_pos >> PAGE_SHIFT,
+                                                    end >> PAGE_SHIFT);
                WARN_ON_ONCE(ret);
        }
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 0f96847b90e1..0b7f986745c1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -248,6 +248,7 @@ xfs_growfs_data_private(
                        agf->agf_roots[XFS_BTNUM_RMAPi] =
                                                cpu_to_be32(XFS_RMAP_BLOCK(mp));
                        agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+                        agf->agf_rmap_blocks = cpu_to_be32(1);
                }
                agf->agf_flfirst = cpu_to_be32(1);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2114d53df433..2af0dda1c978 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -715,12 +715,16 @@ xfs_iomap_write_allocate(
                 * is in the delayed allocation extent on which we sit
                 * but before our buffer starts.
                 */
                nimaps = 0;
                while (nimaps == 0) {
                        nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+                        /*
-                        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres,
+                         * We have already reserved space for the extent and any
+                         * indirect blocks when creating the delalloc extent,
+                         * there is no need to reserve space in this transaction
+                         * again.
+                         */
+                        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
                                        0, XFS_TRANS_RESERVE, &tp);
                        if (error)
                                return error;
@@ -1037,20 +1041,14 @@ xfs_file_iomap_begin(
                        return error;
                trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
-                xfs_bmbt_to_iomap(ip, iomap, &imap);
-        } else if (nimaps) {
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                trace_xfs_iomap_found(ip, offset, length, 0, &imap);
-                xfs_bmbt_to_iomap(ip, iomap, &imap);
        } else {
+                ASSERT(nimaps);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+                trace_xfs_iomap_found(ip, offset, length, 0, &imap);
-                iomap->blkno = IOMAP_NULL_BLOCK;
-                iomap->type = IOMAP_HOLE;
-                iomap->offset = offset;
-                iomap->length = length;
        }
+        xfs_bmbt_to_iomap(ip, iomap, &imap);
        return 0;
 }
@@ -1112,3 +1110,48 @@ struct iomap_ops xfs_iomap_ops = {
        .iomap_begin            = xfs_file_iomap_begin,
        .iomap_end              = xfs_file_iomap_end,
 };
+static int
+xfs_xattr_iomap_begin(
+        struct inode            *inode,
+        loff_t                  offset,
+        loff_t                  length,
+        unsigned                flags,
+        struct iomap            *iomap)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + length);
+        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1, error = 0;
+        unsigned                lockmode;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        lockmode = xfs_ilock_data_map_shared(ip);
+        /* if there are no attribute fork or extents, return ENOENT */
+        if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+                error = -ENOENT;
+                goto out_unlock;
+        }
+        ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL);
+        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+                               &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK);
+out_unlock:
+        xfs_iunlock(ip, lockmode);
+        if (!error) {
+                ASSERT(nimaps);
+                xfs_bmbt_to_iomap(ip, iomap, &imap);
+        }
+        return error;
+}
+struct iomap_ops xfs_xattr_iomap_ops = {
+        .iomap_begin            = xfs_xattr_iomap_begin,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index e066d045e2ff..fb8aca3d69ab 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -35,5 +35,6 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
                struct xfs_bmbt_irec *);
 extern struct iomap_ops xfs_iomap_ops;
+extern struct iomap_ops xfs_xattr_iomap_ops;
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ab820f84ed50..b24c3102fa93 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1009,7 +1009,14 @@ xfs_vn_fiemap(
        int                     error;
        xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
-        error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+                fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
+                error = iomap_fiemap(inode, fieinfo, start, length,
+                                &xfs_xattr_iomap_ops);
+        } else {
+                error = iomap_fiemap(inode, fieinfo, start, length,
+                                &xfs_iomap_ops);
+        }
        xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
        return error;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 24ef83ef04de..fd6be45b3a1e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1574,9 +1574,16 @@ xfs_fs_fill_super(
                }
        }
-        if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+        if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+                if (mp->m_sb.sb_rblocks) {
+                        xfs_alert(mp,
+        "EXPERIMENTAL reverse mapping btree not compatible with realtime device!");
+                        error = -EINVAL;
+                        goto out_filestream_unmount;
+                }
                xfs_alert(mp,
        "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!");
+        }
        error = xfs_mountfs(mp);
        if (error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 551b7e26980c..d303a665dba9 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1298,7 +1298,6 @@ DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
 DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
 DEFINE_IOMAP_EVENT(xfs_iomap_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -2296,7 +2295,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
                __entry->dev = mp ? mp->m_super->s_dev : 0;
                __entry->type = dfp->dfp_type->type;
                __entry->intent = dfp->dfp_intent;
-                __entry->committed = dfp->dfp_committed;
+                __entry->committed = dfp->dfp_done != NULL;
                __entry->nr = dfp->dfp_count;
        ),
        TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n",