59 files changed, 1240 insertions, 590 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 4b0eff6da674..85737e96ab8b 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -189,11 +189,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
        case 1:
                _debug("extract FID count");
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("FID count: %u", call->count);
@@ -210,11 +207,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
                _debug("extract FID array");
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       call->count * 3 * 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                _debug("unmarshall FID array");
                call->request = kcalloc(call->count,
@@ -239,11 +233,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
        case 3:
                _debug("extract CB count");
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                tmp = ntohl(call->tmp);
                _debug("CB count: %u", tmp);
@@ -258,11 +249,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
                _debug("extract CB array");
                ret = afs_extract_data(call, skb, last, call->request,
                                       call->count * 3 * 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                _debug("unmarshall CB array");
                cb = call->request;
@@ -278,9 +266,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
                call->unmarshall++;
        case 5:
-                _debug("trailer");
+                ret = afs_data_complete(call, skb, last);
-                if (skb->len != 0)
+                if (ret < 0)
-                        return -EBADMSG;
+                        return ret;
                /* Record that the message was unmarshalled successfully so
                 * that the call destructor can know do the callback breaking
@@ -294,8 +282,6 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
                break;
        }
-        if (!last)
-                return 0;
        call->state = AFS_CALL_REPLYING;
@@ -335,13 +321,13 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
 {
        struct afs_server *server;
        struct in_addr addr;
+        int ret;
        _enter(",{%u},%d", skb->len, last);
-        if (skb->len > 0)
+        ret = afs_data_complete(call, skb, last);
-                return -EBADMSG;
+        if (ret < 0)
-        if (!last)
+                return ret;
-                return 0;
        /* no unmarshalling required */
        call->state = AFS_CALL_REPLYING;
@@ -371,8 +357,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
        _enter(",{%u},%d", skb->len, last);
+        /* There are some arguments that we ignore */
+        afs_data_consumed(call, skb);
        if (!last)
-                return 0;
+                return -EAGAIN;
        /* no unmarshalling required */
        call->state = AFS_CALL_REPLYING;
@@ -408,12 +396,13 @@ static void SRXAFSCB_Probe(struct work_struct *work)
 static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
                                bool last)
 {
+        int ret;
        _enter(",{%u},%d", skb->len, last);
-        if (skb->len > 0)
+        ret = afs_data_complete(call, skb, last);
-                return -EBADMSG;
+        if (ret < 0)
-        if (!last)
+                return ret;
-                return 0;
        /* no unmarshalling required */
        call->state = AFS_CALL_REPLYING;
@@ -460,10 +449,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        if (skb->len > 0)
+        ret = afs_data_complete(call, skb, last);
-                return -EBADMSG;
+        if (ret < 0)
-        if (!last)
+                return ret;
-                return 0;
        switch (call->unmarshall) {
        case 0:
@@ -509,8 +497,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
                break;
        }
-        if (!last)
+        ret = afs_data_complete(call, skb, last);
-                return 0;
+        if (ret < 0)
+                return ret;
        call->state = AFS_CALL_REPLYING;
@@ -588,12 +577,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
                                                 struct sk_buff *skb, bool last)
 {
+        int ret;
        _enter(",{%u},%d", skb->len, last);
-        if (skb->len > 0)
+        ret = afs_data_complete(call, skb, last);
-                return -EBADMSG;
+        if (ret < 0)
-        if (!last)
+                return ret;
-                return 0;
        /* no unmarshalling required */
        call->state = AFS_CALL_REPLYING;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index c2e930ec2888..9312b92e54be 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -240,15 +240,13 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call,
 {
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter(",,%u", last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -335,11 +333,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
        case 1:
                _debug("extract data length (MSW)");
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("DATA length MSW: %u", call->count);
@@ -353,11 +348,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
        case 2:
                _debug("extract data length");
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("DATA length: %u", call->count);
@@ -375,11 +367,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
                        ret = afs_extract_data(call, skb, last, buffer,
                                               call->count);
                        kunmap_atomic(buffer);
-                        switch (ret) {
+                        if (ret < 0)
-                        case 0:         break;
+                                return ret;
-                        case -EAGAIN:   return 0;
-                        default:        return ret;
-                        }
                }
                call->offset = 0;
@@ -389,11 +378,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
        case 4:
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       (21 + 3 + 6) * 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                bp = call->buffer;
                xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
@@ -405,15 +391,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
                call->unmarshall++;
        case 5:
-                _debug("trailer");
+                ret = afs_data_complete(call, skb, last);
-                if (skb->len != 0)
+                if (ret < 0)
-                        return -EBADMSG;
+                        return ret;
                break;
        }
-        if (!last)
-                return 0;
        if (call->count < PAGE_SIZE) {
                _debug("clear");
                page = call->reply3;
@@ -537,9 +520,8 @@ static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
 {
        _enter(",{%u},%d", skb->len, last);
-        if (skb->len > 0)
+        /* shouldn't be any reply data */
-                return -EBADMSG; /* shouldn't be any reply data */
+        return afs_data_complete(call, skb, last);
-        return 0;
 }
 /*
@@ -622,15 +604,13 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call,
 {
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -721,15 +701,13 @@ static int afs_deliver_fs_remove(struct afs_call *call,
 {
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -804,15 +782,13 @@ static int afs_deliver_fs_link(struct afs_call *call,
 {
        struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -892,15 +868,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call,
 {
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -999,15 +973,13 @@ static int afs_deliver_fs_rename(struct afs_call *call,
 {
        struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -1105,20 +1077,13 @@ static int afs_deliver_fs_store_data(struct afs_call *call,
 {
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter(",,%u", last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last) {
+        if (ret < 0)
-                _leave(" = 0 [more]");
+                return ret;
-                return 0;
-        }
-        if (call->reply_size != call->reply_max) {
-                _leave(" = -EBADMSG [%u != %u]",
-                       call->reply_size, call->reply_max);
-                return -EBADMSG;
-        }
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
@@ -1292,20 +1257,13 @@ static int afs_deliver_fs_store_status(struct afs_call *call,
        afs_dataversion_t *store_version;
        struct afs_vnode *vnode = call->reply;
        const __be32 *bp;
+        int ret;
        _enter(",,%u", last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last) {
+        if (ret < 0)
-                _leave(" = 0 [more]");
+                return ret;
-                return 0;
-        }
-        if (call->reply_size != call->reply_max) {
-                _leave(" = -EBADMSG [%u != %u]",
-                       call->reply_size, call->reply_max);
-                return -EBADMSG;
-        }
        /* unmarshall the reply once we've received all of it */
        store_version = NULL;
@@ -1504,11 +1462,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                _debug("extract status");
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       12 * 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                bp = call->buffer;
                xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
@@ -1518,11 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                /* extract the volume name length */
        case 2:
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("volname length: %u", call->count);
@@ -1537,11 +1489,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                if (call->count > 0) {
                        ret = afs_extract_data(call, skb, last, call->reply3,
                                               call->count);
-                        switch (ret) {
+                        if (ret < 0)
-                        case 0:         break;
+                                return ret;
-                        case -EAGAIN:   return 0;
-                        default:        return ret;
-                        }
                }
                p = call->reply3;
@@ -1561,11 +1510,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
        case 4:
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       call->count);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->offset = 0;
                call->unmarshall++;
@@ -1574,11 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                /* extract the offline message length */
        case 5:
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("offline msg length: %u", call->count);
@@ -1593,11 +1536,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                if (call->count > 0) {
                        ret = afs_extract_data(call, skb, last, call->reply3,
                                               call->count);
-                        switch (ret) {
+                        if (ret < 0)
-                        case 0:         break;
+                                return ret;
-                        case -EAGAIN:   return 0;
-                        default:        return ret;
-                        }
                }
                p = call->reply3;
@@ -1617,11 +1557,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
        case 7:
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       call->count);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->offset = 0;
                call->unmarshall++;
@@ -1630,11 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                /* extract the message of the day length */
        case 8:
                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->count = ntohl(call->tmp);
                _debug("motd length: %u", call->count);
@@ -1649,11 +1583,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
                if (call->count > 0) {
                        ret = afs_extract_data(call, skb, last, call->reply3,
                                               call->count);
-                        switch (ret) {
+                        if (ret < 0)
-                        case 0:         break;
+                                return ret;
-                        case -EAGAIN:   return 0;
-                        default:        return ret;
-                        }
                }
                p = call->reply3;
@@ -1673,26 +1604,20 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
        case 10:
                ret = afs_extract_data(call, skb, last, call->buffer,
                                       call->count);
-                switch (ret) {
+                if (ret < 0)
-                case 0:         break;
+                        return ret;
-                case -EAGAIN:   return 0;
-                default:        return ret;
-                }
                call->offset = 0;
                call->unmarshall++;
        no_motd_padding:
        case 11:
-                _debug("trailer %d", skb->len);
+                ret = afs_data_complete(call, skb, last);
-                if (skb->len != 0)
+                if (ret < 0)
-                        return -EBADMSG;
+                        return ret;
                break;
        }
-        if (!last)
-                return 0;
        _leave(" = 0 [done]");
        return 0;
 }
@@ -1764,15 +1689,13 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
                                    struct sk_buff *skb, bool last)
 {
        const __be32 *bp;
+        int ret;
        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 71d5982312f3..df976b2a7f40 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -609,17 +609,29 @@ extern void afs_proc_cell_remove(struct afs_cell *);
 */
 extern int afs_open_socket(void);
 extern void afs_close_socket(void);
+extern void afs_data_consumed(struct afs_call *, struct sk_buff *);
 extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
                         const struct afs_wait_mode *);
 extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
                                            size_t, size_t);
 extern void afs_flat_call_destructor(struct afs_call *);
-extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
+extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool);
 extern void afs_send_empty_reply(struct afs_call *);
 extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
 extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
                            size_t);
+static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb,
+                                    bool last)
+{
+        if (skb->len > 0)
+                return -EBADMSG;
+        afs_data_consumed(call, skb);
+        if (!last)
+                return -EAGAIN;
+        return 0;
+}
 /*
 * security.c
 */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 4832de84d52c..14d04c848465 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -150,10 +150,9 @@ void afs_close_socket(void)
 }
 /*
- * note that the data in a socket buffer is now delivered and that the buffer
+ * Note that the data in a socket buffer is now consumed.
- * should be freed
 */
-static void afs_data_delivered(struct sk_buff *skb)
+void afs_data_consumed(struct afs_call *call, struct sk_buff *skb)
 {
        if (!skb) {
                _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
@@ -161,9 +160,7 @@ static void afs_data_delivered(struct sk_buff *skb)
        } else {
                _debug("DLVR %p{%u} [%d]",
                       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
-                if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+                rxrpc_kernel_data_consumed(call->rxcall, skb);
-                        BUG();
-                rxrpc_kernel_data_delivered(skb);
        }
 }
@@ -489,9 +486,15 @@ static void afs_deliver_to_call(struct afs_call *call)
                        last = rxrpc_kernel_is_data_last(skb);
                        ret = call->type->deliver(call, skb, last);
                        switch (ret) {
+                        case -EAGAIN:
+                                if (last) {
+                                        _debug("short data");
+                                        goto unmarshal_error;
+                                }
+                                break;
                        case 0:
-                                if (last &&
+                                ASSERT(last);
-                                    call->state == AFS_CALL_AWAIT_REPLY)
+                                if (call->state == AFS_CALL_AWAIT_REPLY)
                                        call->state = AFS_CALL_COMPLETE;
                                break;
                        case -ENOTCONN:
@@ -501,6 +504,7 @@ static void afs_deliver_to_call(struct afs_call *call)
                                abort_code = RX_INVALID_OPERATION;
                                goto do_abort;
                        default:
+                        unmarshal_error:
                                abort_code = RXGEN_CC_UNMARSHAL;
                                if (call->state != AFS_CALL_AWAIT_REPLY)
                                        abort_code = RXGEN_SS_UNMARSHAL;
@@ -511,9 +515,7 @@ static void afs_deliver_to_call(struct afs_call *call)
                                call->state = AFS_CALL_ERROR;
                                break;
                        }
-                        afs_data_delivered(skb);
+                        break;
-                        skb = NULL;
-                        continue;
                case RXRPC_SKB_MARK_FINAL_ACK:
                        _debug("Rcv ACK");
                        call->state = AFS_CALL_COMPLETE;
@@ -685,15 +687,35 @@ static void afs_process_async_call(struct afs_call *call)
 }
 /*
- * empty a socket buffer into a flat reply buffer
+ * Empty a socket buffer into a flat reply buffer.
 */
-void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
+int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last)
 {
        size_t len = skb->len;
-        if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
+        if (len > call->reply_max - call->reply_size) {
-                BUG();
+                _leave(" = -EBADMSG [%zu > %u]",
-        call->reply_size += len;
+                       len, call->reply_max - call->reply_size);
+                return -EBADMSG;
+        }
+        if (len > 0) {
+                if (skb_copy_bits(skb, 0, call->buffer + call->reply_size,
+                                  len) < 0)
+                        BUG();
+                call->reply_size += len;
+        }
+        afs_data_consumed(call, skb);
+        if (!last)
+                return -EAGAIN;
+        if (call->reply_size != call->reply_max) {
+                _leave(" = -EBADMSG [%u != %u]",
+                       call->reply_size, call->reply_max);
+                return -EBADMSG;
+        }
+        return 0;
 }
 /*
@@ -745,7 +767,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
 }
 /*
- * grab the operation ID from an incoming cache manager call
+ * Grab the operation ID from an incoming cache manager call.  The socket
+ * buffer is discarded on error or if we don't yet have sufficient data.
 */
 static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
                                bool last)
@@ -766,12 +789,9 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
        call->offset += len;
        if (call->offset < 4) {
-                if (last) {
+                afs_data_consumed(call, skb);
-                        _leave(" = -EBADMSG [op ID short]");
+                _leave(" = -EAGAIN");
-                        return -EBADMSG;
+                return -EAGAIN;
-                }
-                _leave(" = 0 [incomplete]");
-                return 0;
        }
        call->state = AFS_CALL_AWAIT_REQUEST;
@@ -855,7 +875,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 }
 /*
- * extract a piece of data from the received data socket buffers
+ * Extract a piece of data from the received data socket buffers.
 */
 int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
                     bool last, void *buf, size_t count)
@@ -873,10 +893,7 @@ int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
        call->offset += len;
        if (call->offset < count) {
-                if (last) {
+                afs_data_consumed(call, skb);
-                        _leave(" = -EBADMSG [%d < %zu]", call->offset, count);
-                        return -EBADMSG;
-                }
                _leave(" = -EAGAIN");
                return -EAGAIN;
        }
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 340afd0cd182..f94d1abdc3eb 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -64,16 +64,13 @@ static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
        struct afs_cache_vlocation *entry;
        __be32 *bp;
        u32 tmp;
-        int loop;
+        int loop, ret;
        _enter(",,%u", last);
-        afs_transfer_reply(call, skb);
+        ret = afs_transfer_reply(call, skb, last);
-        if (!last)
+        if (ret < 0)
-                return 0;
+                return ret;
-        if (call->reply_size != call->reply_max)
-                return -EBADMSG;
        /* unmarshall the reply once we've received all of it */
        entry = call->reply;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c3cdde87cc8c..08ae99343d92 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -249,7 +249,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
                 * thaw_bdev drops it.
                 */
                sb = get_super(bdev);
-                drop_super(sb);
+                if (sb)
+                        drop_super(sb);
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                return sb;
        }
@@ -646,7 +647,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type,
 {
        struct dentry *dent;
        dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
-        if (dent)
+        if (!IS_ERR(dent))
                dent->d_sb->s_iflags |= SB_I_CGROUPWB;
        return dent;
 }
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2b88439c2ee8..455a6b2fd539 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -589,6 +589,7 @@ static void __merge_refs(struct list_head *head, int mode)
                        list_del(&ref2->list);
                        kmem_cache_free(btrfs_prelim_ref_cache, ref2);
+                        cond_resched();
                }
        }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2fe8f89091a3..eff3993c77b3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1028,6 +1028,7 @@ struct btrfs_fs_info {
        struct btrfs_workqueue *qgroup_rescan_workers;
        struct completion qgroup_rescan_completion;
        struct btrfs_work qgroup_rescan_work;
+        bool qgroup_rescan_running;     /* protected by qgroup_rescan_lock */
        /* filesystem state */
        unsigned long fs_state;
@@ -1079,6 +1080,8 @@ struct btrfs_fs_info {
        struct list_head pinned_chunks;
        int creating_free_space_tree;
+        /* Used to record internally whether fs has been frozen */
+        int fs_frozen;
 };
 struct btrfs_subvolume_writers {
@@ -2578,7 +2581,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   u64 root_objectid, u64 owner, u64 offset,
                                   struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
                         u64 min_alloc_size, u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data, int delalloc);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index b6d210e7a993..ac02e041464b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_ref_head *existing;
        struct btrfs_delayed_ref_head *head_ref = NULL;
        struct btrfs_delayed_ref_root *delayed_refs;
-        struct btrfs_qgroup_extent_record *qexisting;
        int count_mod = 1;
        int must_insert_reserved = 0;
@@ -606,10 +605,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
                qrecord->num_bytes = num_bytes;
                qrecord->old_roots = NULL;
-                qexisting = btrfs_qgroup_insert_dirty_extent(fs_info,
+                if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info,
-                                                             delayed_refs,
+                                        delayed_refs, qrecord))
-                                                             qrecord);
-                if (qexisting)
                        kfree(qrecord);
        }
@@ -862,33 +859,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        return 0;
 }
-int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
-                                     struct btrfs_trans_handle *trans,
-                                     u64 ref_root, u64 bytenr, u64 num_bytes)
-{
-        struct btrfs_delayed_ref_root *delayed_refs;
-        struct btrfs_delayed_ref_head *ref_head;
-        int ret = 0;
-        if (!fs_info->quota_enabled || !is_fstree(ref_root))
-                return 0;
-        delayed_refs = &trans->transaction->delayed_refs;
-        spin_lock(&delayed_refs->lock);
-        ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
-        if (!ref_head) {
-                ret = -ENOENT;
-                goto out;
-        }
-        WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
-        ref_head->qgroup_ref_root = ref_root;
-        ref_head->qgroup_reserved = num_bytes;
-out:
-        spin_unlock(&delayed_refs->lock);
-        return ret;
-}
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5fca9534a271..43f3629760e9 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -250,9 +250,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, u64 reserved, int action,
                               struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
-                                     struct btrfs_trans_handle *trans,
-                                     u64 ref_root, u64 bytenr, u64 num_bytes);
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 59febfb8d04a..54bc8c7c6bcd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -559,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root,
        u32 nritems = btrfs_header_nritems(leaf);
        int slot;
-        if (nritems == 0)
+        if (nritems == 0) {
+                struct btrfs_root *check_root;
+                key.objectid = btrfs_header_owner(leaf);
+                key.type = BTRFS_ROOT_ITEM_KEY;
+                key.offset = (u64)-1;
+                check_root = btrfs_get_fs_root(root->fs_info, &key, false);
+                /*
+                 * The only reason we also check NULL here is that during
+                 * open_ctree() some roots has not yet been set up.
+                 */
+                if (!IS_ERR_OR_NULL(check_root)) {
+                        /* if leaf is the root, then it's fine */
+                        if (leaf->start !=
+                            btrfs_root_bytenr(&check_root->root_item)) {
+                                CORRUPT("non-root leaf's nritems is 0",
+                                        leaf, root, 0);
+                                return -EIO;
+                        }
+                }
                return 0;
+        }
        /* Check the 0 item */
        if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
@@ -612,6 +633,19 @@ static noinline int check_leaf(struct btrfs_root *root,
        return 0;
 }
+static int check_node(struct btrfs_root *root, struct extent_buffer *node)
+{
+        unsigned long nr = btrfs_header_nritems(node);
+        if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
+                btrfs_crit(root->fs_info,
+                           "corrupt node: block %llu root %llu nritems %lu",
+                           node->start, root->objectid, nr);
+                return -EIO;
+        }
+        return 0;
+}
 static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
                                      u64 phy_offset, struct page *page,
                                      u64 start, u64 end, int mirror)
@@ -682,6 +716,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
                ret = -EIO;
        }
+        if (found_level > 0 && check_node(root, eb))
+                ret = -EIO;
        if (!ret)
                set_extent_buffer_uptodate(eb);
 err:
@@ -1618,8 +1655,8 @@ fail:
        return ret;
 }
-static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
-                                               u64 root_id)
+                                        u64 root_id)
 {
        struct btrfs_root *root;
@@ -2298,6 +2335,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
        fs_info->quota_enabled = 0;
        fs_info->pending_quota_state = 0;
        fs_info->qgroup_ulist = NULL;
+        fs_info->qgroup_rescan_running = false;
        mutex_init(&fs_info->qgroup_rescan_lock);
 }
@@ -2624,6 +2662,7 @@ int open_ctree(struct super_block *sb,
        atomic_set(&fs_info->qgroup_op_seq, 0);
        atomic_set(&fs_info->reada_works_cnt, 0);
        atomic64_set(&fs_info->tree_mod_seq, 0);
+        fs_info->fs_frozen = 0;
        fs_info->sb = sb;
        fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
        fs_info->metadata_ratio = 0;
@@ -3739,8 +3778,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
        if (btrfs_root_refs(&root->root_item) == 0)
                synchronize_srcu(&fs_info->subvol_srcu);
-        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
                btrfs_free_log(NULL, root);
+                if (root->reloc_root) {
+                        free_extent_buffer(root->reloc_root->node);
+                        free_extent_buffer(root->reloc_root->commit_root);
+                        btrfs_put_fs_root(root->reloc_root);
+                        root->reloc_root = NULL;
+                }
+        }
        if (root->free_ino_pinned)
                __btrfs_remove_free_space_cache(root->free_ino_pinned);
@@ -3851,7 +3897,7 @@ void close_ctree(struct btrfs_root *root)
        smp_mb();
        /* wait for the qgroup rescan worker to stop */
-        btrfs_qgroup_wait_for_completion(fs_info);
+        btrfs_qgroup_wait_for_completion(fs_info, false);
        /* wait for the uuid_scan task to finish */
        down(&fs_info->uuid_tree_rescan_sem);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b3207a0e09f7..f19a982f5a4f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
                                      struct btrfs_key *location);
 int btrfs_init_fs_root(struct btrfs_root *root);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                        u64 root_id);
 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
                         struct btrfs_root *root);
 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 61b494e8e604..0450dc410533 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -60,21 +60,6 @@ enum {
        CHUNK_ALLOC_FORCE = 2,
 };
-/*
- * Control how reservations are dealt with.
- *
- * RESERVE_FREE - freeing a reservation.
- * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
- *   ENOSPC accounting
- * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
- *   bytes_may_use as the ENOSPC accounting is done elsewhere
- */
-enum {
-        RESERVE_FREE = 0,
-        RESERVE_ALLOC = 1,
-        RESERVE_ALLOC_NO_ACCOUNT = 2,
-};
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root, u64 bytenr,
                              u64 num_bytes, int alloc);
@@ -104,9 +89,10 @@ static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
-static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                       u64 num_bytes, int reserve,
+                                    u64 ram_bytes, u64 num_bytes, int delalloc);
-                                       int delalloc);
+static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                     u64 num_bytes, int delalloc);
 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
                               u64 num_bytes);
 int btrfs_pin_extent(struct btrfs_root *root,
@@ -3501,7 +3487,6 @@ again:
                dcs = BTRFS_DC_SETUP;
        else if (ret == -ENOSPC)
                set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
-        btrfs_free_reserved_data_space(inode, 0, num_pages);
 out_put:
        iput(inode);
@@ -4472,6 +4457,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
        }
 }
+/*
+ * If force is CHUNK_ALLOC_FORCE:
+ *    - return 1 if it successfully allocates a chunk,
+ *    - return errors including -ENOSPC otherwise.
+ * If force is NOT CHUNK_ALLOC_FORCE:
+ *    - return 0 if it doesn't need to allocate a new chunk,
+ *    - return 1 if it successfully allocates a chunk,
+ *    - return errors including -ENOSPC otherwise.
+ */
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 flags, int force)
 {
@@ -4882,7 +4876,7 @@ static int flush_space(struct btrfs_root *root,
                                     btrfs_get_alloc_profile(root, 0),
                                     CHUNK_ALLOC_NO_FORCE);
                btrfs_end_transaction(trans, root);
-                if (ret == -ENOSPC)
+                if (ret > 0 || ret == -ENOSPC)
                        ret = 0;
                break;
        case COMMIT_TRANS:
@@ -6497,19 +6491,15 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 }
 /**
- * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * btrfs_add_reserved_bytes - update the block_group and space info counters
 * @cache:      The cache we are manipulating
+ * @ram_bytes:  The number of bytes of file content, and will be same to
+ *              @num_bytes except for the compress path.
 * @num_bytes:  The number of bytes in question
- * @reserve:    One of the reservation enums
 * @delalloc:   The blocks are allocated for the delalloc write
 *
- * This is called by the allocator when it reserves space, or by somebody who is
+ * This is called by the allocator when it reserves space. Metadata
- * freeing space that was never actually used on disk.  For example if you
+ * reservations should be called with RESERVE_ALLOC so we do the proper
- * reserve some space for a new leaf in transaction A and before transaction A
- * commits you free that leaf, you call this with reserve set to 0 in order to
- * clear the reservation.
- *
- * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
 * ENOSPC accounting.  For data we handle the reservation through clearing the
 * delalloc bits in the io_tree.  We have to do this since we could end up
 * allocating less disk space for the amount of data we have reserved in the
@@ -6519,44 +6509,63 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 * make the reservation and return -EAGAIN, otherwise this function always
 * succeeds.
 */
-static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                       u64 num_bytes, int reserve, int delalloc)
+                                    u64 ram_bytes, u64 num_bytes, int delalloc)
 {
        struct btrfs_space_info *space_info = cache->space_info;
        int ret = 0;
        spin_lock(&space_info->lock);
        spin_lock(&cache->lock);
-        if (reserve != RESERVE_FREE) {
+        if (cache->ro) {
-                if (cache->ro) {
+                ret = -EAGAIN;
-                        ret = -EAGAIN;
-                } else {
-                        cache->reserved += num_bytes;
-                        space_info->bytes_reserved += num_bytes;
-                        if (reserve == RESERVE_ALLOC) {
-                                trace_btrfs_space_reservation(cache->fs_info,
-                                                "space_info", space_info->flags,
-                                                num_bytes, 0);
-                                space_info->bytes_may_use -= num_bytes;
-                        }
-                        if (delalloc)
-                                cache->delalloc_bytes += num_bytes;
-                }
        } else {
-                if (cache->ro)
+                cache->reserved += num_bytes;
-                        space_info->bytes_readonly += num_bytes;
+                space_info->bytes_reserved += num_bytes;
-                cache->reserved -= num_bytes;
-                space_info->bytes_reserved -= num_bytes;
+                trace_btrfs_space_reservation(cache->fs_info,
+                                "space_info", space_info->flags,
+                                ram_bytes, 0);
+                space_info->bytes_may_use -= ram_bytes;
                if (delalloc)
-                        cache->delalloc_bytes -= num_bytes;
+                        cache->delalloc_bytes += num_bytes;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&space_info->lock);
        return ret;
 }
+/**
+ * btrfs_free_reserved_bytes - update the block_group and space info counters
+ * @cache:      The cache we are manipulating
+ * @num_bytes:  The number of bytes in question
+ * @delalloc:   The blocks are allocated for the delalloc write
+ *
+ * This is called by somebody who is freeing space that was never actually used
+ * on disk.  For example if you reserve some space for a new leaf in transaction
+ * A and before transaction A commits you free that leaf, you call this with
+ * reserve set to 0 in order to clear the reservation.
+ */
+static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                     u64 num_bytes, int delalloc)
+{
+        struct btrfs_space_info *space_info = cache->space_info;
+        int ret = 0;
+        spin_lock(&space_info->lock);
+        spin_lock(&cache->lock);
+        if (cache->ro)
+                space_info->bytes_readonly += num_bytes;
+        cache->reserved -= num_bytes;
+        space_info->bytes_reserved -= num_bytes;
+        if (delalloc)
+                cache->delalloc_bytes -= num_bytes;
+        spin_unlock(&cache->lock);
+        spin_unlock(&space_info->lock);
+        return ret;
+}
 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root)
 {
@@ -7191,7 +7200,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
                btrfs_add_free_space(cache, buf->start, buf->len);
-                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+                btrfs_free_reserved_bytes(cache, buf->len, 0);
                btrfs_put_block_group(cache);
                trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
                pin = 0;
@@ -7416,9 +7425,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
 * the free space extent currently.
 */
 static noinline int find_free_extent(struct btrfs_root *orig_root,
-                                     u64 num_bytes, u64 empty_size,
+                                u64 ram_bytes, u64 num_bytes, u64 empty_size,
-                                     u64 hint_byte, struct btrfs_key *ins,
+                                u64 hint_byte, struct btrfs_key *ins,
-                                     u64 flags, int delalloc)
+                                u64 flags, int delalloc)
 {
        int ret = 0;
        struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -7430,8 +7439,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        struct btrfs_space_info *space_info;
        int loop = 0;
        int index = __get_raid_index(flags);
-        int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
-                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
        bool use_cluster = true;
@@ -7763,8 +7770,8 @@ checks:
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+                ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
-                                                  alloc_type, delalloc);
+                                num_bytes, delalloc);
                if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
                        goto loop;
@@ -7936,7 +7943,7 @@ again:
        up_read(&info->groups_sem);
 }
-int btrfs_reserve_extent(struct btrfs_root *root,
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
                         u64 num_bytes, u64 min_alloc_size,
                         u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data, int delalloc)
@@ -7948,8 +7955,8 @@ int btrfs_reserve_extent(struct btrfs_root *root,
        flags = btrfs_get_alloc_profile(root, is_data);
 again:
        WARN_ON(num_bytes < root->sectorsize);
-        ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
+        ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
-                               flags, delalloc);
+                               hint_byte, ins, flags, delalloc);
        if (!ret && !is_data) {
                btrfs_dec_block_group_reservations(root->fs_info,
                                                   ins->objectid);
@@ -7958,6 +7965,7 @@ again:
                        num_bytes = min(num_bytes >> 1, ins->offset);
                        num_bytes = round_down(num_bytes, root->sectorsize);
                        num_bytes = max(num_bytes, min_alloc_size);
+                        ram_bytes = num_bytes;
                        if (num_bytes == min_alloc_size)
                                final_tried = true;
                        goto again;
@@ -7995,7 +8003,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
                if (btrfs_test_opt(root->fs_info, DISCARD))
                        ret = btrfs_discard_extent(root, start, len, NULL);
                btrfs_add_free_space(cache, start, len);
-                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+                btrfs_free_reserved_bytes(cache, len, delalloc);
                trace_btrfs_reserved_extent_free(root, start, len);
        }
@@ -8223,8 +8231,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        if (!block_group)
                return -EINVAL;
-        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+        ret = btrfs_add_reserved_bytes(block_group, ins->offset,
-                                          RESERVE_ALLOC_NO_ACCOUNT, 0);
+                                       ins->offset, 0);
        BUG_ON(ret); /* logic error */
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
@@ -8368,7 +8376,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
        if (IS_ERR(block_rsv))
                return ERR_CAST(block_rsv);
-        ret = btrfs_reserve_extent(root, blocksize, blocksize,
+        ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
                                   empty_size, hint, &ins, 0, 0);
        if (ret)
                goto out_unuse;
@@ -8521,35 +8529,6 @@ reada:
        wc->reada_slot = slot;
 }
-/*
- * These may not be seen by the usual inc/dec ref code so we have to
- * add them here.
- */
-static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root, u64 bytenr,
-                                     u64 num_bytes)
-{
-        struct btrfs_qgroup_extent_record *qrecord;
-        struct btrfs_delayed_ref_root *delayed_refs;
-        qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
-        if (!qrecord)
-                return -ENOMEM;
-        qrecord->bytenr = bytenr;
-        qrecord->num_bytes = num_bytes;
-        qrecord->old_roots = NULL;
-        delayed_refs = &trans->transaction->delayed_refs;
-        spin_lock(&delayed_refs->lock);
-        if (btrfs_qgroup_insert_dirty_extent(trans->fs_info,
-                                             delayed_refs, qrecord))
-                kfree(qrecord);
-        spin_unlock(&delayed_refs->lock);
-        return 0;
-}
 static int account_leaf_items(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct extent_buffer *eb)
@@ -8583,7 +8562,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
-                ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+                ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+                                bytenr, num_bytes, GFP_NOFS);
                if (ret)
                        return ret;
        }
@@ -8732,8 +8712,9 @@ walk_down:
                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
-                        ret = record_one_subtree_extent(trans, root, child_bytenr,
+                        ret = btrfs_qgroup_insert_dirty_extent(trans,
-                                                        root->nodesize);
+                                        root->fs_info, child_bytenr,
+                                        root->nodesize, GFP_NOFS);
                        if (ret)
                                goto out;
                }
@@ -9906,6 +9887,7 @@ static int find_first_block_group(struct btrfs_root *root,
                        } else {
                                ret = 0;
                        }
+                        free_extent_map(em);
                        goto out;
                }
                path->slots[0]++;
@@ -9942,6 +9924,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
                block_group->iref = 0;
                block_group->inode = NULL;
                spin_unlock(&block_group->lock);
+                ASSERT(block_group->io_ctl.inode == NULL);
                iput(inode);
                last = block_group->key.objectid + block_group->key.offset;
                btrfs_put_block_group(block_group);
@@ -9999,6 +9982,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                        free_excluded_extents(info->extent_root, block_group);
                btrfs_remove_free_space_cache(block_group);
+                ASSERT(list_empty(&block_group->dirty_list));
+                ASSERT(list_empty(&block_group->io_list));
+                ASSERT(list_empty(&block_group->bg_list));
+                ASSERT(atomic_read(&block_group->count) == 1);
                btrfs_put_block_group(block_group);
                spin_lock(&info->block_group_cache_lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bc2729a7612d..28cd88fccc7e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,6 +20,7 @@
 #define EXTENT_DAMAGED          (1U << 14)
 #define EXTENT_NORESERVE        (1U << 15)
 #define EXTENT_QGROUP_RESERVED  (1U << 16)
+#define EXTENT_CLEAR_DATA_RESV  (1U << 17)
 #define EXTENT_IOBITS           (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS          (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9404121fd5f7..fea31a4a6e36 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2033,6 +2033,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                 */
                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                          &BTRFS_I(inode)->runtime_flags);
+                /*
+                 * An ordered extent might have started before and completed
+                 * already with io errors, in which case the inode was not
+                 * updated and we end up here. So check the inode's mapping
+                 * flags for any errors that might have happened while doing
+                 * writeback of file data.
+                 */
+                ret = btrfs_inode_check_errors(inode);
                inode_unlock(inode);
                goto out;
        }
@@ -2062,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        }
        trans->sync = true;
-        btrfs_init_log_ctx(&ctx);
+        btrfs_init_log_ctx(&ctx, inode);
        ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
        if (ret < 0) {
@@ -2667,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode,
        alloc_start = round_down(offset, blocksize);
        alloc_end = round_up(offset + len, blocksize);
+        cur_offset = alloc_start;
        /* Make sure we aren't being give some crap mode */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2759,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode,
        /* First, check if we exceed the qgroup limit */
        INIT_LIST_HEAD(&reserve_list);
-        cur_offset = alloc_start;
        while (1) {
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
@@ -2786,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode,
                                        last_byte - cur_offset);
                        if (ret < 0)
                                break;
+                } else {
+                        /*
+                         * Do not need to reserve unwritten extent for this
+                         * range, free reserved data space first, otherwise
+                         * it'll result in false ENOSPC error.
+                         */
+                        btrfs_free_reserved_data_space(inode, cur_offset,
+                                last_byte - cur_offset);
                }
                free_extent_map(em);
                cur_offset = last_byte;
@@ -2803,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode,
                                        range->start,
                                        range->len, 1 << inode->i_blkbits,
                                        offset + len, &alloc_hint);
+                else
+                        btrfs_free_reserved_data_space(inode, range->start,
+                                                       range->len);
                list_del(&range->list);
                kfree(range);
        }
@@ -2837,18 +2856,11 @@ out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_KERNEL);
 out:
-        /*
-         * As we waited the extent range, the data_rsv_map must be empty
-         * in the range, as written data range will be released from it.
-         * And for prealloacted extent, it will also be released when
-         * its metadata is written.
-         * So this is completely used as cleanup.
-         */
-        btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
        inode_unlock(inode);
        /* Let go of our reservation. */
-        btrfs_free_reserved_data_space(inode, alloc_start,
+        if (ret != 0)
-                                       alloc_end - alloc_start);
+                btrfs_free_reserved_data_space(inode, alloc_start,
+                                       alloc_end - cur_offset);
        return ret;
 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index aa6fabaee72e..359ee861b5a4 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -495,10 +495,9 @@ again:
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
                                              prealloc, prealloc, &alloc_hint);
        if (ret) {
-                btrfs_delalloc_release_space(inode, 0, prealloc);
+                btrfs_delalloc_release_metadata(inode, prealloc);
                goto out_put;
        }
-        btrfs_free_reserved_data_space(inode, 0, prealloc);
        ret = btrfs_write_out_ino_cache(root, trans, path, inode);
 out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2f5975954ccf..e6811c42e41e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -566,6 +566,8 @@ cont:
                                                     PAGE_SET_WRITEBACK |
                                                     page_error_op |
                                                     PAGE_END_WRITEBACK);
+                        btrfs_free_reserved_data_space_noquota(inode, start,
+                                                end - start + 1);
                        goto free_pages_out;
                }
        }
@@ -742,7 +744,7 @@ retry:
                lock_extent(io_tree, async_extent->start,
                            async_extent->start + async_extent->ram_size - 1);
-                ret = btrfs_reserve_extent(root,
+                ret = btrfs_reserve_extent(root, async_extent->ram_size,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
                                           0, alloc_hint, &ins, 1, 1);
@@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode,
                                     EXTENT_DEFRAG, PAGE_UNLOCK |
                                     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
                                     PAGE_END_WRITEBACK);
+                        btrfs_free_reserved_data_space_noquota(inode, start,
+                                                end - start + 1);
                        *nr_written = *nr_written +
                             (end - start + PAGE_SIZE) / PAGE_SIZE;
                        *page_started = 1;
@@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode,
                unsigned long op;
                cur_alloc_size = disk_num_bytes;
-                ret = btrfs_reserve_extent(root, cur_alloc_size,
+                ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
                                           root->sectorsize, 0, alloc_hint,
                                           &ins, 1, 1);
                if (ret < 0)
@@ -1489,8 +1492,10 @@ out_check:
                extent_clear_unlock_delalloc(inode, cur_offset,
                                             cur_offset + num_bytes - 1,
                                             locked_page, EXTENT_LOCKED |
-                                             EXTENT_DELALLOC, PAGE_UNLOCK |
+                                             EXTENT_DELALLOC |
-                                             PAGE_SET_PRIVATE2);
+                                             EXTENT_CLEAR_DATA_RESV,
+                                             PAGE_UNLOCK | PAGE_SET_PRIVATE2);
                if (!nolock && nocow)
                        btrfs_end_write_no_snapshoting(root);
                cur_offset = extent_end;
@@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                        return;
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-                    && do_list && !(state->state & EXTENT_NORESERVE))
+                    && do_list && !(state->state & EXTENT_NORESERVE)
+                    && (*bits & (EXTENT_DO_ACCOUNTING |
+                    EXTENT_CLEAR_DATA_RESV)))
                        btrfs_free_reserved_data_space_noquota(inode,
                                        state->start, len);
@@ -3435,10 +3442,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
                ret = PTR_ERR_OR_ZERO(inode);
-                if (ret && ret != -ESTALE)
+                if (ret && ret != -ENOENT)
                        goto out;
-                if (ret == -ESTALE && root == root->fs_info->tree_root) {
+                if (ret == -ENOENT && root == root->fs_info->tree_root) {
                        struct btrfs_root *dead_root;
                        struct btrfs_fs_info *fs_info = root->fs_info;
                        int is_dead_root = 0;
@@ -3474,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                 * Inode is already gone but the orphan item is still there,
                 * kill the orphan item.
                 */
-                if (ret == -ESTALE) {
+                if (ret == -ENOENT) {
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
                                ret = PTR_ERR(trans);
@@ -3633,7 +3640,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
 * read an inode from the btree into the in-memory inode
 */
-static void btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode)
 {
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -3652,14 +3659,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
                filled = true;
        path = btrfs_alloc_path();
-        if (!path)
+        if (!path) {
+                ret = -ENOMEM;
                goto make_bad;
+        }
        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
-        if (ret)
+        if (ret) {
+                if (ret > 0)
+                        ret = -ENOENT;
                goto make_bad;
+        }
        leaf = path->nodes[0];
@@ -3812,11 +3824,12 @@ cache_acl:
        }
        btrfs_update_iflags(inode);
-        return;
+        return 0;
 make_bad:
        btrfs_free_path(path);
        make_bad_inode(inode);
+        return ret;
 }
 /*
@@ -4204,6 +4217,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        int err = 0;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
+        u64 last_unlink_trans;
        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
@@ -4226,11 +4240,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (err)
                goto out;
+        last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
        /* now the directory is empty */
        err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
                                 dentry->d_name.name, dentry->d_name.len);
-        if (!err)
+        if (!err) {
                btrfs_i_size_write(inode, 0);
+                /*
+                 * Propagate the last_unlink_trans value of the deleted dir to
+                 * its parent directory. This is to prevent an unrecoverable
+                 * log tree in the case we do something like this:
+                 * 1) create dir foo
+                 * 2) create snapshot under dir foo
+                 * 3) delete the snapshot
+                 * 4) rmdir foo
+                 * 5) mkdir foo
+                 * 6) fsync foo or some file inside foo
+                 */
+                if (last_unlink_trans >= trans->transid)
+                        BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
+        }
 out:
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
@@ -5606,7 +5636,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                return ERR_PTR(-ENOMEM);
        if (inode->i_state & I_NEW) {
-                btrfs_read_locked_inode(inode);
+                int ret;
+                ret = btrfs_read_locked_inode(inode);
                if (!is_bad_inode(inode)) {
                        inode_tree_add(inode);
                        unlock_new_inode(inode);
@@ -5615,7 +5647,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                } else {
                        unlock_new_inode(inode);
                        iput(inode);
-                        inode = ERR_PTR(-ESTALE);
+                        ASSERT(ret < 0);
+                        inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
                }
        }
@@ -7225,7 +7258,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        int ret;
        alloc_hint = get_extent_allocation_hint(inode, start, len);
-        ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
+        ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
                                   alloc_hint, &ins, 1, 1);
        if (ret)
                return ERR_PTR(ret);
@@ -7725,6 +7758,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                ret = PTR_ERR(em2);
                                goto unlock_err;
                        }
+                        /*
+                         * For inode marked NODATACOW or extent marked PREALLOC,
+                         * use the existing or preallocated extent, so does not
+                         * need to adjust btrfs_space_info's bytes_may_use.
+                         */
+                        btrfs_free_reserved_data_space_noquota(inode,
+                                        start, len);
                        goto unlock;
                }
        }
@@ -7759,7 +7799,6 @@ unlock:
                        i_size_write(inode, start + len);
                adjust_dio_outstanding_extents(inode, dio_data, len);
-                btrfs_free_reserved_data_space(inode, start, len);
                WARN_ON(dio_data->reserve < len);
                dio_data->reserve -= len;
                dio_data->unsubmitted_oe_range_end = start + len;
@@ -10280,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
        u64 last_alloc = (u64)-1;
        int ret = 0;
        bool own_trans = true;
+        u64 end = start + num_bytes - 1;
        if (trans)
                own_trans = false;
@@ -10301,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                 * sized chunks.
                 */
                cur_bytes = min(cur_bytes, last_alloc);
-                ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
+                ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
-                                           *alloc_hint, &ins, 1, 0);
+                                min_size, 0, *alloc_hint, &ins, 1, 0);
                if (ret) {
                        if (own_trans)
                                btrfs_end_transaction(trans, root);
@@ -10388,6 +10428,9 @@ next:
                if (own_trans)
                        btrfs_end_transaction(trans, root);
        }
+        if (cur_offset < end)
+                btrfs_free_reserved_data_space(inode, cur_offset,
+                        end - cur_offset + 1);
        return ret;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 14ed1e9e6bc8..b2a2da5893af 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5084,7 +5084,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        return btrfs_qgroup_wait_for_completion(root->fs_info);
+        return btrfs_qgroup_wait_for_completion(root->fs_info, true);
 }
 static long _btrfs_ioctl_set_received_subvol(struct file *file,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 93ee1c18ef9d..8db2e29fdcf4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
                goto out;
        fs_info->quota_enabled = 0;
        fs_info->pending_quota_state = 0;
-        btrfs_qgroup_wait_for_completion(fs_info);
+        btrfs_qgroup_wait_for_completion(fs_info, false);
        spin_lock(&fs_info->qgroup_lock);
        quota_root = fs_info->quota_root;
        fs_info->quota_root = NULL;
@@ -1453,10 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
        return ret;
 }
-struct btrfs_qgroup_extent_record *
+int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info,
-btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+                                struct btrfs_delayed_ref_root *delayed_refs,
-                                 struct btrfs_delayed_ref_root *delayed_refs,
+                                struct btrfs_qgroup_extent_record *record)
-                                 struct btrfs_qgroup_extent_record *record)
 {
        struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
        struct rb_node *parent_node = NULL;
@@ -1475,12 +1474,42 @@ btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
                else if (bytenr > entry->bytenr)
                        p = &(*p)->rb_right;
                else
-                        return entry;
+                        return 1;
        }
        rb_link_node(&record->node, parent_node, p);
        rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
-        return NULL;
+        return 0;
+}
+int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
+                struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
+                gfp_t gfp_flag)
+{
+        struct btrfs_qgroup_extent_record *record;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0)
+                return 0;
+        if (WARN_ON(trans == NULL))
+                return -EINVAL;
+        record = kmalloc(sizeof(*record), gfp_flag);
+        if (!record)
+                return -ENOMEM;
+        delayed_refs = &trans->transaction->delayed_refs;
+        record->bytenr = bytenr;
+        record->num_bytes = num_bytes;
+        record->old_roots = NULL;
+        spin_lock(&delayed_refs->lock);
+        ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs,
+                                                      record);
+        spin_unlock(&delayed_refs->lock);
+        if (ret > 0)
+                kfree(record);
+        return 0;
 }
 #define UPDATE_NEW      0
@@ -2303,6 +2332,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
        int err = -ENOMEM;
        int ret = 0;
+        mutex_lock(&fs_info->qgroup_rescan_lock);
+        fs_info->qgroup_rescan_running = true;
+        mutex_unlock(&fs_info->qgroup_rescan_lock);
        path = btrfs_alloc_path();
        if (!path)
                goto out;
@@ -2369,6 +2402,9 @@ out:
        }
 done:
+        mutex_lock(&fs_info->qgroup_rescan_lock);
+        fs_info->qgroup_rescan_running = false;
+        mutex_unlock(&fs_info->qgroup_rescan_lock);
        complete_all(&fs_info->qgroup_rescan_completion);
 }
@@ -2487,20 +2523,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
        return 0;
 }
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
+                                     bool interruptible)
 {
        int running;
        int ret = 0;
        mutex_lock(&fs_info->qgroup_rescan_lock);
        spin_lock(&fs_info->qgroup_lock);
-        running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+        running = fs_info->qgroup_rescan_running;
        spin_unlock(&fs_info->qgroup_lock);
        mutex_unlock(&fs_info->qgroup_rescan_lock);
-        if (running)
+        if (!running)
+                return 0;
+        if (interruptible)
                ret = wait_for_completion_interruptible(
                                        &fs_info->qgroup_rescan_completion);
+        else
+                wait_for_completion(&fs_info->qgroup_rescan_completion);
        return ret;
 }
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 710887c06aaf..1bc64c864b62 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
 void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
+                                     bool interruptible);
 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
                              struct btrfs_fs_info *fs_info, u64 src, u64 dst);
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
@@ -63,10 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
 struct btrfs_delayed_extent_op;
 int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info);
-struct btrfs_qgroup_extent_record *
+/*
-btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+ * Insert one dirty extent record into @delayed_refs, informing qgroup to
-                                 struct btrfs_delayed_ref_root *delayed_refs,
+ * account that extent at commit trans time.
-                                 struct btrfs_qgroup_extent_record *record);
+ *
+ * No lock version, caller must acquire delayed ref lock and allocate memory.
+ *
+ * Return 0 for success insert
+ * Return >0 for existing record, caller can free @record safely.
+ * Error is not possible
+ */
+int btrfs_qgroup_insert_dirty_extent_nolock(
+                struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_root *delayed_refs,
+                struct btrfs_qgroup_extent_record *record);
+/*
+ * Insert one dirty extent record into @delayed_refs, informing qgroup to
+ * account that extent at commit trans time.
+ *
+ * Better encapsulated version.
+ *
+ * Return 0 if the operation is done.
+ * Return <0 for error, like memory allocation failure or invalid parameter
+ * (NULL trans)
+ */
+int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
+                struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
+                gfp_t gfp_flag);
 int
 btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
                            struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b26a5aea41b4..8a2c2a07987b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -31,6 +31,7 @@
 #include "async-thread.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "qgroup.h"
 /*
 * backref_node, mapping_node and tree_block start with this
@@ -3037,15 +3038,19 @@ int prealloc_file_extent_cluster(struct inode *inode,
        u64 num_bytes;
        int nr = 0;
        int ret = 0;
+        u64 prealloc_start = cluster->start - offset;
+        u64 prealloc_end = cluster->end - offset;
+        u64 cur_offset;
        BUG_ON(cluster->start != cluster->boundary[0]);
        inode_lock(inode);
-        ret = btrfs_check_data_free_space(inode, cluster->start,
+        ret = btrfs_check_data_free_space(inode, prealloc_start,
-                                          cluster->end + 1 - cluster->start);
+                                          prealloc_end + 1 - prealloc_start);
        if (ret)
                goto out;
+        cur_offset = prealloc_start;
        while (nr < cluster->nr) {
                start = cluster->boundary[nr] - offset;
                if (nr + 1 < cluster->nr)
@@ -3055,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode,
                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
                num_bytes = end + 1 - start;
+                if (cur_offset < start)
+                        btrfs_free_reserved_data_space(inode, cur_offset,
+                                        start - cur_offset);
                ret = btrfs_prealloc_file_range(inode, 0, start,
                                                num_bytes, num_bytes,
                                                end + 1, &alloc_hint);
+                cur_offset = end + 1;
                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
                if (ret)
                        break;
                nr++;
        }
-        btrfs_free_reserved_data_space(inode, cluster->start,
+        if (cur_offset < prealloc_end)
-                                       cluster->end + 1 - cluster->start);
+                btrfs_free_reserved_data_space(inode, cur_offset,
+                                       prealloc_end + 1 - cur_offset);
 out:
        inode_unlock(inode);
        return ret;
@@ -3916,6 +3926,90 @@ int prepare_to_relocate(struct reloc_control *rc)
        return 0;
 }
+/*
+ * Qgroup fixer for data chunk relocation.
+ * The data relocation is done in the following steps
+ * 1) Copy data extents into data reloc tree
+ * 2) Create tree reloc tree(special snapshot) for related subvolumes
+ * 3) Modify file extents in tree reloc tree
+ * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
+ *
+ * The problem is, data and tree reloc tree are not accounted to qgroup,
+ * and 4) will only info qgroup to track tree blocks change, not file extents
+ * in the tree blocks.
+ *
+ * The good news is, related data extents are all in data reloc tree, so we
+ * only need to info qgroup to track all file extents in data reloc tree
+ * before commit trans.
+ */
+static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
+                                             struct reloc_control *rc)
+{
+        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+        struct inode *inode = rc->data_inode;
+        struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret = 0;
+        if (!fs_info->quota_enabled)
+                return 0;
+        /*
+         * Only for stage where we update data pointers the qgroup fix is
+         * valid.
+         * For MOVING_DATA stage, we will miss the timing of swapping tree
+         * blocks, and won't fix it.
+         */
+        if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = btrfs_ino(inode);
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        key.offset = 0;
+        ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
+        while (1) {
+                struct btrfs_file_extent_item *fi;
+                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                if (key.objectid > btrfs_ino(inode))
+                        break;
+                if (key.type != BTRFS_EXTENT_DATA_KEY)
+                        goto next;
+                fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                    struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(path->nodes[0], fi) !=
+                                BTRFS_FILE_EXTENT_REG)
+                        goto next;
+                ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
+                        btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
+                        btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
+                        GFP_NOFS);
+                if (ret < 0)
+                        break;
+next:
+                ret = btrfs_next_item(data_reloc_root, path);
+                if (ret < 0)
+                        break;
+                if (ret > 0) {
+                        ret = 0;
+                        break;
+                }
+        }
+        unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
        struct rb_root blocks = RB_ROOT;
@@ -4102,10 +4196,16 @@ restart:
        /* get rid of pinned extents */
        trans = btrfs_join_transaction(rc->extent_root);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
-        else
+                goto out_free;
-                btrfs_commit_transaction(trans, rc->extent_root);
+        }
+        err = qgroup_fix_relocated_data_extents(trans, rc);
+        if (err < 0) {
+                btrfs_abort_transaction(trans, err);
+                goto out_free;
+        }
+        btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
        btrfs_free_path(path);
@@ -4468,10 +4568,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        unset_reloc_control(rc);
        trans = btrfs_join_transaction(rc->extent_root);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
-        else
+                goto out_free;
-                err = btrfs_commit_transaction(trans, rc->extent_root);
+        }
+        err = qgroup_fix_relocated_data_extents(trans, rc);
+        if (err < 0) {
+                btrfs_abort_transaction(trans, err);
+                goto out_free;
+        }
+        err = btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
        kfree(rc);
 out:
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7fd7e1830cfe..091296062456 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                root_key.objectid = key.offset;
                key.offset++;
+                /*
+                 * The root might have been inserted already, as before we look
+                 * for orphan roots, log replay might have happened, which
+                 * triggers a transaction commit and qgroup accounting, which
+                 * in turn reads and inserts fs roots while doing backref
+                 * walking.
+                 */
+                root = btrfs_lookup_fs_root(tree_root->fs_info,
+                                            root_key.objectid);
+                if (root) {
+                        WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+                                          &root->state));
+                        if (btrfs_root_refs(&root->root_item) == 0)
+                                btrfs_add_dead_root(root);
+                        continue;
+                }
                root = btrfs_read_fs_root(tree_root, &root_key);
                err = PTR_ERR_OR_ZERO(root);
                if (err && err != -ENOENT) {
@@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
                err = btrfs_insert_fs_root(root->fs_info, root);
-                /*
-                 * The root might have been inserted already, as before we look
-                 * for orphan roots, log replay might have happened, which
-                 * triggers a transaction commit and qgroup accounting, which
-                 * in turn reads and inserts fs roots while doing backref
-                 * walking.
-                 */
-                if (err == -EEXIST)
-                        err = 0;
                if (err) {
+                        BUG_ON(err == -EEXIST);
                        btrfs_free_fs_root(root);
                        break;
                }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b71dd298385c..efe129fe2678 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -231,7 +231,6 @@ struct pending_dir_move {
        u64 parent_ino;
        u64 ino;
        u64 gen;
-        bool is_orphan;
        struct list_head update_refs;
 };
@@ -274,6 +273,39 @@ struct name_cache_entry {
        char name[];
 };
+static void inconsistent_snapshot_error(struct send_ctx *sctx,
+                                        enum btrfs_compare_tree_result result,
+                                        const char *what)
+{
+        const char *result_string;
+        switch (result) {
+        case BTRFS_COMPARE_TREE_NEW:
+                result_string = "new";
+                break;
+        case BTRFS_COMPARE_TREE_DELETED:
+                result_string = "deleted";
+                break;
+        case BTRFS_COMPARE_TREE_CHANGED:
+                result_string = "updated";
+                break;
+        case BTRFS_COMPARE_TREE_SAME:
+                ASSERT(0);
+                result_string = "unchanged";
+                break;
+        default:
+                ASSERT(0);
+                result_string = "unexpected";
+        }
+        btrfs_err(sctx->send_root->fs_info,
+                  "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
+                  result_string, what, sctx->cmp_key->objectid,
+                  sctx->send_root->root_key.objectid,
+                  (sctx->parent_root ?
+                   sctx->parent_root->root_key.objectid : 0));
+}
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
 static struct waiting_dir_move *
@@ -1861,7 +1893,8 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
         * was already unlinked/moved, so we can safely assume that we will not
         * overwrite anything at this point in time.
         */
-        if (other_inode > sctx->send_progress) {
+        if (other_inode > sctx->send_progress ||
+            is_waiting_for_move(sctx, other_inode)) {
                ret = get_inode_info(sctx->parent_root, other_inode, NULL,
                                who_gen, NULL, NULL, NULL, NULL);
                if (ret < 0)
@@ -2502,6 +2535,8 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
        ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+        if (ret > 0)
+                ret = -ENOENT;
        if (ret < 0)
                goto out;
@@ -2947,6 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
                }
                if (loc.objectid > send_progress) {
+                        struct orphan_dir_info *odi;
+                        odi = get_orphan_dir_info(sctx, dir);
+                        free_orphan_dir_info(sctx, odi);
                        ret = 0;
                        goto out;
                }
@@ -3047,7 +3086,6 @@ static int add_pending_dir_move(struct send_ctx *sctx,
        pm->parent_ino = parent_ino;
        pm->ino = ino;
        pm->gen = ino_gen;
-        pm->is_orphan = is_orphan;
        INIT_LIST_HEAD(&pm->list);
        INIT_LIST_HEAD(&pm->update_refs);
        RB_CLEAR_NODE(&pm->node);
@@ -3113,6 +3151,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
        return NULL;
 }
+static int path_loop(struct send_ctx *sctx, struct fs_path *name,
+                     u64 ino, u64 gen, u64 *ancestor_ino)
+{
+        int ret = 0;
+        u64 parent_inode = 0;
+        u64 parent_gen = 0;
+        u64 start_ino = ino;
+        *ancestor_ino = 0;
+        while (ino != BTRFS_FIRST_FREE_OBJECTID) {
+                fs_path_reset(name);
+                if (is_waiting_for_rm(sctx, ino))
+                        break;
+                if (is_waiting_for_move(sctx, ino)) {
+                        if (*ancestor_ino == 0)
+                                *ancestor_ino = ino;
+                        ret = get_first_ref(sctx->parent_root, ino,
+                                            &parent_inode, &parent_gen, name);
+                } else {
+                        ret = __get_cur_name_and_parent(sctx, ino, gen,
+                                                        &parent_inode,
+                                                        &parent_gen, name);
+                        if (ret > 0) {
+                                ret = 0;
+                                break;
+                        }
+                }
+                if (ret < 0)
+                        break;
+                if (parent_inode == start_ino) {
+                        ret = 1;
+                        if (*ancestor_ino == 0)
+                                *ancestor_ino = ino;
+                        break;
+                }
+                ino = parent_inode;
+                gen = parent_gen;
+        }
+        return ret;
+}
 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 {
        struct fs_path *from_path = NULL;
@@ -3123,6 +3203,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
        u64 parent_ino, parent_gen;
        struct waiting_dir_move *dm = NULL;
        u64 rmdir_ino = 0;
+        u64 ancestor;
+        bool is_orphan;
        int ret;
        name = fs_path_alloc();
@@ -3135,9 +3217,10 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
        dm = get_waiting_dir_move(sctx, pm->ino);
        ASSERT(dm);
        rmdir_ino = dm->rmdir_ino;
+        is_orphan = dm->orphanized;
        free_waiting_dir_move(sctx, dm);
-        if (pm->is_orphan) {
+        if (is_orphan) {
                ret = gen_unique_name(sctx, pm->ino,
                                      pm->gen, from_path);
        } else {
@@ -3155,6 +3238,24 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
                goto out;
        sctx->send_progress = sctx->cur_ino + 1;
+        ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+        if (ret < 0)
+                goto out;
+        if (ret) {
+                LIST_HEAD(deleted_refs);
+                ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
+                ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
+                                           &pm->update_refs, &deleted_refs,
+                                           is_orphan);
+                if (ret < 0)
+                        goto out;
+                if (rmdir_ino) {
+                        dm = get_waiting_dir_move(sctx, pm->ino);
+                        ASSERT(dm);
+                        dm->rmdir_ino = rmdir_ino;
+                }
+                goto out;
+        }
        fs_path_reset(name);
        to_path = name;
        name = NULL;
@@ -3174,7 +3275,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
                        /* already deleted */
                        goto finish;
                }
-                ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+                ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino);
                if (ret < 0)
                        goto out;
                if (!ret)
@@ -3204,8 +3305,18 @@ finish:
         * and old parent(s).
         */
        list_for_each_entry(cur, &pm->update_refs, list) {
-                if (cur->dir == rmdir_ino)
+                /*
+                 * The parent inode might have been deleted in the send snapshot
+                 */
+                ret = get_inode_info(sctx->send_root, cur->dir, NULL,
+                                     NULL, NULL, NULL, NULL, NULL);
+                if (ret == -ENOENT) {
+                        ret = 0;
                        continue;
+                }
+                if (ret < 0)
+                        goto out;
                ret = send_utimes(sctx, cur->dir, cur->dir_gen);
                if (ret < 0)
                        goto out;
@@ -3325,6 +3436,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
        u64 left_gen;
        u64 right_gen;
        int ret = 0;
+        struct waiting_dir_move *wdm;
        if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
                return 0;
@@ -3383,7 +3495,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
                goto out;
        }
-        if (is_waiting_for_move(sctx, di_key.objectid)) {
+        wdm = get_waiting_dir_move(sctx, di_key.objectid);
+        if (wdm && !wdm->orphanized) {
                ret = add_pending_dir_move(sctx,
                                           sctx->cur_ino,
                                           sctx->cur_inode_gen,
@@ -3470,7 +3583,8 @@ static int wait_for_parent_move(struct send_ctx *sctx,
                        ret = is_ancestor(sctx->parent_root,
                                          sctx->cur_ino, sctx->cur_inode_gen,
                                          ino, path_before);
-                        break;
+                        if (ret)
+                                break;
                }
                fs_path_reset(path_before);
@@ -3643,11 +3757,26 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                goto out;
                        if (ret) {
                                struct name_cache_entry *nce;
+                                struct waiting_dir_move *wdm;
                                ret = orphanize_inode(sctx, ow_inode, ow_gen,
                                                cur->full_path);
                                if (ret < 0)
                                        goto out;
+                                /*
+                                 * If ow_inode has its rename operation delayed
+                                 * make sure that its orphanized name is used in
+                                 * the source path when performing its rename
+                                 * operation.
+                                 */
+                                if (is_waiting_for_move(sctx, ow_inode)) {
+                                        wdm = get_waiting_dir_move(sctx,
+                                                                   ow_inode);
+                                        ASSERT(wdm);
+                                        wdm->orphanized = true;
+                                }
                                /*
                                 * Make sure we clear our orphanized inode's
                                 * name from the name cache. This is because the
@@ -3663,6 +3792,19 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                        name_cache_delete(sctx, nce);
                                        kfree(nce);
                                }
+                                /*
+                                 * ow_inode might currently be an ancestor of
+                                 * cur_ino, therefore compute valid_path (the
+                                 * current path of cur_ino) again because it
+                                 * might contain the pre-orphanization name of
+                                 * ow_inode, which is no longer valid.
+                                 */
+                                fs_path_reset(valid_path);
+                                ret = get_cur_path(sctx, sctx->cur_ino,
+                                           sctx->cur_inode_gen, valid_path);
+                                if (ret < 0)
+                                        goto out;
                        } else {
                                ret = send_unlink(sctx, cur->full_path);
                                if (ret < 0)
@@ -5602,7 +5744,10 @@ static int changed_ref(struct send_ctx *sctx,
 {
        int ret = 0;
-        BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+        if (sctx->cur_ino != sctx->cmp_key->objectid) {
+                inconsistent_snapshot_error(sctx, result, "reference");
+                return -EIO;
+        }
        if (!sctx->cur_inode_new_gen &&
            sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
@@ -5627,7 +5772,10 @@ static int changed_xattr(struct send_ctx *sctx,
 {
        int ret = 0;
-        BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+        if (sctx->cur_ino != sctx->cmp_key->objectid) {
+                inconsistent_snapshot_error(sctx, result, "xattr");
+                return -EIO;
+        }
        if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
                if (result == BTRFS_COMPARE_TREE_NEW)
@@ -5651,7 +5799,10 @@ static int changed_extent(struct send_ctx *sctx,
 {
        int ret = 0;
-        BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+        if (sctx->cur_ino != sctx->cmp_key->objectid) {
+                inconsistent_snapshot_error(sctx, result, "extent");
+                return -EIO;
+        }
        if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
                if (result != BTRFS_COMPARE_TREE_DELETED)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 864ce334f696..4071fe2bd098 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2241,6 +2241,13 @@ static int btrfs_freeze(struct super_block *sb)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+        root->fs_info->fs_frozen = 1;
+        /*
+         * We don't need a barrier here, we'll wait for any transaction that
+         * could be in progress on other threads (and do delayed iputs that
+         * we want to avoid on a frozen filesystem), or do the commit
+         * ourselves.
+         */
        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                /* no transaction, don't bother */
@@ -2251,6 +2258,14 @@ static int btrfs_freeze(struct super_block *sb)
        return btrfs_commit_transaction(trans, root);
 }
+static int btrfs_unfreeze(struct super_block *sb)
+{
+        struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+        root->fs_info->fs_frozen = 0;
+        return 0;
+}
 static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2299,6 +2314,7 @@ static const struct super_operations btrfs_super_ops = {
        .statfs         = btrfs_statfs,
        .remount_fs     = btrfs_remount,
        .freeze_fs      = btrfs_freeze,
+        .unfreeze_fs    = btrfs_unfreeze,
 };
 static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9cca0a721961..95d41919d034 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2278,8 +2278,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
+        /*
+         * If fs has been frozen, we can not handle delayed iputs, otherwise
+         * it'll result in deadlock about SB_FREEZE_FS.
+         */
        if (current != root->fs_info->transaction_kthread &&
-            current != root->fs_info->cleaner_kthread)
+            current != root->fs_info->cleaner_kthread &&
+            !root->fs_info->fs_frozen)
                btrfs_run_delayed_iputs(root);
        return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d31a0c4f56be..e935035ac034 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
 #include "backref.h"
 #include "hash.h"
 #include "compression.h"
+#include "qgroup.h"
 /* magic values for the inode_only field in btrfs_log_inode:
 *
@@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                ins.type = BTRFS_EXTENT_ITEM_KEY;
                offset = key->offset - btrfs_file_extent_offset(eb, item);
+                /*
+                 * Manually record dirty extent, as here we did a shallow
+                 * file extent item copy and skip normal backref update,
+                 * but modifying extent tree all by ourselves.
+                 * So need to manually record dirty extent for qgroup,
+                 * as the owner of the file extent changed from log tree
+                 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
+                 */
+                ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+                                btrfs_file_extent_disk_bytenr(eb, item),
+                                btrfs_file_extent_disk_num_bytes(eb, item),
+                                GFP_NOFS);
+                if (ret < 0)
+                        goto out;
                if (ins.objectid > 0) {
                        u64 csum_start;
                        u64 csum_end;
@@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         */
        mutex_unlock(&root->log_mutex);
-        btrfs_init_log_ctx(&root_log_ctx);
+        btrfs_init_log_ctx(&root_log_ctx, NULL);
        mutex_lock(&log_root_tree->log_mutex);
        atomic_inc(&log_root_tree->log_batch);
@@ -4469,7 +4485,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                         const int slot,
                                         const struct btrfs_key *key,
-                                         struct inode *inode)
+                                         struct inode *inode,
+                                         u64 *other_ino)
 {
        int ret;
        struct btrfs_path *search_path;
@@ -4528,7 +4545,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                           search_path, parent,
                                           name, this_name_len, 0);
                if (di && !IS_ERR(di)) {
-                        ret = 1;
+                        struct btrfs_key di_key;
+                        btrfs_dir_item_key_to_cpu(search_path->nodes[0],
+                                                  di, &di_key);
+                        if (di_key.type == BTRFS_INODE_ITEM_KEY) {
+                                ret = 1;
+                                *other_ino = di_key.objectid;
+                        } else {
+                                ret = -EAGAIN;
+                        }
                        goto out;
                } else if (IS_ERR(di)) {
                        ret = PTR_ERR(di);
@@ -4722,16 +4748,72 @@ again:
                if ((min_key.type == BTRFS_INODE_REF_KEY ||
                     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
                    BTRFS_I(inode)->generation == trans->transid) {
+                        u64 other_ino = 0;
                        ret = btrfs_check_ref_name_override(path->nodes[0],
                                                            path->slots[0],
-                                                            &min_key, inode);
+                                                            &min_key, inode,
+                                                            &other_ino);
                        if (ret < 0) {
                                err = ret;
                                goto out_unlock;
-                        } else if (ret > 0) {
+                        } else if (ret > 0 && ctx &&
-                                err = 1;
+                                   other_ino != btrfs_ino(ctx->inode)) {
-                                btrfs_set_log_full_commit(root->fs_info, trans);
+                                struct btrfs_key inode_key;
-                                goto out_unlock;
+                                struct inode *other_inode;
+                                if (ins_nr > 0) {
+                                        ins_nr++;
+                                } else {
+                                        ins_nr = 1;
+                                        ins_start_slot = path->slots[0];
+                                }
+                                ret = copy_items(trans, inode, dst_path, path,
+                                                 &last_extent, ins_start_slot,
+                                                 ins_nr, inode_only,
+                                                 logged_isize);
+                                if (ret < 0) {
+                                        err = ret;
+                                        goto out_unlock;
+                                }
+                                ins_nr = 0;
+                                btrfs_release_path(path);
+                                inode_key.objectid = other_ino;
+                                inode_key.type = BTRFS_INODE_ITEM_KEY;
+                                inode_key.offset = 0;
+                                other_inode = btrfs_iget(root->fs_info->sb,
+                                                         &inode_key, root,
+                                                         NULL);
+                                /*
+                                 * If the other inode that had a conflicting dir
+                                 * entry was deleted in the current transaction,
+                                 * we don't need to do more work nor fallback to
+                                 * a transaction commit.
+                                 */
+                                if (IS_ERR(other_inode) &&
+                                    PTR_ERR(other_inode) == -ENOENT) {
+                                        goto next_key;
+                                } else if (IS_ERR(other_inode)) {
+                                        err = PTR_ERR(other_inode);
+                                        goto out_unlock;
+                                }
+                                /*
+                                 * We are safe logging the other inode without
+                                 * acquiring its i_mutex as long as we log with
+                                 * the LOG_INODE_EXISTS mode. We're safe against
+                                 * concurrent renames of the other inode as well
+                                 * because during a rename we pin the log and
+                                 * update the log with the new name before we
+                                 * unpin it.
+                                 */
+                                err = btrfs_log_inode(trans, root, other_inode,
+                                                      LOG_INODE_EXISTS,
+                                                      0, LLONG_MAX, ctx);
+                                iput(other_inode);
+                                if (err)
+                                        goto out_unlock;
+                                else
+                                        goto next_key;
                        }
                }
@@ -4799,7 +4881,7 @@ next_slot:
                        ins_nr = 0;
                }
                btrfs_release_path(path);
+next_key:
                if (min_key.offset < (u64)-1) {
                        min_key.offset++;
                } else if (min_key.type < max_key.type) {
@@ -4993,8 +5075,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                        break;
-                if (IS_ROOT(parent))
+                if (IS_ROOT(parent)) {
+                        inode = d_inode(parent);
+                        if (btrfs_must_commit_transaction(trans, inode))
+                                ret = 1;
                        break;
+                }
                parent = dget_parent(parent);
                dput(old_parent);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index a9f1b75d080d..ab858e31ccbc 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -30,15 +30,18 @@ struct btrfs_log_ctx {
        int log_transid;
        int io_err;
        bool log_new_dentries;
+        struct inode *inode;
        struct list_head list;
 };
-static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
+                                      struct inode *inode)
 {
        ctx->log_ret = 0;
        ctx->log_transid = 0;
        ctx->io_err = 0;
        ctx->log_new_dentries = false;
+        ctx->inode = inode;
        INIT_LIST_HEAD(&ctx->list);
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 51f125508771..035efce603a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -834,10 +834,6 @@ static void __free_device(struct work_struct *work)
        struct btrfs_device *device;
        device = container_of(work, struct btrfs_device, rcu_work);
-        if (device->bdev)
-                blkdev_put(device->bdev, device->mode);
        rcu_string_free(device->name);
        kfree(device);
 }
@@ -852,6 +848,17 @@ static void free_device(struct rcu_head *head)
        schedule_work(&device->rcu_work);
 }
+static void btrfs_close_bdev(struct btrfs_device *device)
+{
+        if (device->bdev && device->writeable) {
+                sync_blockdev(device->bdev);
+                invalidate_bdev(device->bdev);
+        }
+        if (device->bdev)
+                blkdev_put(device->bdev, device->mode);
+}
 static void btrfs_close_one_device(struct btrfs_device *device)
 {
        struct btrfs_fs_devices *fs_devices = device->fs_devices;
@@ -870,10 +877,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
        if (device->missing)
                fs_devices->missing_devices--;
-        if (device->bdev && device->writeable) {
+        btrfs_close_bdev(device);
-                sync_blockdev(device->bdev);
-                invalidate_bdev(device->bdev);
-        }
        new_device = btrfs_alloc_device(NULL, &device->devid,
                                        device->uuid);
@@ -1932,6 +1936,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
                btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
        }
+        btrfs_close_bdev(device);
        call_rcu(&device->rcu, free_device);
        num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
@@ -2025,6 +2031,9 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
                /* zero out the old super if it is writable */
                btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
        }
+        btrfs_close_bdev(srcdev);
        call_rcu(&srcdev->rcu, free_device);
        /*
@@ -2080,6 +2089,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
         * the device_list_mutex lock.
         */
        btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+        btrfs_close_bdev(tgtdev);
        call_rcu(&tgtdev->rcu, free_device);
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 99115cae1652..16e6ded0b7f2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1347,9 +1347,12 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
-        struct ceph_mds_session *session = *psession;
+        struct ceph_mds_session *session = NULL;
        int mds;
        dout("ceph_flush_snaps %p\n", inode);
+        if (psession)
+                session = *psession;
 retry:
        spin_lock(&ci->i_ceph_lock);
        if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fa59a85226b2..f72d4ae303b2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2759,6 +2759,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        } else {
                path = NULL;
                pathlen = 0;
+                pathbase = 0;
        }
        spin_lock(&ci->i_ceph_lock);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index eea64912c9c0..466f7d60edc2 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -607,20 +607,54 @@ static const struct file_operations format2_fops;
 static const struct file_operations format3_fops;
 static const struct file_operations format4_fops;
-static int table_open(struct inode *inode, struct file *file)
+static int table_open1(struct inode *inode, struct file *file)
 {
        struct seq_file *seq;
-        int ret = -1;
+        int ret;
-        if (file->f_op == &format1_fops)
+        ret = seq_open(file, &format1_seq_ops);
-                ret = seq_open(file, &format1_seq_ops);
+        if (ret)
-        else if (file->f_op == &format2_fops)
+                return ret;
-                ret = seq_open(file, &format2_seq_ops);
-        else if (file->f_op == &format3_fops)
+        seq = file->private_data;
-                ret = seq_open(file, &format3_seq_ops);
+        seq->private = inode->i_private; /* the dlm_ls */
-        else if (file->f_op == &format4_fops)
+        return 0;
-                ret = seq_open(file, &format4_seq_ops);
+}
+static int table_open2(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &format2_seq_ops);
+        if (ret)
+                return ret;
+        seq = file->private_data;
+        seq->private = inode->i_private; /* the dlm_ls */
+        return 0;
+}
+static int table_open3(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &format3_seq_ops);
+        if (ret)
+                return ret;
+        seq = file->private_data;
+        seq->private = inode->i_private; /* the dlm_ls */
+        return 0;
+}
+static int table_open4(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &format4_seq_ops);
        if (ret)
                return ret;
@@ -631,7 +665,7 @@ static int table_open(struct inode *inode, struct file *file)
 static const struct file_operations format1_fops = {
        .owner   = THIS_MODULE,
-        .open    = table_open,
+        .open    = table_open1,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release
@@ -639,7 +673,7 @@ static const struct file_operations format1_fops = {
 static const struct file_operations format2_fops = {
        .owner   = THIS_MODULE,
-        .open    = table_open,
+        .open    = table_open2,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release
@@ -647,7 +681,7 @@ static const struct file_operations format2_fops = {
 static const struct file_operations format3_fops = {
        .owner   = THIS_MODULE,
-        .open    = table_open,
+        .open    = table_open3,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release
@@ -655,7 +689,7 @@ static const struct file_operations format3_fops = {
 static const struct file_operations format4_fops = {
        .owner   = THIS_MODULE,
-        .open    = table_open,
+        .open    = table_open4,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d64d2a515cb2..ccb401eebc11 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1699,11 +1699,11 @@ static int f2fs_write_end(struct file *file,
        trace_f2fs_write_end(inode, pos, len, copied);
        set_page_dirty(page);
-        f2fs_put_page(page, 1);
        if (pos + copied > i_size_read(inode))
                f2fs_i_size_write(inode, pos + copied);
+        f2fs_put_page(page, 1);
        f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
        return copied;
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 675fa79d86f6..14f5fe2b841e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -538,7 +538,7 @@ struct f2fs_nm_info {
        /* NAT cache management */
        struct radix_tree_root nat_root;/* root of the nat entry cache */
        struct radix_tree_root nat_set_root;/* root of the nat set cache */
-        struct percpu_rw_semaphore nat_tree_lock;       /* protect nat_tree_lock */
+        struct rw_semaphore nat_tree_lock;      /* protect nat_tree_lock */
        struct list_head nat_entries;   /* cached nat entry list (clean) */
        unsigned int nat_cnt;           /* the # of cached nat entries */
        unsigned int dirty_nat_cnt;     /* total num of nat entries in set */
@@ -787,7 +787,7 @@ struct f2fs_sb_info {
        struct f2fs_checkpoint *ckpt;           /* raw checkpoint pointer */
        struct inode *meta_inode;               /* cache meta blocks */
        struct mutex cp_mutex;                  /* checkpoint procedure lock */
-        struct percpu_rw_semaphore cp_rwsem;            /* blocking FS operations */
+        struct rw_semaphore cp_rwsem;           /* blocking FS operations */
        struct rw_semaphore node_write;         /* locking node writes */
        wait_queue_head_t cp_wait;
        unsigned long last_time[MAX_TIME];      /* to store time in jiffies */
@@ -1074,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
 static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
 {
-        percpu_down_read(&sbi->cp_rwsem);
+        down_read(&sbi->cp_rwsem);
 }
 static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
 {
-        percpu_up_read(&sbi->cp_rwsem);
+        up_read(&sbi->cp_rwsem);
 }
 static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
 {
-        percpu_down_write(&sbi->cp_rwsem);
+        down_write(&sbi->cp_rwsem);
 }
 static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
 {
-        percpu_up_write(&sbi->cp_rwsem);
+        up_write(&sbi->cp_rwsem);
 }
 static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0e493f63ea41..47abb96098e4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2086,15 +2086,19 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
        if (unlikely(f2fs_readonly(src->i_sb)))
                return -EROFS;
-        if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode))
+        if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode))
-                return -EISDIR;
+                return -EINVAL;
        if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
                return -EOPNOTSUPP;
        inode_lock(src);
-        if (src != dst)
+        if (src != dst) {
-                inode_lock(dst);
+                if (!inode_trylock(dst)) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+        }
        ret = -EINVAL;
        if (pos_in + len > src->i_size || pos_in + len < pos_in)
@@ -2152,6 +2156,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
 out_unlock:
        if (src != dst)
                inode_unlock(dst);
+out:
        inode_unlock(src);
        return ret;
 }
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b2fa4b615925..f75d197d5beb 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -206,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
        struct nat_entry *e;
        bool need = false;
-        percpu_down_read(&nm_i->nat_tree_lock);
+        down_read(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, nid);
        if (e) {
                if (!get_nat_flag(e, IS_CHECKPOINTED) &&
                                !get_nat_flag(e, HAS_FSYNCED_INODE))
                        need = true;
        }
-        percpu_up_read(&nm_i->nat_tree_lock);
+        up_read(&nm_i->nat_tree_lock);
        return need;
 }
@@ -223,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
        struct nat_entry *e;
        bool is_cp = true;
-        percpu_down_read(&nm_i->nat_tree_lock);
+        down_read(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, nid);
        if (e && !get_nat_flag(e, IS_CHECKPOINTED))
                is_cp = false;
-        percpu_up_read(&nm_i->nat_tree_lock);
+        up_read(&nm_i->nat_tree_lock);
        return is_cp;
 }
@@ -237,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
        struct nat_entry *e;
        bool need_update = true;
-        percpu_down_read(&nm_i->nat_tree_lock);
+        down_read(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, ino);
        if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
                        (get_nat_flag(e, IS_CHECKPOINTED) ||
                         get_nat_flag(e, HAS_FSYNCED_INODE)))
                need_update = false;
-        percpu_up_read(&nm_i->nat_tree_lock);
+        up_read(&nm_i->nat_tree_lock);
        return need_update;
 }
@@ -284,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct nat_entry *e;
-        percpu_down_write(&nm_i->nat_tree_lock);
+        down_write(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, ni->nid);
        if (!e) {
                e = grab_nat_entry(nm_i, ni->nid);
@@ -334,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
                        set_nat_flag(e, HAS_FSYNCED_INODE, true);
                set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
        }
-        percpu_up_write(&nm_i->nat_tree_lock);
+        up_write(&nm_i->nat_tree_lock);
 }
 int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -342,7 +342,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        int nr = nr_shrink;
-        percpu_down_write(&nm_i->nat_tree_lock);
+        if (!down_write_trylock(&nm_i->nat_tree_lock))
+                return 0;
        while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
                struct nat_entry *ne;
@@ -351,7 +352,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
                __del_from_nat_cache(nm_i, ne);
                nr_shrink--;
        }
-        percpu_up_write(&nm_i->nat_tree_lock);
+        up_write(&nm_i->nat_tree_lock);
        return nr - nr_shrink;
 }
@@ -373,13 +374,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
        ni->nid = nid;
        /* Check nat cache */
-        percpu_down_read(&nm_i->nat_tree_lock);
+        down_read(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, nid);
        if (e) {
                ni->ino = nat_get_ino(e);
                ni->blk_addr = nat_get_blkaddr(e);
                ni->version = nat_get_version(e);
-                percpu_up_read(&nm_i->nat_tree_lock);
+                up_read(&nm_i->nat_tree_lock);
                return;
        }
@@ -403,11 +404,11 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
        node_info_from_raw_nat(ni, &ne);
        f2fs_put_page(page, 1);
 cache:
-        percpu_up_read(&nm_i->nat_tree_lock);
+        up_read(&nm_i->nat_tree_lock);
        /* cache nat entry */
-        percpu_down_write(&nm_i->nat_tree_lock);
+        down_write(&nm_i->nat_tree_lock);
        cache_nat_entry(sbi, nid, &ne);
-        percpu_up_write(&nm_i->nat_tree_lock);
+        up_write(&nm_i->nat_tree_lock);
 }
 /*
@@ -1788,7 +1789,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
        ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
                                                        META_NAT, true);
-        percpu_down_read(&nm_i->nat_tree_lock);
+        down_read(&nm_i->nat_tree_lock);
        while (1) {
                struct page *page = get_current_nat_page(sbi, nid);
@@ -1820,7 +1821,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
                        remove_free_nid(nm_i, nid);
        }
        up_read(&curseg->journal_rwsem);
-        percpu_up_read(&nm_i->nat_tree_lock);
+        up_read(&nm_i->nat_tree_lock);
        ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
                                        nm_i->ra_nid_pages, META_NAT, false);
@@ -2209,7 +2210,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
        if (!nm_i->dirty_nat_cnt)
                return;
-        percpu_down_write(&nm_i->nat_tree_lock);
+        down_write(&nm_i->nat_tree_lock);
        /*
         * if there are no enough space in journal to store dirty nat
@@ -2232,7 +2233,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
        list_for_each_entry_safe(set, tmp, &sets, set_list)
                __flush_nat_entry_set(sbi, set);
-        percpu_up_write(&nm_i->nat_tree_lock);
+        up_write(&nm_i->nat_tree_lock);
        f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
 }
@@ -2268,8 +2269,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
        mutex_init(&nm_i->build_lock);
        spin_lock_init(&nm_i->free_nid_list_lock);
-        if (percpu_init_rwsem(&nm_i->nat_tree_lock))
+        init_rwsem(&nm_i->nat_tree_lock);
-                return -ENOMEM;
        nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
        nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -2326,7 +2326,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
        spin_unlock(&nm_i->free_nid_list_lock);
        /* destroy nat cache */
-        percpu_down_write(&nm_i->nat_tree_lock);
+        down_write(&nm_i->nat_tree_lock);
        while ((found = __gang_lookup_nat_cache(nm_i,
                                        nid, NATVEC_SIZE, natvec))) {
                unsigned idx;
@@ -2351,9 +2351,8 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
                        kmem_cache_free(nat_entry_set_slab, setvec[idx]);
                }
        }
-        percpu_up_write(&nm_i->nat_tree_lock);
+        up_write(&nm_i->nat_tree_lock);
-        percpu_free_rwsem(&nm_i->nat_tree_lock);
        kfree(nm_i->nat_bitmap);
        sbi->nm_info = NULL;
        kfree(nm_i);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1b86d3f638ef..7f863a645ab1 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -706,8 +706,6 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
                percpu_counter_destroy(&sbi->nr_pages[i]);
        percpu_counter_destroy(&sbi->alloc_valid_block_count);
        percpu_counter_destroy(&sbi->total_valid_inode_count);
-        percpu_free_rwsem(&sbi->cp_rwsem);
 }
 static void f2fs_put_super(struct super_block *sb)
@@ -1483,9 +1481,6 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
 {
        int i, err;
-        if (percpu_init_rwsem(&sbi->cp_rwsem))
-                return -ENOMEM;
        for (i = 0; i < NR_COUNT_TYPE; i++) {
                err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
                if (err)
@@ -1686,6 +1681,7 @@ try_onemore:
                sbi->write_io[i].bio = NULL;
        }
+        init_rwsem(&sbi->cp_rwsem);
        init_waitqueue_head(&sbi->cp_wait);
        init_sb_info(sbi);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4d09d4441e3e..05713a5da083 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1949,6 +1949,12 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 {
        struct backing_dev_info *bdi;
+        /*
+         * If we are expecting writeback progress we must submit plugged IO.
+         */
+        if (blk_needs_flush_plug(current))
+                blk_schedule_flush_plug(current);
        if (!nr_pages)
                nr_pages = get_nr_dirty_pages();
diff --git a/fs/iomap.c b/fs/iomap.c
index 48141b8eff5f..0342254646e3 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -84,8 +84,11 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
         * Now the data has been copied, commit the range we've copied.  This
         * should not fail unless the filesystem has had a fatal error.
         */
-        ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+        if (ops->iomap_end) {
-                        flags, &iomap);
+                ret = ops->iomap_end(inode, pos, length,
+                                     written > 0 ? written : 0,
+                                     flags, &iomap);
+        }
        return written ? written : ret;
 }
@@ -194,12 +197,9 @@ again:
                if (mapping_writably_mapped(inode->i_mapping))
                        flush_dcache_page(page);
-                pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
-                pagefault_enable();
                flush_dcache_page(page);
-                mark_page_accessed(page);
                status = iomap_write_end(inode, pos, bytes, copied, page);
                if (unlikely(status < 0))
@@ -470,13 +470,18 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
        if (ret)
                return ret;
-        ret = filemap_write_and_wait(inode->i_mapping);
+        if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
-        if (ret)
+                ret = filemap_write_and_wait(inode->i_mapping);
-                return ret;
+                if (ret)
+                        return ret;
+        }
        while (len > 0) {
                ret = iomap_apply(inode, start, len, 0, ops, &ctx,
                                iomap_fiemap_actor);
+                /* inode with no (attribute) mapping will give ENOENT */
+                if (ret == -ENOENT)
+                        break;
                if (ret < 0)
                        return ret;
                if (ret == 0)
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 33da841a21bb..6f4752734804 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -338,6 +338,8 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
        case 0:
                break;
        case -NFS4ERR_EXPIRED:
+        case -NFS4ERR_ADMIN_REVOKED:
+        case -NFS4ERR_DELEG_REVOKED:
        case -NFS4ERR_STALE_STATEID:
        case -NFS4ERR_OLD_STATEID:
        case -NFS4ERR_BAD_STATEID:
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 324bfdc21250..9bf64eacba5b 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -396,6 +396,10 @@ extern void nfs4_schedule_state_renewal(struct nfs_client *);
 extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
 extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(struct work_struct *);
+extern void nfs4_set_lease_period(struct nfs_client *clp,
+                unsigned long lease,
+                unsigned long lastrenewed);
 /* nfs4state.c */
 struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a036e93bdf96..1949bbd806eb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4237,12 +4237,9 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str
                err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
                trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
                if (err == 0) {
-                        struct nfs_client *clp = server->nfs_client;
+                        nfs4_set_lease_period(server->nfs_client,
+                                        fsinfo->lease_time * HZ,
-                        spin_lock(&clp->cl_lock);
+                                        now);
-                        clp->cl_lease_time = fsinfo->lease_time * HZ;
-                        clp->cl_last_renewal = now;
-                        spin_unlock(&clp->cl_lock);
                        break;
                }
                err = nfs4_handle_exception(server, err, &exception);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e1ba58c3d1ad..82e77198d17e 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -136,6 +136,26 @@ nfs4_kill_renewd(struct nfs_client *clp)
        cancel_delayed_work_sync(&clp->cl_renewd);
 }
+/**
+ * nfs4_set_lease_period - Sets the lease period on a nfs_client
+ *
+ * @clp: pointer to nfs_client
+ * @lease: new value for lease period
+ * @lastrenewed: time at which lease was last renewed
+ */
+void nfs4_set_lease_period(struct nfs_client *clp,
+                unsigned long lease,
+                unsigned long lastrenewed)
+{
+        spin_lock(&clp->cl_lock);
+        clp->cl_lease_time = lease;
+        clp->cl_last_renewal = lastrenewed;
+        spin_unlock(&clp->cl_lock);
+        /* Cap maximum reconnect timeout at 1/2 lease period */
+        rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1);
+}
 /*
 * Local variables:
 *   c-basic-offset: 8
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 834b875900d6..cada00aa5096 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -277,20 +277,17 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
 {
        int status;
        struct nfs_fsinfo fsinfo;
+        unsigned long now;
        if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
                nfs4_schedule_state_renewal(clp);
                return 0;
        }
+        now = jiffies;
        status = nfs4_proc_get_lease_time(clp, &fsinfo);
        if (status == 0) {
-                /* Update lease time and schedule renewal */
+                nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now);
-                spin_lock(&clp->cl_lock);
-                clp->cl_lease_time = fsinfo.lease_time * HZ;
-                clp->cl_last_renewal = jiffies;
-                spin_unlock(&clp->cl_lock);
                nfs4_schedule_state_renewal(clp);
        }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8410ca275db1..a204d7e109d4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4903,6 +4903,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        return nfs_ok;
 }
+static __be32
+nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s)
+{
+        struct nfs4_ol_stateid *stp = openlockstateid(s);
+        __be32 ret;
+        mutex_lock(&stp->st_mutex);
+        ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+        if (ret)
+                goto out;
+        ret = nfserr_locks_held;
+        if (check_for_locks(stp->st_stid.sc_file,
+                            lockowner(stp->st_stateowner)))
+                goto out;
+        release_lock_stateid(stp);
+        ret = nfs_ok;
+out:
+        mutex_unlock(&stp->st_mutex);
+        nfs4_put_stid(s);
+        return ret;
+}
 __be32
 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                   struct nfsd4_free_stateid *free_stateid)
@@ -4910,7 +4936,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        stateid_t *stateid = &free_stateid->fr_stateid;
        struct nfs4_stid *s;
        struct nfs4_delegation *dp;
-        struct nfs4_ol_stateid *stp;
        struct nfs4_client *cl = cstate->session->se_client;
        __be32 ret = nfserr_bad_stateid;
@@ -4929,18 +4954,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                ret = nfserr_locks_held;
                break;
        case NFS4_LOCK_STID:
-                ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+                atomic_inc(&s->sc_count);
-                if (ret)
-                        break;
-                stp = openlockstateid(s);
-                ret = nfserr_locks_held;
-                if (check_for_locks(stp->st_stid.sc_file,
-                                    lockowner(stp->st_stateowner)))
-                        break;
-                WARN_ON(!unhash_lock_stateid(stp));
                spin_unlock(&cl->cl_lock);
-                nfs4_put_stid(s);
+                ret = nfsd4_free_lock_stateid(stateid, s);
-                ret = nfs_ok;
                goto out;
        case NFS4_REVOKED_DELEG_STID:
                dp = delegstateid(s);
@@ -5507,7 +5523,7 @@ static __be32
 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
                            struct nfs4_ol_stateid *ost,
                            struct nfsd4_lock *lock,
-                            struct nfs4_ol_stateid **lst, bool *new)
+                            struct nfs4_ol_stateid **plst, bool *new)
 {
        __be32 status;
        struct nfs4_file *fi = ost->st_stid.sc_file;
@@ -5515,7 +5531,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
        struct nfs4_client *cl = oo->oo_owner.so_client;
        struct inode *inode = d_inode(cstate->current_fh.fh_dentry);
        struct nfs4_lockowner *lo;
+        struct nfs4_ol_stateid *lst;
        unsigned int strhashval;
+        bool hashed;
        lo = find_lockowner_str(cl, &lock->lk_new_owner);
        if (!lo) {
@@ -5531,12 +5549,27 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
                        goto out;
        }
-        *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
+retry:
-        if (*lst == NULL) {
+        lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
+        if (lst == NULL) {
                status = nfserr_jukebox;
                goto out;
        }
+        mutex_lock(&lst->st_mutex);
+        /* See if it's still hashed to avoid race with FREE_STATEID */
+        spin_lock(&cl->cl_lock);
+        hashed = !list_empty(&lst->st_perfile);
+        spin_unlock(&cl->cl_lock);
+        if (!hashed) {
+                mutex_unlock(&lst->st_mutex);
+                nfs4_put_stid(&lst->st_stid);
+                goto retry;
+        }
        status = nfs_ok;
+        *plst = lst;
 out:
        nfs4_put_stateowner(&lo->lo_owner);
        return status;
@@ -5603,8 +5636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                status = lookup_or_create_lock_state(cstate, open_stp, lock,
                                                        &lock_stp, &new);
-                if (status == nfs_ok)
-                        mutex_lock(&lock_stp->st_mutex);
        } else {
                status = nfs4_preprocess_seqid_op(cstate,
                                       lock->lk_old_lock_seqid,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ba944123167b..ff476e654b8f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1252,10 +1252,13 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (IS_ERR(dchild))
                return nfserrno(host_err);
        err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
-        if (err) {
+        /*
-                dput(dchild);
+         * We unconditionally drop our ref to dchild as fh_compose will have
+         * already grabbed its own ref for it.
+         */
+        dput(dchild);
+        if (err)
                return err;
-        }
        return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
                                        rdev, resfhp);
 }
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 09e18fdf61e5..b9a8c813e5e6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -46,7 +46,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                cached = 0;
        for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
-                pages[lru] = global_page_state(NR_LRU_BASE + lru);
+                pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
        available = si_mem_available();
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 19f532e7d35e..6dc4296eed62 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -223,8 +223,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                size -= n;
                buf += n;
                copied += n;
-                if (!m->count)
+                if (!m->count) {
+                        m->from = 0;
                        m->index++;
+                }
                if (!size)
                        goto Done;
        }
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index b45345d701e7..51157da3f76e 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -370,7 +370,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
        p = c->gap_lebs;
        do {
-                ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
+                ubifs_assert(p < c->gap_lebs + c->lst.idx_lebs);
                written = layout_leb_in_gaps(c, p);
                if (written < 0) {
                        err = written;
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e237811f09ce..11a004114eba 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -575,7 +575,8 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
        dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name,
                inode->i_ino, dentry, size);
-        return  __ubifs_getxattr(inode, name, buffer, size);
+        name = xattr_full_name(handler, name);
+        return __ubifs_getxattr(inode, name, buffer, size);
 }
 static int ubifs_xattr_set(const struct xattr_handler *handler,
@@ -586,6 +587,8 @@ static int ubifs_xattr_set(const struct xattr_handler *handler,
        dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
                name, inode->i_ino, dentry, size);
+        name = xattr_full_name(handler, name);
        if (value)
                return __ubifs_setxattr(inode, name, value, size, flags);
        else
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 776ae2f325d1..3dd8f1d54498 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1582,6 +1582,7 @@ xfs_alloc_ag_vextent_small(
        xfs_extlen_t    *flenp, /* result length */
        int             *stat)  /* status: 0-freelist, 1-normal/none */
 {
+        struct xfs_owner_info   oinfo;
        int             error;
        xfs_agblock_t   fbno;
        xfs_extlen_t    flen;
@@ -1624,6 +1625,18 @@ xfs_alloc_ag_vextent_small(
                                error0);
                        args->wasfromfl = 1;
                        trace_xfs_alloc_small_freelist(args);
+                        /*
+                         * If we're feeding an AGFL block to something that
+                         * doesn't live in the free space, we need to clear
+                         * out the OWN_AG rmap.
+                         */
+                        xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+                        error = xfs_rmap_free(args->tp, args->agbp, args->agno,
+                                        fbno, 1, &oinfo);
+                        if (error)
+                                goto error0;
                        *stat = 0;
                        return 0;
                }
@@ -2264,6 +2277,7 @@ xfs_alloc_log_agf(
                offsetof(xfs_agf_t, agf_longest),
                offsetof(xfs_agf_t, agf_btreeblks),
                offsetof(xfs_agf_t, agf_uuid),
+                offsetof(xfs_agf_t, agf_rmap_blocks),
                sizeof(xfs_agf_t)
        };
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index f814d42c73b2..e6a8bea0f7ba 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -640,12 +640,15 @@ typedef struct xfs_agf {
        __be32          agf_btreeblks;  /* # of blocks held in AGF btrees */
        uuid_t          agf_uuid;       /* uuid of filesystem */
+        __be32          agf_rmap_blocks;        /* rmapbt blocks used */
+        __be32          agf_padding;            /* padding */
        /*
         * reserve some contiguous space for future logged fields before we add
         * the unlogged fields. This makes the range logging via flags and
         * structure offsets much simpler.
         */
-        __be64          agf_spare64[16];
+        __be64          agf_spare64[15];
        /* unlogged fields, written during buffer writeback. */
        __be64          agf_lsn;        /* last write sequence */
@@ -670,7 +673,8 @@ typedef struct xfs_agf {
 #define XFS_AGF_LONGEST         0x00000400
 #define XFS_AGF_BTREEBLKS       0x00000800
 #define XFS_AGF_UUID            0x00001000
-#define XFS_AGF_NUM_BITS        13
+#define XFS_AGF_RMAP_BLOCKS     0x00002000
+#define XFS_AGF_NUM_BITS        14
 #define XFS_AGF_ALL_BITS        ((1 << XFS_AGF_NUM_BITS) - 1)
 #define XFS_AGF_FLAGS \
@@ -686,7 +690,8 @@ typedef struct xfs_agf {
        { XFS_AGF_FREEBLKS,     "FREEBLKS" }, \
        { XFS_AGF_LONGEST,      "LONGEST" }, \
        { XFS_AGF_BTREEBLKS,    "BTREEBLKS" }, \
-        { XFS_AGF_UUID,         "UUID" }
+        { XFS_AGF_UUID,         "UUID" }, \
+        { XFS_AGF_RMAP_BLOCKS,  "RMAP_BLOCKS" }
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGF_DADDR(mp)       ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index bc1faebc84ec..17b8eeb34ac8 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -98,6 +98,8 @@ xfs_rmapbt_alloc_block(
        union xfs_btree_ptr     *new,
        int                     *stat)
 {
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
        int                     error;
        xfs_agblock_t           bno;
@@ -124,6 +126,8 @@ xfs_rmapbt_alloc_block(
        xfs_trans_agbtree_delta(cur->bc_tp, 1);
        new->s = cpu_to_be32(bno);
+        be32_add_cpu(&agf->agf_rmap_blocks, 1);
+        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
        *stat = 1;
@@ -143,6 +147,8 @@ xfs_rmapbt_free_block(
        bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
        trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
                        bno, 1);
+        be32_add_cpu(&agf->agf_rmap_blocks, -1);
+        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
        error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
        if (error)
                return error;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 47a318ce82e0..607cc29bba21 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -115,7 +115,6 @@ xfs_buf_ioacct_dec(
        if (!(bp->b_flags & _XBF_IN_FLIGHT))
                return;
-        ASSERT(bp->b_flags & XBF_ASYNC);
        bp->b_flags &= ~_XBF_IN_FLIGHT;
        percpu_counter_dec(&bp->b_target->bt_io_count);
 }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ed95e5bb04e6..e612a0233710 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -741,9 +741,20 @@ xfs_file_dax_write(
         * page is inserted into the pagecache when we have to serve a write
         * fault on a hole.  It should never be dirtied and can simply be
         * dropped from the pagecache once we get real data for the page.
+         *
+         * XXX: This is racy against mmap, and there's nothing we can do about
+         * it. dax_do_io() should really do this invalidation internally as
+         * it will know if we've allocated over a holei for this specific IO and
+         * if so it needs to update the mapping tree and invalidate existing
+         * PTEs over the newly allocated range. Remove this invalidation when
+         * dax_do_io() is fixed up.
         */
        if (mapping->nrpages) {
-                ret = invalidate_inode_pages2(mapping);
+                loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
+                ret = invalidate_inode_pages2_range(mapping,
+                                                    iocb->ki_pos >> PAGE_SHIFT,
+                                                    end >> PAGE_SHIFT);
                WARN_ON_ONCE(ret);
        }
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 0f96847b90e1..0b7f986745c1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -248,6 +248,7 @@ xfs_growfs_data_private(
                        agf->agf_roots[XFS_BTNUM_RMAPi] =
                                                cpu_to_be32(XFS_RMAP_BLOCK(mp));
                        agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+                        agf->agf_rmap_blocks = cpu_to_be32(1);
                }
                agf->agf_flfirst = cpu_to_be32(1);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2114d53df433..2af0dda1c978 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -715,12 +715,16 @@ xfs_iomap_write_allocate(
                 * is in the delayed allocation extent on which we sit
                 * but before our buffer starts.
                 */
                nimaps = 0;
                while (nimaps == 0) {
                        nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+                        /*
-                        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres,
+                         * We have already reserved space for the extent and any
+                         * indirect blocks when creating the delalloc extent,
+                         * there is no need to reserve space in this transaction
+                         * again.
+                         */
+                        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
                                        0, XFS_TRANS_RESERVE, &tp);
                        if (error)
                                return error;
@@ -1037,20 +1041,14 @@ xfs_file_iomap_begin(
                        return error;
                trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
-                xfs_bmbt_to_iomap(ip, iomap, &imap);
-        } else if (nimaps) {
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                trace_xfs_iomap_found(ip, offset, length, 0, &imap);
-                xfs_bmbt_to_iomap(ip, iomap, &imap);
        } else {
+                ASSERT(nimaps);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+                trace_xfs_iomap_found(ip, offset, length, 0, &imap);
-                iomap->blkno = IOMAP_NULL_BLOCK;
-                iomap->type = IOMAP_HOLE;
-                iomap->offset = offset;
-                iomap->length = length;
        }
+        xfs_bmbt_to_iomap(ip, iomap, &imap);
        return 0;
 }
@@ -1112,3 +1110,48 @@ struct iomap_ops xfs_iomap_ops = {
        .iomap_begin            = xfs_file_iomap_begin,
        .iomap_end              = xfs_file_iomap_end,
 };
+static int
+xfs_xattr_iomap_begin(
+        struct inode            *inode,
+        loff_t                  offset,
+        loff_t                  length,
+        unsigned                flags,
+        struct iomap            *iomap)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + length);
+        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1, error = 0;
+        unsigned                lockmode;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        lockmode = xfs_ilock_data_map_shared(ip);
+        /* if there are no attribute fork or extents, return ENOENT */
+        if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+                error = -ENOENT;
+                goto out_unlock;
+        }
+        ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL);
+        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+                               &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK);
+out_unlock:
+        xfs_iunlock(ip, lockmode);
+        if (!error) {
+                ASSERT(nimaps);
+                xfs_bmbt_to_iomap(ip, iomap, &imap);
+        }
+        return error;
+}
+struct iomap_ops xfs_xattr_iomap_ops = {
+        .iomap_begin            = xfs_xattr_iomap_begin,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index e066d045e2ff..fb8aca3d69ab 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -35,5 +35,6 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
                struct xfs_bmbt_irec *);
 extern struct iomap_ops xfs_iomap_ops;
+extern struct iomap_ops xfs_xattr_iomap_ops;
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ab820f84ed50..b24c3102fa93 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1009,7 +1009,14 @@ xfs_vn_fiemap(
        int                     error;
        xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
-        error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+                fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
+                error = iomap_fiemap(inode, fieinfo, start, length,
+                                &xfs_xattr_iomap_ops);
+        } else {
+                error = iomap_fiemap(inode, fieinfo, start, length,
+                                &xfs_iomap_ops);
+        }
        xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
        return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 551b7e26980c..7e88bec3f359 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1298,7 +1298,6 @@ DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
 DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
 DEFINE_IOMAP_EVENT(xfs_iomap_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),