44 files changed, 355 insertions, 209 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 1320b2a05fb2..250b0a73c8a8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -712,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
         */
        ret = retry(iocb);
-        if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED)
+        if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
+                /*
+                 * There's no easy way to restart the syscall since other AIO's
+                 * may be already running. Just fail this IO with EINTR.
+                 */
+                if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+                             ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
+                        ret = -EINTR;
                aio_complete(iocb, ret, 0);
+        }
 out:
        spin_lock_irq(&ctx->ctx_lock);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index bc87b9c1d27e..0fcd2640c23f 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -3,6 +3,7 @@ config CEPH_FS
        depends on INET && EXPERIMENTAL
        select LIBCRC32C
        select CRYPTO_AES
+        select CRYPTO
        help
          Choose Y or M here to include support for mounting the
          experimental Ceph distributed file system.  Ceph is an extremely
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4cfce1ee31fa..efbc604001c8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -411,8 +411,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        if (i_size < page_off + len)
                len = i_size - page_off;
-        dout("writepage %p page %p index %lu on %llu~%u\n",
+        dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
-             inode, page, page->index, page_off, len);
+             inode, page, page->index, page_off, len, snapc);
        writeback_stat = atomic_long_inc_return(&client->writeback_count);
        if (writeback_stat >
@@ -766,7 +766,8 @@ get_more_pages:
                        /* ok */
                        if (locked_pages == 0) {
                                /* prepare async write request */
-                                offset = page->index << PAGE_CACHE_SHIFT;
+                                offset = (unsigned long long)page->index
+                                        << PAGE_CACHE_SHIFT;
                                len = wsize;
                                req = ceph_osdc_new_request(&client->osdc,
                                            &ci->i_layout,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a2069b6680ae..5e9da996a151 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -814,7 +814,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
                used |= CEPH_CAP_PIN;
        if (ci->i_rd_ref)
                used |= CEPH_CAP_FILE_RD;
-        if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+        if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
                used |= CEPH_CAP_FILE_CACHE;
        if (ci->i_wr_ref)
                used |= CEPH_CAP_FILE_WR;
@@ -1195,10 +1195,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 * asynchronously back to the MDS once sync writes complete and dirty
 * data is written out.
 *
+ * Unless @again is true, skip cap_snaps that were already sent to
+ * the MDS (i.e., during this session).
+ *
 * Called under i_lock.  Takes s_mutex as needed.
 */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
-                        struct ceph_mds_session **psession)
+                        struct ceph_mds_session **psession,
+                        int again)
                __releases(ci->vfs_inode->i_lock)
                __acquires(ci->vfs_inode->i_lock)
 {
@@ -1227,7 +1231,7 @@ retry:
                 * pages to be written out.
                 */
                if (capsnap->dirty_pages || capsnap->writing)
-                        continue;
+                        break;
                /*
                 * if cap writeback already occurred, we should have dropped
@@ -1240,6 +1244,13 @@ retry:
                        dout("no auth cap (migrating?), doing nothing\n");
                        goto out;
                }
+                /* only flush each capsnap once */
+                if (!again && !list_empty(&capsnap->flushing_item)) {
+                        dout("already flushed %p, skipping\n", capsnap);
+                        continue;
+                }
                mds = ci->i_auth_cap->session->s_mds;
                mseq = ci->i_auth_cap->mseq;
@@ -1276,8 +1287,8 @@ retry:
                              &session->s_cap_snaps_flushing);
                spin_unlock(&inode->i_lock);
-                dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+                dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
-                     inode, capsnap, next_follows, capsnap->size);
+                     inode, capsnap, capsnap->follows, capsnap->flush_tid);
                send_cap_msg(session, ceph_vino(inode).ino, 0,
                             CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
                             capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
@@ -1314,7 +1325,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
        struct inode *inode = &ci->vfs_inode;
        spin_lock(&inode->i_lock);
-        __ceph_flush_snaps(ci, NULL);
+        __ceph_flush_snaps(ci, NULL, 0);
        spin_unlock(&inode->i_lock);
 }
@@ -1477,7 +1488,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        /* flush snaps first time around only */
        if (!list_empty(&ci->i_cap_snaps))
-                __ceph_flush_snaps(ci, &session);
+                __ceph_flush_snaps(ci, &session, 0);
        goto retry_locked;
 retry:
        spin_lock(&inode->i_lock);
@@ -1894,7 +1905,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
                if (cap && cap->session == session) {
                        dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
                             cap, capsnap);
-                        __ceph_flush_snaps(ci, &session);
+                        __ceph_flush_snaps(ci, &session, 1);
                } else {
                        pr_err("%p auth cap %p not mds%d ???\n", inode,
                               cap, session->s_mds);
@@ -2272,7 +2283,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
-        int seq = le32_to_cpu(grant->seq);
+        unsigned seq = le32_to_cpu(grant->seq);
+        unsigned issue_seq = le32_to_cpu(grant->issue_seq);
        int newcaps = le32_to_cpu(grant->caps);
        int issued, implemented, used, wanted, dirty;
        u64 size = le64_to_cpu(grant->size);
@@ -2284,8 +2296,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        int revoked_rdcache = 0;
        int queue_invalidate = 0;
-        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+        dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
-             inode, cap, mds, seq, ceph_cap_string(newcaps));
+             inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
                inode->i_size);
@@ -2381,6 +2393,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        cap->seq = seq;
+        cap->issue_seq = issue_seq;
        /* file layout may have changed */
        ci->i_layout = grant->layout;
@@ -2763,15 +2776,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                if (op == CEPH_CAP_OP_IMPORT)
                        __queue_cap_release(session, vino.ino, cap_id,
                                            mseq, seq);
+                goto flush_cap_releases;
-                /*
-                 * send any full release message to try to move things
-                 * along for the mds (who clearly thinks we still have this
-                 * cap).
-                 */
-                ceph_add_cap_releases(mdsc, session);
-                ceph_send_cap_releases(mdsc, session);
-                goto done;
        }
        /* these will work even if we don't have a cap yet */
@@ -2799,7 +2804,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                dout(" no cap on %p ino %llx.%llx from mds%d\n",
                     inode, ceph_ino(inode), ceph_snap(inode), mds);
                spin_unlock(&inode->i_lock);
-                goto done;
+                goto flush_cap_releases;
        }
        /* note that each of these drops i_lock for us */
@@ -2823,6 +2828,17 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                       ceph_cap_op_name(op));
        }
+        goto done;
+flush_cap_releases:
+        /*
+         * send any full release message to try to move things
+         * along for the mds (who clearly thinks we still have this
+         * cap).
+         */
+        ceph_add_cap_releases(mdsc, session);
+        ceph_send_cap_releases(mdsc, session);
 done:
        mutex_unlock(&session->s_mutex);
 done_unlocked:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 6e4f43ff23ec..a1986eb52045 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1021,11 +1021,15 @@ out_touch:
 static void ceph_dentry_release(struct dentry *dentry)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
-        struct inode *parent_inode = dentry->d_parent->d_inode;
+        struct inode *parent_inode = NULL;
-        u64 snapid = ceph_snap(parent_inode);
+        u64 snapid = CEPH_NOSNAP;
+        if (!IS_ROOT(dentry)) {
+                parent_inode = dentry->d_parent->d_inode;
+                if (parent_inode)
+                        snapid = ceph_snap(parent_inode);
+        }
        dout("dentry_release %p parent %p\n", dentry, parent_inode);
        if (parent_inode && snapid != CEPH_SNAPDIR) {
                struct ceph_inode_info *ci = ceph_inode(parent_inode);
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 4480cb1c63e7..e38423e82f2e 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -42,32 +42,37 @@ struct ceph_nfs_confh {
 static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                          int connectable)
 {
+        int type;
        struct ceph_nfs_fh *fh = (void *)rawfh;
        struct ceph_nfs_confh *cfh = (void *)rawfh;
        struct dentry *parent = dentry->d_parent;
        struct inode *inode = dentry->d_inode;
-        int type;
+        int connected_handle_length = sizeof(*cfh)/4;
+        int handle_length = sizeof(*fh)/4;
        /* don't re-export snaps */
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EINVAL;
-        if (*max_len >= sizeof(*cfh)) {
+        if (*max_len >= connected_handle_length) {
                dout("encode_fh %p connectable\n", dentry);
                cfh->ino = ceph_ino(dentry->d_inode);
                cfh->parent_ino = ceph_ino(parent->d_inode);
                cfh->parent_name_hash = parent->d_name.hash;
-                *max_len = sizeof(*cfh);
+                *max_len = connected_handle_length;
                type = 2;
-        } else if (*max_len > sizeof(*fh)) {
+        } else if (*max_len >= handle_length) {
-                if (connectable)
+                if (connectable) {
-                        return -ENOSPC;
+                        *max_len = connected_handle_length;
+                        return 255;
+                }
                dout("encode_fh %p\n", dentry);
                fh->ino = ceph_ino(dentry->d_inode);
-                *max_len = sizeof(*fh);
+                *max_len = handle_length;
                type = 1;
        } else {
-                return -ENOSPC;
+                *max_len = handle_length;
+                return 255;
        }
        return type;
 }
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8c044a4f0457..66e4da6dba22 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -697,7 +697,7 @@ more:
                         * start_request so that a tid has been assigned.
                         */
                        spin_lock(&ci->i_unsafe_lock);
-                        list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+                        list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
                        spin_unlock(&ci->i_unsafe_lock);
                        ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
                }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e7cca414da03..62377ec37edf 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -845,7 +845,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 * the caller) if we fail.
 */
 static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
-                                    bool *prehash)
+                                    bool *prehash, bool set_offset)
 {
        struct dentry *realdn;
@@ -877,7 +877,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        }
        if ((!prehash || *prehash) && d_unhashed(dn))
                d_rehash(dn);
-        ceph_set_dentry_offset(dn);
+        if (set_offset)
+                ceph_set_dentry_offset(dn);
 out:
        return dn;
 }
@@ -1062,7 +1063,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                                d_delete(dn);
                                goto done;
                        }
-                        dn = splice_dentry(dn, in, &have_lease);
+                        dn = splice_dentry(dn, in, &have_lease, true);
                        if (IS_ERR(dn)) {
                                err = PTR_ERR(dn);
                                goto done;
@@ -1105,7 +1106,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        goto done;
                }
                dout(" linking snapped dir %p to dn %p\n", in, dn);
-                dn = splice_dentry(dn, in, NULL);
+                dn = splice_dentry(dn, in, NULL, true);
                if (IS_ERR(dn)) {
                        err = PTR_ERR(dn);
                        goto done;
@@ -1237,7 +1238,7 @@ retry_lookup:
                                err = PTR_ERR(in);
                                goto out;
                        }
-                        dn = splice_dentry(dn, in, NULL);
+                        dn = splice_dentry(dn, in, NULL, false);
                        if (IS_ERR(dn))
                                dn = NULL;
                }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f091b1351786..fad95f8f2608 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2374,6 +2374,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                                num_fcntl_locks,
                                                num_flock_locks);
                unlock_kernel();
+        } else {
+                err = ceph_pagelist_append(pagelist, &rec, reclen);
        }
 out_free:
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index dfced1dacbcd..3b5571b8ce22 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -549,7 +549,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 */
 static void __cancel_request(struct ceph_osd_request *req)
 {
-        if (req->r_sent) {
+        if (req->r_sent && req->r_osd) {
                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
                req->r_sent = 0;
        }
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index b6859f47d364..46a368b6dce5 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -5,10 +5,18 @@
 #include "pagelist.h"
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+        struct page *page = list_entry(pl->head.prev, struct page,
+                                       lru);
+        kunmap(page);
+}
 int ceph_pagelist_release(struct ceph_pagelist *pl)
 {
        if (pl->mapped_tail)
-                kunmap(pl->mapped_tail);
+                ceph_pagelist_unmap_tail(pl);
        while (!list_empty(&pl->head)) {
                struct page *page = list_first_entry(&pl->head, struct page,
                                                     lru);
@@ -26,7 +34,7 @@ static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
        pl->room += PAGE_SIZE;
        list_add_tail(&page->lru, &pl->head);
        if (pl->mapped_tail)
-                kunmap(pl->mapped_tail);
+                ceph_pagelist_unmap_tail(pl);
        pl->mapped_tail = kmap(page);
        return 0;
 }
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4868b9dcac5a..190b6c4a6f2b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -119,6 +119,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
        INIT_LIST_HEAD(&realm->children);
        INIT_LIST_HEAD(&realm->child_item);
        INIT_LIST_HEAD(&realm->empty_item);
+        INIT_LIST_HEAD(&realm->dirty_item);
        INIT_LIST_HEAD(&realm->inodes_with_caps);
        spin_lock_init(&realm->inodes_with_caps_lock);
        __insert_snap_realm(&mdsc->snap_realms, realm);
@@ -467,7 +468,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                INIT_LIST_HEAD(&capsnap->ci_item);
                INIT_LIST_HEAD(&capsnap->flushing_item);
-                capsnap->follows = snapc->seq - 1;
+                capsnap->follows = snapc->seq;
                capsnap->issued = __ceph_caps_issued(ci, NULL);
                capsnap->dirty = dirty;
@@ -604,6 +605,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
        struct ceph_snap_realm *realm;
        int invalidate = 0;
        int err = -ENOMEM;
+        LIST_HEAD(dirty_realms);
        dout("update_snap_trace deletion=%d\n", deletion);
 more:
@@ -626,24 +628,6 @@ more:
                }
        }
-        if (le64_to_cpu(ri->seq) > realm->seq) {
-                dout("update_snap_trace updating %llx %p %lld -> %lld\n",
-                     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
-                /*
-                 * if the realm seq has changed, queue a cap_snap for every
-                 * inode with open caps.  we do this _before_ we update
-                 * the realm info so that we prepare for writeback under the
-                 * _previous_ snap context.
-                 *
-                 * ...unless it's a snap deletion!
-                 */
-                if (!deletion)
-                        queue_realm_cap_snaps(realm);
-        } else {
-                dout("update_snap_trace %llx %p seq %lld unchanged\n",
-                     realm->ino, realm, realm->seq);
-        }
        /* ensure the parent is correct */
        err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
        if (err < 0)
@@ -651,6 +635,8 @@ more:
        invalidate += err;
        if (le64_to_cpu(ri->seq) > realm->seq) {
+                dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+                     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
                /* update realm parameters, snap lists */
                realm->seq = le64_to_cpu(ri->seq);
                realm->created = le64_to_cpu(ri->created);
@@ -668,9 +654,17 @@ more:
                if (err < 0)
                        goto fail;
+                /* queue realm for cap_snap creation */
+                list_add(&realm->dirty_item, &dirty_realms);
                invalidate = 1;
        } else if (!realm->cached_context) {
+                dout("update_snap_trace %llx %p seq %lld new\n",
+                     realm->ino, realm, realm->seq);
                invalidate = 1;
+        } else {
+                dout("update_snap_trace %llx %p seq %lld unchanged\n",
+                     realm->ino, realm, realm->seq);
        }
        dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
@@ -683,6 +677,14 @@ more:
        if (invalidate)
                rebuild_snap_realms(realm);
+        /*
+         * queue cap snaps _after_ we've built the new snap contexts,
+         * so that i_head_snapc can be set appropriately.
+         */
+        list_for_each_entry(realm, &dirty_realms, dirty_item) {
+                queue_realm_cap_snaps(realm);
+        }
        __cleanup_empty_realms(mdsc);
        return 0;
@@ -715,7 +717,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
                igrab(inode);
                spin_unlock(&mdsc->snap_flush_lock);
                spin_lock(&inode->i_lock);
-                __ceph_flush_snaps(ci, &session);
+                __ceph_flush_snaps(ci, &session, 0);
                spin_unlock(&inode->i_lock);
                iput(inode);
                spin_lock(&mdsc->snap_flush_lock);
@@ -816,6 +818,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        };
                        struct inode *inode = ceph_find_inode(sb, vino);
                        struct ceph_inode_info *ci;
+                        struct ceph_snap_realm *oldrealm;
                        if (!inode)
                                continue;
@@ -841,18 +844,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        dout(" will move %p to split realm %llx %p\n",
                             inode, realm->ino, realm);
                        /*
-                         * Remove the inode from the realm's inode
+                         * Move the inode to the new realm
-                         * list, but don't add it to the new realm
-                         * yet.  We don't want the cap_snap to be
-                         * queued (again) by ceph_update_snap_trace()
-                         * below.  Queue it _now_, under the old context.
                         */
                        spin_lock(&realm->inodes_with_caps_lock);
                        list_del_init(&ci->i_snap_realm_item);
+                        list_add(&ci->i_snap_realm_item,
+                                 &realm->inodes_with_caps);
+                        oldrealm = ci->i_snap_realm;
+                        ci->i_snap_realm = realm;
                        spin_unlock(&realm->inodes_with_caps_lock);
                        spin_unlock(&inode->i_lock);
-                        ceph_queue_cap_snap(ci);
+                        ceph_get_snap_realm(mdsc, realm);
+                        ceph_put_snap_realm(mdsc, oldrealm);
                        iput(inode);
                        continue;
@@ -880,43 +884,9 @@ skip_inode:
        ceph_update_snap_trace(mdsc, p, e,
                               op == CEPH_SNAP_OP_DESTROY);
-        if (op == CEPH_SNAP_OP_SPLIT) {
+        if (op == CEPH_SNAP_OP_SPLIT)
-                /*
-                 * ok, _now_ add the inodes into the new realm.
-                 */
-                for (i = 0; i < num_split_inos; i++) {
-                        struct ceph_vino vino = {
-                                .ino = le64_to_cpu(split_inos[i]),
-                                .snap = CEPH_NOSNAP,
-                        };
-                        struct inode *inode = ceph_find_inode(sb, vino);
-                        struct ceph_inode_info *ci;
-                        if (!inode)
-                                continue;
-                        ci = ceph_inode(inode);
-                        spin_lock(&inode->i_lock);
-                        if (list_empty(&ci->i_snap_realm_item)) {
-                                struct ceph_snap_realm *oldrealm =
-                                        ci->i_snap_realm;
-                                dout(" moving %p to split realm %llx %p\n",
-                                     inode, realm->ino, realm);
-                                spin_lock(&realm->inodes_with_caps_lock);
-                                list_add(&ci->i_snap_realm_item,
-                                         &realm->inodes_with_caps);
-                                ci->i_snap_realm = realm;
-                                spin_unlock(&realm->inodes_with_caps_lock);
-                                ceph_get_snap_realm(mdsc, realm);
-                                ceph_put_snap_realm(mdsc, oldrealm);
-                        }
-                        spin_unlock(&inode->i_lock);
-                        iput(inode);
-                }
                /* we took a reference when we created the realm, above */
                ceph_put_snap_realm(mdsc, realm);
-        }
        __cleanup_empty_realms(mdsc);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c33897ae5725..b87638e84c4b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -690,6 +690,8 @@ struct ceph_snap_realm {
        struct list_head empty_item;     /* if i have ref==0 */
+        struct list_head dirty_item;     /* if realm needs new context */
        /* the current set of snaps for this realm */
        struct ceph_snap_context *cached_context;
@@ -826,7 +828,8 @@ extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                                       struct ceph_snap_context *snapc);
 extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
-                               struct ceph_mds_session **psession);
+                               struct ceph_mds_session **psession,
+                               int again);
 extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                            struct ceph_mds_session *session);
 extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index f80a4f25123c..143d393881cb 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -40,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = {
 #endif
                /* permit direct mmap, for read, write or exec */
                BDI_CAP_MAP_DIRECT |
-                BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP),
+                BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
+                /* no writeback happens */
+                BDI_CAP_NO_ACCT_AND_WRITEBACK),
 };
 static struct kobj_map *cdev_map;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd37..7e83b356cc9e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -232,7 +232,7 @@ static int
 small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
                void **request_buf)
 {
-        int rc = 0;
+        int rc;
        rc = cifs_reconnect_tcon(tcon, smb_command);
        if (rc)
@@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
        if (tcon != NULL)
                cifs_stats_inc(&tcon->num_smbs_sent);
-        return rc;
+        return 0;
 }
 int
@@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct,
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
-         void **request_buf /* returned */ ,
+                        void **request_buf, void **response_buf)
-         void **response_buf /* returned */ )
 {
-        int rc = 0;
-        rc = cifs_reconnect_tcon(tcon, smb_command);
-        if (rc)
-                return rc;
        *request_buf = cifs_buf_get();
        if (*request_buf == NULL) {
                /* BB should we add a retry in here if not a writepage? */
@@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
        if (tcon != NULL)
                cifs_stats_inc(&tcon->num_smbs_sent);
-        return rc;
+        return 0;
+}
+/* If the return code is zero, this function must fill in request_buf pointer */
+static int
+smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+         void **request_buf, void **response_buf)
+{
+        int rc;
+        rc = cifs_reconnect_tcon(tcon, smb_command);
+        if (rc)
+                return rc;
+        return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
+}
+static int
+smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
+                        void **request_buf, void **response_buf)
+{
+        if (tcon->ses->need_reconnect || tcon->need_reconnect)
+                return -EHOSTDOWN;
+        return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
 }
 static int validate_t2(struct smb_t2_rsp *pSMB)
@@ -4534,8 +4551,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
        cFYI(1, "In QFSUnixInfo");
 QFSUnixRetry:
-        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+        rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
-                      (void **) &pSMBr);
+                                   (void **) &pSMB, (void **) &pSMBr);
        if (rc)
                return rc;
@@ -4604,8 +4621,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
        cFYI(1, "In SETFSUnixInfo");
 SETFSUnixRetry:
        /* BB switch to small buf init to save memory */
-        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+        rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
-                      (void **) &pSMBr);
+                                        (void **) &pSMB, (void **) &pSMBr);
        if (rc)
                return rc;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 93f77d438d3c..53cce8cc2224 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -801,6 +801,8 @@ retry_iget5_locked:
                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
                if (inode->i_state & I_NEW) {
                        inode->i_ino = hash;
+                        if (S_ISREG(inode->i_mode))
+                                inode->i_data.backing_dev_info = sb->s_bdi;
 #ifdef CONFIG_CIFS_FSCACHE
                        /* initialize per-inode cache cookie pointer */
                        CIFS_I(inode)->fscache = NULL;
diff --git a/fs/compat.c b/fs/compat.c
index 718c7062aec1..0644a154672b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1153,7 +1153,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 {
        compat_ssize_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
-        struct iovec *iov;
+        struct iovec *iov = iovstack;
        ssize_t ret;
        io_fn_t fn;
        iov_fn_t fnv;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index eb7368ebd8cd..3eadd97324b1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -54,6 +54,9 @@ struct page_collect {
        unsigned nr_pages;
        unsigned long length;
        loff_t pg_first; /* keep 64bit also in 32-arches */
+        bool read_4_write; /* This means two things: that the read is sync
+                            * And the pages should not be unlocked.
+                            */
 };
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -71,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
        pcol->nr_pages = 0;
        pcol->length = 0;
        pcol->pg_first = -1;
+        pcol->read_4_write = false;
 }
 static void _pcol_reset(struct page_collect *pcol)
@@ -347,7 +351,8 @@ static int readpage_strip(void *data, struct page *page)
                if (PageError(page))
                        ClearPageError(page);
-                unlock_page(page);
+                if (!pcol->read_4_write)
+                        unlock_page(page);
                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
                             " splitting\n", inode->i_ino, page->index);
@@ -428,6 +433,7 @@ static int _readpage(struct page *page, bool is_sync)
        /* readpage_strip might call read_exec(,is_sync==false) at several
         * places but not if we have a single page.
         */
+        pcol.read_4_write = is_sync;
        ret = readpage_strip(&pcol, page);
        if (ret) {
                EXOFS_ERR("_readpage => %d\n", ret);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 81e086d8aa57..ab38fef1c9a1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -52,8 +52,6 @@ struct wb_writeback_work {
 #define CREATE_TRACE_POINTS
 #include <trace/events/writeback.h>
-#define inode_to_bdi(inode)     ((inode)->i_mapping->backing_dev_info)
 /*
 * We don't actually have pdflush, but this one is exported though /proc...
 */
@@ -71,6 +69,16 @@ int writeback_in_progress(struct backing_dev_info *bdi)
        return test_bit(BDI_writeback_running, &bdi->state);
 }
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        if (strcmp(sb->s_type->name, "bdev") == 0)
+                return inode->i_mapping->backing_dev_info;
+        return sb->s_bdi;
+}
 static void bdi_queue_work(struct backing_dev_info *bdi,
                struct wb_writeback_work *work)
 {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index d367af1514ef..cde755cca564 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1354,7 +1354,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
        loff_t file_size;
        unsigned int num;
        unsigned int offset;
-        size_t total_len;
+        size_t total_len = 0;
        req = fuse_get_req(fc);
        if (IS_ERR(req))
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cdfb8c6a4206..c16f8d8331b5 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp)
 static inline void
 fh_unlock(struct svc_fh *fhp)
 {
-        BUG_ON(!fhp->fh_dentry);
        if (fhp->fh_locked) {
                fill_post_wcc(fhp);
                mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..b388443c3a09 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
-source "fs/notify/fanotify/Kconfig"
+#source "fs/notify/fanotify/Kconfig"
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index a76e0aa5cd3f..391915093fe1 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
        }
        inode->i_mode = new_mode;
+        inode->i_ctime = CURRENT_TIME;
        di->i_mode = cpu_to_le16(inode->i_mode);
+        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1361997cf205..cbe2f057cc28 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -977,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                           size_t caller_veclen, u8 target_node, int *status)
 {
-        int ret;
+        int ret = 0;
        struct o2net_msg *msg = NULL;
        size_t veclen, caller_bytes = 0;
        struct kvec *vec = NULL;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f04ebcfffc4a..c49f6de0e7ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3931,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
                goto out_commit;
        }
+        cpos = split_hash;
+        ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+                                       data_ac, meta_ac, new_dx_leaves,
+                                       num_dx_leaves);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
        for (i = 0; i < num_dx_leaves; i++) {
                ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
                                              orig_dx_leaves[i],
@@ -3939,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
                        mlog_errno(ret);
                        goto out_commit;
                }
-        }
-        cpos = split_hash;
+                ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
-        ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+                                              new_dx_leaves[i],
-                                       data_ac, meta_ac, new_dx_leaves,
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
-                                       num_dx_leaves);
+                if (ret) {
-        if (ret) {
+                        mlog_errno(ret);
-                mlog_errno(ret);
+                        goto out_commit;
-                goto out_commit;
+                }
        }
        ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4b6ae2c13b47..765298908f1d 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1030,6 +1030,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
                         struct dlm_lock_resource *res);
 void dlm_clean_master_list(struct dlm_ctxt *dlm,
                           u8 dead_node);
+void dlm_force_free_mles(struct dlm_ctxt *dlm);
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
 int __dlm_lockres_unused(struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 5efdd37dfe48..901ca52bf86b 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
        spin_lock(&dlm->track_lock);
        if (oldres)
                track_list = &oldres->tracking;
-        else
+        else {
                track_list = &dlm->tracking_list;
+                if (list_empty(track_list)) {
+                        dl = NULL;
+                        spin_unlock(&dlm->track_lock);
+                        goto bail;
+                }
+        }
        list_for_each_entry(res, track_list, tracking) {
                if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
        } else
                dl = NULL;
+bail:
        /* passed to seq_show */
        return dl;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 153abb5abef0..11a5c87fd7f7 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -693,6 +693,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
                dlm_mark_domain_leaving(dlm);
                dlm_leave_domain(dlm);
+                dlm_force_free_mles(dlm);
                dlm_complete_dlm_shutdown(dlm);
        }
        dlm_put(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ffb4c68dafa4..f564b0e5f80d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -3433,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
        wake_up(&res->wq);
        wake_up(&dlm->migration_wq);
 }
+void dlm_force_free_mles(struct dlm_ctxt *dlm)
+{
+        int i;
+        struct hlist_head *bucket;
+        struct dlm_master_list_entry *mle;
+        struct hlist_node *tmp, *list;
+        /*
+         * We notified all other nodes that we are exiting the domain and
+         * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
+         * around we force free them and wake any processes that are waiting
+         * on the mles
+         */
+        spin_lock(&dlm->spinlock);
+        spin_lock(&dlm->master_lock);
+        BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
+        BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+                bucket = dlm_master_hash(dlm, i);
+                hlist_for_each_safe(list, tmp, bucket) {
+                        mle = hlist_entry(list, struct dlm_master_list_entry,
+                                          master_hash_node);
+                        if (mle->type != DLM_MLE_BLOCK) {
+                                mlog(ML_ERROR, "bad mle: %p\n", mle);
+                                dlm_print_one_mle(mle);
+                        }
+                        atomic_set(&mle->woken, 1);
+                        wake_up(&mle->wq);
+                        __dlm_unlink_mle(dlm, mle);
+                        __dlm_mle_detach_hb_events(dlm, mle);
+                        __dlm_put_mle(mle);
+                }
+        }
+        spin_unlock(&dlm->master_lock);
+        spin_unlock(&dlm->spinlock);
+}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e1b3d6..1d596d8c4a4a 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
        OI_LS_PARENT,
        OI_LS_RENAME1,
        OI_LS_RENAME2,
+        OI_LS_REFLINK_TARGET,
 };
 int ocfs2_dlm_init(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 33f1c9a8258d..fa31d05e41b7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -235,18 +235,31 @@
 #define OCFS2_HAS_REFCOUNT_FL   (0x0010)
 /* Inode attributes, keep in sync with EXT2 */
-#define OCFS2_SECRM_FL          (0x00000001)    /* Secure deletion */
+#define OCFS2_SECRM_FL                  FS_SECRM_FL     /* Secure deletion */
-#define OCFS2_UNRM_FL           (0x00000002)    /* Undelete */
+#define OCFS2_UNRM_FL                   FS_UNRM_FL      /* Undelete */
-#define OCFS2_COMPR_FL          (0x00000004)    /* Compress file */
+#define OCFS2_COMPR_FL                  FS_COMPR_FL     /* Compress file */
-#define OCFS2_SYNC_FL           (0x00000008)    /* Synchronous updates */
+#define OCFS2_SYNC_FL                   FS_SYNC_FL      /* Synchronous updates */
-#define OCFS2_IMMUTABLE_FL      (0x00000010)    /* Immutable file */
+#define OCFS2_IMMUTABLE_FL              FS_IMMUTABLE_FL /* Immutable file */
-#define OCFS2_APPEND_FL         (0x00000020)    /* writes to file may only append */
+#define OCFS2_APPEND_FL                 FS_APPEND_FL    /* writes to file may only append */
-#define OCFS2_NODUMP_FL         (0x00000040)    /* do not dump file */
+#define OCFS2_NODUMP_FL                 FS_NODUMP_FL    /* do not dump file */
-#define OCFS2_NOATIME_FL        (0x00000080)    /* do not update atime */
+#define OCFS2_NOATIME_FL                FS_NOATIME_FL   /* do not update atime */
-#define OCFS2_DIRSYNC_FL        (0x00010000)    /* dirsync behaviour (directories only) */
+/* Reserved for compression usage... */
+#define OCFS2_DIRTY_FL                  FS_DIRTY_FL
-#define OCFS2_FL_VISIBLE        (0x000100FF)    /* User visible flags */
+#define OCFS2_COMPRBLK_FL               FS_COMPRBLK_FL  /* One or more compressed clusters */
-#define OCFS2_FL_MODIFIABLE     (0x000100FF)    /* User modifiable flags */
+#define OCFS2_NOCOMP_FL                 FS_NOCOMP_FL    /* Don't compress */
+#define OCFS2_ECOMPR_FL                 FS_ECOMPR_FL    /* Compression error */
+/* End compression flags --- maybe not all used */
+#define OCFS2_BTREE_FL                  FS_BTREE_FL     /* btree format dir */
+#define OCFS2_INDEX_FL                  FS_INDEX_FL     /* hash-indexed directory */
+#define OCFS2_IMAGIC_FL                 FS_IMAGIC_FL    /* AFS directory */
+#define OCFS2_JOURNAL_DATA_FL           FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define OCFS2_NOTAIL_FL                 FS_NOTAIL_FL    /* file tail should not be merged */
+#define OCFS2_DIRSYNC_FL                FS_DIRSYNC_FL   /* dirsync behaviour (directories only) */
+#define OCFS2_TOPDIR_FL                 FS_TOPDIR_FL    /* Top of directory hierarchies*/
+#define OCFS2_RESERVED_FL               FS_RESERVED_FL  /* reserved for ext2 lib */
+#define OCFS2_FL_VISIBLE                FS_FL_USER_VISIBLE      /* User visible flags */
+#define OCFS2_FL_MODIFIABLE             FS_FL_USER_MODIFIABLE   /* User modifiable flags */
 /*
 * Extent record flags (e_node.leaf.flags)
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a83..5d241505690b 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
 /*
 * ioctl commands
 */
-#define OCFS2_IOC_GETFLAGS      _IOR('f', 1, long)
+#define OCFS2_IOC_GETFLAGS      FS_IOC_GETFLAGS
-#define OCFS2_IOC_SETFLAGS      _IOW('f', 2, long)
+#define OCFS2_IOC_SETFLAGS      FS_IOC_SETFLAGS
-#define OCFS2_IOC32_GETFLAGS    _IOR('f', 1, int)
+#define OCFS2_IOC32_GETFLAGS    FS_IOC32_GETFLAGS
-#define OCFS2_IOC32_SETFLAGS    _IOW('f', 2, int)
+#define OCFS2_IOC32_SETFLAGS    FS_IOC32_SETFLAGS
 /*
 * Space reservation / allocation / free ioctls and argument structure
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 0afeda83120f..efdd75607406 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4201,8 +4201,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
                goto out;
        }
-        mutex_lock(&new_inode->i_mutex);
+        mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
-        ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
+        ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
+                                      OI_LS_REFLINK_TARGET);
        if (ret) {
                mlog_errno(ret);
                goto out_unlock;
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index d8b6e4259b80..3e78db361bc7 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -732,25 +732,23 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
                           struct ocfs2_alloc_reservation *resv,
                           int *cstart, int *clen)
 {
-        unsigned int wanted = *clen;
        if (resv == NULL || ocfs2_resmap_disabled(resmap))
                return -ENOSPC;
        spin_lock(&resv_lock);
-        /*
-         * We don't want to over-allocate for temporary
-         * windows. Otherwise, we run the risk of fragmenting the
-         * allocation space.
-         */
-        wanted = ocfs2_resv_window_bits(resmap, resv);
-        if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
-                wanted = *clen;
        if (ocfs2_resv_empty(resv)) {
-                mlog(0, "empty reservation, find new window\n");
+                /*
+                 * We don't want to over-allocate for temporary
+                 * windows. Otherwise, we run the risk of fragmenting the
+                 * allocation space.
+                 */
+                unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
+                if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+                        wanted = *clen;
+                mlog(0, "empty reservation, find new window\n");
                /*
                 * Try to get a window here. If it works, we must fall
                 * through and test the bitmap . This avoids some
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8a286f54dca1..849c2f0e0a0e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -357,7 +357,7 @@ out:
 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
                                          struct ocfs2_group_desc *bg,
                                          struct ocfs2_chain_list *cl,
-                                          u64 p_blkno, u32 clusters)
+                                          u64 p_blkno, unsigned int clusters)
 {
        struct ocfs2_extent_list *el = &bg->bg_list;
        struct ocfs2_extent_rec *rec;
@@ -369,7 +369,7 @@ static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
        rec->e_blkno = cpu_to_le64(p_blkno);
        rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
                                  le16_to_cpu(cl->cl_bpc));
-        rec->e_leaf_clusters = cpu_to_le32(clusters);
+        rec->e_leaf_clusters = cpu_to_le16(clusters);
        le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
        le16_add_cpu(&bg->bg_free_bits_count,
                     clusters * le16_to_cpu(cl->cl_bpc));
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc4..9975457c981f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
        }
        /* Fast symlinks can't be large */
-        len = strlen(target);
+        len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
        link = kzalloc(len + 1, GFP_NOFS);
        if (!link) {
                status = -ENOMEM;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03469f61801..06fa5e77c40e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1286,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
-        down_read(&oi->ip_xattr_sem);
        ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
                                    buffer_size, &xis);
        if (ret == -ENODATA && di->i_xattr_loc)
                ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
                                            buffer_size, &xbs);
-        up_read(&oi->ip_xattr_sem);
        return ret;
 }
@@ -1316,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode,
                mlog_errno(ret);
                return ret;
        }
+        down_read(&OCFS2_I(inode)->ip_xattr_sem);
        ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
                                     name, buffer, buffer_size);
+        up_read(&OCFS2_I(inode)->ip_xattr_sem);
        ocfs2_inode_unlock(inode, 0);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1c43e7c8a7b..8e4addaa5424 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2675,7 +2675,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        INF("auxv",       S_IRUSR, proc_pid_auxv),
        ONE("status",     S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",     S_IRUSR, proc_pid_limits),
+        INF("limits",     S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
@@ -3011,7 +3011,7 @@ static const struct pid_entry tid_base_stuff[] = {
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",    S_IRUSR, proc_pid_limits),
+        INF("limits",    S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 271afc48b9a5..1dbca4e8cc16 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -363,13 +363,13 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                        mss->referenced += PAGE_SIZE;
                mapcount = page_mapcount(page);
                if (mapcount >= 2) {
-                        if (pte_dirty(ptent))
+                        if (pte_dirty(ptent) || PageDirty(page))
                                mss->shared_dirty += PAGE_SIZE;
                        else
                                mss->shared_clean += PAGE_SIZE;
                        mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
                } else {
-                        if (pte_dirty(ptent))
+                        if (pte_dirty(ptent) || PageDirty(page))
                                mss->private_dirty += PAGE_SIZE;
                        else
                                mss->private_clean += PAGE_SIZE;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 91c817ff02c3..2367fb3f70bc 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -163,7 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 static const struct file_operations proc_vmcore_operations = {
        .read           = read_vmcore,
-        .llseek         = generic_file_llseek,
+        .llseek         = default_llseek,
 };
 static struct vmcore* __init get_new_element(void)
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index f53505de0712..5cbb81e134ac 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
 int reiserfs_unpack(struct inode *inode, struct file *filp)
 {
        int retval = 0;
+        int depth;
        int index;
        struct page *page;
        struct address_space *mapping;
@@ -188,8 +189,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        /* we need to make sure nobody is changing the file size beneath
         ** us
         */
-        mutex_lock(&inode->i_mutex);
+        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
-        reiserfs_write_lock(inode->i_sb);
+        depth = reiserfs_write_lock_once(inode->i_sb);
        write_from = inode->i_size & (blocksize - 1);
        /* if we are on a block boundary, we are already unpacked.  */
@@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
      out:
        mutex_unlock(&inode->i_mutex);
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, depth);
        return retval;
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index d59c4a65d492..81976ffed7d6 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -668,14 +668,11 @@ xfs_inode_set_reclaim_tag(
        xfs_perag_put(pag);
 }
-void
+STATIC void
-__xfs_inode_clear_reclaim_tag(
+__xfs_inode_clear_reclaim(
-        xfs_mount_t     *mp,
        xfs_perag_t     *pag,
        xfs_inode_t     *ip)
 {
-        radix_tree_tag_clear(&pag->pag_ici_root,
-                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
        pag->pag_ici_reclaimable--;
        if (!pag->pag_ici_reclaimable) {
                /* clear the reclaim tag from the perag radix tree */
@@ -689,6 +686,17 @@ __xfs_inode_clear_reclaim_tag(
        }
 }
+void
+__xfs_inode_clear_reclaim_tag(
+        xfs_mount_t     *mp,
+        xfs_perag_t     *pag,
+        xfs_inode_t     *ip)
+{
+        radix_tree_tag_clear(&pag->pag_ici_root,
+                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+        __xfs_inode_clear_reclaim(pag, ip);
+}
 /*
 * Inodes in different states need to be treated differently, and the return
 * value of xfs_iflush is not sufficient to get this right. The following table
@@ -838,6 +846,7 @@ reclaim:
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
+        __xfs_inode_clear_reclaim(pag, ip);
        write_unlock(&pag->pag_ici_lock);
        /*
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ed575fb4b495..7e206fc1fa36 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -405,9 +405,15 @@ xlog_cil_push(
        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
        new_ctx->ticket = xlog_cil_ticket_alloc(log);
-        /* lock out transaction commit, but don't block on background push */
+        /*
+         * Lock out transaction commit, but don't block for background pushes
+         * unless we are well over the CIL space limit. See the definition of
+         * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
+         * used here.
+         */
        if (!down_write_trylock(&cil->xc_ctx_lock)) {
-                if (!push_seq)
+                if (!push_seq &&
+                    cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
                        goto out_free_ticket;
                down_write(&cil->xc_ctx_lock);
        }
@@ -422,7 +428,7 @@ xlog_cil_push(
                goto out_skip;
        /* check for a previously pushed seqeunce */
-        if (push_seq < cil->xc_ctx->sequence)
+        if (push_seq && push_seq < cil->xc_ctx->sequence)
                goto out_skip;
        /*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ced52b98b322..edcdfe01617f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -426,13 +426,13 @@ struct xfs_cil {
 };
 /*
- * The amount of log space we should the CIL to aggregate is difficult to size.
+ * The amount of log space we allow the CIL to aggregate is difficult to size.
- * Whatever we chose we have to make we can get a reservation for the log space
+ * Whatever we choose, we have to make sure we can get a reservation for the
- * effectively, that it is large enough to capture sufficient relogging to
+ * log space effectively, that it is large enough to capture sufficient
- * reduce log buffer IO significantly, but it is not too large for the log or
+ * relogging to reduce log buffer IO significantly, but it is not too large for
- * induces too much latency when writing out through the iclogs. We track both
+ * the log or induces too much latency when writing out through the iclogs. We
- * space consumed and the number of vectors in the checkpoint context, so we
+ * track both space consumed and the number of vectors in the checkpoint
- * need to decide which to use for limiting.
+ * context, so we need to decide which to use for limiting.
 *
 * Every log buffer we write out during a push needs a header reserved, which
 * is at least one sector and more for v2 logs. Hence we need a reservation of
@@ -459,16 +459,21 @@ struct xfs_cil {
 * checkpoint transaction ticket is specific to the checkpoint context, rather
 * than the CIL itself.
 *
- * With dynamic reservations, we can basically make up arbitrary limits for the
+ * With dynamic reservations, we can effectively make up arbitrary limits for
- * checkpoint size so long as they don't violate any other size rules.  Hence
+ * the checkpoint size so long as they don't violate any other size rules.
- * the initial maximum size for the checkpoint transaction will be set to a
+ * Recovery imposes a rule that no transaction exceed half the log, so we are
- * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
+ * limited by that.  Furthermore, the log transaction reservation subsystem
- * right now based on the latency of writing out a large amount of data through
+ * tries to keep 25% of the log free, so we need to keep below that limit or we
- * the circular iclog buffers.
+ * risk running out of free log space to start any new transactions.
+ *
+ * In order to keep background CIL push efficient, we will set a lower
+ * threshold at which background pushing is attempted without blocking current
+ * transaction commits.  A separate, higher bound defines when CIL pushes are
+ * enforced to ensure we stay within our maximum checkpoint size bounds.
+ * threshold, yet give us plenty of space for aggregation on large logs.
 */
+#define XLOG_CIL_SPACE_LIMIT(log)       (log->l_logsize >> 3)
-#define XLOG_CIL_SPACE_LIMIT(log)       \
+#define XLOG_CIL_HARD_SPACE_LIMIT(log)  (3 * (log->l_logsize >> 4))
-        (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
 /*
 * The reservation head lsn is not made up of a cycle number and block number.