58 files changed, 1014 insertions, 430 deletions
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d61e3b28ce37..36d961f342af 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -146,7 +146,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                while (rdir->head < rdir->tail) {
                        p9stat_init(&st);
                        err = p9stat_read(rdir->buf + rdir->head,
-                                                buflen - rdir->head, &st,
+                                                rdir->tail - rdir->head, &st,
                                                fid->clnt->proto_version);
                        if (err) {
                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0d1d966b0fe4..c3df14ce2cc2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2304,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
        return ret;
 }
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right.  We'll push up to and including min_slot, but no lower
+ */
 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      struct btrfs_path *path,
                                      int data_size, int empty,
                                      struct extent_buffer *right,
-                                      int free_space, u32 left_nritems)
+                                      int free_space, u32 left_nritems,
+                                      u32 min_slot)
 {
        struct extent_buffer *left = path->nodes[0];
        struct extent_buffer *upper = path->nodes[1];
@@ -2327,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (empty)
                nr = 0;
        else
-                nr = 1;
+                nr = max_t(u32, 1, min_slot);
        if (path->slots[0] >= left_nritems)
                push_space += data_size;
@@ -2469,10 +2474,14 @@ out_unlock:
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf.  It won't
+ * push any slot lower than min_slot
 */
 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-                           *root, struct btrfs_path *path, int data_size,
+                           *root, struct btrfs_path *path,
-                           int empty)
+                           int min_data_size, int data_size,
+                           int empty, u32 min_slot)
 {
        struct extent_buffer *left = path->nodes[0];
        struct extent_buffer *right;
@@ -2514,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        if (left_nritems == 0)
                goto out_unlock;
-        return __push_leaf_right(trans, root, path, data_size, empty,
+        return __push_leaf_right(trans, root, path, min_data_size, empty,
-                                right, free_space, left_nritems);
+                                right, free_space, left_nritems, min_slot);
 out_unlock:
        btrfs_tree_unlock(right);
        free_extent_buffer(right);
@@ -2525,12 +2534,17 @@ out_unlock:
 /*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
+ * items
 */
 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct btrfs_path *path, int data_size,
                                     int empty, struct extent_buffer *left,
-                                     int free_space, int right_nritems)
+                                     int free_space, u32 right_nritems,
+                                     u32 max_slot)
 {
        struct btrfs_disk_key disk_key;
        struct extent_buffer *right = path->nodes[0];
@@ -2549,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        slot = path->slots[1];
        if (empty)
-                nr = right_nritems;
+                nr = min(right_nritems, max_slot);
        else
-                nr = right_nritems - 1;
+                nr = min(right_nritems - 1, max_slot);
        for (i = 0; i < nr; i++) {
                item = btrfs_item_nr(right, i);
@@ -2712,10 +2726,14 @@ out:
 /*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
+ * items
 */
 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
-                          *root, struct btrfs_path *path, int data_size,
+                          *root, struct btrfs_path *path, int min_data_size,
-                          int empty)
+                          int data_size, int empty, u32 max_slot)
 {
        struct extent_buffer *right = path->nodes[0];
        struct extent_buffer *left;
@@ -2761,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
                goto out;
        }
-        return __push_leaf_left(trans, root, path, data_size,
+        return __push_leaf_left(trans, root, path, min_data_size,
-                               empty, left, free_space, right_nritems);
+                               empty, left, free_space, right_nritems,
+                               max_slot);
 out:
        btrfs_tree_unlock(left);
        free_extent_buffer(left);
@@ -2855,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 }
 /*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf.  A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ *          A                 B                 C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves.  If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          int data_size)
+{
+        int ret;
+        int progress = 0;
+        int slot;
+        u32 nritems;
+        slot = path->slots[0];
+        /*
+         * try to push all the items after our slot into the
+         * right leaf
+         */
+        ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                progress++;
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        /*
+         * our goal is to get our slot at the start or end of a leaf.  If
+         * we've done so we're done
+         */
+        if (path->slots[0] == 0 || path->slots[0] == nritems)
+                return 0;
+        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+                return 0;
+        /* try to push all the items before our slot into the next leaf */
+        slot = path->slots[0];
+        ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                progress++;
+        if (progress)
+                return 0;
+        return 1;
+}
+/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
 *
@@ -2876,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        int wret;
        int split;
        int num_doubles = 0;
+        int tried_avoid_double = 0;
        l = path->nodes[0];
        slot = path->slots[0];
@@ -2884,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                return -EOVERFLOW;
        /* first try to make some room by pushing left and right */
-        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+        if (data_size) {
-                wret = push_leaf_right(trans, root, path, data_size, 0);
+                wret = push_leaf_right(trans, root, path, data_size,
+                                       data_size, 0, 0);
                if (wret < 0)
                        return wret;
                if (wret) {
-                        wret = push_leaf_left(trans, root, path, data_size, 0);
+                        wret = push_leaf_left(trans, root, path, data_size,
+                                              data_size, 0, (u32)-1);
                        if (wret < 0)
                                return wret;
                }
@@ -2923,6 +3003,8 @@ again:
                                if (mid != nritems &&
                                    leaf_space_used(l, mid, nritems - mid) +
                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        if (data_size && !tried_avoid_double)
+                                                goto push_for_double;
                                        split = 2;
                                }
                        }
@@ -2939,6 +3021,8 @@ again:
                                if (mid != nritems &&
                                    leaf_space_used(l, mid, nritems - mid) +
                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        if (data_size && !tried_avoid_double)
+                                                goto push_for_double;
                                        split = 2 ;
                                }
                        }
@@ -3019,6 +3103,13 @@ again:
        }
        return ret;
+push_for_double:
+        push_for_double_split(trans, root, path, data_size);
+        tried_avoid_double = 1;
+        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+                return 0;
+        goto again;
 }
 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3915,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        extent_buffer_get(leaf);
                        btrfs_set_path_blocking(path);
-                        wret = push_leaf_left(trans, root, path, 1, 1);
+                        wret = push_leaf_left(trans, root, path, 1, 1,
+                                              1, (u32)-1);
                        if (wret < 0 && wret != -ENOSPC)
                                ret = wret;
                        if (path->nodes[0] == leaf &&
                            btrfs_header_nritems(leaf)) {
-                                wret = push_leaf_right(trans, root, path, 1, 1);
+                                wret = push_leaf_right(trans, root, path, 1,
+                                                       1, 1, 0);
                                if (wret < 0 && wret != -ENOSPC)
                                        ret = wret;
                        }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4dbaf89b1337..9254b3d58dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1458,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
         */
        /* the destination must be opened for writing */
-        if (!(file->f_mode & FMODE_WRITE))
+        if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
                return -EINVAL;
        ret = mnt_want_write(file->f_path.mnt);
@@ -1511,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        /* determine range to clone */
        ret = -EINVAL;
-        if (off >= src->i_size || off + len > src->i_size)
+        if (off + len > src->i_size || off + len < off)
                goto out_unlock;
        if (len == 0)
                olen = len = src->i_size - off;
@@ -1578,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        u64 disko = 0, diskl = 0;
                        u64 datao = 0, datal = 0;
                        u8 comp;
+                        u64 endoff;
                        size = btrfs_item_size_nr(leaf, slot);
                        read_extent_buffer(leaf, buf,
@@ -1712,9 +1713,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        btrfs_release_path(root, path);
                        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                        if (new_key.offset + datal > inode->i_size)
-                                btrfs_i_size_write(inode,
+                        /*
-                                                   new_key.offset + datal);
+                         * we round up to the block size at eof when
+                         * determining which extents to clone above,
+                         * but shouldn't round up the file size
+                         */
+                        endoff = new_key.offset + datal;
+                        if (endoff > off+olen)
+                                endoff = off+olen;
+                        if (endoff > inode->i_size)
+                                btrfs_i_size_write(inode, endoff);
                        BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 04b8280582a9..bc87b9c1d27e 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -2,7 +2,7 @@ config CEPH_FS
        tristate "Ceph distributed file system (EXPERIMENTAL)"
        depends on INET && EXPERIMENTAL
        select LIBCRC32C
-        select CONFIG_CRYPTO_AES
+        select CRYPTO_AES
        help
          Choose Y or M here to include support for mounting the
          experimental Ceph distributed file system.  Ceph is an extremely
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 3fe49042d8ad..6d44053ecff1 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -613,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
                remove_ticket_handler(ac, th);
        }
+        if (xi->auth_authorizer.buf)
+                ceph_buffer_put(xi->auth_authorizer.buf);
        kfree(ac->private);
        ac->private = NULL;
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 74144d6389f0..b81be9a56487 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -627,7 +627,7 @@ retry:
        if (fmode >= 0)
                __ceph_get_fmode(ci, fmode);
        spin_unlock(&inode->i_lock);
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
        return 0;
 }
@@ -1181,7 +1181,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        }
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        return delayed;
 }
@@ -2153,7 +2153,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
        else if (flushsnaps)
                ceph_flush_snaps(ci);
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        if (put)
                iput(inode);
 }
@@ -2229,7 +2229,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                iput(inode);
        } else if (complete_capsnap) {
                ceph_flush_snaps(ci);
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        }
        if (drop_capsnap)
                iput(inode);
@@ -2405,7 +2405,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        if (queue_invalidate)
                ceph_queue_invalidate(inode);
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        if (check_caps == 1)
                ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
@@ -2460,7 +2460,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                                         struct ceph_inode_info,
                                         i_flushing_item)->vfs_inode);
                mdsc->num_cap_flushing--;
-                wake_up(&mdsc->cap_flushing_wq);
+                wake_up_all(&mdsc->cap_flushing_wq);
                dout(" inode %p now !flushing\n", inode);
                if (ci->i_dirty_caps == 0) {
@@ -2472,7 +2472,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                }
        }
        spin_unlock(&mdsc->cap_dirty_lock);
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
 out:
        spin_unlock(&inode->i_lock);
@@ -2984,6 +2984,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
                memcpy(*p, dentry->d_name.name, dentry->d_name.len);
                *p += dentry->d_name.len;
                rel->dname_seq = cpu_to_le32(di->lease_seq);
+                __ceph_mdsc_drop_dentry_lease(dentry);
        }
        spin_unlock(&dentry->d_lock);
        return ret;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f85719310db2..f94ed3c7f6a5 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -266,6 +266,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        spin_lock(&inode->i_lock);
        if ((filp->f_pos == 2 || fi->dentry) &&
            !ceph_test_opt(client, NOASYNCREADDIR) &&
+            ceph_snap(inode) != CEPH_SNAPDIR &&
            (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
                err = __dcache_readdir(filp, dirent, filldir);
@@ -1013,18 +1014,22 @@ out_touch:
 /*
 * When a dentry is released, clear the dir I_COMPLETE if it was part
- * of the current dir gen.
+ * of the current dir gen or if this is in the snapshot namespace.
 */
 static void ceph_dentry_release(struct dentry *dentry)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
        struct inode *parent_inode = dentry->d_parent->d_inode;
+        u64 snapid = ceph_snap(parent_inode);
-        if (parent_inode) {
+        dout("dentry_release %p parent %p\n", dentry, parent_inode);
+        if (parent_inode && snapid != CEPH_SNAPDIR) {
                struct ceph_inode_info *ci = ceph_inode(parent_inode);
                spin_lock(&parent_inode->i_lock);
-                if (ci->i_shared_gen == di->lease_shared_gen) {
+                if (ci->i_shared_gen == di->lease_shared_gen ||
+                    snapid <= CEPH_MAXSNAP) {
                        dout(" clearing %p complete (d_release)\n",
                             parent_inode);
                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
@@ -1241,7 +1246,9 @@ struct dentry_operations ceph_dentry_ops = {
 struct dentry_operations ceph_snapdir_dentry_ops = {
        .d_revalidate = ceph_snapdir_d_revalidate,
+        .d_release = ceph_dentry_release,
 };
 struct dentry_operations ceph_snap_dentry_ops = {
+        .d_release = ceph_dentry_release,
 };
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6251a1574b94..7c08698fad3e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file)
        kmem_cache_free(ceph_file_cachep, cf);
        /* wake up anyone waiting for caps on this inode */
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
        return 0;
 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8f9b9fe8ef9f..389f9dbd9949 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1199,8 +1199,10 @@ retry_lookup:
                                goto out;
                        }
                        err = ceph_init_dentry(dn);
-                        if (err < 0)
+                        if (err < 0) {
+                                dput(dn);
                                goto out;
+                        }
                } else if (dn->d_inode &&
                           (ceph_ino(dn->d_inode) != vino.ino ||
                            ceph_snap(dn->d_inode) != vino.snap)) {
@@ -1499,7 +1501,7 @@ retry:
        if (wrbuffer_refs == 0)
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3ab79f6c4ce8..dd440bd438a9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -868,7 +868,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
        if (arg) {
                spin_lock(&inode->i_lock);
                ci->i_wanted_max_size = 0;
@@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        ceph_encode_filepath(&p, end, ino1, path1);
        ceph_encode_filepath(&p, end, ino2, path2);
+        /* make note of release offset, in case we need to replay */
+        req->r_request_release_offset = p - msg->front.iov_base;
        /* cap releases */
        releases = 0;
        if (req->r_inode_drop)
@@ -1561,7 +1564,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
        if (req->r_callback)
                req->r_callback(mdsc, req);
        else
-                complete(&req->r_completion);
+                complete_all(&req->r_completion);
 }
 /*
@@ -1580,6 +1583,32 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+        if (req->r_got_unsafe) {
+                /*
+                 * Replay.  Do not regenerate message (and rebuild
+                 * paths, etc.); just use the original message.
+                 * Rebuilding paths will break for renames because
+                 * d_move mangles the src name.
+                 */
+                msg = req->r_request;
+                rhead = msg->front.iov_base;
+                flags = le32_to_cpu(rhead->flags);
+                flags |= CEPH_MDS_FLAG_REPLAY;
+                rhead->flags = cpu_to_le32(flags);
+                if (req->r_target_inode)
+                        rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+                rhead->num_retry = req->r_attempts - 1;
+                /* remove cap/dentry releases from message */
+                rhead->num_releases = 0;
+                msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
+                msg->front.iov_len = req->r_request_release_offset;
+                return 0;
+        }
        if (req->r_request) {
                ceph_msg_put(req->r_request);
                req->r_request = NULL;
@@ -1601,13 +1630,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        rhead->flags = cpu_to_le32(flags);
        rhead->num_fwd = req->r_num_fwd;
        rhead->num_retry = req->r_attempts - 1;
+        rhead->ino = 0;
        dout(" r_locked_dir = %p\n", req->r_locked_dir);
-        if (req->r_target_inode && req->r_got_unsafe)
-                rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
-        else
-                rhead->ino = 0;
        return 0;
 }
@@ -1907,7 +1932,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        if (head->safe) {
                req->r_got_safe = true;
                __unregister_request(mdsc, req);
-                complete(&req->r_safe_completion);
+                complete_all(&req->r_safe_completion);
                if (req->r_got_unsafe) {
                        /*
@@ -1922,7 +1947,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        /* last unsafe request during umount? */
                        if (mdsc->stopping && !__get_oldest_req(mdsc))
-                                complete(&mdsc->safe_umount_waiters);
+                                complete_all(&mdsc->safe_umount_waiters);
                        mutex_unlock(&mdsc->mutex);
                        goto out;
                }
@@ -2101,7 +2126,7 @@ static void handle_session(struct ceph_mds_session *session,
                        pr_info("mds%d reconnect denied\n", session->s_mds);
                remove_session_caps(session);
                wake = 1; /* for good measure */
-                complete(&mdsc->session_close_waiters);
+                complete_all(&mdsc->session_close_waiters);
                kick_requests(mdsc, mds);
                break;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index b292fa42a66d..952410c60d09 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -188,6 +188,7 @@ struct ceph_mds_request {
        int r_old_inode_drop, r_old_inode_unless;
        struct ceph_msg  *r_request;  /* original request */
+        int r_request_release_offset;
        struct ceph_msg  *r_reply;
        struct ceph_mds_reply_info_parsed r_reply_info;
        int r_err;
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 9ad43a310a41..15167b2daa55 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con);
 * nicely render a sockaddr as a string.
 */
 #define MAX_ADDR_STR 20
-static char addr_str[MAX_ADDR_STR][40];
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
 static DEFINE_SPINLOCK(addr_str_lock);
 static int last_addr_str;
@@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss)
        int i;
        char *s;
        struct sockaddr_in *in4 = (void *)ss;
-        unsigned char *quad = (void *)&in4->sin_addr.s_addr;
        struct sockaddr_in6 *in6 = (void *)ss;
        spin_lock(&addr_str_lock);
@@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss)
        switch (ss->ss_family) {
        case AF_INET:
-                sprintf(s, "%u.%u.%u.%u:%u",
+                snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
-                        (unsigned int)quad[0],
+                         (unsigned int)ntohs(in4->sin_port));
-                        (unsigned int)quad[1],
-                        (unsigned int)quad[2],
-                        (unsigned int)quad[3],
-                        (unsigned int)ntohs(in4->sin_port));
                break;
        case AF_INET6:
-                sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
+                snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
-                        in6->sin6_addr.s6_addr16[0],
+                         (unsigned int)ntohs(in6->sin6_port));
-                        in6->sin6_addr.s6_addr16[1],
-                        in6->sin6_addr.s6_addr16[2],
-                        in6->sin6_addr.s6_addr16[3],
-                        in6->sin6_addr.s6_addr16[4],
-                        in6->sin6_addr.s6_addr16[5],
-                        in6->sin6_addr.s6_addr16[6],
-                        in6->sin6_addr.s6_addr16[7],
-                        (unsigned int)ntohs(in6->sin6_port));
                break;
        default:
@@ -215,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock,
 */
 static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 {
-        struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+        struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
        struct socket *sock;
        int ret;
        BUG_ON(con->sock);
-        ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+        ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+                               IPPROTO_TCP, &sock);
        if (ret)
                return ERR_PTR(ret);
        con->sock = sock;
@@ -234,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
        dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-        ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+        ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+                                 O_NONBLOCK);
        if (ret == -EINPROGRESS) {
                dout("connect %s EINPROGRESS sk_state = %u\n",
                     pr_addr(&con->peer_addr.in_addr),
@@ -1009,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end,
                struct sockaddr_in *in4 = (void *)ss;
                struct sockaddr_in6 *in6 = (void *)ss;
                int port;
+                char delim = ',';
+                if (*p == '[') {
+                        delim = ']';
+                        p++;
+                }
                memset(ss, 0, sizeof(*ss));
                if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
-                             ',', &ipend)) {
+                             delim, &ipend))
                        ss->ss_family = AF_INET;
-                } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+                else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
-                                    ',', &ipend)) {
+                                  delim, &ipend))
                        ss->ss_family = AF_INET6;
-                } else {
+                else
                        goto bad;
-                }
                p = ipend;
+                if (delim == ']') {
+                        if (*p != ']') {
+                                dout("missing matching ']'\n");
+                                goto bad;
+                        }
+                        p++;
+                }
                /* port? */
                if (p < end && *p == ':') {
                        port = 0;
@@ -1055,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end,
        return 0;
 bad:
-        pr_err("parse_ips bad ip '%s'\n", c);
+        pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
        return -EINVAL;
 }
@@ -2015,20 +2018,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 {
        mutex_lock(&con->mutex);
        if (!list_empty(&msg->list_head)) {
-                dout("con_revoke %p msg %p\n", con, msg);
+                dout("con_revoke %p msg %p - was on queue\n", con, msg);
                list_del_init(&msg->list_head);
                ceph_msg_put(msg);
                msg->hdr.seq = 0;
-                if (con->out_msg == msg) {
+        }
-                        ceph_msg_put(con->out_msg);
+        if (con->out_msg == msg) {
-                        con->out_msg = NULL;
+                dout("con_revoke %p msg %p - was sending\n", con, msg);
-                }
+                con->out_msg = NULL;
                if (con->out_kvec_is_msg) {
                        con->out_skip = con->out_kvec_bytes;
                        con->out_kvec_is_msg = false;
                }
-        } else {
+                ceph_msg_put(msg);
-                dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+                msg->hdr.seq = 0;
        }
        mutex_unlock(&con->mutex);
 }
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index cc115eafae11..54fe01c50706 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -345,7 +345,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 out:
        mutex_unlock(&monc->mutex);
-        wake_up(&client->auth_wq);
+        wake_up_all(&client->auth_wq);
 }
 /*
@@ -462,7 +462,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
        }
        mutex_unlock(&monc->mutex);
        if (req) {
-                complete(&req->completion);
+                complete_all(&req->completion);
                put_generic_request(req);
        }
        return;
@@ -718,7 +718,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                                     monc->m_auth->front_max);
        if (ret < 0) {
                monc->client->auth_err = ret;
-                wake_up(&monc->client->auth_wq);
+                wake_up_all(&monc->client->auth_wq);
        } else if (ret > 0) {
                __send_prepared_auth_request(monc, ret);
        } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 92b7251a53f1..e38522347898 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -862,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
        if (req->r_callback)
                req->r_callback(req, msg);
        else
-                complete(&req->r_completion);
+                complete_all(&req->r_completion);
        if (flags & CEPH_OSD_FLAG_ONDISK) {
                if (req->r_safe_callback)
                        req->r_safe_callback(req, msg);
-                complete(&req->r_safe_completion);  /* fsync waiter */
+                complete_all(&req->r_safe_completion);  /* fsync waiter */
        }
 done:
@@ -1083,7 +1083,7 @@ done:
        if (newmap)
                kick_requests(osdc, NULL);
        up_read(&osdc->map_sem);
-        wake_up(&osdc->client->auth_wq);
+        wake_up_all(&osdc->client->auth_wq);
        return;
 bad:
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 50ce64ebd330..416d46adbf87 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
                if (ev > CEPH_PG_POOL_VERSION) {
                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
                                   ev, CEPH_PG_POOL_VERSION);
+                        kfree(pi);
                        goto bad;
                }
                __decode_pool(p, pi);
@@ -830,12 +831,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                /* remove any? */
                while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
                                                node)->pgid, pgid) <= 0) {
-                        struct rb_node *cur = rbp;
+                        struct ceph_pg_mapping *cur =
+                                rb_entry(rbp, struct ceph_pg_mapping, node);
+                        
                        rbp = rb_next(rbp);
-                        dout(" removed pg_temp %llx\n",
+                        dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                             *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+                        rb_erase(&cur->node, &map->pg_temp);
-                                               node)->pgid);
+                        kfree(cur);
-                        rb_erase(cur, &map->pg_temp);
                }
                if (pglen) {
@@ -851,19 +853,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                        for (j = 0; j < pglen; j++)
                                pg->osds[j] = ceph_decode_32(p);
                        err = __insert_pg_mapping(pg, &map->pg_temp);
-                        if (err)
+                        if (err) {
+                                kfree(pg);
                                goto bad;
+                        }
                        dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
                             pglen);
                }
        }
        while (rbp) {
-                struct rb_node *cur = rbp;
+                struct ceph_pg_mapping *cur =
+                        rb_entry(rbp, struct ceph_pg_mapping, node);
                rbp = rb_next(rbp);
-                dout(" removed pg_temp %llx\n",
+                dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+                rb_erase(&cur->node, &map->pg_temp);
-                                       node)->pgid);
+                kfree(cur);
-                rb_erase(cur, &map->pg_temp);
        }
        /* ignore the rest */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 484e52bb40bb..2cb1a70214d7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -923,7 +923,7 @@ init_cifs(void)
                goto out_unregister_filesystem;
 #endif
 #ifdef CONFIG_CIFS_DFS_UPCALL
-        rc = register_key_type(&key_type_dns_resolver);
+        rc = cifs_init_dns_resolver();
        if (rc)
                goto out_unregister_key_type;
 #endif
@@ -935,7 +935,7 @@ init_cifs(void)
 out_unregister_resolver_key:
 #ifdef CONFIG_CIFS_DFS_UPCALL
-        unregister_key_type(&key_type_dns_resolver);
+        cifs_exit_dns_resolver();
 out_unregister_key_type:
 #endif
 #ifdef CONFIG_CIFS_UPCALL
@@ -961,7 +961,7 @@ exit_cifs(void)
        cifs_proc_clean();
 #ifdef CONFIG_CIFS_DFS_UPCALL
        cifs_dfs_release_automount_timer();
-        unregister_key_type(&key_type_dns_resolver);
+        cifs_exit_dns_resolver();
 #endif
 #ifdef CONFIG_CIFS_UPCALL
        unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 4db2c5e7283f..49315cbf742d 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -24,12 +24,16 @@
 */
 #include <linux/slab.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
 #include <keys/user-type.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
+static const struct cred *dns_resolver_cache;
 /* Checks if supplied name is IP address
 * returns:
 *              1 - name is IP
@@ -94,6 +98,7 @@ struct key_type key_type_dns_resolver = {
 int
 dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 {
+        const struct cred *saved_cred;
        int rc = -EAGAIN;
        struct key *rkey = ERR_PTR(-EAGAIN);
        char *name;
@@ -133,8 +138,15 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
                goto skip_upcall;
        }
+        saved_cred = override_creds(dns_resolver_cache);
        rkey = request_key(&key_type_dns_resolver, name, "");
+        revert_creds(saved_cred);
        if (!IS_ERR(rkey)) {
+                if (!(rkey->perm & KEY_USR_VIEW)) {
+                        down_read(&rkey->sem);
+                        rkey->perm |= KEY_USR_VIEW;
+                        up_read(&rkey->sem);
+                }
                len = rkey->type_data.x[0];
                data = rkey->payload.data;
        } else {
@@ -165,4 +177,61 @@ out:
        return rc;
 }
+int __init cifs_init_dns_resolver(void)
+{
+        struct cred *cred;
+        struct key *keyring;
+        int ret;
+        printk(KERN_NOTICE "Registering the %s key type\n",
+               key_type_dns_resolver.name);
+        /* create an override credential set with a special thread keyring in
+         * which DNS requests are cached
+         *
+         * this is used to prevent malicious redirections from being installed
+         * with add_key().
+         */
+        cred = prepare_kernel_cred(NULL);
+        if (!cred)
+                return -ENOMEM;
+        keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
+                            (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                            KEY_USR_VIEW | KEY_USR_READ,
+                            KEY_ALLOC_NOT_IN_QUOTA);
+        if (IS_ERR(keyring)) {
+                ret = PTR_ERR(keyring);
+                goto failed_put_cred;
+        }
+        ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+        if (ret < 0)
+                goto failed_put_key;
+        ret = register_key_type(&key_type_dns_resolver);
+        if (ret < 0)
+                goto failed_put_key;
+        /* instruct request_key() to use this special keyring as a cache for
+         * the results it looks up */
+        cred->thread_keyring = keyring;
+        cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+        dns_resolver_cache = cred;
+        return 0;
+failed_put_key:
+        key_put(keyring);
+failed_put_cred:
+        put_cred(cred);
+        return ret;
+}
+void __exit cifs_exit_dns_resolver(void)
+{
+        key_revoke(dns_resolver_cache->thread_keyring);
+        unregister_key_type(&key_type_dns_resolver);
+        put_cred(dns_resolver_cache);
+        printk(KERN_NOTICE "Unregistered %s key type\n",
+               key_type_dns_resolver.name);
+}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 966e9288930b..26b9eaa9f5ee 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,8 @@
 #define _DNS_RESOLVE_H
 #ifdef __KERNEL__
-#include <linux/key-type.h>
+extern int __init cifs_init_dns_resolver(void);
-extern struct key_type key_type_dns_resolver;
+extern void __exit cifs_exit_dns_resolver(void);
 extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
 #endif /* KERNEL */
diff --git a/fs/dcache.c b/fs/dcache.c
index c8c78ba07827..86d4db15473e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -896,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 *
 * In this case we return -1 to tell the caller that we baled.
 */
-static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 2d8dbce9d485..46c4dd8dfcc3 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux;
 static struct hlist_head *ecryptfs_daemon_hash;
 struct mutex ecryptfs_daemon_hash_mux;
-static int ecryptfs_hash_buckets;
+static int ecryptfs_hash_bits;
 #define ecryptfs_uid_hash(uid) \
-        hash_long((unsigned long)uid, ecryptfs_hash_buckets)
+        hash_long((unsigned long)uid, ecryptfs_hash_bits)
 static u32 ecryptfs_msg_counter;
 static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void)
        }
        mutex_init(&ecryptfs_daemon_hash_mux);
        mutex_lock(&ecryptfs_daemon_hash_mux);
-        ecryptfs_hash_buckets = 1;
+        ecryptfs_hash_bits = 1;
-        while (ecryptfs_number_of_users >> ecryptfs_hash_buckets)
+        while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
-                ecryptfs_hash_buckets++;
+                ecryptfs_hash_bits++;
        ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
-                                        * ecryptfs_hash_buckets), GFP_KERNEL);
+                                        * (1 << ecryptfs_hash_bits)),
+                                       GFP_KERNEL);
        if (!ecryptfs_daemon_hash) {
                rc = -ENOMEM;
                printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
                mutex_unlock(&ecryptfs_daemon_hash_mux);
                goto out;
        }
-        for (i = 0; i < ecryptfs_hash_buckets; i++)
+        for (i = 0; i < (1 << ecryptfs_hash_bits); i++)
                INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
        mutex_unlock(&ecryptfs_daemon_hash_mux);
        ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
@@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void)
                int i;
                mutex_lock(&ecryptfs_daemon_hash_mux);
-                for (i = 0; i < ecryptfs_hash_buckets; i++) {
+                for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
                        int rc;
                        hlist_for_each_entry(daemon, elem,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 4a48c0f4b402..84da64b551b2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1041,6 +1041,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
        if (gfs2_is_stuffed(ip)) {
                u64 dsize = size + sizeof(struct gfs2_inode);
+                ip->i_disksize = size;
                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 8295c5b5d4a9..6b48d7c268b2 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
        unsigned totlen = be16_to_cpu(dent->de_rec_len);
        if (gfs2_dirent_sentinel(dent))
-                actual = GFS2_DIRENT_SIZE(0);
+                actual = 0;
        if (totlen - actual >= required)
                return 1;
        return 0;
@@ -1231,6 +1231,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
        return 0;
 }
+static void *gfs2_alloc_sort_buffer(unsigned size)
+{
+        void *ptr = NULL;
+        if (size < KMALLOC_MAX_SIZE)
+                ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
+        if (!ptr)
+                ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
+        return ptr;
+}
+static void gfs2_free_sort_buffer(void *ptr)
+{
+        if (is_vmalloc_addr(ptr))
+                vfree(ptr);
+        else
+                kfree(ptr);
+}
 static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                              filldir_t filldir, int *copied, unsigned *depth,
                              u64 leaf_no)
@@ -1271,7 +1290,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
         * 99 is the maximum number of entries that can fit in a single
         * leaf block.
         */
-        larr = vmalloc((leaves + entries + 99) * sizeof(void *));
+        larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
        if (!larr)
                goto out;
        darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1282,7 +1301,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
        do {
                error = get_leaf(ip, lfn, &bh);
                if (error)
-                        goto out_kfree;
+                        goto out_free;
                lf = (struct gfs2_leaf *)bh->b_data;
                lfn = be64_to_cpu(lf->lf_next);
                if (lf->lf_entries) {
@@ -1291,7 +1310,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                                                gfs2_dirent_gather, NULL, &g);
                        error = PTR_ERR(dent);
                        if (IS_ERR(dent))
-                                goto out_kfree;
+                                goto out_free;
                        if (entries2 != g.offset) {
                                fs_warn(sdp, "Number of entries corrupt in dir "
                                                "leaf %llu, entries2 (%u) != "
@@ -1300,7 +1319,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                                        entries2, g.offset);
                                        
                                error = -EIO;
-                                goto out_kfree;
+                                goto out_free;
                        }
                        error = 0;
                        larr[leaf++] = bh;
@@ -1312,10 +1331,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
        BUG_ON(entries2 != entries);
        error = do_filldir_main(ip, offset, opaque, filldir, darr,
                                entries, copied);
-out_kfree:
+out_free:
        for(i = 0; i < leaf; i++)
                brelse(larr[i]);
-        vfree(larr);
+        gfs2_free_sort_buffer(larr);
 out:
        return error;
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ddcdbf493536..0898f3ec8212 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -706,8 +706,18 @@ static void glock_work_func(struct work_struct *work)
 {
        unsigned long delay = 0;
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+        struct gfs2_holder *gh;
        int drop_ref = 0;
+        if (unlikely(test_bit(GLF_FROZEN, &gl->gl_flags))) {
+                spin_lock(&gl->gl_spin);
+                gh = find_first_waiter(gl);
+                if (gh && (gh->gh_flags & LM_FLAG_NOEXP) &&
+                    test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+                        set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+                spin_unlock(&gl->gl_spin);
+        }
        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
                finish_xmote(gl, gl->gl_reply);
                drop_ref = 1;
@@ -1348,7 +1358,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 }
-static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        struct gfs2_glock *gl;
        int may_demote;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b5612cbb62a5..f03afd9c44bc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
 {
        struct inode *inode;
        struct gfs2_inode *ip;
-        struct gfs2_glock *io_gl;
+        struct gfs2_glock *io_gl = NULL;
        int error;
        inode = gfs2_iget(sb, no_addr);
@@ -198,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
                ip->i_iopen_gh.gh_gl->gl_object = ip;
                gfs2_glock_put(io_gl);
+                io_gl = NULL;
                if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
                        goto gfs2_nfsbypass;
@@ -228,7 +229,8 @@ gfs2_nfsbypass:
 fail_glock:
        gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
-        gfs2_glock_put(io_gl);
+        if (io_gl)
+                gfs2_glock_put(io_gl);
 fail_put:
        if (inode->i_state & I_NEW)
                ip->i_gl->gl_object = NULL;
@@ -256,7 +258,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 {
        struct gfs2_sbd *sdp;
        struct gfs2_inode *ip;
-        struct gfs2_glock *io_gl;
+        struct gfs2_glock *io_gl = NULL;
        int error;
        struct gfs2_holder gh;
        struct inode *inode;
@@ -293,6 +295,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
        ip->i_iopen_gh.gh_gl->gl_object = ip;
        gfs2_glock_put(io_gl);
+        io_gl = NULL;
        inode->i_mode = DT2IF(DT_UNKNOWN);
@@ -319,7 +322,8 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 fail_glock:
        gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
-        gfs2_glock_put(io_gl);
+        if (io_gl)
+                gfs2_glock_put(io_gl);
 fail_put:
        ip->i_gl->gl_object = NULL;
        gfs2_glock_put(ip->i_gl);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 49667d68769e..8f02d3db8f42 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(qd_lru_lock);
-int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        struct gfs2_quota_data *qd;
        struct gfs2_sbd *sdp;
@@ -694,10 +694,8 @@ get_a_page:
                if (!buffer_mapped(bh))
                        goto unlock_out;
                /* If it's a newly allocated disk block for quota, zero it */
-                if (buffer_new(bh)) {
+                if (buffer_new(bh))
-                        memset(bh->b_data, 0, bh->b_size);
+                        zero_user(page, pos - blocksize, bh->b_size);
-                        set_buffer_uptodate(bh);
-                }
        }
        if (PageUptodate(page))
@@ -723,7 +721,7 @@ get_a_page:
        /* If quota straddles page boundary, we need to update the rest of the
         * quota at the beginning of the next page */
-        if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
+        if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
                ptr = ptr + nbytes;
                nbytes = sizeof(struct gfs2_quota) - nbytes;
                offset = 0;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 195f60c8bd14..e7d236ca48bd 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
        return ret;
 }
-extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
 extern const struct quotactl_ops gfs2_quotactl_ops;
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/inode.c b/fs/inode.c
index 2bee20ae3d65..722860b323a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan)
 * This function is passed the number of inodes to scan, and it returns the
 * total number of remaining possibly-reclaimable inodes.
 */
-static int shrink_icache_memory(int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        if (nr) {
                /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index bc2ff5932769..036880895bfc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -297,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
-        struct jbd2_buffer_trigger_type *triggers;
        journal_t *journal = transaction->t_journal;
        /*
@@ -328,21 +327,21 @@ repeat:
                done_copy_out = 1;
                new_page = virt_to_page(jh_in->b_frozen_data);
                new_offset = offset_in_page(jh_in->b_frozen_data);
-                triggers = jh_in->b_frozen_triggers;
        } else {
                new_page = jh2bh(jh_in)->b_page;
                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
-                triggers = jh_in->b_triggers;
        }
        mapped_data = kmap_atomic(new_page, KM_USER0);
        /*
-         * Fire any commit trigger.  Do this before checking for escaping,
+         * Fire data frozen trigger if data already wasn't frozen.  Do this
-         * as the trigger may modify the magic offset.  If a copy-out
+         * before checking for escaping, as the trigger may modify the magic
-         * happens afterwards, it will have the correct data in the buffer.
+         * offset.  If a copy-out happens afterwards, it will have the correct
+         * data in the buffer.
         */
-        jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+        if (!done_copy_out)
-                                   triggers);
+                jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+                                           jh_in->b_triggers);
        /*
         * Check for escaping
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e214d68620ac..b8e0806681bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -725,6 +725,9 @@ done:
                page = jh2bh(jh)->b_page;
                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
                source = kmap_atomic(page, KM_USER0);
+                /* Fire data frozen trigger just before we copy the data */
+                jbd2_buffer_frozen_trigger(jh, source + offset,
+                                           jh->b_triggers);
                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
                kunmap_atomic(source, KM_USER0);
@@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
        jh->b_triggers = type;
 }
-void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
                                struct jbd2_buffer_trigger_type *triggers)
 {
        struct buffer_head *bh = jh2bh(jh);
-        if (!triggers || !triggers->t_commit)
+        if (!triggers || !triggers->t_frozen)
                return;
-        triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+        triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
 }
 void jbd2_buffer_abort_trigger(struct journal_head *jh,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index a2d58c96f1b4..d258e261bdc7 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
 static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
-        /* success of check_xattr_ref_inode() means taht inode (ic) dose not have
+        /* success of check_xattr_ref_inode() means that inode (ic) dose not have
         * duplicate name/value pairs. If duplicate name/value pair would be found,
         * one will be removed.
         */
diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3d04a9..e28f21b95344 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache)
 * What the mbcache registers as to get shrunk dynamically.
 */
-static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
 static struct shrinker mb_cache_shrinker = {
        .shrink = mb_cache_shrink_fn,
@@ -191,13 +191,14 @@ forget:
 * This function is called by the kernel memory management when memory
 * gets low.
 *
+ * @shrink: (ignored)
 * @nr_to_scan: Number of objects to scan
 * @gfp_mask: (ignored)
 *
 * Returns the number of objects which are present in the cache.
 */
 static int
-mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(free_list);
        struct list_head *l, *ltmp;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 782b431ef91c..e60416d3f818 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1710,7 +1710,7 @@ static void nfs_access_free_list(struct list_head *head)
        }
 }
-int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
+int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(head);
        struct nfs_inode *nfsi;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d8bd619e386c..e70f44b9b3f4 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[];
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 /* dir.c */
-extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
+extern int nfs_access_cache_shrinker(struct shrinker *shrink,
+                                        int nr_to_scan, gfp_t gfp_mask);
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca20cc18..356e976772bf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
                        dump_stack();
                        goto bail;
                }
-                past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-                mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-                     (unsigned long long)past_eof);
-                if (create && (iblock >= past_eof))
-                        set_buffer_new(bh_result);
        }
+        past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+        mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+             (unsigned long long)past_eof);
+        if (create && (iblock >= past_eof))
+                set_buffer_new(bh_result);
 bail:
        if (err < 0)
                err = -EIO;
@@ -459,36 +458,6 @@ int walk_page_buffers(	handle_t *handle,
        return ret;
 }
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
-                                                         struct page *page,
-                                                         unsigned from,
-                                                         unsigned to)
-{
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        handle_t *handle;
-        int ret = 0;
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
-                mlog_errno(ret);
-                goto out;
-        }
-        if (ocfs2_should_order_data(inode)) {
-                ret = ocfs2_jbd2_file_inode(handle, inode);
-                if (ret < 0)
-                        mlog_errno(ret);
-        }
-out:
-        if (ret) {
-                if (!IS_ERR(handle))
-                        ocfs2_commit_trans(osb, handle);
-                handle = ERR_PTR(ret);
-        }
-        return handle;
-}
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
        sector_t status;
@@ -1131,23 +1100,37 @@ out:
 */
 static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                                      struct ocfs2_write_ctxt *wc,
-                                      u32 cpos, loff_t user_pos, int new,
+                                      u32 cpos, loff_t user_pos,
+                                      unsigned user_len, int new,
                                      struct page *mmap_page)
 {
        int ret = 0, i;
-        unsigned long start, target_index, index;
+        unsigned long start, target_index, end_index, index;
        struct inode *inode = mapping->host;
+        loff_t last_byte;
        target_index = user_pos >> PAGE_CACHE_SHIFT;
        /*
         * Figure out how many pages we'll be manipulating here. For
         * non allocating write, we just change the one
-         * page. Otherwise, we'll need a whole clusters worth.
+         * page. Otherwise, we'll need a whole clusters worth.  If we're
+         * writing past i_size, we only need enough pages to cover the
+         * last page of the write.
         */
        if (new) {
                wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
                start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+                /*
+                 * We need the index *past* the last page we could possibly
+                 * touch.  This is the page past the end of the write or
+                 * i_size, whichever is greater.
+                 */
+                last_byte = max(user_pos + user_len, i_size_read(inode));
+                BUG_ON(last_byte < 1);
+                end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+                if ((start + wc->w_num_pages) > end_index)
+                        wc->w_num_pages = end_index - start;
        } else {
                wc->w_num_pages = 1;
                start = target_index;
@@ -1620,21 +1603,20 @@ out:
 * write path can treat it as an non-allocating write, which has no
 * special case code for sparse/nonsparse files.
 */
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
-                                        unsigned len,
+                                        struct buffer_head *di_bh,
+                                        loff_t pos, unsigned len,
                                        struct ocfs2_write_ctxt *wc)
 {
        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        loff_t newsize = pos + len;
-        if (ocfs2_sparse_alloc(osb))
+        BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
-                return 0;
        if (newsize <= i_size_read(inode))
                return 0;
-        ret = ocfs2_extend_no_holes(inode, newsize, pos);
+        ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
        if (ret)
                mlog_errno(ret);
@@ -1644,6 +1626,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
        return ret;
 }
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+                           loff_t pos)
+{
+        int ret = 0;
+        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+        if (pos > i_size_read(inode))
+                ret = ocfs2_zero_extend(inode, di_bh, pos);
+        return ret;
+}
 int ocfs2_write_begin_nolock(struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata,
@@ -1679,7 +1673,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                }
        }
-        ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+        if (ocfs2_sparse_alloc(osb))
+                ret = ocfs2_zero_tail(inode, di_bh, pos);
+        else
+                ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+                                                   wc);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -1789,7 +1787,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
         * that we can zero and flush if we error after adding the
         * extent.
         */
-        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
                                         cluster_of_pages, mmap_page);
        if (ret) {
                mlog_errno(ret);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6b5a492e1749..153abb5abef0 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
        struct dlm_ctxt *dlm = NULL;
        struct dlm_ctxt *new_ctxt = NULL;
-        if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+        if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
                ret = -ENAMETOOLONG;
                mlog(ML_ERROR, "domain name length too long\n");
                goto leave;
@@ -1709,6 +1709,7 @@ retry:
                }
                if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+                        spin_unlock(&dlm_domain_lock);
                        mlog(ML_ERROR,
                             "Requested locking protocol version is not "
                             "compatible with already registered domain "
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 4a7506a4e314..94b97fc6a88e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2808,14 +2808,8 @@ again:
                mlog(0, "trying again...\n");
                goto again;
        }
-        /* now that we are sure the MIGRATING state is there, drop
-         * the unneded state which blocked threads trying to DIRTY */
-        spin_lock(&res->spinlock);
-        BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
-        BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
-        res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
-        spin_unlock(&res->spinlock);
+        ret = 0;
        /* did the target go down or die? */
        spin_lock(&dlm->spinlock);
        if (!test_bit(target, dlm->domain_map)) {
@@ -2826,9 +2820,21 @@ again:
        spin_unlock(&dlm->spinlock);
        /*
+         * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+         * another try; otherwise, we are sure the MIGRATING state is there,
+         * drop the unneded state which blocked threads trying to DIRTY
+         */
+        spin_lock(&res->spinlock);
+        BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+        res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+        if (!ret)
+                BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+        spin_unlock(&res->spinlock);
+        /*
         * at this point:
         *
-         *   o the DLM_LOCK_RES_MIGRATING flag is set
+         *   o the DLM_LOCK_RES_MIGRATING flag is set if target not down
         *   o there are no pending asts on this lockres
         *   o all processes trying to reserve an ast on this
         *     lockres must wait for the MIGRATING flag to clear
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f8b75ce4be70..9dfaac73b36d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
        if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
                int bit;
-                bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+                bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
                if (bit >= O2NM_MAX_NODES || bit < 0)
                        dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
                else
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6a13ea64c447..2b10b36d1577 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -724,28 +724,55 @@ leave:
        return status;
 }
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        handle_t *handle = NULL;
+        int ret = 0;
+        if (!ocfs2_should_order_data(inode))
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_jbd2_file_inode(handle, inode);
+        if (ret < 0)
+                mlog_errno(ret);
+out:
+        if (ret) {
+                if (!IS_ERR(handle))
+                        ocfs2_commit_trans(osb, handle);
+                handle = ERR_PTR(ret);
+        }
+        return handle;
+}
 /* Some parts of this taken from generic_cont_expand, which turned out
 * to be too fragile to do exactly what we need without us having to
 * worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
-                                 u64 size)
+                                 u64 abs_to)
 {
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
-        unsigned long index;
+        unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
-        unsigned int offset;
        handle_t *handle = NULL;
-        int ret;
+        int ret = 0;
+        unsigned zero_from, zero_to, block_start, block_end;
-        offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+        BUG_ON(abs_from >= abs_to);
-        /* ugh.  in prepare/commit_write, if from==to==start of block, we
+        BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
-        ** skip the prepare.  make sure we never send an offset for the start
+        BUG_ON(abs_from & (inode->i_blkbits - 1));
-        ** of a block
-        */
-        if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
-                offset++;
-        }
-        index = size >> PAGE_CACHE_SHIFT;
        page = grab_cache_page(mapping, index);
        if (!page) {
@@ -754,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
+        /* Get the offsets within the page that we want to zero */
-        if (ret < 0) {
+        zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
-                mlog_errno(ret);
+        zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
-                goto out_unlock;
+        if (!zero_to)
-        }
+                zero_to = PAGE_CACHE_SIZE;
-        if (ocfs2_should_order_data(inode)) {
+        mlog(0,
-                handle = ocfs2_start_walk_page_trans(inode, page, offset,
+             "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
-                                                     offset);
+             (unsigned long long)abs_from, (unsigned long long)abs_to,
-                if (IS_ERR(handle)) {
+             index, zero_from, zero_to);
-                        ret = PTR_ERR(handle);
-                        handle = NULL;
+        /* We know that zero_from is block aligned */
+        for (block_start = zero_from; block_start < zero_to;
+             block_start = block_end) {
+                block_end = block_start + (1 << inode->i_blkbits);
+                /*
+                 * block_start is block-aligned.  Bump it by one to
+                 * force ocfs2_{prepare,commit}_write() to zero the
+                 * whole block.
+                 */
+                ret = ocfs2_prepare_write_nolock(inode, page,
+                                                 block_start + 1,
+                                                 block_start + 1);
+                if (ret < 0) {
+                        mlog_errno(ret);
                        goto out_unlock;
                }
-        }
-        /* must not update i_size! */
+                if (!handle) {
-        ret = block_commit_write(page, offset, offset);
+                        handle = ocfs2_zero_start_ordered_transaction(inode);
-        if (ret < 0)
+                        if (IS_ERR(handle)) {
-                mlog_errno(ret);
+                                ret = PTR_ERR(handle);
-        else
+                                handle = NULL;
-                ret = 0;
+                                break;
+                        }
+                }
+                /* must not update i_size! */
+                ret = block_commit_write(page, block_start + 1,
+                                         block_start + 1);
+                if (ret < 0)
+                        mlog_errno(ret);
+                else
+                        ret = 0;
+        }
        if (handle)
                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out_unlock:
        unlock_page(page);
        page_cache_release(page);
@@ -786,22 +838,114 @@ out:
        return ret;
 }
-static int ocfs2_zero_extend(struct inode *inode,
+/*
-                             u64 zero_to_size)
+ * Find the next range to zero.  We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache.  We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed.  range_start and range_end return the next zeroing
+ * range.  A subsequent call should pass the previous range_end as its
+ * zero_start.  If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over.  Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+                                       struct buffer_head *di_bh,
+                                       u64 zero_start, u64 zero_end,
+                                       u64 *range_start, u64 *range_end)
 {
-        int ret = 0;
+        int rc = 0, needs_cow = 0;
-        u64 start_off;
+        u32 p_cpos, zero_clusters = 0;
-        struct super_block *sb = inode->i_sb;
+        u32 zero_cpos =
+                zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+        u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
-        start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+        while (zero_cpos < last_cpos) {
-        while (start_off < zero_to_size) {
+                rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
-                ret = ocfs2_write_zero_page(inode, start_off);
+                                        &num_clusters, &ext_flags);
-                if (ret < 0) {
+                if (rc) {
-                        mlog_errno(ret);
+                        mlog_errno(rc);
+                        goto out;
+                }
+                if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+                        zero_clusters = num_clusters;
+                        if (ext_flags & OCFS2_EXT_REFCOUNTED)
+                                needs_cow = 1;
+                        break;
+                }
+                zero_cpos += num_clusters;
+        }
+        if (!zero_clusters) {
+                *range_end = 0;
+                goto out;
+        }
+        while ((zero_cpos + zero_clusters) < last_cpos) {
+                rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+                                        &p_cpos, &num_clusters,
+                                        &ext_flags);
+                if (rc) {
+                        mlog_errno(rc);
                        goto out;
                }
-                start_off += sb->s_blocksize;
+                if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+                        break;
+                if (ext_flags & OCFS2_EXT_REFCOUNTED)
+                        needs_cow = 1;
+                zero_clusters += num_clusters;
+        }
+        if ((zero_cpos + zero_clusters) > last_cpos)
+                zero_clusters = last_cpos - zero_cpos;
+        if (needs_cow) {
+                rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
+                                        UINT_MAX);
+                if (rc) {
+                        mlog_errno(rc);
+                        goto out;
+                }
+        }
+        *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+        *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+                                             zero_cpos + zero_clusters);
+out:
+        return rc;
+}
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+                                   u64 range_end)
+{
+        int rc = 0;
+        u64 next_pos;
+        u64 zero_pos = range_start;
+        mlog(0, "range_start = %llu, range_end = %llu\n",
+             (unsigned long long)range_start,
+             (unsigned long long)range_end);
+        BUG_ON(range_start >= range_end);
+        while (zero_pos < range_end) {
+                next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+                if (next_pos > range_end)
+                        next_pos = range_end;
+                rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+                if (rc < 0) {
+                        mlog_errno(rc);
+                        break;
+                }
+                zero_pos = next_pos;
                /*
                 * Very large extends have the potential to lock up
@@ -810,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
                cond_resched();
        }
-out:
+        return rc;
+}
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+                      loff_t zero_to_size)
+{
+        int ret = 0;
+        u64 zero_start, range_start = 0, range_end = 0;
+        struct super_block *sb = inode->i_sb;
+        zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+        mlog(0, "zero_start %llu for i_size %llu\n",
+             (unsigned long long)zero_start,
+             (unsigned long long)i_size_read(inode));
+        while (zero_start < zero_to_size) {
+                ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+                                                  zero_to_size,
+                                                  &range_start,
+                                                  &range_end);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
+                if (!range_end)
+                        break;
+                /* Trim the ends */
+                if (range_start < zero_start)
+                        range_start = zero_start;
+                if (range_end > zero_to_size)
+                        range_end = zero_to_size;
+                ret = ocfs2_zero_extend_range(inode, range_start,
+                                              range_end);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
+                zero_start = range_end;
+        }
        return ret;
 }
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+                          u64 new_i_size, u64 zero_to)
 {
        int ret;
        u32 clusters_to_add;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        /*
+         * Only quota files call this without a bh, and they can't be
+         * refcounted.
+         */
+        BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+        BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
        if (clusters_to_add < oi->ip_clusters)
                clusters_to_add = 0;
@@ -840,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
         * still need to zero the area between the old i_size and the
         * new i_size.
         */
-        ret = ocfs2_zero_extend(inode, zero_to);
+        ret = ocfs2_zero_extend(inode, di_bh, zero_to);
        if (ret < 0)
                mlog_errno(ret);
@@ -862,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode,
                goto out;
        if (i_size_read(inode) == new_i_size)
-                goto out;
+                goto out;
        BUG_ON(new_i_size < i_size_read(inode));
        /*
-         * Fall through for converting inline data, even if the fs
-         * supports sparse files.
-         *
-         * The check for inline data here is legal - nobody can add
-         * the feature since we have i_mutex. We must check it again
-         * after acquiring ip_alloc_sem though, as paths like mmap
-         * might have raced us to converting the inode to extents.
-         */
-        if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-            && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-                goto out_update_size;
-        /*
         * The alloc sem blocks people in read/write from reading our
         * allocation until we're done changing it. We depend on
         * i_mutex to block other extend/truncate calls while we're
-         * here.
+         * here.  We even have to hold it for sparse files because there
+         * might be some tail zeroing.
         */
        down_write(&oi->ip_alloc_sem);
@@ -899,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode,
                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
                if (ret) {
                        up_write(&oi->ip_alloc_sem);
                        mlog_errno(ret);
                        goto out;
                }
        }
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-                ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+                ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+        else
+                ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+                                            new_i_size);
        up_write(&oi->ip_alloc_sem);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70e..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 int ocfs2_simple_size_update(struct inode *inode,
                             struct buffer_head *di_bh,
                             u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
-                          u64 zero_to);
+                          u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+                      loff_t zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 47878cf16418..625de9d7088c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
        return container_of(triggers, struct ocfs2_triggers, ot_triggers);
 }
-static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
                                 struct buffer_head *bh,
                                 void *data, size_t size)
 {
@@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
 * Quota blocks have their own trigger because the struct ocfs2_block_check
 * offset depends on the blocksize.
 */
-static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
                                 struct buffer_head *bh,
                                 void *data, size_t size)
 {
@@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
 * Directory blocks also have their own trigger because the
 * struct ocfs2_block_check offset depends on the blocksize.
 */
-static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
                                 struct buffer_head *bh,
                                 void *data, size_t size)
 {
@@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
 static struct ocfs2_triggers di_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_dinode, i_check),
@@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = {
 static struct ocfs2_triggers eb_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_extent_block, h_check),
@@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = {
 static struct ocfs2_triggers rb_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_refcount_block, rf_check),
@@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = {
 static struct ocfs2_triggers gd_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_group_desc, bg_check),
@@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = {
 static struct ocfs2_triggers db_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_db_commit_trigger,
+                .t_frozen = ocfs2_db_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
 };
 static struct ocfs2_triggers xb_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_xattr_block, xb_check),
@@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = {
 static struct ocfs2_triggers dq_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_dq_commit_trigger,
+                .t_frozen = ocfs2_dq_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
 };
 static struct ocfs2_triggers dr_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = {
 static struct ocfs2_triggers dl_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_dx_leaf, dl_check),
@@ -1936,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
        mutex_lock(&os->os_lock);
        ocfs2_queue_orphan_scan(osb);
        if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
-                schedule_delayed_work(&os->os_orphan_scan_work,
+                queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
                                      ocfs2_orphan_scan_timeout());
        mutex_unlock(&os->os_lock);
 }
@@ -1976,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
        else {
                atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
-                schedule_delayed_work(&os->os_orphan_scan_work,
+                queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
-                                      ocfs2_orphan_scan_timeout());
+                                   ocfs2_orphan_scan_timeout());
        }
 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 3d7419682dc0..ec6adbf8f551 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -118,6 +118,7 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
 {
        unsigned int la_mb;
        unsigned int gd_mb;
+        unsigned int la_max_mb;
        unsigned int megs_per_slot;
        struct super_block *sb = osb->sb;
@@ -182,6 +183,12 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
        if (megs_per_slot < la_mb)
                la_mb = megs_per_slot;
+        /* We can't store more bits than we can in a block. */
+        la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+                                                ocfs2_local_alloc_size(sb) * 8);
+        if (la_mb > la_max_mb)
+                la_mb = la_max_mb;
        return la_mb;
 }
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2bb35fe00511..4607923eb24c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
                 * locking allocators ranks above a transaction start
                 */
                WARN_ON(journal_current_handle());
-                status = ocfs2_extend_no_holes(gqinode,
+                status = ocfs2_extend_no_holes(gqinode, NULL,
                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
                        gqinode->i_size);
                if (status < 0)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8bd70d4d184d..dc78764ccc4c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        u64 p_blkno;
        /* We are protected by dqio_sem so no locking needed */
-        status = ocfs2_extend_no_holes(lqinode,
+        status = ocfs2_extend_no_holes(lqinode, NULL,
                                       lqinode->i_size + 2 * sb->s_blocksize,
                                       lqinode->i_size);
        if (status < 0) {
@@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
                return ocfs2_local_quota_add_chunk(sb, type, offset);
        /* We are protected by dqio_sem so no locking needed */
-        status = ocfs2_extend_no_holes(lqinode,
+        status = ocfs2_extend_no_holes(lqinode, NULL,
                                       lqinode->i_size + sb->s_blocksize,
                                       lqinode->i_size);
        if (status < 0) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4793f36f6518..3ac5aa733e9c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2931,6 +2931,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
        end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+        /*
+         * We only duplicate pages until we reach the page contains i_size - 1.
+         * So trim 'end' to i_size.
+         */
+        if (end > i_size_read(context->inode))
+                end = i_size_read(context->inode);
        while (offset < end) {
                page_index = offset >> PAGE_CACHE_SHIFT;
@@ -4166,6 +4172,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
        struct inode *inode = old_dentry->d_inode;
        struct buffer_head *new_bh = NULL;
+        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+                ret = -EINVAL;
+                mlog_errno(ret);
+                goto out;
+        }
        ret = filemap_fdatawrite(inode->i_mapping);
        if (ret) {
                mlog_errno(ret);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f4c2a9eb8c4d..a8e6a95a353f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -741,7 +741,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                     le16_to_cpu(bg->bg_free_bits_count));
        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
                     le16_to_cpu(bg->bg_bits));
-        cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg->bg_blkno);
+        cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
                le16_add_cpu(&cl->cl_next_free_rec, 1);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e97b34842cfe..d03469f61801 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -709,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
                                         struct ocfs2_xattr_value_buf *vb,
                                         struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int status = 0;
+        int status = 0, credits;
        handle_t *handle = ctxt->handle;
        enum ocfs2_alloc_restarted why;
        u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
@@ -719,38 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
        ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
-        status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+        while (clusters_to_add) {
-                              OCFS2_JOURNAL_ACCESS_WRITE);
+                status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
-        if (status < 0) {
+                                       OCFS2_JOURNAL_ACCESS_WRITE);
-                mlog_errno(status);
+                if (status < 0) {
-                goto leave;
+                        mlog_errno(status);
-        }
+                        break;
+                }
-        prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+                prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
-        status = ocfs2_add_clusters_in_btree(handle,
+                status = ocfs2_add_clusters_in_btree(handle,
-                                             &et,
+                                                     &et,
-                                             &logical_start,
+                                                     &logical_start,
-                                             clusters_to_add,
+                                                     clusters_to_add,
-                                             0,
+                                                     0,
-                                             ctxt->data_ac,
+                                                     ctxt->data_ac,
-                                             ctxt->meta_ac,
+                                                     ctxt->meta_ac,
-                                             &why);
+                                                     &why);
-        if (status < 0) {
+                if ((status < 0) && (status != -EAGAIN)) {
-                mlog_errno(status);
+                        if (status != -ENOSPC)
-                goto leave;
+                                mlog_errno(status);
-        }
+                        break;
+                }
-        ocfs2_journal_dirty(handle, vb->vb_bh);
+                ocfs2_journal_dirty(handle, vb->vb_bh);
-        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
+                clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+                                         prev_clusters;
-        /*
+                if (why != RESTART_NONE && clusters_to_add) {
-         * We should have already allocated enough space before the transaction,
+                        /*
-         * so no need to restart.
+                         * We can only fail in case the alloc file doesn't give
-         */
+                         * up enough clusters.
-        BUG_ON(why != RESTART_NONE || clusters_to_add);
+                         */
+                        BUG_ON(why == RESTART_META);
-leave:
+                        mlog(0, "restarting xattr value extension for %u"
+                             " clusters,.\n", clusters_to_add);
+                        credits = ocfs2_calc_extend_credits(inode->i_sb,
+                                                            &vb->vb_xv->xr_list,
+                                                            clusters_to_add);
+                        status = ocfs2_extend_trans(handle, credits);
+                        if (status < 0) {
+                                status = -ENOMEM;
+                                mlog_errno(status);
+                                break;
+                        }
+                }
+        }
        return status;
 }
@@ -6788,16 +6804,15 @@ out:
        return ret;
 }
-static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
                                u64 blkno, u64 new_blkno, u32 clusters,
+                                u32 *cpos, int num_buckets,
                                struct ocfs2_alloc_context *meta_ac,
                                struct ocfs2_alloc_context *data_ac,
                                struct ocfs2_reflink_xattr_tree_args *args)
 {
        int i, j, ret = 0;
        struct super_block *sb = args->reflink->old_inode->i_sb;
-        u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
-        u32 num_buckets = clusters * bpc;
        int bpb = args->old_bucket->bu_blocks;
        struct ocfs2_xattr_value_buf vb = {
                .vb_access = ocfs2_journal_access,
@@ -6816,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
                        break;
                }
-                /*
-                 * The real bucket num in this series of blocks is stored
-                 * in the 1st bucket.
-                 */
-                if (i == 0)
-                        num_buckets = le16_to_cpu(
-                                bucket_xh(args->old_bucket)->xh_num_buckets);
                ret = ocfs2_xattr_bucket_journal_access(handle,
                                                args->new_bucket,
                                                OCFS2_JOURNAL_ACCESS_CREATE);
@@ -6837,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
                               bucket_block(args->old_bucket, j),
                               sb->s_blocksize);
+                /*
+                 * Record the start cpos so that we can use it to initialize
+                 * our xattr tree we also set the xh_num_bucket for the new
+                 * bucket.
+                 */
+                if (i == 0) {
+                        *cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+                                            xh_entries[0].xe_name_hash);
+                        bucket_xh(args->new_bucket)->xh_num_buckets =
+                                cpu_to_le16(num_buckets);
+                }
                ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
                ret = ocfs2_reflink_xattr_header(handle, args->reflink,
@@ -6866,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
                }
                ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
                ocfs2_xattr_bucket_relse(args->old_bucket);
                ocfs2_xattr_bucket_relse(args->new_bucket);
        }
@@ -6874,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
        ocfs2_xattr_bucket_relse(args->new_bucket);
        return ret;
 }
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+                                struct inode *inode,
+                                struct ocfs2_reflink_xattr_tree_args *args,
+                                struct ocfs2_extent_tree *et,
+                                struct ocfs2_alloc_context *meta_ac,
+                                struct ocfs2_alloc_context *data_ac,
+                                u64 blkno, u32 cpos, u32 len)
+{
+        int ret, first_inserted = 0;
+        u32 p_cluster, num_clusters, reflink_cpos = 0;
+        u64 new_blkno;
+        unsigned int num_buckets, reflink_buckets;
+        unsigned int bpc =
+                ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+        ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+        ocfs2_xattr_bucket_relse(args->old_bucket);
+        while (len && num_buckets) {
+                ret = ocfs2_claim_clusters(handle, data_ac,
+                                           1, &p_cluster, &num_clusters);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+                reflink_buckets = min(num_buckets, bpc * num_clusters);
+                ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+                                                 new_blkno, num_clusters,
+                                                 &reflink_cpos, reflink_buckets,
+                                                 meta_ac, data_ac, args);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * For the 1st allocated cluster, we make it use the same cpos
+                 * so that the xattr tree looks the same as the original one
+                 * in the most case.
+                 */
+                if (!first_inserted) {
+                        reflink_cpos = cpos;
+                        first_inserted = 1;
+                }
+                ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+                                          num_clusters, 0, meta_ac);
+                if (ret)
+                        mlog_errno(ret);
+                mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+                     (unsigned long long)new_blkno, num_clusters, reflink_cpos);
+                len -= num_clusters;
+                blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+                num_buckets -= reflink_buckets;
+        }
+out:
+        return ret;
+}
 /*
 * Create the same xattr extent record in the new inode's xattr tree.
 */
@@ -6885,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
                                   void *para)
 {
        int ret, credits = 0;
-        u32 p_cluster, num_clusters;
-        u64 new_blkno;
        handle_t *handle;
        struct ocfs2_reflink_xattr_tree_args *args =
                        (struct ocfs2_reflink_xattr_tree_args *)para;
@@ -6895,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_extent_tree et;
+        mlog(0, "reflink xattr buckets %llu len %u\n",
+             (unsigned long long)blkno, len);
        ocfs2_init_xattr_tree_extent_tree(&et,
                                          INODE_CACHE(args->reflink->new_inode),
                                          args->new_blk_bh);
@@ -6914,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_claim_clusters(handle, data_ac,
+        ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
-                                   len, &p_cluster, &num_clusters);
+                                          meta_ac, data_ac,
-        if (ret) {
+                                          blkno, cpos, len);
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
-        mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
-             (unsigned long long)blkno, (unsigned long long)new_blkno, len);
-        ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
-                                          meta_ac, data_ac, args);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
-             (unsigned long long)new_blkno, len, cpos);
-        ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
-                                  len, 0, meta_ac);
        if (ret)
                mlog_errno(ret);
-out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 3e73de5967ff..fc8497643fd0 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -74,6 +74,7 @@ int ibm_partition(struct parsed_partitions *state)
        } *label;
        unsigned char *data;
        Sector sect;
+        sector_t labelsect;
        res = 0;
        blocksize = bdev_logical_block_size(bdev);
@@ -98,10 +99,19 @@ int ibm_partition(struct parsed_partitions *state)
                goto out_freeall;
        /*
+         * Special case for FBA disks: label sector does not depend on
+         * blocksize.
+         */
+        if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+            (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+                labelsect = info->label_block;
+        else
+                labelsect = info->label_block * (blocksize >> 9);
+        /*
         * Get volume label, extract name and type.
         */
-        data = read_part_sector(state, info->label_block*(blocksize/512),
+        data = read_part_sector(state, labelsect, &sect);
-                                &sect);
        if (data == NULL)
                goto out_readerr;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 12c233da1b6b..437d2ca2de97 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -676,7 +676,7 @@ static void prune_dqcache(int count)
 * This is called from kswapd when we think we need some
 * more memory
 */
-static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        if (nr) {
                spin_lock(&dq_list_lock);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index f71246bebfe4..a7ac78f8e67a 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -28,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
        struct sysfs_dirent *target_sd = NULL;
        struct sysfs_dirent *sd = NULL;
        struct sysfs_addrm_cxt acxt;
+        enum kobj_ns_type ns_type;
        int error;
        BUG_ON(!name);
@@ -58,16 +59,29 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
        if (!sd)
                goto out_put;
-        if (sysfs_ns_type(parent_sd))
+        ns_type = sysfs_ns_type(parent_sd);
+        if (ns_type)
                sd->s_ns = target->ktype->namespace(target);
        sd->s_symlink.target_sd = target_sd;
        target_sd = NULL;       /* reference is now owned by the symlink */
        sysfs_addrm_start(&acxt, parent_sd);
-        if (warn)
+        /* Symlinks must be between directories with the same ns_type */
-                error = sysfs_add_one(&acxt, sd);
+        if (!ns_type ||
-        else
+            (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
-                error = __sysfs_add_one(&acxt, sd);
+                if (warn)
+                        error = sysfs_add_one(&acxt, sd);
+                else
+                        error = __sysfs_add_one(&acxt, sd);
+        } else {
+                error = -EINVAL;
+                WARN(1, KERN_WARNING
+                        "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
+                        parent_sd->s_name,
+                        sd->s_name,
+                        sd->s_symlink.target_sd->s_parent->s_name,
+                        sd->s_symlink.target_sd->s_name);
+        }
        sysfs_addrm_finish(&acxt);
        if (error)
@@ -122,7 +136,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 {
        const void *ns = NULL;
        spin_lock(&sysfs_assoc_lock);
-        if (targ->sd)
+        if (targ->sd && sysfs_ns_type(kobj->sd))
                ns = targ->sd->s_ns;
        spin_unlock(&sysfs_assoc_lock);
        sysfs_hash_and_remove(kobj->sd, ns, name);
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 02feb59cefca..0b201114a5ad 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,7 +277,7 @@ static int kick_a_thread(void)
        return 0;
 }
-int ubifs_shrinker(int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        int freed, contention = 0;
        long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 2eef553d50c8..04310878f449 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int ubifs_tnc_end_commit(struct ubifs_info *c);
 /* shrinker.c */
-int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
 /* commit.c */
 int ubifs_bg_thread(void *info);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 649ade8ef598..2ee3f7a60163 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -45,7 +45,7 @@
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(int, gfp_t);
+STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
 static struct shrinker xfs_buf_shake = {
        .shrink = xfsbufd_wakeup,
@@ -340,7 +340,7 @@ _xfs_buf_lookup_pages(
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(0, gfp_mask);
+                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -1762,6 +1762,7 @@ xfs_buf_runall_queues(
 STATIC int
 xfsbufd_wakeup(
+        struct shrinker         *shrink,
        int                     priority,
        gfp_t                   mask)
 {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f2d1718c9165..80938c736c27 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1883,7 +1883,6 @@ init_xfs_fs(void)
                goto out_cleanup_procfs;
        vfs_initquota();
-        xfs_inode_shrinker_init();
        error = register_filesystem(&xfs_fs_type);
        if (error)
@@ -1911,7 +1910,6 @@ exit_xfs_fs(void)
 {
        vfs_exitquota();
        unregister_filesystem(&xfs_fs_type);
-        xfs_inode_shrinker_destroy();
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
        xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index ef7f0218bccb..a51a07c3a70c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -144,6 +144,41 @@ restart:
        return last_error;
 }
+/*
+ * Select the next per-ag structure to iterate during the walk. The reclaim
+ * walk is optimised only to walk AGs with reclaimable inodes in them.
+ */
+static struct xfs_perag *
+xfs_inode_ag_iter_next_pag(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          *first,
+        int                     tag)
+{
+        struct xfs_perag        *pag = NULL;
+        if (tag == XFS_ICI_RECLAIM_TAG) {
+                int found;
+                int ref;
+                spin_lock(&mp->m_perag_lock);
+                found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+                                (void **)&pag, *first, 1, tag);
+                if (found <= 0) {
+                        spin_unlock(&mp->m_perag_lock);
+                        return NULL;
+                }
+                *first = pag->pag_agno + 1;
+                /* open coded pag reference increment */
+                ref = atomic_inc_return(&pag->pag_ref);
+                spin_unlock(&mp->m_perag_lock);
+                trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
+        } else {
+                pag = xfs_perag_get(mp, *first);
+                (*first)++;
+        }
+        return pag;
+}
 int
 xfs_inode_ag_iterator(
        struct xfs_mount        *mp,
@@ -154,16 +189,15 @@ xfs_inode_ag_iterator(
        int                     exclusive,
        int                     *nr_to_scan)
 {
+        struct xfs_perag        *pag;
        int                     error = 0;
        int                     last_error = 0;
        xfs_agnumber_t          ag;
        int                     nr;
        nr = nr_to_scan ? *nr_to_scan : INT_MAX;
-        for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+        ag = 0;
-                struct xfs_perag        *pag;
+        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
-                pag = xfs_perag_get(mp, ag);
                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
                                                exclusive, &nr);
                xfs_perag_put(pag);
@@ -640,6 +674,17 @@ __xfs_inode_set_reclaim_tag(
        radix_tree_tag_set(&pag->pag_ici_root,
                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
                           XFS_ICI_RECLAIM_TAG);
+        if (!pag->pag_ici_reclaimable) {
+                /* propagate the reclaim tag up into the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                XFS_ICI_RECLAIM_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+                                                        -1, _RET_IP_);
+        }
        pag->pag_ici_reclaimable++;
 }
@@ -674,6 +719,16 @@ __xfs_inode_clear_reclaim_tag(
        radix_tree_tag_clear(&pag->pag_ici_root,
                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
        pag->pag_ici_reclaimable--;
+        if (!pag->pag_ici_reclaimable) {
+                /* clear the reclaim tag from the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                XFS_ICI_RECLAIM_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+                                                        -1, _RET_IP_);
+        }
 }
 /*
@@ -828,83 +883,52 @@ xfs_reclaim_inodes(
 /*
 * Shrinker infrastructure.
- *
- * This is all far more complex than it needs to be. It adds a global list of
- * mounts because the shrinkers can only call a global context. We need to make
- * the shrinkers pass a context to avoid the need for global state.
 */
-static LIST_HEAD(xfs_mount_list);
-static struct rw_semaphore xfs_mount_list_lock;
 static int
 xfs_reclaim_inode_shrink(
+        struct shrinker *shrink,
        int             nr_to_scan,
        gfp_t           gfp_mask)
 {
        struct xfs_mount *mp;
        struct xfs_perag *pag;
        xfs_agnumber_t  ag;
-        int             reclaimable = 0;
+        int             reclaimable;
+        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                down_read(&xfs_mount_list_lock);
+                xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
-                list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
-                        xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
                                        XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
-                        if (nr_to_scan <= 0)
+                /* if we don't exhaust the scan, don't bother coming back */
-                                break;
+                if (nr_to_scan > 0)
-                }
+                        return -1;
-                up_read(&xfs_mount_list_lock);
+       }
-        }
-        down_read(&xfs_mount_list_lock);
+        reclaimable = 0;
-        list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+        ag = 0;
-                for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
-                        pag = xfs_perag_get(mp, ag);
+                                        XFS_ICI_RECLAIM_TAG))) {
-                        reclaimable += pag->pag_ici_reclaimable;
+                reclaimable += pag->pag_ici_reclaimable;
-                        xfs_perag_put(pag);
+                xfs_perag_put(pag);
-                }
        }
-        up_read(&xfs_mount_list_lock);
        return reclaimable;
 }
-static struct shrinker xfs_inode_shrinker = {
-        .shrink = xfs_reclaim_inode_shrink,
-        .seeks = DEFAULT_SEEKS,
-};
-void __init
-xfs_inode_shrinker_init(void)
-{
-        init_rwsem(&xfs_mount_list_lock);
-        register_shrinker(&xfs_inode_shrinker);
-}
-void
-xfs_inode_shrinker_destroy(void)
-{
-        ASSERT(list_empty(&xfs_mount_list));
-        unregister_shrinker(&xfs_inode_shrinker);
-}
 void
 xfs_inode_shrinker_register(
        struct xfs_mount        *mp)
 {
-        down_write(&xfs_mount_list_lock);
+        mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
-        list_add_tail(&mp->m_mplist, &xfs_mount_list);
+        mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
-        up_write(&xfs_mount_list_lock);
+        register_shrinker(&mp->m_inode_shrink);
 }
 void
 xfs_inode_shrinker_unregister(
        struct xfs_mount        *mp)
 {
-        down_write(&xfs_mount_list_lock);
+        unregister_shrinker(&mp->m_inode_shrink);
-        list_del(&mp->m_mplist);
-        up_write(&xfs_mount_list_lock);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index cdcbaaca9880..e28139aaa4aa 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -55,8 +55,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
        int flags, int tag, int write_lock, int *nr_to_scan);
-void xfs_inode_shrinker_init(void);
-void xfs_inode_shrinker_destroy(void);
 void xfs_inode_shrinker_register(struct xfs_mount *mp);
 void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 73d5aa117384..302820690904 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,10 @@ DEFINE_EVENT(xfs_perag_class, name,	\
                 unsigned long caller_ip),                                      \
        TP_ARGS(mp, agno, refcount, caller_ip))
 DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
 TRACE_EVENT(xfs_attr_list_node_descend,
        TP_PROTO(struct xfs_attr_list_context *ctx,
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8c117ff2e3ab..67c018392d62 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -69,7 +69,7 @@ STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int      xfs_qm_shake(int, gfp_t);
+STATIC int      xfs_qm_shake(struct shrinker *, int, gfp_t);
 static struct shrinker xfs_qm_shaker = {
        .shrink = xfs_qm_shake,
@@ -2117,7 +2117,10 @@ xfs_qm_shake_freelist(
 */
 /* ARGSUSED */
 STATIC int
-xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
+xfs_qm_shake(
+        struct shrinker *shrink,
+        int             nr_to_scan,
+        gfp_t           gfp_mask)
 {
        int     ndqused, nfree, n;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d2c7eed4eda..5761087ee8ea 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -259,7 +259,7 @@ typedef struct xfs_mount {
        wait_queue_head_t       m_wait_single_sync_task;
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
-        struct list_head        m_mplist;       /* inode shrinker mount list */
+        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
 } xfs_mount_t;
 /*