115 files changed, 1740 insertions, 996 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 358563689064..6406f896bf95 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -242,7 +242,8 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
        }
        kfree(wnames);
 fid_out:
-        v9fs_fid_add(dentry, fid);
+        if (!IS_ERR(fid))
+                v9fs_fid_add(dentry, fid);
 err_out:
        up_read(&v9ses->rename_sem);
        return fid;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 16c8a2a98c1b..899f168fd19c 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -292,9 +292,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
        fid = filp->private_data;
        P9_DPRINTK(P9_DEBUG_VFS,
-                        "inode: %p filp: %p fid: %d\n", inode, filp, fid->fid);
+                        "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
+                        inode, filp, fid ? fid->fid : -1);
        filemap_write_and_wait(inode->i_mapping);
-        p9_client_clunk(fid);
+        if (fid)
+                p9_client_clunk(fid);
        return 0;
 }
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c7c23eab9440..9e670d527646 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -730,7 +730,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
-        dentry->d_op = &v9fs_cached_dentry_operations;
+        if (v9ses->cache)
+                dentry->d_op = &v9fs_cached_dentry_operations;
+        else
+                dentry->d_op = &v9fs_dentry_operations;
        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
        if (err < 0)
@@ -1128,6 +1131,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
                generic_fillattr(dentry->d_inode, stat);
+        p9stat_free(st);
        kfree(st);
        return 0;
 }
@@ -1489,6 +1493,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        retval = strnlen(buffer, buflen);
 done:
+        p9stat_free(st);
        kfree(st);
        return retval;
 }
@@ -1942,7 +1947,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .unlink = v9fs_vfs_unlink,
        .mkdir = v9fs_vfs_mkdir,
        .rmdir = v9fs_vfs_rmdir,
-        .mknod = v9fs_vfs_mknod_dotl,
+        .mknod = v9fs_vfs_mknod,
        .rename = v9fs_vfs_rename,
        .getattr = v9fs_vfs_getattr,
        .setattr = v9fs_vfs_setattr,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f9311077de68..1d12ba0ed3db 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -122,6 +122,10 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        fid = v9fs_session_init(v9ses, dev_name, data);
        if (IS_ERR(fid)) {
                retval = PTR_ERR(fid);
+                /*
+                 * we need to call session_close to tear down some
+                 * of the data structure setup by session_init
+                 */
                goto close_session;
        }
@@ -144,7 +148,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                retval = -ENOMEM;
                goto release_sb;
        }
        sb->s_root = root;
        if (v9fs_proto_dotl(v9ses)) {
@@ -152,7 +155,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
                if (IS_ERR(st)) {
                        retval = PTR_ERR(st);
-                        goto clunk_fid;
+                        goto release_sb;
                }
                v9fs_stat2inode_dotl(st, root->d_inode);
@@ -162,7 +165,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                st = p9_client_stat(fid);
                if (IS_ERR(st)) {
                        retval = PTR_ERR(st);
-                        goto clunk_fid;
+                        goto release_sb;
                }
                root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
@@ -174,19 +177,24 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        v9fs_fid_add(root, fid);
-P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+        P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
        simple_set_mnt(mnt, sb);
        return 0;
 clunk_fid:
        p9_client_clunk(fid);
 close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
        return retval;
 release_sb:
+        /*
+         * we will do the session_close and root dentry release
+         * in the below call. But we need to clunk fid, because we haven't
+         * attached the fid to dentry so it won't get clunked
+         * automatically.
+         */
+        p9_client_clunk(fid);
        deactivate_locked_super(sb);
        return retval;
 }
diff --git a/fs/aio.c b/fs/aio.c
index 3006b5bc33d6..250b0a73c8a8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -712,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
         */
        ret = retry(iocb);
-        if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED)
+        if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
+                /*
+                 * There's no easy way to restart the syscall since other AIO's
+                 * may be already running. Just fail this IO with EINTR.
+                 */
+                if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+                             ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
+                        ret = -EINTR;
                aio_complete(iocb, ret, 0);
+        }
 out:
        spin_lock_irq(&ctx->ctx_lock);
@@ -1659,6 +1667,9 @@ long do_io_submit(aio_context_t ctx_id, long nr,
        if (unlikely(nr < 0))
                return -EINVAL;
+        if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
+                nr = LONG_MAX/sizeof(*iocbpp);
        if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
                return -EFAULT;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f96eff04e11a..a6395bdb26ae 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
                if (!dump_write(file, dump_start, dump_size))
                        goto end_coredump;
        }
-/* Finally dump the task struct.  Not be used by gdb, but could be useful */
-        set_fs(KERNEL_DS);
-        if (!dump_write(file, current, sizeof(*current)))
-                goto end_coredump;
 end_coredump:
        set_fs(fs);
        return has_dumped;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index a7528b913936..fd0cc0bf9a40 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -724,7 +724,7 @@ static int __init init_misc_binfmt(void)
 {
        int err = register_filesystem(&bm_fs_type);
        if (!err) {
-                err = register_binfmt(&misc_format);
+                err = insert_binfmt(&misc_format);
                if (err)
                        unregister_filesystem(&bm_fs_type);
        }
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 612a5c38d3c1..4d0ff5ee27b8 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -413,10 +413,10 @@ int bio_integrity_prep(struct bio *bio)
        /* Allocate kernel buffer for protection data */
        len = sectors * blk_integrity_tuple_size(bi);
-        buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+        buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
        if (unlikely(buf == NULL)) {
                printk(KERN_ERR "could not allocate integrity buffer\n");
-                return -EIO;
+                return -ENOMEM;
        }
        end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index bc87b9c1d27e..0fcd2640c23f 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -3,6 +3,7 @@ config CEPH_FS
        depends on INET && EXPERIMENTAL
        select LIBCRC32C
        select CRYPTO_AES
+        select CRYPTO
        help
          Choose Y or M here to include support for mounting the
          experimental Ceph distributed file system.  Ceph is an extremely
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5598a0d02295..efbc604001c8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
        /* dirty the head */
        spin_lock(&inode->i_lock);
-        if (ci->i_wrbuffer_ref_head == 0)
+        if (ci->i_head_snapc == NULL)
                ci->i_head_snapc = ceph_get_snap_context(snapc);
        ++ci->i_wrbuffer_ref_head;
        if (ci->i_wrbuffer_ref == 0)
@@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page)
        spin_lock_irq(&mapping->tree_lock);
        if (page->mapping) {    /* Race with truncate? */
                WARN_ON_ONCE(!PageUptodate(page));
+                account_page_dirtied(page, page->mapping);
-                if (mapping_cap_account_dirty(mapping)) {
-                        __inc_zone_page_state(page, NR_FILE_DIRTY);
-                        __inc_bdi_stat(mapping->backing_dev_info,
-                                        BDI_RECLAIMABLE);
-                        task_io_account_write(PAGE_CACHE_SIZE);
-                }
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
@@ -352,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
                        break;
                }
        }
-        if (!snapc && ci->i_head_snapc) {
+        if (!snapc && ci->i_wrbuffer_ref_head) {
                snapc = ceph_get_snap_context(ci->i_head_snapc);
                dout(" head snapc %p has %d dirty pages\n",
                     snapc, ci->i_wrbuffer_ref_head);
@@ -417,8 +411,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        if (i_size < page_off + len)
                len = i_size - page_off;
-        dout("writepage %p page %p index %lu on %llu~%u\n",
+        dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
-             inode, page, page->index, page_off, len);
+             inode, page, page->index, page_off, len, snapc);
        writeback_stat = atomic_long_inc_return(&client->writeback_count);
        if (writeback_stat >
@@ -772,7 +766,8 @@ get_more_pages:
                        /* ok */
                        if (locked_pages == 0) {
                                /* prepare async write request */
-                                offset = page->index << PAGE_CACHE_SHIFT;
+                                offset = (unsigned long long)page->index
+                                        << PAGE_CACHE_SHIFT;
                                len = wsize;
                                req = ceph_osdc_new_request(&client->osdc,
                                            &ci->i_layout,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 582e0b2caf8a..a2d002cbdec2 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -376,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
                th = get_ticket_handler(ac, service);
-                if (!th) {
+                if (IS_ERR(th)) {
                        *pneed |= service;
                        continue;
                }
@@ -399,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
        struct ceph_x_ticket_handler *th =
                get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+        if (IS_ERR(th))
+                return PTR_ERR(th);
        ceph_x_validate_tickets(ac, &need);
        dout("build_request want %x have %x need %x\n",
@@ -450,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
                        return -ERANGE;
                head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-                BUG_ON(!th);
                ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
                if (ret)
                        return ret;
@@ -505,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
        case CEPHX_GET_PRINCIPAL_SESSION_KEY:
                th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-                BUG_ON(!th);
+                if (IS_ERR(th))
+                        return PTR_ERR(th);
                ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
                                               buf + sizeof(*head), end);
                break;
@@ -563,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
        void *end = p + sizeof(au->reply_buf);
        th = get_ticket_handler(ac, au->service);
-        if (!th)
+        if (IS_ERR(th))
-                return -EIO;  /* hrm! */
+                return PTR_ERR(th);
        ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
        if (ret < 0)
                return ret;
@@ -626,7 +629,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
        struct ceph_x_ticket_handler *th;
        th = get_ticket_handler(ac, peer_type);
-        if (th && !IS_ERR(th))
+        if (!IS_ERR(th))
                remove_ticket_handler(ac, th);
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7bf182b03973..5e9da996a151 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -814,7 +814,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
                used |= CEPH_CAP_PIN;
        if (ci->i_rd_ref)
                used |= CEPH_CAP_FILE_RD;
-        if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+        if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
                used |= CEPH_CAP_FILE_CACHE;
        if (ci->i_wr_ref)
                used |= CEPH_CAP_FILE_WR;
@@ -1082,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        gid_t gid;
        struct ceph_mds_session *session;
        u64 xattr_version = 0;
+        struct ceph_buffer *xattr_blob = NULL;
        int delayed = 0;
        u64 flush_tid = 0;
        int i;
@@ -1142,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                for (i = 0; i < CEPH_CAP_BITS; i++)
                        if (flushing & (1 << i))
                                ci->i_cap_flush_tid[i] = flush_tid;
+                follows = ci->i_head_snapc->seq;
+        } else {
+                follows = 0;
        }
        keep = cap->implemented;
@@ -1155,14 +1160,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        mtime = inode->i_mtime;
        atime = inode->i_atime;
        time_warp_seq = ci->i_time_warp_seq;
-        follows = ci->i_snap_realm->cached_context->seq;
        uid = inode->i_uid;
        gid = inode->i_gid;
        mode = inode->i_mode;
-        if (dropping & CEPH_CAP_XATTR_EXCL) {
+        if (flushing & CEPH_CAP_XATTR_EXCL) {
                __ceph_build_xattrs_blob(ci);
-                xattr_version = ci->i_xattrs.version + 1;
+                xattr_blob = ci->i_xattrs.blob;
+                xattr_version = ci->i_xattrs.version;
        }
        spin_unlock(&inode->i_lock);
@@ -1170,9 +1175,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
                op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
                size, max_size, &mtime, &atime, time_warp_seq,
-                uid, gid, mode,
+                uid, gid, mode, xattr_version, xattr_blob,
-                xattr_version,
-                (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
                follows);
        if (ret < 0) {
                dout("error sending cap msg, must requeue %p\n", inode);
@@ -1192,10 +1195,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 * asynchronously back to the MDS once sync writes complete and dirty
 * data is written out.
 *
+ * Unless @again is true, skip cap_snaps that were already sent to
+ * the MDS (i.e., during this session).
+ *
 * Called under i_lock.  Takes s_mutex as needed.
 */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
-                        struct ceph_mds_session **psession)
+                        struct ceph_mds_session **psession,
+                        int again)
                __releases(ci->vfs_inode->i_lock)
                __acquires(ci->vfs_inode->i_lock)
 {
@@ -1224,7 +1231,7 @@ retry:
                 * pages to be written out.
                 */
                if (capsnap->dirty_pages || capsnap->writing)
-                        continue;
+                        break;
                /*
                 * if cap writeback already occurred, we should have dropped
@@ -1237,6 +1244,13 @@ retry:
                        dout("no auth cap (migrating?), doing nothing\n");
                        goto out;
                }
+                /* only flush each capsnap once */
+                if (!again && !list_empty(&capsnap->flushing_item)) {
+                        dout("already flushed %p, skipping\n", capsnap);
+                        continue;
+                }
                mds = ci->i_auth_cap->session->s_mds;
                mseq = ci->i_auth_cap->mseq;
@@ -1273,8 +1287,8 @@ retry:
                              &session->s_cap_snaps_flushing);
                spin_unlock(&inode->i_lock);
-                dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+                dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
-                     inode, capsnap, next_follows, capsnap->size);
+                     inode, capsnap, capsnap->follows, capsnap->flush_tid);
                send_cap_msg(session, ceph_vino(inode).ino, 0,
                             CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
                             capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
@@ -1282,7 +1296,7 @@ retry:
                             &capsnap->mtime, &capsnap->atime,
                             capsnap->time_warp_seq,
                             capsnap->uid, capsnap->gid, capsnap->mode,
-                             0, NULL,
+                             capsnap->xattr_version, capsnap->xattr_blob,
                             capsnap->follows);
                next_follows = capsnap->follows + 1;
@@ -1311,7 +1325,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
        struct inode *inode = &ci->vfs_inode;
        spin_lock(&inode->i_lock);
-        __ceph_flush_snaps(ci, NULL);
+        __ceph_flush_snaps(ci, NULL, 0);
        spin_unlock(&inode->i_lock);
 }
@@ -1332,7 +1346,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
             ceph_cap_string(was | mask));
        ci->i_dirty_caps |= mask;
        if (was == 0) {
-                dout(" inode %p now dirty\n", &ci->vfs_inode);
+                if (!ci->i_head_snapc)
+                        ci->i_head_snapc = ceph_get_snap_context(
+                                ci->i_snap_realm->cached_context);
+                dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
+                        ci->i_head_snapc);
                BUG_ON(!list_empty(&ci->i_dirty_item));
                spin_lock(&mdsc->cap_dirty_lock);
                list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -1470,7 +1488,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        /* flush snaps first time around only */
        if (!list_empty(&ci->i_cap_snaps))
-                __ceph_flush_snaps(ci, &session);
+                __ceph_flush_snaps(ci, &session, 0);
        goto retry_locked;
 retry:
        spin_lock(&inode->i_lock);
@@ -1887,7 +1905,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
                if (cap && cap->session == session) {
                        dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
                             cap, capsnap);
-                        __ceph_flush_snaps(ci, &session);
+                        __ceph_flush_snaps(ci, &session, 1);
                } else {
                        pr_err("%p auth cap %p not mds%d ???\n", inode,
                               cap, session->s_mds);
@@ -2190,7 +2208,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
        if (ci->i_head_snapc == snapc) {
                ci->i_wrbuffer_ref_head -= nr;
-                if (!ci->i_wrbuffer_ref_head) {
+                if (ci->i_wrbuffer_ref_head == 0 &&
+                    ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+                        BUG_ON(!ci->i_head_snapc);
                        ceph_put_snap_context(ci->i_head_snapc);
                        ci->i_head_snapc = NULL;
                }
@@ -2263,7 +2283,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
-        int seq = le32_to_cpu(grant->seq);
+        unsigned seq = le32_to_cpu(grant->seq);
+        unsigned issue_seq = le32_to_cpu(grant->issue_seq);
        int newcaps = le32_to_cpu(grant->caps);
        int issued, implemented, used, wanted, dirty;
        u64 size = le64_to_cpu(grant->size);
@@ -2275,8 +2296,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        int revoked_rdcache = 0;
        int queue_invalidate = 0;
-        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+        dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
-             inode, cap, mds, seq, ceph_cap_string(newcaps));
+             inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
                inode->i_size);
@@ -2372,6 +2393,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        cap->seq = seq;
+        cap->issue_seq = issue_seq;
        /* file layout may have changed */
        ci->i_layout = grant->layout;
@@ -2483,6 +2505,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                        dout(" inode %p now clean\n", inode);
                        BUG_ON(!list_empty(&ci->i_dirty_item));
                        drop = 1;
+                        if (ci->i_wrbuffer_ref_head == 0) {
+                                BUG_ON(!ci->i_head_snapc);
+                                ceph_put_snap_context(ci->i_head_snapc);
+                                ci->i_head_snapc = NULL;
+                        }
                } else {
                        BUG_ON(list_empty(&ci->i_dirty_item));
                }
@@ -2749,15 +2776,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                if (op == CEPH_CAP_OP_IMPORT)
                        __queue_cap_release(session, vino.ino, cap_id,
                                            mseq, seq);
+                goto flush_cap_releases;
-                /*
-                 * send any full release message to try to move things
-                 * along for the mds (who clearly thinks we still have this
-                 * cap).
-                 */
-                ceph_add_cap_releases(mdsc, session);
-                ceph_send_cap_releases(mdsc, session);
-                goto done;
        }
        /* these will work even if we don't have a cap yet */
@@ -2785,7 +2804,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                dout(" no cap on %p ino %llx.%llx from mds%d\n",
                     inode, ceph_ino(inode), ceph_snap(inode), mds);
                spin_unlock(&inode->i_lock);
-                goto done;
+                goto flush_cap_releases;
        }
        /* note that each of these drops i_lock for us */
@@ -2809,6 +2828,17 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                       ceph_cap_op_name(op));
        }
+        goto done;
+flush_cap_releases:
+        /*
+         * send any full release message to try to move things
+         * along for the mds (who clearly thinks we still have this
+         * cap).
+         */
+        ceph_add_cap_releases(mdsc, session);
+        ceph_send_cap_releases(mdsc, session);
 done:
        mutex_unlock(&session->s_mutex);
 done_unlocked:
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 360c4f22718d..6fd8b20a8611 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -171,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p)
                } else if (req->r_dentry) {
                        path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
                                                    &pathbase, 0);
+                        if (IS_ERR(path))
+                                path = NULL;
                        spin_lock(&req->r_dentry->d_lock);
                        seq_printf(s, " #%llx/%.*s (%s)",
                                   ceph_ino(req->r_dentry->d_parent->d_inode),
@@ -187,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p)
                if (req->r_old_dentry) {
                        path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
                                                    &pathbase, 0);
+                        if (IS_ERR(path))
+                                path = NULL;
                        spin_lock(&req->r_old_dentry->d_lock);
                        seq_printf(s, " #%llx/%.*s (%s)",
                           ceph_ino(req->r_old_dentry->d_parent->d_inode),
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 67bbb41d5526..a1986eb52045 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -46,7 +46,7 @@ int ceph_init_dentry(struct dentry *dentry)
        else
                dentry->d_op = &ceph_snap_dentry_ops;
-        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
        if (!di)
                return -ENOMEM;          /* oh well */
@@ -1021,11 +1021,15 @@ out_touch:
 static void ceph_dentry_release(struct dentry *dentry)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
-        struct inode *parent_inode = dentry->d_parent->d_inode;
+        struct inode *parent_inode = NULL;
-        u64 snapid = ceph_snap(parent_inode);
+        u64 snapid = CEPH_NOSNAP;
+        if (!IS_ROOT(dentry)) {
+                parent_inode = dentry->d_parent->d_inode;
+                if (parent_inode)
+                        snapid = ceph_snap(parent_inode);
+        }
        dout("dentry_release %p parent %p\n", dentry, parent_inode);
        if (parent_inode && snapid != CEPH_SNAPDIR) {
                struct ceph_inode_info *ci = ceph_inode(parent_inode);
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 4480cb1c63e7..e38423e82f2e 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -42,32 +42,37 @@ struct ceph_nfs_confh {
 static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                          int connectable)
 {
+        int type;
        struct ceph_nfs_fh *fh = (void *)rawfh;
        struct ceph_nfs_confh *cfh = (void *)rawfh;
        struct dentry *parent = dentry->d_parent;
        struct inode *inode = dentry->d_inode;
-        int type;
+        int connected_handle_length = sizeof(*cfh)/4;
+        int handle_length = sizeof(*fh)/4;
        /* don't re-export snaps */
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EINVAL;
-        if (*max_len >= sizeof(*cfh)) {
+        if (*max_len >= connected_handle_length) {
                dout("encode_fh %p connectable\n", dentry);
                cfh->ino = ceph_ino(dentry->d_inode);
                cfh->parent_ino = ceph_ino(parent->d_inode);
                cfh->parent_name_hash = parent->d_name.hash;
-                *max_len = sizeof(*cfh);
+                *max_len = connected_handle_length;
                type = 2;
-        } else if (*max_len > sizeof(*fh)) {
+        } else if (*max_len >= handle_length) {
-                if (connectable)
+                if (connectable) {
-                        return -ENOSPC;
+                        *max_len = connected_handle_length;
+                        return 255;
+                }
                dout("encode_fh %p\n", dentry);
                fh->ino = ceph_ino(dentry->d_inode);
-                *max_len = sizeof(*fh);
+                *max_len = handle_length;
                type = 1;
        } else {
-                return -ENOSPC;
+                *max_len = handle_length;
+                return 255;
        }
        return type;
 }
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8c044a4f0457..66e4da6dba22 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -697,7 +697,7 @@ more:
                         * start_request so that a tid has been assigned.
                         */
                        spin_lock(&ci->i_unsafe_lock);
-                        list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+                        list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
                        spin_unlock(&ci->i_unsafe_lock);
                        ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
                }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5d893d31e399..62377ec37edf 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -677,6 +677,7 @@ static int fill_inode(struct inode *inode,
                if (ci->i_files == 0 && ci->i_subdirs == 0 &&
                    ceph_snap(inode) == CEPH_NOSNAP &&
                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+                    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                        dout(" marking %p complete (empty)\n", inode);
                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
@@ -844,7 +845,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 * the caller) if we fail.
 */
 static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
-                                    bool *prehash)
+                                    bool *prehash, bool set_offset)
 {
        struct dentry *realdn;
@@ -876,7 +877,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        }
        if ((!prehash || *prehash) && d_unhashed(dn))
                d_rehash(dn);
-        ceph_set_dentry_offset(dn);
+        if (set_offset)
+                ceph_set_dentry_offset(dn);
 out:
        return dn;
 }
@@ -1061,7 +1063,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                                d_delete(dn);
                                goto done;
                        }
-                        dn = splice_dentry(dn, in, &have_lease);
+                        dn = splice_dentry(dn, in, &have_lease, true);
                        if (IS_ERR(dn)) {
                                err = PTR_ERR(dn);
                                goto done;
@@ -1104,7 +1106,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        goto done;
                }
                dout(" linking snapped dir %p to dn %p\n", in, dn);
-                dn = splice_dentry(dn, in, NULL);
+                dn = splice_dentry(dn, in, NULL, true);
                if (IS_ERR(dn)) {
                        err = PTR_ERR(dn);
                        goto done;
@@ -1229,14 +1231,14 @@ retry_lookup:
                        in = dn->d_inode;
                } else {
                        in = ceph_get_inode(parent->d_sb, vino);
-                        if (in == NULL) {
+                        if (IS_ERR(in)) {
                                dout("new_inode badness\n");
                                d_delete(dn);
                                dput(dn);
-                                err = -ENOMEM;
+                                err = PTR_ERR(in);
                                goto out;
                        }
-                        dn = splice_dentry(dn, in, NULL);
+                        dn = splice_dentry(dn, in, NULL, false);
                        if (IS_ERR(dn))
                                dn = NULL;
                }
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae85af06454f..ff4e753aae92 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -82,7 +82,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
                length = fl->fl_end - fl->fl_start + 1;
        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                (u64)fl->fl_pid, (u64)fl->fl_nspid,
+                                (u64)fl->fl_pid,
+                                (u64)(unsigned long)fl->fl_nspid,
                                lock_cmd, fl->fl_start,
                                length, wait);
        if (!err) {
@@ -92,7 +93,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
                        /* undo! This should only happen if the kernel detects
                         * local deadlock. */
                        ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                          (u64)fl->fl_pid, (u64)fl->fl_nspid,
+                                          (u64)fl->fl_pid,
+                                          (u64)(unsigned long)fl->fl_nspid,
                                          CEPH_LOCK_UNLOCK, fl->fl_start,
                                          length, 0);
                        dout("got %d on posix_lock_file, undid lock", err);
@@ -132,7 +134,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                length = fl->fl_end - fl->fl_start + 1;
        err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
-                                file, (u64)fl->fl_pid, (u64)fl->fl_nspid,
+                                file, (u64)fl->fl_pid,
+                                (u64)(unsigned long)fl->fl_nspid,
                                lock_cmd, fl->fl_start,
                                length, wait);
        if (!err) {
@@ -141,7 +144,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                        ceph_lock_message(CEPH_LOCK_FLOCK,
                                          CEPH_MDS_OP_SETFILELOCK,
                                          file, (u64)fl->fl_pid,
-                                          (u64)fl->fl_nspid,
+                                          (u64)(unsigned long)fl->fl_nspid,
                                          CEPH_LOCK_UNLOCK, fl->fl_start,
                                          length, 0);
                        dout("got %d on flock_lock_file_wait, undid lock", err);
@@ -235,7 +238,8 @@ int lock_to_ceph_filelock(struct file_lock *lock,
        cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
        cephlock->client = cpu_to_le64(0);
        cephlock->pid = cpu_to_le64(lock->fl_pid);
-        cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid);
+        cephlock->pid_namespace =
+                cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
        switch (lock->fl_type) {
        case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a75ddbf9fe37..fad95f8f2608 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -560,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 *
 * Called under mdsc->mutex.
 */
+struct dentry *get_nonsnap_parent(struct dentry *dentry)
+{
+        while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+                dentry = dentry->d_parent;
+        return dentry;
+}
 static int __choose_mds(struct ceph_mds_client *mdsc,
                        struct ceph_mds_request *req)
 {
@@ -590,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        if (req->r_inode) {
                inode = req->r_inode;
        } else if (req->r_dentry) {
-                if (req->r_dentry->d_inode) {
+                struct inode *dir = req->r_dentry->d_parent->d_inode;
+                if (dir->i_sb != mdsc->client->sb) {
+                        /* not this fs! */
+                        inode = req->r_dentry->d_inode;
+                } else if (ceph_snap(dir) != CEPH_NOSNAP) {
+                        /* direct snapped/virtual snapdir requests
+                         * based on parent dir inode */
+                        struct dentry *dn =
+                                get_nonsnap_parent(req->r_dentry->d_parent);
+                        inode = dn->d_inode;
+                        dout("__choose_mds using nonsnap parent %p\n", inode);
+                } else if (req->r_dentry->d_inode) {
+                        /* dentry target */
                        inode = req->r_dentry->d_inode;
                } else {
-                        inode = req->r_dentry->d_parent->d_inode;
+                        /* dir + name */
+                        inode = dir;
                        hash = req->r_dentry->d_name.hash;
                        is_hash = true;
                }
        }
        dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
             (int)hash, mode);
        if (!inode)
@@ -2208,7 +2230,7 @@ static void handle_session(struct ceph_mds_session *session,
                        pr_info("mds%d reconnect denied\n", session->s_mds);
                remove_session_caps(session);
                wake = 1; /* for good measure */
-                complete_all(&mdsc->session_close_waiters);
+                wake_up_all(&mdsc->session_close_wq);
                kick_requests(mdsc, mds);
                break;
@@ -2302,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
-                        BUG_ON(err);
+                        goto out_dput;
                }
        } else {
                path = NULL;
@@ -2310,7 +2332,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        }
        err = ceph_pagelist_encode_string(pagelist, path, pathlen);
        if (err)
-                goto out;
+                goto out_free;
        spin_lock(&inode->i_lock);
        cap->seq = 0;        /* reset cap seq */
@@ -2352,10 +2374,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                                num_fcntl_locks,
                                                num_flock_locks);
                unlock_kernel();
+        } else {
+                err = ceph_pagelist_append(pagelist, &rec, reclen);
        }
-out:
+out_free:
        kfree(path);
+out_dput:
        dput(dentry);
        return err;
 }
@@ -2876,7 +2901,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
                return -ENOMEM;
        init_completion(&mdsc->safe_umount_waiters);
-        init_completion(&mdsc->session_close_waiters);
+        init_waitqueue_head(&mdsc->session_close_wq);
        INIT_LIST_HEAD(&mdsc->waiting_for_map);
        mdsc->sessions = NULL;
        mdsc->max_sessions = 0;
@@ -3021,6 +3046,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
        wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
 }
+/*
+ * true if all sessions are closed, or we force unmount
+ */
+bool done_closing_sessions(struct ceph_mds_client *mdsc)
+{
+        int i, n = 0;
+        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+                return true;
+        mutex_lock(&mdsc->mutex);
+        for (i = 0; i < mdsc->max_sessions; i++)
+                if (mdsc->sessions[i])
+                        n++;
+        mutex_unlock(&mdsc->mutex);
+        return n == 0;
+}
 /*
 * called after sb is ro.
@@ -3029,45 +3071,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
        struct ceph_mds_session *session;
        int i;
-        int n;
        struct ceph_client *client = mdsc->client;
-        unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+        unsigned long timeout = client->mount_args->mount_timeout * HZ;
        dout("close_sessions\n");
-        mutex_lock(&mdsc->mutex);
        /* close sessions */
-        started = jiffies;
+        mutex_lock(&mdsc->mutex);
-        while (time_before(jiffies, started + timeout)) {
+        for (i = 0; i < mdsc->max_sessions; i++) {
-                dout("closing sessions\n");
+                session = __ceph_lookup_mds_session(mdsc, i);
-                n = 0;
+                if (!session)
-                for (i = 0; i < mdsc->max_sessions; i++) {
+                        continue;
-                        session = __ceph_lookup_mds_session(mdsc, i);
-                        if (!session)
-                                continue;
-                        mutex_unlock(&mdsc->mutex);
-                        mutex_lock(&session->s_mutex);
-                        __close_session(mdsc, session);
-                        mutex_unlock(&session->s_mutex);
-                        ceph_put_mds_session(session);
-                        mutex_lock(&mdsc->mutex);
-                        n++;
-                }
-                if (n == 0)
-                        break;
-                if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
-                        break;
-                dout("waiting for sessions to close\n");
                mutex_unlock(&mdsc->mutex);
-                wait_for_completion_timeout(&mdsc->session_close_waiters,
+                mutex_lock(&session->s_mutex);
-                                            timeout);
+                __close_session(mdsc, session);
+                mutex_unlock(&session->s_mutex);
+                ceph_put_mds_session(session);
                mutex_lock(&mdsc->mutex);
        }
+        mutex_unlock(&mdsc->mutex);
+        dout("waiting for sessions to close\n");
+        wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
+                           timeout);
        /* tear down remaining sessions */
+        mutex_lock(&mdsc->mutex);
        for (i = 0; i < mdsc->max_sessions; i++) {
                if (mdsc->sessions[i]) {
                        session = get_session(mdsc->sessions[i]);
@@ -3080,9 +3109,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
                        mutex_lock(&mdsc->mutex);
                }
        }
        WARN_ON(!list_empty(&mdsc->cap_delay_list));
        mutex_unlock(&mdsc->mutex);
        ceph_cleanup_empty_realms(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ab7e89f5e344..c98267ce6d2a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -234,7 +234,8 @@ struct ceph_mds_client {
        struct mutex            mutex;         /* all nested structures */
        struct ceph_mdsmap      *mdsmap;
-        struct completion       safe_umount_waiters, session_close_waiters;
+        struct completion       safe_umount_waiters;
+        wait_queue_head_t       session_close_wq;
        struct list_head        waiting_for_map;
        struct ceph_mds_session **sessions;    /* NULL for mds if no session */
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index bed6391e52c7..3b5571b8ce22 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -549,7 +549,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 */
 static void __cancel_request(struct ceph_osd_request *req)
 {
-        if (req->r_sent) {
+        if (req->r_sent && req->r_osd) {
                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
                req->r_sent = 0;
        }
@@ -661,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc,
        reqhead->reassert_version = req->r_reassert_version;
        req->r_stamp = jiffies;
-        list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
+        list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
        ceph_msg_get(req->r_request); /* send consumes a ref */
        ceph_con_send(&req->r_osd->o_con, req->r_request);
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index b6859f47d364..46a368b6dce5 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -5,10 +5,18 @@
 #include "pagelist.h"
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+        struct page *page = list_entry(pl->head.prev, struct page,
+                                       lru);
+        kunmap(page);
+}
 int ceph_pagelist_release(struct ceph_pagelist *pl)
 {
        if (pl->mapped_tail)
-                kunmap(pl->mapped_tail);
+                ceph_pagelist_unmap_tail(pl);
        while (!list_empty(&pl->head)) {
                struct page *page = list_first_entry(&pl->head, struct page,
                                                     lru);
@@ -26,7 +34,7 @@ static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
        pl->room += PAGE_SIZE;
        list_add_tail(&page->lru, &pl->head);
        if (pl->mapped_tail)
-                kunmap(pl->mapped_tail);
+                ceph_pagelist_unmap_tail(pl);
        pl->mapped_tail = kmap(page);
        return 0;
 }
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index c0b26b6badba..190b6c4a6f2b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -119,6 +119,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
        INIT_LIST_HEAD(&realm->children);
        INIT_LIST_HEAD(&realm->child_item);
        INIT_LIST_HEAD(&realm->empty_item);
+        INIT_LIST_HEAD(&realm->dirty_item);
        INIT_LIST_HEAD(&realm->inodes_with_caps);
        spin_lock_init(&realm->inodes_with_caps_lock);
        __insert_snap_realm(&mdsc->snap_realms, realm);
@@ -435,7 +436,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap_snap *capsnap;
-        int used;
+        int used, dirty;
        capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
        if (!capsnap) {
@@ -445,6 +446,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
        spin_lock(&inode->i_lock);
        used = __ceph_caps_used(ci);
+        dirty = __ceph_caps_dirty(ci);
        if (__ceph_have_pending_cap_snap(ci)) {
                /* there is no point in queuing multiple "pending" cap_snaps,
                   as no new writes are allowed to start when pending, so any
@@ -452,27 +454,37 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                   cap_snap.  lucky us. */
                dout("queue_cap_snap %p already pending\n", inode);
                kfree(capsnap);
-        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
+                   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+                             CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
                struct ceph_snap_context *snapc = ci->i_head_snapc;
+                dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
+                     capsnap, snapc);
                igrab(inode);
+                
                atomic_set(&capsnap->nref, 1);
                capsnap->ci = ci;
                INIT_LIST_HEAD(&capsnap->ci_item);
                INIT_LIST_HEAD(&capsnap->flushing_item);
-                capsnap->follows = snapc->seq - 1;
+                capsnap->follows = snapc->seq;
                capsnap->issued = __ceph_caps_issued(ci, NULL);
-                capsnap->dirty = __ceph_caps_dirty(ci);
+                capsnap->dirty = dirty;
                capsnap->mode = inode->i_mode;
                capsnap->uid = inode->i_uid;
                capsnap->gid = inode->i_gid;
-                /* fixme? */
+                if (dirty & CEPH_CAP_XATTR_EXCL) {
-                capsnap->xattr_blob = NULL;
+                        __ceph_build_xattrs_blob(ci);
-                capsnap->xattr_len = 0;
+                        capsnap->xattr_blob =
+                                ceph_buffer_get(ci->i_xattrs.blob);
+                        capsnap->xattr_version = ci->i_xattrs.version;
+                } else {
+                        capsnap->xattr_blob = NULL;
+                        capsnap->xattr_version = 0;
+                }
                /* dirty page count moved from _head to this cap_snap;
                   all subsequent writes page dirties occur _after_ this
@@ -480,7 +492,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
                ci->i_wrbuffer_ref_head = 0;
                capsnap->context = snapc;
-                ci->i_head_snapc = NULL;
+                ci->i_head_snapc =
+                        ceph_get_snap_context(ci->i_snap_realm->cached_context);
+                dout(" new snapc is %p\n", ci->i_head_snapc);
                list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
                if (used & CEPH_CAP_FILE_WR) {
@@ -539,6 +553,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
        return 1;  /* caller may want to ceph_flush_snaps */
 }
+/*
+ * Queue cap_snaps for snap writeback for this realm and its children.
+ * Called under snap_rwsem, so realm topology won't change.
+ */
+static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+{
+        struct ceph_inode_info *ci;
+        struct inode *lastinode = NULL;
+        struct ceph_snap_realm *child;
+        dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+        spin_lock(&realm->inodes_with_caps_lock);
+        list_for_each_entry(ci, &realm->inodes_with_caps,
+                            i_snap_realm_item) {
+                struct inode *inode = igrab(&ci->vfs_inode);
+                if (!inode)
+                        continue;
+                spin_unlock(&realm->inodes_with_caps_lock);
+                if (lastinode)
+                        iput(lastinode);
+                lastinode = inode;
+                ceph_queue_cap_snap(ci);
+                spin_lock(&realm->inodes_with_caps_lock);
+        }
+        spin_unlock(&realm->inodes_with_caps_lock);
+        if (lastinode)
+                iput(lastinode);
+        dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
+        list_for_each_entry(child, &realm->children, child_item)
+                queue_realm_cap_snaps(child);
+        dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+}
 /*
 * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
@@ -556,6 +605,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
        struct ceph_snap_realm *realm;
        int invalidate = 0;
        int err = -ENOMEM;
+        LIST_HEAD(dirty_realms);
        dout("update_snap_trace deletion=%d\n", deletion);
 more:
@@ -578,45 +628,6 @@ more:
                }
        }
-        if (le64_to_cpu(ri->seq) > realm->seq) {
-                dout("update_snap_trace updating %llx %p %lld -> %lld\n",
-                     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
-                /*
-                 * if the realm seq has changed, queue a cap_snap for every
-                 * inode with open caps.  we do this _before_ we update
-                 * the realm info so that we prepare for writeback under the
-                 * _previous_ snap context.
-                 *
-                 * ...unless it's a snap deletion!
-                 */
-                if (!deletion) {
-                        struct ceph_inode_info *ci;
-                        struct inode *lastinode = NULL;
-                        spin_lock(&realm->inodes_with_caps_lock);
-                        list_for_each_entry(ci, &realm->inodes_with_caps,
-                                            i_snap_realm_item) {
-                                struct inode *inode = igrab(&ci->vfs_inode);
-                                if (!inode)
-                                        continue;
-                                spin_unlock(&realm->inodes_with_caps_lock);
-                                if (lastinode)
-                                        iput(lastinode);
-                                lastinode = inode;
-                                ceph_queue_cap_snap(ci);
-                                spin_lock(&realm->inodes_with_caps_lock);
-                        }
-                        spin_unlock(&realm->inodes_with_caps_lock);
-                        if (lastinode)
-                                iput(lastinode);
-                        dout("update_snap_trace cap_snaps queued\n");
-                }
-        } else {
-                dout("update_snap_trace %llx %p seq %lld unchanged\n",
-                     realm->ino, realm, realm->seq);
-        }
        /* ensure the parent is correct */
        err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
        if (err < 0)
@@ -624,6 +635,8 @@ more:
        invalidate += err;
        if (le64_to_cpu(ri->seq) > realm->seq) {
+                dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+                     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
                /* update realm parameters, snap lists */
                realm->seq = le64_to_cpu(ri->seq);
                realm->created = le64_to_cpu(ri->created);
@@ -641,9 +654,17 @@ more:
                if (err < 0)
                        goto fail;
+                /* queue realm for cap_snap creation */
+                list_add(&realm->dirty_item, &dirty_realms);
                invalidate = 1;
        } else if (!realm->cached_context) {
+                dout("update_snap_trace %llx %p seq %lld new\n",
+                     realm->ino, realm, realm->seq);
                invalidate = 1;
+        } else {
+                dout("update_snap_trace %llx %p seq %lld unchanged\n",
+                     realm->ino, realm, realm->seq);
        }
        dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
@@ -656,6 +677,14 @@ more:
        if (invalidate)
                rebuild_snap_realms(realm);
+        /*
+         * queue cap snaps _after_ we've built the new snap contexts,
+         * so that i_head_snapc can be set appropriately.
+         */
+        list_for_each_entry(realm, &dirty_realms, dirty_item) {
+                queue_realm_cap_snaps(realm);
+        }
        __cleanup_empty_realms(mdsc);
        return 0;
@@ -688,7 +717,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
                igrab(inode);
                spin_unlock(&mdsc->snap_flush_lock);
                spin_lock(&inode->i_lock);
-                __ceph_flush_snaps(ci, &session);
+                __ceph_flush_snaps(ci, &session, 0);
                spin_unlock(&inode->i_lock);
                iput(inode);
                spin_lock(&mdsc->snap_flush_lock);
@@ -789,6 +818,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        };
                        struct inode *inode = ceph_find_inode(sb, vino);
                        struct ceph_inode_info *ci;
+                        struct ceph_snap_realm *oldrealm;
                        if (!inode)
                                continue;
@@ -814,18 +844,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        dout(" will move %p to split realm %llx %p\n",
                             inode, realm->ino, realm);
                        /*
-                         * Remove the inode from the realm's inode
+                         * Move the inode to the new realm
-                         * list, but don't add it to the new realm
-                         * yet.  We don't want the cap_snap to be
-                         * queued (again) by ceph_update_snap_trace()
-                         * below.  Queue it _now_, under the old context.
                         */
                        spin_lock(&realm->inodes_with_caps_lock);
                        list_del_init(&ci->i_snap_realm_item);
+                        list_add(&ci->i_snap_realm_item,
+                                 &realm->inodes_with_caps);
+                        oldrealm = ci->i_snap_realm;
+                        ci->i_snap_realm = realm;
                        spin_unlock(&realm->inodes_with_caps_lock);
                        spin_unlock(&inode->i_lock);
-                        ceph_queue_cap_snap(ci);
+                        ceph_get_snap_realm(mdsc, realm);
+                        ceph_put_snap_realm(mdsc, oldrealm);
                        iput(inode);
                        continue;
@@ -853,43 +884,9 @@ skip_inode:
        ceph_update_snap_trace(mdsc, p, e,
                               op == CEPH_SNAP_OP_DESTROY);
-        if (op == CEPH_SNAP_OP_SPLIT) {
+        if (op == CEPH_SNAP_OP_SPLIT)
-                /*
-                 * ok, _now_ add the inodes into the new realm.
-                 */
-                for (i = 0; i < num_split_inos; i++) {
-                        struct ceph_vino vino = {
-                                .ino = le64_to_cpu(split_inos[i]),
-                                .snap = CEPH_NOSNAP,
-                        };
-                        struct inode *inode = ceph_find_inode(sb, vino);
-                        struct ceph_inode_info *ci;
-                        if (!inode)
-                                continue;
-                        ci = ceph_inode(inode);
-                        spin_lock(&inode->i_lock);
-                        if (list_empty(&ci->i_snap_realm_item)) {
-                                struct ceph_snap_realm *oldrealm =
-                                        ci->i_snap_realm;
-                                dout(" moving %p to split realm %llx %p\n",
-                                     inode, realm->ino, realm);
-                                spin_lock(&realm->inodes_with_caps_lock);
-                                list_add(&ci->i_snap_realm_item,
-                                         &realm->inodes_with_caps);
-                                ci->i_snap_realm = realm;
-                                spin_unlock(&realm->inodes_with_caps_lock);
-                                ceph_get_snap_realm(mdsc, realm);
-                                ceph_put_snap_realm(mdsc, oldrealm);
-                        }
-                        spin_unlock(&inode->i_lock);
-                        iput(inode);
-                }
                /* we took a reference when we created the realm, above */
                ceph_put_snap_realm(mdsc, realm);
-        }
        __cleanup_empty_realms(mdsc);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2482d696f0de..b87638e84c4b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -216,8 +216,7 @@ struct ceph_cap_snap {
        uid_t uid;
        gid_t gid;
-        void *xattr_blob;
+        struct ceph_buffer *xattr_blob;
-        int xattr_len;
        u64 xattr_version;
        u64 size;
@@ -229,8 +228,11 @@ struct ceph_cap_snap {
 static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
 {
-        if (atomic_dec_and_test(&capsnap->nref))
+        if (atomic_dec_and_test(&capsnap->nref)) {
+                if (capsnap->xattr_blob)
+                        ceph_buffer_put(capsnap->xattr_blob);
                kfree(capsnap);
+        }
 }
 /*
@@ -342,7 +344,8 @@ struct ceph_inode_info {
        unsigned i_cap_exporting_issued;
        struct ceph_cap_reservation i_cap_migration_resv;
        struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
-        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
+        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
+                                                    dirty|flushing caps */
        unsigned i_snap_caps;           /* cap bits for snapped files */
        int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
@@ -687,6 +690,8 @@ struct ceph_snap_realm {
        struct list_head empty_item;     /* if i have ref==0 */
+        struct list_head dirty_item;     /* if realm needs new context */
        /* the current set of snaps for this realm */
        struct ceph_snap_context *cached_context;
@@ -823,7 +828,8 @@ extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                                       struct ceph_snap_context *snapc);
 extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
-                               struct ceph_mds_session **psession);
+                               struct ceph_mds_session **psession,
+                               int again);
 extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                            struct ceph_mds_session *session);
 extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 097a2654c00f..9578af610b73 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -485,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
                ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
                ci->i_xattrs.prealloc_blob = NULL;
                ci->i_xattrs.dirty = false;
+                ci->i_xattrs.version++;
        }
 }
diff --git a/fs/char_dev.c b/fs/char_dev.c
index f80a4f25123c..143d393881cb 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -40,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = {
 #endif
                /* permit direct mmap, for read, write or exec */
                BDI_CAP_MAP_DIRECT |
-                BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP),
+                BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
+                /* no writeback happens */
+                BDI_CAP_NO_ACCT_AND_WRITEBACK),
 };
 static struct kobj_map *cdev_map;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 650638275a6f..7fe6b52df507 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -30,6 +30,8 @@
 *     This is a compressed table of upper and lower case conversion.
 *
 */
+#ifndef _CIFS_UNICODE_H
+#define _CIFS_UNICODE_H
 #include <asm/byteorder.h>
 #include <linux/types.h>
@@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[];
 #endif                          /* UNIUPR_NOUPPER */
 #ifndef UNIUPR_NOLOWER
-extern signed char UniLowerTable[512];
+extern signed char CifsUniLowerTable[512];
-extern struct UniCaseRange UniLowerRange[];
+extern const struct UniCaseRange CifsUniLowerRange[];
 #endif                          /* UNIUPR_NOLOWER */
 #ifdef __KERNEL__
@@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin)
 * UniTolower:  Convert a unicode character to lower case
 */
 static inline wchar_t
-UniTolower(wchar_t uc)
+UniTolower(register wchar_t uc)
 {
-        register struct UniCaseRange *rp;
+        register const struct UniCaseRange *rp;
-        if (uc < sizeof(UniLowerTable)) {
+        if (uc < sizeof(CifsUniLowerTable)) {
                /* Latin characters */
-                return uc + UniLowerTable[uc];  /* Use base tables */
+                return uc + CifsUniLowerTable[uc];      /* Use base tables */
        } else {
-                rp = UniLowerRange;     /* Use range tables */
+                rp = CifsUniLowerRange; /* Use range tables */
                while (rp->start) {
                        if (uc < rp->start)     /* Before start of range */
                                return uc;      /* Uppercase = input */
@@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin)
 }
 #endif
+#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 18a9d978e519..0ac7c5a8633a 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = {
 /*
 * Latin lower case
 */
-static signed char CifsUniLowerTable[512] = {
+signed char CifsUniLowerTable[512] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
@@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = {
 /*
 * Lower Case Range
 */
-static const struct UniCaseRange CifsUniLowerRange[] = {
+const struct UniCaseRange CifsUniLowerRange[] = {
-        0x0380, 0x03ab, UniCaseRangeL0380,
+        {0x0380, 0x03ab, UniCaseRangeL0380},
-        0x0400, 0x042f, UniCaseRangeL0400,
+        {0x0400, 0x042f, UniCaseRangeL0400},
-        0x0490, 0x04cb, UniCaseRangeL0490,
+        {0x0490, 0x04cb, UniCaseRangeL0490},
-        0x1e00, 0x1ff7, UniCaseRangeL1e00,
+        {0x1e00, 0x1ff7, UniCaseRangeL1e00},
-        0xff20, 0xff3a, UniCaseRangeLff20,
+        {0xff20, 0xff3a, UniCaseRangeLff20},
-        0, 0, 0
+        {0}
 };
 #endif
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 847628dfdc44..35042d8f7338 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -223,63 +223,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
        return 0;
 }
-int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
-                               const struct nls_table *nls_info)
-{
-        char temp_hash[16];
-        struct HMACMD5Context ctx;
-        char *ucase_buf;
-        __le16 *unicode_buf;
-        unsigned int i, user_name_len, dom_name_len;
-        if (ses == NULL)
-                return -EINVAL;
-        E_md4hash(ses->password, temp_hash);
-        hmac_md5_init_limK_to_64(temp_hash, 16, &ctx);
-        user_name_len = strlen(ses->userName);
-        if (user_name_len > MAX_USERNAME_SIZE)
-                return -EINVAL;
-        if (ses->domainName == NULL)
-                return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
-        dom_name_len = strlen(ses->domainName);
-        if (dom_name_len > MAX_USERNAME_SIZE)
-                return -EINVAL;
-        ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL);
-        if (ucase_buf == NULL)
-                return -ENOMEM;
-        unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL);
-        if (unicode_buf == NULL) {
-                kfree(ucase_buf);
-                return -ENOMEM;
-        }
-        for (i = 0; i < user_name_len; i++)
-                ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]];
-        ucase_buf[i] = 0;
-        user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf,
-                                      MAX_USERNAME_SIZE*2, nls_info);
-        unicode_buf[user_name_len] = 0;
-        user_name_len++;
-        for (i = 0; i < dom_name_len; i++)
-                ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]];
-        ucase_buf[i] = 0;
-        dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf,
-                                     MAX_USERNAME_SIZE*2, nls_info);
-        unicode_buf[user_name_len + dom_name_len] = 0;
-        hmac_md5_update((const unsigned char *) unicode_buf,
-                (user_name_len+dom_name_len)*2, &ctx);
-        hmac_md5_final(ses->server->ntlmv2_hash, &ctx);
-        kfree(ucase_buf);
-        kfree(unicode_buf);
-        return 0;
-}
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
                        char *lnm_session_key)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1f5450814087..1d60c655e3e0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -87,8 +87,9 @@ extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
                        struct TCP_Server_Info *server);
 extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
+extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
 extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
-                                unsigned short int port);
+                                const unsigned short int port);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
                            const struct cifsTconInfo *, int /* length of
@@ -365,8 +366,6 @@ extern int cifs_verify_signature(struct smb_hdr *,
                                __u32 expected_sequence_number);
 extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
                                 const char *pass);
-extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *,
-                        const struct nls_table *);
 extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
 extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
                             const struct nls_table *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd37..7e83b356cc9e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -232,7 +232,7 @@ static int
 small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
                void **request_buf)
 {
-        int rc = 0;
+        int rc;
        rc = cifs_reconnect_tcon(tcon, smb_command);
        if (rc)
@@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
        if (tcon != NULL)
                cifs_stats_inc(&tcon->num_smbs_sent);
-        return rc;
+        return 0;
 }
 int
@@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct,
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
-         void **request_buf /* returned */ ,
+                        void **request_buf, void **response_buf)
-         void **response_buf /* returned */ )
 {
-        int rc = 0;
-        rc = cifs_reconnect_tcon(tcon, smb_command);
-        if (rc)
-                return rc;
        *request_buf = cifs_buf_get();
        if (*request_buf == NULL) {
                /* BB should we add a retry in here if not a writepage? */
@@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
        if (tcon != NULL)
                cifs_stats_inc(&tcon->num_smbs_sent);
-        return rc;
+        return 0;
+}
+/* If the return code is zero, this function must fill in request_buf pointer */
+static int
+smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+         void **request_buf, void **response_buf)
+{
+        int rc;
+        rc = cifs_reconnect_tcon(tcon, smb_command);
+        if (rc)
+                return rc;
+        return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
+}
+static int
+smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
+                        void **request_buf, void **response_buf)
+{
+        if (tcon->ses->need_reconnect || tcon->need_reconnect)
+                return -EHOSTDOWN;
+        return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
 }
 static int validate_t2(struct smb_t2_rsp *pSMB)
@@ -4534,8 +4551,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
        cFYI(1, "In QFSUnixInfo");
 QFSUnixRetry:
-        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+        rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
-                      (void **) &pSMBr);
+                                   (void **) &pSMB, (void **) &pSMBr);
        if (rc)
                return rc;
@@ -4604,8 +4621,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
        cFYI(1, "In SETFSUnixInfo");
 SETFSUnixRetry:
        /* BB switch to small buf init to save memory */
-        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+        rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
-                      (void **) &pSMBr);
+                                        (void **) &pSMB, (void **) &pSMBr);
        if (rc)
                return rc;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 95c2ea67edfb..88c84a38bccb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -400,7 +400,9 @@ incomplete_rcv:
                        cFYI(1, "call to reconnect done");
                        csocket = server->ssocket;
                        continue;
-                } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
+                } else if (length == -ERESTARTSYS ||
+                           length == -EAGAIN ||
+                           length == -EINTR) {
                        msleep(1); /* minimum sleep to prevent looping
                                allowing socket to clear and app threads to set
                                tcpStatus CifsNeedReconnect if server hung */
@@ -414,18 +416,6 @@ incomplete_rcv:
                        } else
                                continue;
                } else if (length <= 0) {
-                        if (server->tcpStatus == CifsNew) {
-                                cFYI(1, "tcp session abend after SMBnegprot");
-                                /* some servers kill the TCP session rather than
-                                   returning an SMB negprot error, in which
-                                   case reconnecting here is not going to help,
-                                   and so simply return error to mount */
-                                break;
-                        }
-                        if (!try_to_freeze() && (length == -EINTR)) {
-                                cFYI(1, "cifsd thread killed");
-                                break;
-                        }
                        cFYI(1, "Reconnect after unexpected peek error %d",
                                length);
                        cifs_reconnect(server);
@@ -466,27 +456,19 @@ incomplete_rcv:
                           an error on SMB negprot response */
                        cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
                                pdu_length);
-                        if (server->tcpStatus == CifsNew) {
+                        /* give server a second to clean up  */
-                                /* if nack on negprot (rather than
+                        msleep(1000);
-                                ret of smb negprot error) reconnecting
+                        /* always try 445 first on reconnect since we get NACK
-                                not going to help, ret error to mount */
+                         * on some if we ever connected to port 139 (the NACK
-                                break;
+                         * is since we do not begin with RFC1001 session
-                        } else {
+                         * initialize frame)
-                                /* give server a second to
+                         */
-                                clean up before reconnect attempt */
+                        cifs_set_port((struct sockaddr *)
-                                msleep(1000);
+                                        &server->addr.sockAddr, CIFS_PORT);
-                                /* always try 445 first on reconnect
+                        cifs_reconnect(server);
-                                since we get NACK on some if we ever
+                        csocket = server->ssocket;
-                                connected to port 139 (the NACK is
+                        wake_up(&server->response_q);
-                                since we do not begin with RFC1001
+                        continue;
-                                session initialize frame) */
-                                server->addr.sockAddr.sin_port =
-                                        htons(CIFS_PORT);
-                                cifs_reconnect(server);
-                                csocket = server->ssocket;
-                                wake_up(&server->response_q);
-                                continue;
-                        }
                } else if (temp != (char) 0) {
                        cERROR(1, "Unknown RFC 1002 frame");
                        cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
@@ -522,8 +504,7 @@ incomplete_rcv:
                     total_read += length) {
                        length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
                                                pdu_length - total_read, 0);
-                        if ((server->tcpStatus == CifsExiting) ||
+                        if (server->tcpStatus == CifsExiting) {
-                            (length == -EINTR)) {
                                /* then will exit */
                                reconnect = 2;
                                break;
@@ -534,8 +515,9 @@ incomplete_rcv:
                                /* Now we will reread sock */
                                reconnect = 1;
                                break;
-                        } else if ((length == -ERESTARTSYS) ||
+                        } else if (length == -ERESTARTSYS ||
-                                   (length == -EAGAIN)) {
+                                   length == -EAGAIN ||
+                                   length == -EINTR) {
                                msleep(1); /* minimum sleep to prevent looping,
                                              allowing socket to clear and app
                                              threads to set tcpStatus
@@ -1673,7 +1655,9 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
                                    MAX_USERNAME_SIZE))
                                continue;
                        if (strlen(vol->username) != 0 &&
-                            strncmp(ses->password, vol->password,
+                            ses->password != NULL &&
+                            strncmp(ses->password,
+                                    vol->password ? vol->password : "",
                                    MAX_PASSWORD_SIZE))
                                continue;
                }
@@ -1722,9 +1706,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        if (ses) {
                cFYI(1, "Existing smb sess found (status=%d)", ses->status);
-                /* existing SMB ses has a server reference already */
-                cifs_put_tcp_session(server);
                mutex_lock(&ses->session_mutex);
                rc = cifs_negotiate_protocol(xid, ses);
                if (rc) {
@@ -1747,6 +1728,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
                        }
                }
                mutex_unlock(&ses->session_mutex);
+                /* existing SMB ses has a server reference already */
+                cifs_put_tcp_session(server);
                FreeXid(xid);
                return ses;
        }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 578d88c5b46e..f9ed0751cc12 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -305,8 +305,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto cifs_create_out;
-                return rc;
        }
        if (oplockEnabled)
@@ -365,9 +364,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
        if (buf == NULL) {
-                kfree(full_path);
+                rc = -ENOMEM;
-                FreeXid(xid);
+                goto cifs_create_out;
-                return -ENOMEM;
        }
        /*
@@ -496,6 +494,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
+        int oplock = 0;
+        u16 fileHandle;
+        FILE_ALL_INFO *buf = NULL;
+        unsigned int bytes_written;
+        struct win_dev *pdev;
        if (!old_valid_dev(device_number))
                return -EINVAL;
@@ -506,9 +509,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        pTcon = cifs_sb->tcon;
        full_path = build_path_from_dentry(direntry);
-        if (full_path == NULL)
+        if (full_path == NULL) {
                rc = -ENOMEM;
-        else if (pTcon->unix_ext) {
+                goto mknod_out;
+        }
+        if (pTcon->unix_ext) {
                struct cifs_unix_set_info_args args = {
                        .mode   = mode & ~current_umask(),
                        .ctime  = NO_CHANGE_64,
@@ -527,87 +533,78 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                                            cifs_sb->local_nls,
                                            cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+                if (rc)
+                        goto mknod_out;
-                if (!rc) {
+                rc = cifs_get_inode_info_unix(&newinode, full_path,
-                        rc = cifs_get_inode_info_unix(&newinode, full_path,
                                                inode->i_sb, xid);
-                        if (pTcon->nocase)
+                if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
+                        direntry->d_op = &cifs_ci_dentry_ops;
-                        else
+                else
-                                direntry->d_op = &cifs_dentry_ops;
+                        direntry->d_op = &cifs_dentry_ops;
-                        if (rc == 0)
-                                d_instantiate(direntry, newinode);
-                }
-        } else {
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-                        int oplock = 0;
-                        u16 fileHandle;
-                        FILE_ALL_INFO *buf;
-                        cFYI(1, "sfu compat create special file");
+                if (rc == 0)
+                        d_instantiate(direntry, newinode);
+                goto mknod_out;
+        }
-                        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
-                        if (buf == NULL) {
+                goto mknod_out;
-                                kfree(full_path);
-                                rc = -ENOMEM;
-                                FreeXid(xid);
-                                return rc;
-                        }
-                        rc = CIFSSMBOpen(xid, pTcon, full_path,
-                                         FILE_CREATE, /* fail if exists */
+        cFYI(1, "sfu compat create special file");
-                                         GENERIC_WRITE /* BB would
-                                          WRITE_OWNER | WRITE_DAC be better? */,
+        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-                                         /* Create a file and set the
+        if (buf == NULL) {
-                                            file attribute to SYSTEM */
+                kfree(full_path);
-                                         CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+                rc = -ENOMEM;
-                                         &fileHandle, &oplock, buf,
+                FreeXid(xid);
-                                         cifs_sb->local_nls,
+                return rc;
-                                         cifs_sb->mnt_cifs_flags &
-                                            CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        /* BB FIXME - add handling for backlevel servers
-                           which need legacy open and check for all
-                           calls to SMBOpen for fallback to SMBLeagcyOpen */
-                        if (!rc) {
-                                /* BB Do not bother to decode buf since no
-                                   local inode yet to put timestamps in,
-                                   but we can reuse it safely */
-                                unsigned int bytes_written;
-                                struct win_dev *pdev;
-                                pdev = (struct win_dev *)buf;
-                                if (S_ISCHR(mode)) {
-                                        memcpy(pdev->type, "IntxCHR", 8);
-                                        pdev->major =
-                                              cpu_to_le64(MAJOR(device_number));
-                                        pdev->minor =
-                                              cpu_to_le64(MINOR(device_number));
-                                        rc = CIFSSMBWrite(xid, pTcon,
-                                                fileHandle,
-                                                sizeof(struct win_dev),
-                                                0, &bytes_written, (char *)pdev,
-                                                NULL, 0);
-                                } else if (S_ISBLK(mode)) {
-                                        memcpy(pdev->type, "IntxBLK", 8);
-                                        pdev->major =
-                                              cpu_to_le64(MAJOR(device_number));
-                                        pdev->minor =
-                                              cpu_to_le64(MINOR(device_number));
-                                        rc = CIFSSMBWrite(xid, pTcon,
-                                                fileHandle,
-                                                sizeof(struct win_dev),
-                                                0, &bytes_written, (char *)pdev,
-                                                NULL, 0);
-                                } /* else if(S_ISFIFO */
-                                CIFSSMBClose(xid, pTcon, fileHandle);
-                                d_drop(direntry);
-                        }
-                        kfree(buf);
-                        /* add code here to set EAs */
-                }
        }
+        /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
+                         GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+                         &fileHandle, &oplock, buf, cifs_sb->local_nls,
+                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc)
+                goto mknod_out;
+        /* BB Do not bother to decode buf since no local inode yet to put
+         * timestamps in, but we can reuse it safely */
+        pdev = (struct win_dev *)buf;
+        if (S_ISCHR(mode)) {
+                memcpy(pdev->type, "IntxCHR", 8);
+                pdev->major =
+                      cpu_to_le64(MAJOR(device_number));
+                pdev->minor =
+                      cpu_to_le64(MINOR(device_number));
+                rc = CIFSSMBWrite(xid, pTcon,
+                        fileHandle,
+                        sizeof(struct win_dev),
+                        0, &bytes_written, (char *)pdev,
+                        NULL, 0);
+        } else if (S_ISBLK(mode)) {
+                memcpy(pdev->type, "IntxBLK", 8);
+                pdev->major =
+                      cpu_to_le64(MAJOR(device_number));
+                pdev->minor =
+                      cpu_to_le64(MINOR(device_number));
+                rc = CIFSSMBWrite(xid, pTcon,
+                        fileHandle,
+                        sizeof(struct win_dev),
+                        0, &bytes_written, (char *)pdev,
+                        NULL, 0);
+        } /* else if (S_ISFIFO) */
+        CIFSSMBClose(xid, pTcon, fileHandle);
+        d_drop(direntry);
+        /* FIXME: add code here to set EAs */
+mknod_out:
        kfree(full_path);
+        kfree(buf);
        FreeXid(xid);
        return rc;
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index db11fdef0e92..de748c652d11 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -242,8 +242,7 @@ int cifs_open(struct inode *inode, struct file *file)
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto out;
-                return rc;
        }
        cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4bc47e5b5f29..53cce8cc2224 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -801,6 +801,8 @@ retry_iget5_locked:
                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
                if (inode->i_state & I_NEW) {
                        inode->i_ino = hash;
+                        if (S_ISREG(inode->i_mode))
+                                inode->i_data.backing_dev_info = sb->s_bdi;
 #ifdef CONFIG_CIFS_FSCACHE
                        /* initialize per-inode cache cookie pointer */
                        CIFS_I(inode)->fscache = NULL;
@@ -834,7 +836,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                                                xid, NULL);
        if (!inode)
-                return ERR_PTR(-ENOMEM);
+                return ERR_PTR(rc);
 #ifdef CONFIG_CIFS_FSCACHE
        /* populate tcon->resource_id */
@@ -1462,29 +1464,18 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 {
        char *fromName = NULL;
        char *toName = NULL;
-        struct cifs_sb_info *cifs_sb_source;
+        struct cifs_sb_info *cifs_sb;
-        struct cifs_sb_info *cifs_sb_target;
        struct cifsTconInfo *tcon;
        FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
        FILE_UNIX_BASIC_INFO *info_buf_target;
        int xid, rc, tmprc;
-        cifs_sb_target = CIFS_SB(target_dir->i_sb);
+        cifs_sb = CIFS_SB(source_dir->i_sb);
-        cifs_sb_source = CIFS_SB(source_dir->i_sb);
+        tcon = cifs_sb->tcon;
-        tcon = cifs_sb_source->tcon;
        xid = GetXid();
        /*
-         * BB: this might be allowed if same server, but different share.
-         * Consider adding support for this
-         */
-        if (tcon != cifs_sb_target->tcon) {
-                rc = -EXDEV;
-                goto cifs_rename_exit;
-        }
-        /*
         * we already have the rename sem so we do not need to
         * grab it again here to protect the path integrity
         */
@@ -1519,17 +1510,16 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
                info_buf_target = info_buf_source + 1;
                tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
                                        info_buf_source,
-                                        cifs_sb_source->local_nls,
+                                        cifs_sb->local_nls,
-                                        cifs_sb_source->mnt_cifs_flags &
+                                        cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                if (tmprc != 0)
                        goto unlink_target;
-                tmprc = CIFSSMBUnixQPathInfo(xid, tcon,
+                tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName,
-                                        toName, info_buf_target,
+                                        info_buf_target,
-                                        cifs_sb_target->local_nls,
+                                        cifs_sb->local_nls,
-                                        /* remap based on source sb */
+                                        cifs_sb->mnt_cifs_flags &
-                                        cifs_sb_source->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                if (tmprc == 0 && (info_buf_source->UniqueId ==
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index f97851119e6c..9aad47a2d62f 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -206,26 +206,30 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 }
 int
-cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
+cifs_set_port(struct sockaddr *addr, const unsigned short int port)
-                   const unsigned short int port)
 {
-        if (!cifs_convert_address(dst, src, len))
+        switch (addr->sa_family) {
-                return 0;
-        switch (dst->sa_family) {
        case AF_INET:
-                ((struct sockaddr_in *)dst)->sin_port = htons(port);
+                ((struct sockaddr_in *)addr)->sin_port = htons(port);
                break;
        case AF_INET6:
-                ((struct sockaddr_in6 *)dst)->sin6_port = htons(port);
+                ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
                break;
        default:
                return 0;
        }
        return 1;
 }
+int
+cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
+                   const unsigned short int port)
+{
+        if (!cifs_convert_address(dst, src, len))
+                return 0;
+        return cifs_set_port(dst, port);
+}
 /*****************************************************************************
 convert a NT status code to a dos class/code
 *****************************************************************************/
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index de89645777c7..116af7546cf0 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -184,8 +184,8 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
        }
        /* adjust outsize. is this useful ?? */
-        req->uc_outSize = nbytes;       
+        req->uc_outSize = nbytes;
-        req->uc_flags |= REQ_WRITE;
+        req->uc_flags |= CODA_REQ_WRITE;
        count = nbytes;
        /* Convert filedescriptor into a file handle */
diff --git a/fs/compat.c b/fs/compat.c
index 718c7062aec1..0644a154672b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1153,7 +1153,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 {
        compat_ssize_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
-        struct iovec *iov;
+        struct iovec *iov = iovstack;
        ssize_t ret;
        io_fn_t fn;
        iov_fn_t fnv;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 51f270b479b6..48d74c7391d1 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -634,7 +634,7 @@ static int dio_send_cur_page(struct dio *dio)
        int ret = 0;
        if (dio->bio) {
-                loff_t cur_offset = dio->block_in_file << dio->blkbits;
+                loff_t cur_offset = dio->cur_page_fs_offset;
                loff_t bio_next_offset = dio->logical_offset_in_bio +
                        dio->bio->bi_size;
@@ -659,7 +659,7 @@ static int dio_send_cur_page(struct dio *dio)
                 * Submit now if the underlying fs is about to perform a
                 * metadata read
                 */
-                if (dio->boundary)
+                else if (dio->boundary)
                        dio_bio_submit(dio);
        }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index a2e3b562e65d..cbadc1bee6e7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache;
 static struct list_head key_tfm_list;
 struct mutex key_tfm_list_mutex;
-int ecryptfs_init_crypto(void)
+int __init ecryptfs_init_crypto(void)
 {
        mutex_init(&key_tfm_list_mutex);
        INIT_LIST_HEAD(&key_tfm_list);
@@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename(
                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
                                 + encoded_name_no_prefix_size);
                        (*encoded_name)[(*encoded_name_size)] = '\0';
-                        (*encoded_name_size)++;
                } else {
                        rc = -EOPNOTSUPP;
                }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 6c55113e7222..3fbc94203380 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -349,7 +349,7 @@ out:
 /**
 * ecryptfs_new_lower_dentry
- * @ename: The name of the new dentry.
+ * @name: The name of the new dentry.
 * @lower_dir_dentry: Parent directory of the new dentry.
 * @nd: nameidata from last lookup.
 *
@@ -386,20 +386,19 @@ ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
 * ecryptfs_lookup_one_lower
 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
 * @lower_dir_dentry: lower parent directory
+ * @name: lower file name
 *
 * Get the lower dentry from vfs. If lower dentry does not exist yet,
 * create it.
 */
 static struct dentry *
 ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
-                          struct dentry *lower_dir_dentry)
+                          struct dentry *lower_dir_dentry, struct qstr *name)
 {
        struct nameidata nd;
        struct vfsmount *lower_mnt;
-        struct qstr *name;
        int err;
-        name = &ecryptfs_dentry->d_name;
        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
                                    ecryptfs_dentry->d_parent));
        err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
@@ -434,6 +433,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        size_t encrypted_and_encoded_name_size;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
        struct dentry *lower_dir_dentry, *lower_dentry;
+        struct qstr lower_name;
        int rc = 0;
        ecryptfs_dentry->d_op = &ecryptfs_dops;
@@ -444,9 +444,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                goto out_d_drop;
        }
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+        lower_name.name = ecryptfs_dentry->d_name.name;
+        lower_name.len = ecryptfs_dentry->d_name.len;
+        lower_name.hash = ecryptfs_dentry->d_name.hash;
+        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+                                                    &lower_name);
+                if (rc < 0)
+                        goto out_d_drop;
+        }
        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-                                                 lower_dir_dentry);
+                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
@@ -471,8 +479,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                       "filename; rc = [%d]\n", __func__, rc);
                goto out_d_drop;
        }
+        lower_name.name = encrypted_and_encoded_name;
+        lower_name.len = encrypted_and_encoded_name_size;
+        lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
+        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+                                                    &lower_name);
+                if (rc < 0)
+                        goto out_d_drop;
+        }
        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-                                                 lower_dir_dentry);
+                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 89c5476506ef..73811cfa2ea4 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -515,6 +515,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        if (!s) {
                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                rc = -ENOMEM;
                goto out;
        }
        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -806,6 +807,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
        if (!s) {
                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                rc = -ENOMEM;
                goto out;
        }
        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index d8c3a373aafa..0851ab6980f5 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -86,7 +86,7 @@ out:
        return 0;
 }
-int ecryptfs_init_kthread(void)
+int __init ecryptfs_init_kthread(void)
 {
        int rc = 0;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index bcb68c0cb1f0..ab2248090515 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -473,7 +473,7 @@ sleep:
        return rc;
 }
-int ecryptfs_init_messaging(void)
+int __init ecryptfs_init_messaging(void)
 {
        int i;
        int rc = 0;
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f612bcd4..00208c3d7e92 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -500,7 +500,7 @@ static struct miscdevice ecryptfs_miscdev = {
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_init_ecryptfs_miscdev(void)
+int __init ecryptfs_init_ecryptfs_miscdev(void)
 {
        int rc;
diff --git a/fs/exec.c b/fs/exec.c
index 2d9455282744..03278c984ba0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -376,6 +376,9 @@ static int count(const char __user * const __user * argv, int max)
                        argv++;
                        if (i++ >= max)
                                return -E2BIG;
+                        if (fatal_signal_pending(current))
+                                return -ERESTARTNOHAND;
                        cond_resched();
                }
        }
@@ -419,6 +422,12 @@ static int copy_strings(int argc, const char __user *const __user *argv,
                while (len > 0) {
                        int offset, bytes_to_copy;
+                        if (fatal_signal_pending(current)) {
+                                ret = -ERESTARTNOHAND;
+                                goto out;
+                        }
+                        cond_resched();
                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;
@@ -594,6 +603,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
 #else
        stack_top = arch_align_stack(stack_top);
        stack_top = PAGE_ALIGN(stack_top);
+        if (unlikely(stack_top < mmap_min_addr) ||
+            unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
+                return -ENOMEM;
        stack_shift = vma->vm_end - stack_top;
        bprm->p -= stack_shift;
@@ -2000,3 +2014,41 @@ fail_creds:
 fail:
        return;
 }
+/*
+ * Core dumping helper functions.  These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_write(struct file *file, const void *addr, int nr)
+{
+        return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+int dump_seek(struct file *file, loff_t off)
+{
+        int ret = 1;
+        if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+                if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+                        return 0;
+        } else {
+                char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+                if (!buf)
+                        return 0;
+                while (off > 0) {
+                        unsigned long n = off;
+                        if (n > PAGE_SIZE)
+                                n = PAGE_SIZE;
+                        if (!dump_write(file, buf, n)) {
+                                ret = 0;
+                                break;
+                        }
+                        off -= n;
+                }
+                free_page((unsigned long)buf);
+        }
+        return ret;
+}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index eb7368ebd8cd..3eadd97324b1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -54,6 +54,9 @@ struct page_collect {
        unsigned nr_pages;
        unsigned long length;
        loff_t pg_first; /* keep 64bit also in 32-arches */
+        bool read_4_write; /* This means two things: that the read is sync
+                            * And the pages should not be unlocked.
+                            */
 };
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -71,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
        pcol->nr_pages = 0;
        pcol->length = 0;
        pcol->pg_first = -1;
+        pcol->read_4_write = false;
 }
 static void _pcol_reset(struct page_collect *pcol)
@@ -347,7 +351,8 @@ static int readpage_strip(void *data, struct page *page)
                if (PageError(page))
                        ClearPageError(page);
-                unlock_page(page);
+                if (!pcol->read_4_write)
+                        unlock_page(page);
                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
                             " splitting\n", inode->i_ino, page->index);
@@ -428,6 +433,7 @@ static int _readpage(struct page *page, bool is_sync)
        /* readpage_strip might call read_exec(,is_sync==false) at several
         * places but not if we have a single page.
         */
+        pcol.read_4_write = is_sync;
        ret = readpage_strip(&pcol, page);
        if (ret) {
                EXOFS_ERR("_readpage => %d\n", ret);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6769fd0f35b8..f8cc34f542c3 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -769,11 +769,15 @@ EXPORT_SYMBOL(kill_fasync);
 static int __init fcntl_init(void)
 {
-        /* please add new bits here to ensure allocation uniqueness */
+        /*
-        BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+         * Please add new bits here to ensure allocation uniqueness.
+         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
+         * is defined as O_NONBLOCK on some platforms and not on others.
+         */
+        BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
                O_RDONLY        | O_WRONLY      | O_RDWR        |
                O_CREAT         | O_EXCL        | O_NOCTTY      |
-                O_TRUNC         | O_APPEND      | O_NONBLOCK    |
+                O_TRUNC         | O_APPEND      | /* O_NONBLOCK | */
                __O_SYNC        | O_DSYNC       | FASYNC        |
                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7d9d06ba184b..ab38fef1c9a1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -52,8 +52,6 @@ struct wb_writeback_work {
 #define CREATE_TRACE_POINTS
 #include <trace/events/writeback.h>
-#define inode_to_bdi(inode)     ((inode)->i_mapping->backing_dev_info)
 /*
 * We don't actually have pdflush, but this one is exported though /proc...
 */
@@ -71,6 +69,16 @@ int writeback_in_progress(struct backing_dev_info *bdi)
        return test_bit(BDI_writeback_running, &bdi->state);
 }
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        if (strcmp(sb->s_type->name, "bdev") == 0)
+                return inode->i_mapping->backing_dev_info;
+        return sb->s_bdi;
+}
 static void bdi_queue_work(struct backing_dev_info *bdi,
                struct wb_writeback_work *work)
 {
@@ -808,7 +816,7 @@ int bdi_writeback_thread(void *data)
                        wb->last_active = jiffies;
                set_current_state(TASK_INTERRUPTIBLE);
-                if (!list_empty(&bdi->work_list)) {
+                if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
                        __set_current_state(TASK_RUNNING);
                        continue;
                }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 69ad053ffd78..cde755cca564 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -276,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
 * Called with fc->lock, unlocks it
 */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
 {
        void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
        req->end = NULL;
@@ -306,8 +306,8 @@ __releases(&fc->lock)
 static void wait_answer_interruptible(struct fuse_conn *fc,
                                      struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        if (signal_pending(current))
                return;
@@ -325,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
 }
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        if (!fc->no_interrupt) {
                /* Any signal may interrupt this */
@@ -905,8 +905,8 @@ static int request_pending(struct fuse_conn *fc)
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        DECLARE_WAITQUEUE(wait, current);
@@ -934,7 +934,7 @@ __acquires(&fc->lock)
 */
 static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
                               size_t nbytes, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
 {
        struct fuse_in_header ih;
        struct fuse_interrupt_in arg;
@@ -1354,7 +1354,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
        loff_t file_size;
        unsigned int num;
        unsigned int offset;
-        size_t total_len;
+        size_t total_len = 0;
        req = fuse_get_req(fc);
        if (IS_ERR(req))
@@ -1720,8 +1720,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 * This function releases and reacquires fc->lock
 */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        while (!list_empty(head)) {
                struct fuse_req *req;
@@ -1744,8 +1744,8 @@ __acquires(&fc->lock)
 * locked).
 */
 static void end_io_requests(struct fuse_conn *fc)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        while (!list_empty(&fc->io)) {
                struct fuse_req *req =
@@ -1769,6 +1769,16 @@ __acquires(&fc->lock)
        }
 }
+static void end_queued_requests(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+        fc->max_background = UINT_MAX;
+        flush_bg_queue(fc);
+        end_requests(fc, &fc->pending);
+        end_requests(fc, &fc->processing);
+}
 /*
 * Abort all requests.
 *
@@ -1795,8 +1805,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
                fc->connected = 0;
                fc->blocked = 0;
                end_io_requests(fc);
-                end_requests(fc, &fc->pending);
+                end_queued_requests(fc);
-                end_requests(fc, &fc->processing);
                wake_up_all(&fc->waitq);
                wake_up_all(&fc->blocked_waitq);
                kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1811,8 +1820,9 @@ int fuse_dev_release(struct inode *inode, struct file *file)
        if (fc) {
                spin_lock(&fc->lock);
                fc->connected = 0;
-                end_requests(fc, &fc->pending);
+                fc->blocked = 0;
-                end_requests(fc, &fc->processing);
+                end_queued_requests(fc);
+                wake_up_all(&fc->blocked_waitq);
                spin_unlock(&fc->lock);
                fuse_conn_put(fc);
        }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 147c1f71bdb9..c8224587123f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1144,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 /* Called under fc->lock, may release and reacquire it */
 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        struct fuse_inode *fi = get_fuse_inode(req->inode);
        loff_t size = i_size_read(req->inode);
@@ -1183,8 +1183,8 @@ __acquires(&fc->lock)
 * Called with fc->lock
 */
 void fuse_flush_writepages(struct inode *inode)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 9c65170e932e..eb01f3575e10 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -923,7 +923,7 @@ int gfs2_logd(void *data)
                do {
                        prepare_to_wait(&sdp->sd_logd_waitq, &wait,
-                                        TASK_UNINTERRUPTIBLE);
+                                        TASK_INTERRUPTIBLE);
                        if (!gfs2_ail_flush_reqd(sdp) &&
                            !gfs2_jrnl_flush_reqd(sdp) &&
                            !kthread_should_stop())
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index e20ee85955d1..f3f3578393a4 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -115,7 +115,7 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
        inode_inc_link_count(dir);
-        inode = minix_new_inode(dir, mode, &err);
+        inode = minix_new_inode(dir, S_IFDIR | mode, &err);
        if (!inode)
                goto out_dir;
diff --git a/fs/namespace.c b/fs/namespace.c
index de402eb6eafb..a72eaabfe8f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1484,13 +1484,30 @@ out_unlock:
 }
 /*
+ * Sanity check the flags to change_mnt_propagation.
+ */
+static int flags_to_propagation_type(int flags)
+{
+        int type = flags & ~MS_REC;
+        /* Fail if any non-propagation flags are set */
+        if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+                return 0;
+        /* Only one propagation flag should be set */
+        if (!is_power_of_2(type))
+                return 0;
+        return type;
+}
+/*
 * recursively change the type of the mountpoint.
 */
 static int do_change_type(struct path *path, int flag)
 {
        struct vfsmount *m, *mnt = path->mnt;
        int recurse = flag & MS_REC;
-        int type = flag & ~MS_REC;
+        int type;
        int err = 0;
        if (!capable(CAP_SYS_ADMIN))
@@ -1499,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;
+        type = flags_to_propagation_type(flag);
+        if (!type)
+                return -EINVAL;
        down_write(&namespace_sem);
        if (type == MS_SHARED) {
                err = invent_group_ids(mnt, recurse);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 6c2aad49d731..f7e13db613cb 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -63,6 +63,7 @@ config NFS_V3_ACL
 config NFS_V4
        bool "NFS client support for NFS version 4"
        depends on NFS_FS
+        select SUNRPC_GSS
        help
          This option enables support for version 4 of the NFS protocol
          (RFC 3530) in the kernel's NFS client.
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4e7df2adb212..e7340729af89 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -275,7 +275,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
            sin1->sin6_scope_id != sin2->sin6_scope_id)
                return 0;
-        return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
+        return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
 }
 #else   /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
 static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index eb51bd6201da..05bf3c0dc751 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -723,10 +723,6 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
                default:
                        BUG();
        }
-        if (res < 0)
-                dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager"
-                        " - error %d!\n",
-                                __func__, res);
        return res;
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ec3966e4706b..f4cbf0c306c6 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -431,7 +431,15 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
                goto out_err;
        error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+        if (unlikely(error == -ESTALE)) {
+                struct dentry *pd_dentry;
+                pd_dentry = dget_parent(dentry);
+                if (pd_dentry != NULL) {
+                        nfs_zap_caches(pd_dentry->d_inode);
+                        dput(pd_dentry);
+                }
+        }
        nfs_free_fattr(res.fattr);
        if (error < 0)
                goto out_err;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 95932f523aef..4264377552e2 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -69,6 +69,7 @@ config NFSD_V4
        depends on NFSD && PROC_FS && EXPERIMENTAL
        select NFSD_V3
        select FS_POSIX_ACL
+        select SUNRPC_GSS
        help
          This option enables support in your system's NFS server for
          version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e7357104cfd..cf0d2ffb3c84 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -440,7 +440,7 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
 static int nfs4_access_to_omode(u32 access)
 {
-        switch (access) {
+        switch (access & NFS4_SHARE_ACCESS_BOTH) {
        case NFS4_SHARE_ACCESS_READ:
                return O_RDONLY;
        case NFS4_SHARE_ACCESS_WRITE:
@@ -2450,14 +2450,13 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 static __be32
 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
 {
-        u32 op_share_access, new_access;
+        u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+        bool new_access;
        __be32 status;
-        set_access(&new_access, stp->st_access_bmap);
+        new_access = !test_bit(op_share_access, &stp->st_access_bmap);
-        new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK;
        if (new_access) {
-                status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access);
+                status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access);
                if (status)
                        return status;
        }
@@ -2470,7 +2469,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
                return status;
        }
        /* remember the open */
-        op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
        __set_bit(op_share_access, &stp->st_access_bmap);
        __set_bit(open->op_share_deny, &stp->st_deny_bmap);
@@ -2983,7 +2981,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                                *filpp = find_readable_file(stp->st_file);
                        else
                                *filpp = find_writeable_file(stp->st_file);
-                        BUG_ON(!*filpp); /* assured by check_openmode */
                }
        }
        status = nfs_ok;
@@ -3561,7 +3558,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct nfs4_stateowner *open_sop = NULL;
        struct nfs4_stateowner *lock_sop = NULL;
        struct nfs4_stateid *lock_stp;
-        struct file *filp;
+        struct nfs4_file *fp;
+        struct file *filp = NULL;
        struct file_lock file_lock;
        struct file_lock conflock;
        __be32 status = 0;
@@ -3591,7 +3589,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                 * lock stateid.
                 */
                struct nfs4_stateid *open_stp = NULL;
-                struct nfs4_file *fp;
                
                status = nfserr_stale_clientid;
                if (!nfsd4_has_session(cstate) &&
@@ -3634,6 +3631,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                if (status)
                        goto out;
                lock_sop = lock->lk_replay_owner;
+                fp = lock_stp->st_file;
        }
        /* lock->lk_replay_owner and lock_stp have been created or found */
@@ -3648,13 +3646,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (lock->lk_type) {
                case NFS4_READ_LT:
                case NFS4_READW_LT:
-                        filp = find_readable_file(lock_stp->st_file);
+                        if (find_readable_file(lock_stp->st_file)) {
+                                nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
+                                filp = find_readable_file(lock_stp->st_file);
+                        }
                        file_lock.fl_type = F_RDLCK;
                        cmd = F_SETLK;
                break;
                case NFS4_WRITE_LT:
                case NFS4_WRITEW_LT:
-                        filp = find_writeable_file(lock_stp->st_file);
+                        if (find_writeable_file(lock_stp->st_file)) {
+                                nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
+                                filp = find_writeable_file(lock_stp->st_file);
+                        }
                        file_lock.fl_type = F_WRLCK;
                        cmd = F_SETLK;
                break;
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cdfb8c6a4206..c16f8d8331b5 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp)
 static inline void
 fh_unlock(struct svc_fh *fhp)
 {
-        BUG_ON(!fhp->fh_dentry);
        if (fhp->fh_locked) {
                fill_post_wcc(fhp);
                mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 7731a75971dd..322518c88e4b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -363,23 +363,23 @@ struct nfs4_file {
 * at all? */
 static inline struct file *find_writeable_file(struct nfs4_file *f)
 {
-        if (f->fi_fds[O_RDWR])
+        if (f->fi_fds[O_WRONLY])
-                return f->fi_fds[O_RDWR];
+                return f->fi_fds[O_WRONLY];
-        return f->fi_fds[O_WRONLY];
+        return f->fi_fds[O_RDWR];
 }
 static inline struct file *find_readable_file(struct nfs4_file *f)
 {
-        if (f->fi_fds[O_RDWR])
+        if (f->fi_fds[O_RDONLY])
-                return f->fi_fds[O_RDWR];
+                return f->fi_fds[O_RDONLY];
-        return f->fi_fds[O_RDONLY];
+        return f->fi_fds[O_RDWR];
 }
 static inline struct file *find_any_file(struct nfs4_file *f)
 {
        if (f->fi_fds[O_RDWR])
                return f->fi_fds[O_RDWR];
-        else if (f->fi_fds[O_RDWR])
+        else if (f->fi_fds[O_WRONLY])
                return f->fi_fds[O_WRONLY];
        else
                return f->fi_fds[O_RDONLY];
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 96360a83cb91..661a6cf8e826 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2033,15 +2033,17 @@ out:
 __be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
 {
-        struct path path = {
-                .mnt    = fhp->fh_export->ex_path.mnt,
-                .dentry = fhp->fh_dentry,
-        };
        __be32 err;
        err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
-        if (!err && vfs_statfs(&path, stat))
+        if (!err) {
-                err = nfserr_io;
+                struct path path = {
+                        .mnt    = fhp->fh_export->ex_path.mnt,
+                        .dentry = fhp->fh_dentry,
+                };
+                if (vfs_statfs(&path, stat))
+                        err = nfserr_io;
+        }
        return err;
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d97310f07bef..d27715103376 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -446,6 +446,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        nilfs_mdt_destroy(nilfs->ns_cpfile);
        nilfs_mdt_destroy(nilfs->ns_sufile);
        nilfs_mdt_destroy(nilfs->ns_dat);
+        nilfs_mdt_destroy(nilfs->ns_gc_dat);
 failed:
        nilfs_clear_recovery_info(&ri);
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..b388443c3a09 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
-source "fs/notify/fanotify/Kconfig"
+#source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 756566fe8449..85366c78cc37 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -165,9 +165,6 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
                 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
                 inode_mark, vfsmnt_mark, event_mask, data, data_type);
-        pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n",
-                 __func__, group, vfsmnt_mark, inode_mark, event_mask);
        /* sorry, fanotify only gives a damn about files and dirs */
        if (!S_ISREG(to_tell->i_mode) &&
            !S_ISDIR(to_tell->i_mode))
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 032b837fcd11..5ed8e58d7bfc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -195,6 +195,14 @@ static int prepare_for_access_response(struct fsnotify_group *group,
        re->fd = fd;
        mutex_lock(&group->fanotify_data.access_mutex);
+        if (group->fanotify_data.bypass_perm) {
+                mutex_unlock(&group->fanotify_data.access_mutex);
+                kmem_cache_free(fanotify_response_event_cache, re);
+                event->response = FAN_ALLOW;
+                return 0;
+        }
+                
        list_add_tail(&re->list, &group->fanotify_data.access_list);
        mutex_unlock(&group->fanotify_data.access_mutex);
@@ -364,9 +372,28 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
        struct fsnotify_group *group = file->private_data;
+        struct fanotify_response_event *re, *lre;
        pr_debug("%s: file=%p group=%p\n", __func__, file, group);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        mutex_lock(&group->fanotify_data.access_mutex);
+        group->fanotify_data.bypass_perm = true;
+        list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
+                pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
+                         re, re->event);
+                list_del_init(&re->list);
+                re->event->response = FAN_ALLOW;
+                kmem_cache_free(fanotify_response_event_cache, re);
+        }
+        mutex_unlock(&group->fanotify_data.access_mutex);
+        wake_up(&group->fanotify_data.access_waitq);
+#endif
        /* matches the fanotify_init->fsnotify_alloc_group */
        fsnotify_put_group(group);
@@ -614,7 +641,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
                __func__, flags, event_f_flags);
        if (!capable(CAP_SYS_ADMIN))
-                return -EACCES;
+                return -EPERM;
        if (flags & ~FAN_ALL_INIT_FLAGS)
                return -EINVAL;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 3970392b2722..36802420d69a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -148,13 +148,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
                         const unsigned char *file_name,
                         struct fsnotify_event **event)
 {
-        struct fsnotify_group *group = inode_mark->group;
+        struct fsnotify_group *group = NULL;
-        __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+        __u32 inode_test_mask = 0;
-        __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+        __u32 vfsmount_test_mask = 0;
-        pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p"
+        if (unlikely(!inode_mark && !vfsmount_mark)) {
-                 " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell,
+                BUG();
-                 mnt, inode_mark, mask, data, data_is, cookie, *event);
+                return 0;
+        }
        /* clear ignored on inode modification */
        if (mask & FS_MODIFY) {
@@ -168,18 +169,29 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
        /* does the inode mark tell us to do something? */
        if (inode_mark) {
+                group = inode_mark->group;
+                inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
                inode_test_mask &= inode_mark->mask;
                inode_test_mask &= ~inode_mark->ignored_mask;
        }
        /* does the vfsmount_mark tell us to do something? */
        if (vfsmount_mark) {
+                vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+                group = vfsmount_mark->group;
                vfsmount_test_mask &= vfsmount_mark->mask;
                vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
                if (inode_mark)
                        vfsmount_test_mask &= ~inode_mark->ignored_mask;
        }
+        pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
+                 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
+                 " data=%p data_is=%d cookie=%d event=%p\n",
+                 __func__, group, to_tell, mnt, mask, inode_mark,
+                 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
+                 data_is, cookie, *event);
        if (!inode_test_mask && !vfsmount_test_mask)
                return 0;
@@ -207,13 +219,12 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
 int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
             const unsigned char *file_name, u32 cookie)
 {
-        struct hlist_node *inode_node, *vfsmount_node;
+        struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
        struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
        struct fsnotify_group *inode_group, *vfsmount_group;
        struct fsnotify_event *event = NULL;
        struct vfsmount *mnt;
        int idx, ret = 0;
-        bool used_inode = false, used_vfsmount = false;
        /* global tests shouldn't care about events on child only the specific event */
        __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
@@ -238,57 +249,50 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
            (test_mask & to_tell->i_fsnotify_mask))
                inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
                                              &fsnotify_mark_srcu);
-        else
-                inode_node = NULL;
-        if (mnt) {
+        if (mnt && ((mask & FS_MODIFY) ||
-                if ((mask & FS_MODIFY) ||
+                    (test_mask & mnt->mnt_fsnotify_mask))) {
-                    (test_mask & mnt->mnt_fsnotify_mask))
+                vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
-                        vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
+                                                 &fsnotify_mark_srcu);
-                                                         &fsnotify_mark_srcu);
+                inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
-                else
+                                              &fsnotify_mark_srcu);
-                        vfsmount_node = NULL;
-        } else {
-                mnt = NULL;
-                vfsmount_node = NULL;
        }
        while (inode_node || vfsmount_node) {
+                inode_group = vfsmount_group = NULL;
                if (inode_node) {
                        inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
                                                 struct fsnotify_mark, i.i_list);
                        inode_group = inode_mark->group;
-                } else
+                }
-                        inode_group = (void *)-1;
                if (vfsmount_node) {
                        vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
                                                        struct fsnotify_mark, m.m_list);
                        vfsmount_group = vfsmount_mark->group;
-                } else
+                }
-                        vfsmount_group = (void *)-1;
-                if (inode_group < vfsmount_group) {
+                if (inode_group > vfsmount_group) {
                        /* handle inode */
                        send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
                                      data_is, cookie, file_name, &event);
-                        used_inode = true;
+                        /* we didn't use the vfsmount_mark */
-                } else if (vfsmount_group < inode_group) {
+                        vfsmount_group = NULL;
+                } else if (vfsmount_group > inode_group) {
                        send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
                                      data_is, cookie, file_name, &event);
-                        used_vfsmount = true;
+                        inode_group = NULL;
                } else {
                        send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
                                      mask, data, data_is, cookie, file_name,
                                      &event);
-                        used_vfsmount = true;
-                        used_inode = true;
                }
-                if (used_inode)
+                if (inode_group)
                        inode_node = srcu_dereference(inode_node->next,
                                                      &fsnotify_mark_srcu);
-                if (used_vfsmount)
+                if (vfsmount_group)
                        vfsmount_node = srcu_dereference(vfsmount_node->next,
                                                         &fsnotify_mark_srcu);
        }
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index a76e0aa5cd3f..391915093fe1 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
        }
        inode->i_mode = new_mode;
+        inode->i_ctime = CURRENT_TIME;
        di->i_mode = cpu_to_le16(inode->i_mode);
+        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 215e12ce1d85..592fae5007d1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
        last_page_bytes = PAGE_ALIGN(end);
        index = start >> PAGE_CACHE_SHIFT;
        do {
-                pages[numpages] = grab_cache_page(mapping, index);
+                pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
                if (!pages[numpages]) {
                        ret = -ENOMEM;
                        mlog_errno(ret);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index ec6d12339593..c7ee03c22226 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
        ocfs2_blockcheck_inc_failure(stats);
        mlog(ML_ERROR,
-             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+             "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
        /* Ok, try ECC fixups */
@@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
                goto out;
        }
-        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+        mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
        rc = -EIO;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1361997cf205..cbe2f057cc28 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -977,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                           size_t caller_veclen, u8 target_node, int *status)
 {
-        int ret;
+        int ret = 0;
        struct o2net_msg *msg = NULL;
        size_t veclen, caller_bytes = 0;
        struct kvec *vec = NULL;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f04ebcfffc4a..c49f6de0e7ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3931,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
                goto out_commit;
        }
+        cpos = split_hash;
+        ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+                                       data_ac, meta_ac, new_dx_leaves,
+                                       num_dx_leaves);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
        for (i = 0; i < num_dx_leaves; i++) {
                ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
                                              orig_dx_leaves[i],
@@ -3939,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
                        mlog_errno(ret);
                        goto out_commit;
                }
-        }
-        cpos = split_hash;
+                ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
-        ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+                                              new_dx_leaves[i],
-                                       data_ac, meta_ac, new_dx_leaves,
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
-                                       num_dx_leaves);
+                if (ret) {
-        if (ret) {
+                        mlog_errno(ret);
-                mlog_errno(ret);
+                        goto out_commit;
-                goto out_commit;
+                }
        }
        ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4b6ae2c13b47..765298908f1d 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1030,6 +1030,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
                         struct dlm_lock_resource *res);
 void dlm_clean_master_list(struct dlm_ctxt *dlm,
                           u8 dead_node);
+void dlm_force_free_mles(struct dlm_ctxt *dlm);
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
 int __dlm_lockres_unused(struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 5efdd37dfe48..901ca52bf86b 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
        spin_lock(&dlm->track_lock);
        if (oldres)
                track_list = &oldres->tracking;
-        else
+        else {
                track_list = &dlm->tracking_list;
+                if (list_empty(track_list)) {
+                        dl = NULL;
+                        spin_unlock(&dlm->track_lock);
+                        goto bail;
+                }
+        }
        list_for_each_entry(res, track_list, tracking) {
                if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
        } else
                dl = NULL;
+bail:
        /* passed to seq_show */
        return dl;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 153abb5abef0..11a5c87fd7f7 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -693,6 +693,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
                dlm_mark_domain_leaving(dlm);
                dlm_leave_domain(dlm);
+                dlm_force_free_mles(dlm);
                dlm_complete_dlm_shutdown(dlm);
        }
        dlm_put(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ffb4c68dafa4..f564b0e5f80d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -3433,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
        wake_up(&res->wq);
        wake_up(&dlm->migration_wq);
 }
+void dlm_force_free_mles(struct dlm_ctxt *dlm)
+{
+        int i;
+        struct hlist_head *bucket;
+        struct dlm_master_list_entry *mle;
+        struct hlist_node *tmp, *list;
+        /*
+         * We notified all other nodes that we are exiting the domain and
+         * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
+         * around we force free them and wake any processes that are waiting
+         * on the mles
+         */
+        spin_lock(&dlm->spinlock);
+        spin_lock(&dlm->master_lock);
+        BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
+        BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+                bucket = dlm_master_hash(dlm, i);
+                hlist_for_each_safe(list, tmp, bucket) {
+                        mle = hlist_entry(list, struct dlm_master_list_entry,
+                                          master_hash_node);
+                        if (mle->type != DLM_MLE_BLOCK) {
+                                mlog(ML_ERROR, "bad mle: %p\n", mle);
+                                dlm_print_one_mle(mle);
+                        }
+                        atomic_set(&mle->woken, 1);
+                        wake_up(&mle->wq);
+                        __dlm_unlink_mle(dlm, mle);
+                        __dlm_mle_detach_hb_events(dlm, mle);
+                        __dlm_put_mle(mle);
+                }
+        }
+        spin_unlock(&dlm->master_lock);
+        spin_unlock(&dlm->spinlock);
+}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e1b3d6..1d596d8c4a4a 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
        OI_LS_PARENT,
        OI_LS_RENAME1,
        OI_LS_RENAME2,
+        OI_LS_REFLINK_TARGET,
 };
 int ocfs2_dlm_init(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 81296b4e3646..9a03c151b5ce 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,6 +36,7 @@
 #include <linux/writeback.h>
 #include <linux/falloc.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -190,8 +191,16 @@ static int ocfs2_sync_file(struct file *file, int datasync)
        if (err)
                goto bail;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+                /*
+                 * We still have to flush drive's caches to get data to the
+                 * platter
+                 */
+                if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+                                           NULL, BLKDEV_IFL_WAIT);
                goto bail;
+        }
        journal = osb->journal->j_journal;
        err = jbd2_journal_force_commit(journal);
@@ -774,7 +783,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
        BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
        BUG_ON(abs_from & (inode->i_blkbits - 1));
-        page = grab_cache_page(mapping, index);
+        page = find_or_create_page(mapping, index, GFP_NOFS);
        if (!page) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2329,7 +2338,7 @@ out_dio:
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
        if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
-            ((file->f_flags & O_DIRECT) && has_refcount)) {
+            ((file->f_flags & O_DIRECT) && !direct_io)) {
                ret = filemap_fdatawrite_range(file->f_mapping, pos,
                                               pos + count - 1);
                if (ret < 0)
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0492464916b1..eece3e05d9d0 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -488,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                                                     OCFS2_BH_IGNORE_CACHE);
        } else {
                status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
-                if (!status)
+                /*
+                 * If buffer is in jbd, then its checksum may not have been
+                 * computed as yet.
+                 */
+                if (!status && !buffer_jbd(bh))
                        status = ocfs2_validate_inode_block(osb->sb, bh);
        }
        if (status < 0) {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af2b8fe1f139..4c18f4ad93b4 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -74,9 +74,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
        /*
         * Another node might have truncated while we were waiting on
         * cluster locks.
+         * We don't check size == 0 before the shift. This is borrowed
+         * from do_generic_file_read.
         */
-        last_index = size >> PAGE_CACHE_SHIFT;
+        last_index = (size - 1) >> PAGE_CACHE_SHIFT;
-        if (page->index > last_index) {
+        if (unlikely(!size || page->index > last_index)) {
                ret = -EINVAL;
                goto out;
        }
@@ -107,7 +109,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
         * because the "write" would invalidate their data.
         */
        if (page->index == last_index)
-                len = size & ~PAGE_CACHE_MASK;
+                len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
        ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
                                       &fsdata, di_bh, page);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f171b51a74f7..a00dda2e4f16 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -472,32 +472,23 @@ leave:
        return status;
 }
-static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+static int __ocfs2_mknod_locked(struct inode *dir,
-                              struct inode *dir,
+                                struct inode *inode,
-                              struct inode *inode,
+                                dev_t dev,
-                              dev_t dev,
+                                struct buffer_head **new_fe_bh,
-                              struct buffer_head **new_fe_bh,
+                                struct buffer_head *parent_fe_bh,
-                              struct buffer_head *parent_fe_bh,
+                                handle_t *handle,
-                              handle_t *handle,
+                                struct ocfs2_alloc_context *inode_ac,
-                              struct ocfs2_alloc_context *inode_ac)
+                                u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
 {
        int status = 0;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_extent_list *fel;
-        u64 suballoc_loc, fe_blkno = 0;
-        u16 suballoc_bit;
        u16 feat;
        *new_fe_bh = NULL;
-        status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
-                                       inode_ac, &suballoc_loc,
-                                       &suballoc_bit, &fe_blkno);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        /* populate as many fields early on as possible - many of
         * these are used by the support functions here and in
         * callers. */
@@ -591,6 +582,34 @@ leave:
        return status;
 }
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+                              struct inode *dir,
+                              struct inode *inode,
+                              dev_t dev,
+                              struct buffer_head **new_fe_bh,
+                              struct buffer_head *parent_fe_bh,
+                              handle_t *handle,
+                              struct ocfs2_alloc_context *inode_ac)
+{
+        int status = 0;
+        u64 suballoc_loc, fe_blkno = 0;
+        u16 suballoc_bit;
+        *new_fe_bh = NULL;
+        status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+                                       inode_ac, &suballoc_loc,
+                                       &suballoc_bit, &fe_blkno);
+        if (status < 0) {
+                mlog_errno(status);
+                return status;
+        }
+        return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+                                    parent_fe_bh, handle, inode_ac,
+                                    fe_blkno, suballoc_loc, suballoc_bit);
+}
 static int ocfs2_mkdir(struct inode *dir,
                       struct dentry *dentry,
                       int mode)
@@ -1852,61 +1871,117 @@ bail:
        return status;
 }
-static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
-                                    struct inode **ret_orphan_dir,
+                                        struct inode **ret_orphan_dir,
-                                    u64 blkno,
+                                        struct buffer_head **ret_orphan_dir_bh)
-                                    char *name,
-                                    struct ocfs2_dir_lookup_result *lookup)
 {
        struct inode *orphan_dir_inode;
        struct buffer_head *orphan_dir_bh = NULL;
-        int status = 0;
+        int ret = 0;
-        status = ocfs2_blkno_stringify(blkno, name);
-        if (status < 0) {
-                mlog_errno(status);
-                return status;
-        }
        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
                                                       ORPHAN_DIR_SYSTEM_INODE,
                                                       osb->slot_num);
        if (!orphan_dir_inode) {
-                status = -ENOENT;
+                ret = -ENOENT;
-                mlog_errno(status);
+                mlog_errno(ret);
-                return status;
+                return ret;
        }
        mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+        ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
-        if (status < 0) {
+        if (ret < 0) {
-                mlog_errno(status);
+                mutex_unlock(&orphan_dir_inode->i_mutex);
-                goto leave;
+                iput(orphan_dir_inode);
+                mlog_errno(ret);
+                return ret;
        }
-        status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+        *ret_orphan_dir = orphan_dir_inode;
-                                              orphan_dir_bh, name,
+        *ret_orphan_dir_bh = orphan_dir_bh;
-                                              OCFS2_ORPHAN_NAMELEN, lookup);
-        if (status < 0) {
-                ocfs2_inode_unlock(orphan_dir_inode, 1);
-                mlog_errno(status);
+        return 0;
-                goto leave;
+}
+static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
+                                      struct buffer_head *orphan_dir_bh,
+                                      u64 blkno,
+                                      char *name,
+                                      struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret;
+        struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+        ret = ocfs2_blkno_stringify(blkno, name);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+                                           orphan_dir_bh, name,
+                                           OCFS2_ORPHAN_NAMELEN, lookup);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        return 0;
+}
+/**
+ * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
+ * insertion of an orphan.
+ * @osb: ocfs2 file system
+ * @ret_orphan_dir: Orphan dir inode - returned locked!
+ * @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @lookup: dir lookup result, to be passed back into functions like
+ *          ocfs2_orphan_add
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure. 
+ */
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+                                    struct inode **ret_orphan_dir,
+                                    u64 blkno,
+                                    char *name,
+                                    struct ocfs2_dir_lookup_result *lookup)
+{
+        struct inode *orphan_dir_inode = NULL;
+        struct buffer_head *orphan_dir_bh = NULL;
+        int ret = 0;
+        ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
+                                           &orphan_dir_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
+                                         blkno, name, lookup);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
        }
        *ret_orphan_dir = orphan_dir_inode;
-leave:
+out:
-        if (status) {
+        brelse(orphan_dir_bh);
+        if (ret) {
+                ocfs2_inode_unlock(orphan_dir_inode, 1);
                mutex_unlock(&orphan_dir_inode->i_mutex);
                iput(orphan_dir_inode);
        }
-        brelse(orphan_dir_bh);
+        mlog_exit(ret);
+        return ret;
-        mlog_exit(status);
-        return status;
 }
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
@@ -2053,6 +2128,99 @@ leave:
        return status;
 }
+/**
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly
+ * allocated file. This is different from the typical 'add to orphan dir'
+ * operation in that the inode does not yet exist. This is a problem because
+ * the orphan dir stringifies the inode block number to come up with it's
+ * dirent. Obviously if the inode does not yet exist we have a chicken and egg
+ * problem. This function works around it by calling deeper into the orphan
+ * and suballoc code than other callers. Use this only by necessity.
+ * @dir: The directory which this inode will ultimately wind up under - not the
+ * orphan dir!
+ * @dir_bh: buffer_head the @dir inode block
+ * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
+ * with the string to be used for orphan dirent. Pass back to the orphan dir
+ * code.
+ * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
+ * dir code.
+ * @ret_di_blkno: block number where the new inode will be allocated.
+ * @orphan_insert: Dir insert context to be passed back into orphan dir code.
+ * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure. 
+ */
+static int ocfs2_prep_new_orphaned_file(struct inode *dir,
+                                        struct buffer_head *dir_bh,
+                                        char *orphan_name,
+                                        struct inode **ret_orphan_dir,
+                                        u64 *ret_di_blkno,
+                                        struct ocfs2_dir_lookup_result *orphan_insert,
+                                        struct ocfs2_alloc_context **ret_inode_ac)
+{
+        int ret;
+        u64 di_blkno;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct inode *orphan_dir = NULL;
+        struct buffer_head *orphan_dir_bh = NULL;
+        struct ocfs2_alloc_context *inode_ac = NULL;
+        ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        /* reserve an inode spot */
+        ret = ocfs2_reserve_new_inode(osb, &inode_ac);
+        if (ret < 0) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
+                                       &di_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
+                                         di_blkno, orphan_name, orphan_insert);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+out:
+        if (ret == 0) {
+                *ret_orphan_dir = orphan_dir;
+                *ret_di_blkno = di_blkno;
+                *ret_inode_ac = inode_ac;
+                /*
+                 * orphan_name and orphan_insert are already up to
+                 * date via prepare_orphan_dir
+                 */
+        } else {
+                /* Unroll reserve_new_inode* */
+                if (inode_ac)
+                        ocfs2_free_alloc_context(inode_ac);
+                /* Unroll orphan dir locking */
+                mutex_unlock(&orphan_dir->i_mutex);
+                ocfs2_inode_unlock(orphan_dir, 1);
+                iput(orphan_dir);
+        }
+        brelse(orphan_dir_bh);
+        return 0;
+}
 int ocfs2_create_inode_in_orphan(struct inode *dir,
                                 int mode,
                                 struct inode **new_inode)
@@ -2068,6 +2236,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
        struct buffer_head *new_di_bh = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+        u64 uninitialized_var(di_blkno), suballoc_loc;
+        u16 suballoc_bit;
        status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
        if (status < 0) {
@@ -2076,20 +2246,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
                return status;
        }
-        /*
+        status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
-         * We give the orphan dir the root blkno to fake an orphan name,
+                                              orphan_name, &orphan_dir,
-         * and allocate enough space for our insertion.
+                                              &di_blkno, &orphan_insert, &inode_ac);
-         */
-        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
-                                          osb->root_blkno,
-                                          orphan_name, &orphan_insert);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
-        /* reserve an inode spot */
-        status = ocfs2_reserve_new_inode(osb, &inode_ac);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -2116,17 +2275,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
                goto leave;
        did_quota_inode = 1;
-        inode->i_nlink = 0;
+        status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
-        /* do the real work now. */
+                                              &suballoc_loc,
-        status = ocfs2_mknod_locked(osb, dir, inode,
+                                              &suballoc_bit, di_blkno);
-                                    0, &new_di_bh, parent_di_bh, handle,
-                                    inode_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
+        inode->i_nlink = 0;
+        /* do the real work now. */
+        status = __ocfs2_mknod_locked(dir, inode,
+                                      0, &new_di_bh, parent_di_bh, handle,
+                                      inode_ac, di_blkno, suballoc_loc,
+                                      suballoc_bit);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 33f1c9a8258d..fa31d05e41b7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -235,18 +235,31 @@
 #define OCFS2_HAS_REFCOUNT_FL   (0x0010)
 /* Inode attributes, keep in sync with EXT2 */
-#define OCFS2_SECRM_FL          (0x00000001)    /* Secure deletion */
+#define OCFS2_SECRM_FL                  FS_SECRM_FL     /* Secure deletion */
-#define OCFS2_UNRM_FL           (0x00000002)    /* Undelete */
+#define OCFS2_UNRM_FL                   FS_UNRM_FL      /* Undelete */
-#define OCFS2_COMPR_FL          (0x00000004)    /* Compress file */
+#define OCFS2_COMPR_FL                  FS_COMPR_FL     /* Compress file */
-#define OCFS2_SYNC_FL           (0x00000008)    /* Synchronous updates */
+#define OCFS2_SYNC_FL                   FS_SYNC_FL      /* Synchronous updates */
-#define OCFS2_IMMUTABLE_FL      (0x00000010)    /* Immutable file */
+#define OCFS2_IMMUTABLE_FL              FS_IMMUTABLE_FL /* Immutable file */
-#define OCFS2_APPEND_FL         (0x00000020)    /* writes to file may only append */
+#define OCFS2_APPEND_FL                 FS_APPEND_FL    /* writes to file may only append */
-#define OCFS2_NODUMP_FL         (0x00000040)    /* do not dump file */
+#define OCFS2_NODUMP_FL                 FS_NODUMP_FL    /* do not dump file */
-#define OCFS2_NOATIME_FL        (0x00000080)    /* do not update atime */
+#define OCFS2_NOATIME_FL                FS_NOATIME_FL   /* do not update atime */
-#define OCFS2_DIRSYNC_FL        (0x00010000)    /* dirsync behaviour (directories only) */
+/* Reserved for compression usage... */
+#define OCFS2_DIRTY_FL                  FS_DIRTY_FL
-#define OCFS2_FL_VISIBLE        (0x000100FF)    /* User visible flags */
+#define OCFS2_COMPRBLK_FL               FS_COMPRBLK_FL  /* One or more compressed clusters */
-#define OCFS2_FL_MODIFIABLE     (0x000100FF)    /* User modifiable flags */
+#define OCFS2_NOCOMP_FL                 FS_NOCOMP_FL    /* Don't compress */
+#define OCFS2_ECOMPR_FL                 FS_ECOMPR_FL    /* Compression error */
+/* End compression flags --- maybe not all used */
+#define OCFS2_BTREE_FL                  FS_BTREE_FL     /* btree format dir */
+#define OCFS2_INDEX_FL                  FS_INDEX_FL     /* hash-indexed directory */
+#define OCFS2_IMAGIC_FL                 FS_IMAGIC_FL    /* AFS directory */
+#define OCFS2_JOURNAL_DATA_FL           FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define OCFS2_NOTAIL_FL                 FS_NOTAIL_FL    /* file tail should not be merged */
+#define OCFS2_DIRSYNC_FL                FS_DIRSYNC_FL   /* dirsync behaviour (directories only) */
+#define OCFS2_TOPDIR_FL                 FS_TOPDIR_FL    /* Top of directory hierarchies*/
+#define OCFS2_RESERVED_FL               FS_RESERVED_FL  /* reserved for ext2 lib */
+#define OCFS2_FL_VISIBLE                FS_FL_USER_VISIBLE      /* User visible flags */
+#define OCFS2_FL_MODIFIABLE             FS_FL_USER_MODIFIABLE   /* User modifiable flags */
 /*
 * Extent record flags (e_node.leaf.flags)
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a83..5d241505690b 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
 /*
 * ioctl commands
 */
-#define OCFS2_IOC_GETFLAGS      _IOR('f', 1, long)
+#define OCFS2_IOC_GETFLAGS      FS_IOC_GETFLAGS
-#define OCFS2_IOC_SETFLAGS      _IOW('f', 2, long)
+#define OCFS2_IOC_SETFLAGS      FS_IOC_SETFLAGS
-#define OCFS2_IOC32_GETFLAGS    _IOR('f', 1, int)
+#define OCFS2_IOC32_GETFLAGS    FS_IOC32_GETFLAGS
-#define OCFS2_IOC32_SETFLAGS    _IOW('f', 2, int)
+#define OCFS2_IOC32_SETFLAGS    FS_IOC32_SETFLAGS
 /*
 * Space reservation / allocation / free ioctls and argument structure
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 73a11ccfd4c2..efdd75607406 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2960,7 +2960,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                if (map_end & (PAGE_CACHE_SIZE - 1))
                        to = map_end & (PAGE_CACHE_SIZE - 1);
-                page = grab_cache_page(mapping, page_index);
+                page = find_or_create_page(mapping, page_index, GFP_NOFS);
                /*
                 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -3179,7 +3179,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
                if (map_end > end)
                        map_end = end;
-                page = grab_cache_page(context->inode->i_mapping, page_index);
+                page = find_or_create_page(context->inode->i_mapping,
+                                           page_index, GFP_NOFS);
                BUG_ON(!page);
                wait_on_page_writeback(page);
@@ -4200,8 +4201,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
                goto out;
        }
-        mutex_lock(&new_inode->i_mutex);
+        mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
-        ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
+        ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
+                                      OI_LS_REFLINK_TARGET);
        if (ret) {
                mlog_errno(ret);
                goto out_unlock;
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index d8b6e4259b80..3e78db361bc7 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -732,25 +732,23 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
                           struct ocfs2_alloc_reservation *resv,
                           int *cstart, int *clen)
 {
-        unsigned int wanted = *clen;
        if (resv == NULL || ocfs2_resmap_disabled(resmap))
                return -ENOSPC;
        spin_lock(&resv_lock);
-        /*
-         * We don't want to over-allocate for temporary
-         * windows. Otherwise, we run the risk of fragmenting the
-         * allocation space.
-         */
-        wanted = ocfs2_resv_window_bits(resmap, resv);
-        if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
-                wanted = *clen;
        if (ocfs2_resv_empty(resv)) {
-                mlog(0, "empty reservation, find new window\n");
+                /*
+                 * We don't want to over-allocate for temporary
+                 * windows. Otherwise, we run the risk of fragmenting the
+                 * allocation space.
+                 */
+                unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
+                if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+                        wanted = *clen;
+                mlog(0, "empty reservation, find new window\n");
                /*
                 * Try to get a window here. If it works, we must fall
                 * through and test the bitmap . This avoids some
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a8e6a95a353f..849c2f0e0a0e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -57,11 +57,28 @@ struct ocfs2_suballoc_result {
        u64             sr_bg_blkno;    /* The bg we allocated from.  Set
                                           to 0 when a block group is
                                           contiguous. */
+        u64             sr_bg_stable_blkno; /*
+                                             * Doesn't change, always
+                                             * set to target block
+                                             * group descriptor
+                                             * block.
+                                             */
        u64             sr_blkno;       /* The first allocated block */
        unsigned int    sr_bit_offset;  /* The bit in the bg */
        unsigned int    sr_bits;        /* How many bits we claimed */
 };
+static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
+{
+        if (res->sr_blkno == 0)
+                return 0;
+        if (res->sr_bg_blkno)
+                return res->sr_bg_blkno;
+        return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
+}
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -138,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
        brelse(ac->ac_bh);
        ac->ac_bh = NULL;
        ac->ac_resv = NULL;
+        if (ac->ac_find_loc_priv) {
+                kfree(ac->ac_find_loc_priv);
+                ac->ac_find_loc_priv = NULL;
+        }
 }
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -336,7 +357,7 @@ out:
 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
                                          struct ocfs2_group_desc *bg,
                                          struct ocfs2_chain_list *cl,
-                                          u64 p_blkno, u32 clusters)
+                                          u64 p_blkno, unsigned int clusters)
 {
        struct ocfs2_extent_list *el = &bg->bg_list;
        struct ocfs2_extent_rec *rec;
@@ -348,7 +369,7 @@ static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
        rec->e_blkno = cpu_to_le64(p_blkno);
        rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
                                  le16_to_cpu(cl->cl_bpc));
-        rec->e_leaf_clusters = cpu_to_le32(clusters);
+        rec->e_leaf_clusters = cpu_to_le16(clusters);
        le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
        le16_add_cpu(&bg->bg_free_bits_count,
                     clusters * le16_to_cpu(cl->cl_bpc));
@@ -1678,6 +1699,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        if (!ret)
                ocfs2_bg_discontig_fix_result(ac, gd, res);
+        /*
+         * sr_bg_blkno might have been changed by
+         * ocfs2_bg_discontig_fix_result
+         */
+        res->sr_bg_stable_blkno = group_bh->b_blocknr;
+        if (ac->ac_find_loc_only)
+                goto out_loc_only;
        ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
                                               res->sr_bits,
                                               le16_to_cpu(gd->bg_chain));
@@ -1691,6 +1721,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        if (ret < 0)
                mlog_errno(ret);
+out_loc_only:
        *bits_left = le16_to_cpu(gd->bg_free_bits_count);
 out:
@@ -1708,7 +1739,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 {
        int status;
        u16 chain;
-        u32 tmp_used;
        u64 next_group;
        struct inode *alloc_inode = ac->ac_inode;
        struct buffer_head *group_bh = NULL;
@@ -1770,6 +1800,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        if (!status)
                ocfs2_bg_discontig_fix_result(ac, bg, res);
+        /*
+         * sr_bg_blkno might have been changed by
+         * ocfs2_bg_discontig_fix_result
+         */
+        res->sr_bg_stable_blkno = group_bh->b_blocknr;
        /*
         * Keep track of previous block descriptor read. When
@@ -1796,22 +1831,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                }
        }
-        /* Ok, claim our bits now: set the info on dinode, chainlist
+        if (ac->ac_find_loc_only)
-         * and then the group */
+                goto out_loc_only;
-        status = ocfs2_journal_access_di(handle,
-                                         INODE_CACHE(alloc_inode),
+        status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
-                                         ac->ac_bh,
+                                                  ac->ac_bh, res->sr_bits,
-                                         OCFS2_JOURNAL_ACCESS_WRITE);
+                                                  chain);
-        if (status < 0) {
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
-        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
-        fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
-        le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
-        ocfs2_journal_dirty(handle, ac->ac_bh);
        status = ocfs2_block_group_set_bits(handle,
                                            alloc_inode,
                                            bg,
@@ -1826,6 +1856,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
             (unsigned long long)le64_to_cpu(fe->i_blkno));
+out_loc_only:
        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
        brelse(group_bh);
@@ -1845,6 +1876,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
        int status;
        u16 victim, i;
        u16 bits_left = 0;
+        u64 hint = ac->ac_last_group;
        struct ocfs2_chain_list *cl;
        struct ocfs2_dinode *fe;
@@ -1872,7 +1904,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
                goto bail;
        }
-        res->sr_bg_blkno = ac->ac_last_group;
+        res->sr_bg_blkno = hint;
        if (res->sr_bg_blkno) {
                /* Attempt to short-circuit the usual search mechanism
                 * by jumping straight to the most recently used
@@ -1896,8 +1928,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
                                    res, &bits_left);
-        if (!status)
+        if (!status) {
+                hint = ocfs2_group_from_res(res);
                goto set_hint;
+        }
        if (status < 0 && status != -ENOSPC) {
                mlog_errno(status);
                goto bail;
@@ -1920,8 +1954,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
                ac->ac_chain = i;
                status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
                                            res, &bits_left);
-                if (!status)
+                if (!status) {
+                        hint = ocfs2_group_from_res(res);
                        break;
+                }
                if (status < 0 && status != -ENOSPC) {
                        mlog_errno(status);
                        goto bail;
@@ -1936,7 +1972,7 @@ set_hint:
                if (bits_left < min_bits)
                        ac->ac_last_group = 0;
                else
-                        ac->ac_last_group = res->sr_bg_blkno;
+                        ac->ac_last_group = hint;
        }
 bail:
@@ -2016,6 +2052,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
 }
+int ocfs2_find_new_inode_loc(struct inode *dir,
+                             struct buffer_head *parent_fe_bh,
+                             struct ocfs2_alloc_context *ac,
+                             u64 *fe_blkno)
+{
+        int ret;
+        handle_t *handle = NULL;
+        struct ocfs2_suballoc_result *res;
+        BUG_ON(!ac);
+        BUG_ON(ac->ac_bits_given != 0);
+        BUG_ON(ac->ac_bits_wanted != 1);
+        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+        res = kzalloc(sizeof(*res), GFP_NOFS);
+        if (res == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+        /*
+         * The handle started here is for chain relink. Alternatively,
+         * we could just disable relink for these calls.
+         */
+        handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * This will instruct ocfs2_claim_suballoc_bits and
+         * ocfs2_search_one_group to search but save actual allocation
+         * for later.
+         */
+        ac->ac_find_loc_only = 1;
+        ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ac->ac_find_loc_priv = res;
+        *fe_blkno = res->sr_blkno;
+out:
+        if (handle)
+                ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
+        if (ret)
+                kfree(res);
+        return ret;
+}
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+                                 struct inode *dir,
+                                 struct ocfs2_alloc_context *ac,
+                                 u64 *suballoc_loc,
+                                 u16 *suballoc_bit,
+                                 u64 di_blkno)
+{
+        int ret;
+        u16 chain;
+        struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
+        struct buffer_head *bg_bh = NULL;
+        struct ocfs2_group_desc *bg;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+        /*
+         * Since di_blkno is being passed back in, we check for any
+         * inconsistencies which may have happened between
+         * calls. These are code bugs as di_blkno is not expected to
+         * change once returned from ocfs2_find_new_inode_loc()
+         */
+        BUG_ON(res->sr_blkno != di_blkno);
+        ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
+                                          res->sr_bg_stable_blkno, &bg_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+        chain = le16_to_cpu(bg->bg_chain);
+        ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
+                                               ac->ac_bh, res->sr_bits,
+                                               chain);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_block_group_set_bits(handle,
+                                         ac->ac_inode,
+                                         bg,
+                                         bg_bh,
+                                         res->sr_bit_offset,
+                                         res->sr_bits);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
+             (unsigned long long)di_blkno);
+        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+        BUG_ON(res->sr_bits != 1);
+        *suballoc_loc = res->sr_bg_blkno;
+        *suballoc_bit = res->sr_bit_offset;
+        ac->ac_bits_given++;
+        ocfs2_save_inode_ac_group(dir, ac);
+out:
+        brelse(bg_bh);
+        return ret;
+}
 int ocfs2_claim_new_inode(handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
@@ -2567,7 +2733,8 @@ out:
 * suballoc_bit.
 */
 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
-                                       u16 *suballoc_slot, u16 *suballoc_bit)
+                                       u16 *suballoc_slot, u64 *group_blkno,
+                                       u16 *suballoc_bit)
 {
        int status;
        struct buffer_head *inode_bh = NULL;
@@ -2604,6 +2771,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
                *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
        if (suballoc_bit)
                *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+        if (group_blkno)
+                *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
 bail:
        brelse(inode_bh);
@@ -2621,7 +2790,8 @@ bail:
 */
 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
                                   struct inode *suballoc,
-                                   struct buffer_head *alloc_bh, u64 blkno,
+                                   struct buffer_head *alloc_bh,
+                                   u64 group_blkno, u64 blkno,
                                   u16 bit, int *res)
 {
        struct ocfs2_dinode *alloc_di;
@@ -2642,10 +2812,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
                goto bail;
        }
-        if (alloc_di->i_suballoc_loc)
+        bg_blkno = group_blkno ? group_blkno :
-                bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
+                   ocfs2_which_suballoc_group(blkno, bit);
-        else
-                bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
        status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
                                             &group_bh);
        if (status < 0) {
@@ -2680,6 +2848,7 @@ bail:
 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 {
        int status;
+        u64 group_blkno = 0;
        u16 suballoc_bit = 0, suballoc_slot = 0;
        struct inode *inode_alloc_inode;
        struct buffer_head *alloc_bh = NULL;
@@ -2687,7 +2856,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
        mlog_entry("blkno: %llu", (unsigned long long)blkno);
        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
-                                             &suballoc_bit);
+                                             &group_blkno, &suballoc_bit);
        if (status < 0) {
                mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
                goto bail;
@@ -2715,7 +2884,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
        }
        status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
-                                         blkno, suballoc_bit, res);
+                                         group_blkno, blkno, suballoc_bit, res);
        if (status < 0)
                mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a017dd3ee7d9..b8afabfeede4 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,9 @@ struct ocfs2_alloc_context {
        u64    ac_max_block;  /* Highest block number to allocate. 0 is
                                 is the same as ~0 - unlimited */
+        int    ac_find_loc_only;  /* hack for reflink operation ordering */
+        struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
        struct ocfs2_alloc_reservation  *ac_resv;
 };
@@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
                          struct ocfs2_alloc_context **meta_ac);
 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
+/*
+ * The following two interfaces are for ocfs2_create_inode_in_orphan().
+ */
+int ocfs2_find_new_inode_loc(struct inode *dir,
+                             struct buffer_head *parent_fe_bh,
+                             struct ocfs2_alloc_context *ac,
+                             u64 *fe_blkno);
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+                                 struct inode *dir,
+                                 struct ocfs2_alloc_context *ac,
+                                 u64 *suballoc_loc,
+                                 u16 *suballoc_bit,
+                                 u64 di_blkno);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc4..9975457c981f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
        }
        /* Fast symlinks can't be large */
-        len = strlen(target);
+        len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
        link = kzalloc(len + 1, GFP_NOFS);
        if (!link) {
                status = -ENOMEM;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03469f61801..06fa5e77c40e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1286,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
-        down_read(&oi->ip_xattr_sem);
        ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
                                    buffer_size, &xis);
        if (ret == -ENODATA && di->i_xattr_loc)
                ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
                                            buffer_size, &xbs);
-        up_read(&oi->ip_xattr_sem);
        return ret;
 }
@@ -1316,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode,
                mlog_errno(ret);
                return ret;
        }
+        down_read(&OCFS2_I(inode)->ip_xattr_sem);
        ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
                                     name, buffer, buffer_size);
+        up_read(&OCFS2_I(inode)->ip_xattr_sem);
        ocfs2_inode_unlock(inode, 0);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1c43e7c8a7b..8e4addaa5424 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2675,7 +2675,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        INF("auxv",       S_IRUSR, proc_pid_auxv),
        ONE("status",     S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",     S_IRUSR, proc_pid_limits),
+        INF("limits",     S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
@@ -3011,7 +3011,7 @@ static const struct pid_entry tid_base_stuff[] = {
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",    S_IRUSR, proc_pid_limits),
+        INF("limits",    S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 180cf5a0bd67..3b8b45660331 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -146,7 +146,7 @@ u64 stable_page_flags(struct page *page)
        u |= kpf_copy_bit(k, KPF_HWPOISON,      PG_hwpoison);
 #endif
-#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
        u |= kpf_copy_bit(k, KPF_UNCACHED,      PG_uncached);
 #endif
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 439fc1f1c1c4..1dbca4e8cc16 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -224,7 +224,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
        /* We don't show the stack guard page in /proc/maps */
        start = vma->vm_start;
        if (vma->vm_flags & VM_GROWSDOWN)
-                start += PAGE_SIZE;
+                if (!vma_stack_continue(vma->vm_prev, vma->vm_start))
+                        start += PAGE_SIZE;
        seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
                        start,
@@ -362,13 +363,13 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                        mss->referenced += PAGE_SIZE;
                mapcount = page_mapcount(page);
                if (mapcount >= 2) {
-                        if (pte_dirty(ptent))
+                        if (pte_dirty(ptent) || PageDirty(page))
                                mss->shared_dirty += PAGE_SIZE;
                        else
                                mss->shared_clean += PAGE_SIZE;
                        mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
                } else {
-                        if (pte_dirty(ptent))
+                        if (pte_dirty(ptent) || PageDirty(page))
                                mss->private_dirty += PAGE_SIZE;
                        else
                                mss->private_clean += PAGE_SIZE;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 91c817ff02c3..2367fb3f70bc 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -163,7 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 static const struct file_operations proc_vmcore_operations = {
        .read           = read_vmcore,
-        .llseek         = generic_file_llseek,
+        .llseek         = default_llseek,
 };
 static struct vmcore* __init get_new_element(void)
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index f53505de0712..5cbb81e134ac 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
 int reiserfs_unpack(struct inode *inode, struct file *filp)
 {
        int retval = 0;
+        int depth;
        int index;
        struct page *page;
        struct address_space *mapping;
@@ -188,8 +189,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        /* we need to make sure nobody is changing the file size beneath
         ** us
         */
-        mutex_lock(&inode->i_mutex);
+        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
-        reiserfs_write_lock(inode->i_sb);
+        depth = reiserfs_write_lock_once(inode->i_sb);
        write_from = inode->i_size & (blocksize - 1);
        /* if we are on a block boundary, we are already unpacked.  */
@@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
      out:
        mutex_unlock(&inode->i_mutex);
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, depth);
        return retval;
 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1b27b5688f62..da3fefe91a8f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
        char *p;
        p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
-        if (p)
+        if (!IS_ERR(p))
                memmove(last_sysfs_file, p, strlen(p) + 1);
        /* need attr_sd for attr and ops, its parent for kobj */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 15412fe15c3a..b552f816de15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -852,8 +852,8 @@ xfs_convert_page(
                SetPageUptodate(page);
        if (count) {
-                wbc->nr_to_write--;
+                if (--wbc->nr_to_write <= 0 &&
-                if (wbc->nr_to_write <= 0)
+                    wbc->sync_mode == WB_SYNC_NONE)
                        done = 1;
        }
        xfs_start_page_writeback(page, !page_dirty, count);
@@ -1068,7 +1068,7 @@ xfs_vm_writepage(
         * by themselves.
         */
        if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
-                goto out_fail;
+                goto redirty;
        /*
         * We need a transaction if there are delalloc or unwritten buffers
@@ -1080,7 +1080,7 @@ xfs_vm_writepage(
         */
        xfs_count_page_state(page, &delalloc, &unwritten);
        if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
-                goto out_fail;
+                goto redirty;
        /* Is this page beyond the end of the file? */
        offset = i_size_read(inode);
@@ -1245,12 +1245,15 @@ error:
        if (iohead)
                xfs_cancel_ioend(iohead);
+        if (err == -EAGAIN)
+                goto redirty;
        xfs_aops_discard_page(page);
        ClearPageUptodate(page);
        unlock_page(page);
        return err;
-out_fail:
+redirty:
        redirty_page_for_writepage(wbc, page);
        unlock_page(page);
        return 0;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b93ea3342281..1846a0dd7035 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -440,12 +440,7 @@ _xfs_buf_find(
                ASSERT(btp == bp->b_target);
                if (bp->b_file_offset == range_base &&
                    bp->b_buffer_length == range_length) {
-                        /*
-                         * If we look at something, bring it to the
-                         * front of the list for next time.
-                         */
                        atomic_inc(&bp->b_hold);
-                        list_move(&bp->b_hash_list, &hash->bh_list);
                        goto found;
                }
        }
@@ -1431,8 +1426,7 @@ xfs_alloc_bufhash(
 {
        unsigned int            i;
-        btp->bt_hashshift = external ? 3 : 8;   /* 8 or 256 buckets */
+        btp->bt_hashshift = external ? 3 : 12;  /* 8 or 4096 buckets */
-        btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
        btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
                                         sizeof(xfs_bufhash_t));
        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
@@ -1926,7 +1920,8 @@ xfs_buf_init(void)
        if (!xfs_buf_zone)
                goto out;
-        xfslogd_workqueue = create_workqueue("xfslogd");
+        xfslogd_workqueue = alloc_workqueue("xfslogd",
+                                        WQ_RESCUER | WQ_HIGHPRI, 1);
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index d533d64e2c3e..9d021c73ea52 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,7 +128,6 @@ typedef struct xfs_buftarg {
        size_t                  bt_smask;
        /* per device buffer hash table */
-        uint                    bt_hashmask;
        uint                    bt_hashshift;
        xfs_bufhash_t           *bt_hash;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 237f5ffb2ee8..3b9e626f7cd1 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -785,6 +785,8 @@ xfs_ioc_fsgetxattr(
 {
        struct fsxattr          fa;
+        memset(&fa, 0, sizeof(struct fsxattr));
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        fa.fsx_xflags = xfs_ip2xflags(ip);
        fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
@@ -907,6 +909,13 @@ xfs_ioctl_setattr(
                return XFS_ERROR(EIO);
        /*
+         * Disallow 32bit project ids because on-disk structure
+         * is 16bit only.
+         */
+        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
+                return XFS_ERROR(EINVAL);
+        /*
         * If disk quotas is on, we make sure that the dquots do exist on disk,
         * before we start any other transactions. Trying to do this later
         * is messy. We don't care to take a readlock to look at the ids
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 68be25dcd301..b1fc2a6bfe83 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -664,7 +664,7 @@ xfs_vn_fiemap(
                                        fieinfo->fi_extents_max + 1;
        bm.bmv_count = min_t(__s32, bm.bmv_count,
                             (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-        bm.bmv_iflags = BMV_IF_PREALLOC;
+        bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
                bm.bmv_iflags |= BMV_IF_ATTRFORK;
        if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 5fa7a30cc3f0..08fd3102128c 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1225,6 +1225,7 @@ xfs_fs_statfs(
        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
        __uint64_t              fakeinos, id;
        xfs_extlen_t            lsize;
+        __int64_t               ffree;
        statp->f_type = XFS_SB_MAGIC;
        statp->f_namelen = MAXNAMELEN - 1;
@@ -1248,7 +1249,11 @@ xfs_fs_statfs(
                statp->f_files = min_t(typeof(statp->f_files),
                                        statp->f_files,
                                        mp->m_maxicount);
-        statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        /* make sure statp->f_ffree does not underflow */
+        ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        statp->f_ffree = max_t(__int64_t, ffree, 0);
        spin_unlock(&mp->m_sb_lock);
        if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1401,7 +1406,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp);
+        return -xfs_fs_log_dummy(mp, SYNC_WAIT);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index dfcbd98d1599..81976ffed7d6 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -34,6 +34,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -341,38 +342,6 @@ xfs_sync_attr(
 }
 STATIC int
-xfs_commit_dummy_trans(
-        struct xfs_mount        *mp,
-        uint                    flags)
-{
-        struct xfs_inode        *ip = mp->m_rootip;
-        struct xfs_trans        *tp;
-        int                     error;
-        /*
-         * Put a dummy transaction in the log to tell recovery
-         * that all others are OK.
-         */
-        tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        /* the log force ensures this transaction is pushed to disk */
-        xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-        return error;
-}
-STATIC int
 xfs_sync_fsdata(
        struct xfs_mount        *mp)
 {
@@ -432,7 +401,7 @@ xfs_quiesce_data(
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-                error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
+                error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -563,7 +532,7 @@ xfs_flush_inodes(
 /*
 * Every sync period we need to unpin all items, reclaim inodes and sync
 * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle.
+ * filesystem is idle and not frozen.
 */
 STATIC void
 xfs_sync_worker(
@@ -577,8 +546,9 @@ xfs_sync_worker(
                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-                if (xfs_log_need_covered(mp))
+                if (mp->m_super->s_frozen == SB_UNFROZEN &&
-                        error = xfs_commit_dummy_trans(mp, 0);
+                    xfs_log_need_covered(mp))
+                        error = xfs_fs_log_dummy(mp, 0);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
@@ -698,14 +668,11 @@ xfs_inode_set_reclaim_tag(
        xfs_perag_put(pag);
 }
-void
+STATIC void
-__xfs_inode_clear_reclaim_tag(
+__xfs_inode_clear_reclaim(
-        xfs_mount_t     *mp,
        xfs_perag_t     *pag,
        xfs_inode_t     *ip)
 {
-        radix_tree_tag_clear(&pag->pag_ici_root,
-                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
        pag->pag_ici_reclaimable--;
        if (!pag->pag_ici_reclaimable) {
                /* clear the reclaim tag from the perag radix tree */
@@ -719,6 +686,17 @@ __xfs_inode_clear_reclaim_tag(
        }
 }
+void
+__xfs_inode_clear_reclaim_tag(
+        xfs_mount_t     *mp,
+        xfs_perag_t     *pag,
+        xfs_inode_t     *ip)
+{
+        radix_tree_tag_clear(&pag->pag_ici_root,
+                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+        __xfs_inode_clear_reclaim(pag, ip);
+}
 /*
 * Inodes in different states need to be treated differently, and the return
 * value of xfs_iflush is not sufficient to get this right. The following table
@@ -868,6 +846,7 @@ reclaim:
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
+        __xfs_inode_clear_reclaim(pag, ip);
        write_unlock(&pag->pag_ici_lock);
        /*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 23f14e595c18..f90dadd5a968 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5533,12 +5533,24 @@ xfs_getbmap(
                                        map[i].br_startblock))
                                goto out_free_map;
-                        nexleft--;
                        bmv->bmv_offset =
                                out[cur_ext].bmv_offset +
                                out[cur_ext].bmv_length;
                        bmv->bmv_length =
                                max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+                        /*
+                         * In case we don't want to return the hole,
+                         * don't increase cur_ext so that we can reuse
+                         * it in the next loop.
+                         */
+                        if ((iflags & BMV_IF_NO_HOLES) &&
+                            map[i].br_startblock == HOLESTARTBLOCK) {
+                                memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+                                continue;
+                        }
+                        nexleft--;
                        bmv->bmv_entries++;
                        cur_ext++;
                }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5f..87c2e9d02288 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
 #define BMV_IF_NO_DMAPI_READ    0x2     /* Do not generate DMAPI read event  */
 #define BMV_IF_PREALLOC         0x4     /* rtn status BMV_OF_PREALLOC if req */
 #define BMV_IF_DELALLOC         0x8     /* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES         0x10    /* Do not return holes */
 #define BMV_IF_VALID    \
-        (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
+        (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|  \
+         BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
 /*      bmv_oflags values - returned for each non-header segment */
 #define BMV_OF_PREALLOC         0x1     /* segment = unwritten pre-allocation */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dbca5f5c37ba..43b1d5699335 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -604,31 +604,36 @@ out:
        return 0;
 }
+/*
+ * Dump a transaction into the log that contains no real change. This is needed
+ * to be able to make the log dirty or stamp the current tail LSN into the log
+ * during the covering operation.
+ *
+ * We cannot use an inode here for this - that will push dirty state back up
+ * into the VFS and then periodic inode flushing will prevent log covering from
+ * making progress. Hence we log a field in the superblock instead.
+ */
 int
 xfs_fs_log_dummy(
-        xfs_mount_t     *mp)
+        xfs_mount_t     *mp,
+        int             flags)
 {
        xfs_trans_t     *tp;
-        xfs_inode_t     *ip;
        int             error;
        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+                                        XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
        }
-        ip = mp->m_rootip;
+        /* log the UUID because it is an unchanging field */
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_mod_sb(tp, XFS_SB_UUID);
+        if (flags & SYNC_WAIT)
-        xfs_trans_ijoin(tp, ip);
+                xfs_trans_set_sync(tp);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        return xfs_trans_commit(tp, 0);
-        xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        return error;
 }
 int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c9..a786c5212c1e 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index abf80ae1e95b..5371d2dc360e 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1213,7 +1213,6 @@ xfs_imap_lookup(
        struct xfs_inobt_rec_incore rec;
        struct xfs_btree_cur    *cur;
        struct xfs_buf          *agbp;
-        xfs_agino_t             startino;
        int                     error;
        int                     i;
@@ -1227,13 +1226,13 @@ xfs_imap_lookup(
        }
        /*
-         * derive and lookup the exact inode record for the given agino. If the
+         * Lookup the inode record for the given agino. If the record cannot be
-         * record cannot be found, then it's an invalid inode number and we
+         * found, then it's an invalid inode number and we should abort. Once
-         * should abort.
+         * we have a record, we need to ensure it contains the inode number
+         * we are looking up.
         */
        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-        startino = agino & ~(XFS_IALLOC_INODES(mp) - 1);
+        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
-        error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
        if (!error) {
                if (i)
                        error = xfs_inobt_get_rec(cur, &rec, &i);
@@ -1246,6 +1245,11 @@ xfs_imap_lookup(
        if (error)
                return error;
+        /* check that the returned record contains the required inode */
+        if (rec.ir_startino > agino ||
+            rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+                return EINVAL;
        /* for untrusted inodes check it is allocated first */
        if ((flags & XFS_IGET_UNTRUSTED) &&
            (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 68415cb4f23c..34798f391c49 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1914,6 +1914,11 @@ xfs_iunlink_remove(
        return 0;
 }
+/*
+ * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
 STATIC void
 xfs_ifree_cluster(
        xfs_inode_t     *free_ip,
@@ -1945,8 +1950,6 @@ xfs_ifree_cluster(
        }
        for (j = 0; j < nbufs; j++, inum += ninodes) {
-                int     found = 0;
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
@@ -1965,7 +1968,9 @@ xfs_ifree_cluster(
                /*
                 * Walk the inodes already attached to the buffer and mark them
                 * stale. These will all have the flush locks held, so an
-                 * in-memory inode walk can't lock them.
+                 * in-memory inode walk can't lock them. By marking them all
+                 * stale first, we will not attempt to lock them in the loop
+                 * below as the XFS_ISTALE flag will be set.
                 */
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                while (lip) {
@@ -1977,11 +1982,11 @@ xfs_ifree_cluster(
                                                        &iip->ili_flush_lsn,
                                                        &iip->ili_item.li_lsn);
                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-                                found++;
                        }
                        lip = lip->li_bio_list;
                }
                /*
                 * For each inode in memory attempt to add it to the inode
                 * buffer and set it up for being staled on buffer IO
@@ -1993,6 +1998,7 @@ xfs_ifree_cluster(
                 * even trying to lock them.
                 */
                for (i = 0; i < ninodes; i++) {
+retry:
                        read_lock(&pag->pag_ici_lock);
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
@@ -2003,38 +2009,36 @@ xfs_ifree_cluster(
                                continue;
                        }
-                        /* don't try to lock/unlock the current inode */
+                        /*
+                         * Don't try to lock/unlock the current inode, but we
+                         * _cannot_ skip the other inodes that we did not find
+                         * in the list attached to the buffer and are not
+                         * already marked stale. If we can't lock it, back off
+                         * and retry.
+                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
                                read_unlock(&pag->pag_ici_lock);
-                                continue;
+                                delay(1);
+                                goto retry;
                        }
                        read_unlock(&pag->pag_ici_lock);
-                        if (!xfs_iflock_nowait(ip)) {
+                        xfs_iflock(ip);
-                                if (ip != free_ip)
-                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                continue;
-                        }
                        xfs_iflags_set(ip, XFS_ISTALE);
-                        if (xfs_inode_clean(ip)) {
-                                ASSERT(ip != free_ip);
-                                xfs_ifunlock(ip);
-                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                continue;
-                        }
+                        /*
+                         * we don't need to attach clean inodes or those only
+                         * with unlogged changes (which we throw away, anyway).
+                         */
                        iip = ip->i_itemp;
-                        if (!iip) {
+                        if (!iip || xfs_inode_clean(ip)) {
-                                /* inode with unlogged changes only */
                                ASSERT(ip != free_ip);
                                ip->i_update_core = 0;
                                xfs_ifunlock(ip);
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                                continue;
                        }
-                        found++;
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
@@ -2049,8 +2053,7 @@ xfs_ifree_cluster(
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                }
-                if (found)
+                xfs_trans_stale_inode_buf(tp, bp);
-                        xfs_trans_stale_inode_buf(tp, bp);
                xfs_trans_binval(tp, bp);
        }
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 430a8fc02c1f..ba8e36e0b4e7 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3002,7 +3002,8 @@ _xfs_log_force(
        XFS_STATS_INC(xs_log_force);
-        xlog_cil_push(log, 1);
+        if (log->l_cilp)
+                xlog_cil_force(log);
        spin_lock(&log->l_icloglock);
@@ -3154,7 +3155,7 @@ _xfs_log_force_lsn(
        XFS_STATS_INC(xs_log_force);
        if (log->l_cilp) {
-                lsn = xlog_cil_push_lsn(log, lsn);
+                lsn = xlog_cil_force_lsn(log, lsn);
                if (lsn == NULLCOMMITLSN)
                        return 0;
        }
@@ -3711,7 +3712,7 @@ xfs_log_force_umount(
         * call below.
         */
        if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
-                xlog_cil_push(log, 1);
+                xlog_cil_force(log);
        /*
         * We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 31e4ea2d19ac..7e206fc1fa36 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -68,6 +68,7 @@ xlog_cil_init(
        ctx->sequence = 1;
        ctx->cil = cil;
        cil->xc_ctx = ctx;
+        cil->xc_current_sequence = ctx->sequence;
        cil->xc_log = log;
        log->l_cilp = cil;
@@ -269,15 +270,10 @@ xlog_cil_insert(
 static void
 xlog_cil_format_items(
        struct log              *log,
-        struct xfs_log_vec      *log_vector,
+        struct xfs_log_vec      *log_vector)
-        struct xlog_ticket      *ticket,
-        xfs_lsn_t               *start_lsn)
 {
        struct xfs_log_vec *lv;
-        if (start_lsn)
-                *start_lsn = log->l_cilp->xc_ctx->sequence;
        ASSERT(log_vector);
        for (lv = log_vector; lv; lv = lv->lv_next) {
                void    *ptr;
@@ -301,9 +297,24 @@ xlog_cil_format_items(
                        ptr += vec->i_len;
                }
                ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+        }
+}
+static void
+xlog_cil_insert_items(
+        struct log              *log,
+        struct xfs_log_vec      *log_vector,
+        struct xlog_ticket      *ticket,
+        xfs_lsn_t               *start_lsn)
+{
+        struct xfs_log_vec *lv;
+        if (start_lsn)
+                *start_lsn = log->l_cilp->xc_ctx->sequence;
+        ASSERT(log_vector);
+        for (lv = log_vector; lv; lv = lv->lv_next)
                xlog_cil_insert(log, ticket, lv->lv_item, lv);
-        }
 }
 static void
@@ -321,80 +332,6 @@ xlog_cil_free_logvec(
 }
 /*
- * Commit a transaction with the given vector to the Committed Item List.
- *
- * To do this, we need to format the item, pin it in memory if required and
- * account for the space used by the transaction. Once we have done that we
- * need to release the unused reservation for the transaction, attach the
- * transaction to the checkpoint context so we carry the busy extents through
- * to checkpoint completion, and then unlock all the items in the transaction.
- *
- * For more specific information about the order of operations in
- * xfs_log_commit_cil() please refer to the comments in
- * xfs_trans_commit_iclog().
- *
- * Called with the context lock already held in read mode to lock out
- * background commit, returns without it held once background commits are
- * allowed again.
- */
-int
-xfs_log_commit_cil(
-        struct xfs_mount        *mp,
-        struct xfs_trans        *tp,
-        struct xfs_log_vec      *log_vector,
-        xfs_lsn_t               *commit_lsn,
-        int                     flags)
-{
-        struct log              *log = mp->m_log;
-        int                     log_flags = 0;
-        int                     push = 0;
-        if (flags & XFS_TRANS_RELEASE_LOG_RES)
-                log_flags = XFS_LOG_REL_PERM_RESERV;
-        if (XLOG_FORCED_SHUTDOWN(log)) {
-                xlog_cil_free_logvec(log_vector);
-                return XFS_ERROR(EIO);
-        }
-        /* lock out background commit */
-        down_read(&log->l_cilp->xc_ctx_lock);
-        xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
-        /* check we didn't blow the reservation */
-        if (tp->t_ticket->t_curr_res < 0)
-                xlog_print_tic_res(log->l_mp, tp->t_ticket);
-        /* attach the transaction to the CIL if it has any busy extents */
-        if (!list_empty(&tp->t_busy)) {
-                spin_lock(&log->l_cilp->xc_cil_lock);
-                list_splice_init(&tp->t_busy,
-                                        &log->l_cilp->xc_ctx->busy_extents);
-                spin_unlock(&log->l_cilp->xc_cil_lock);
-        }
-        tp->t_commit_lsn = *commit_lsn;
-        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-        xfs_trans_unreserve_and_mod_sb(tp);
-        /* check for background commit before unlock */
-        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
-                push = 1;
-        up_read(&log->l_cilp->xc_ctx_lock);
-        /*
-         * We need to push CIL every so often so we don't cache more than we
-         * can fit in the log. The limit really is that a checkpoint can't be
-         * more than half the log (the current checkpoint is not allowed to
-         * overwrite the previous checkpoint), but commit latency and memory
-         * usage limit this to a smaller size in most cases.
-         */
-        if (push)
-                xlog_cil_push(log, 0);
-        return 0;
-}
-/*
 * Mark all items committed and clear busy extents. We free the log vector
 * chains in a separate pass so that we unpin the log items as quickly as
 * possible.
@@ -427,13 +364,23 @@ xlog_cil_committed(
 }
 /*
- * Push the Committed Item List to the log. If the push_now flag is not set,
+ * Push the Committed Item List to the log. If @push_seq flag is zero, then it
- * then it is a background flush and so we can chose to ignore it.
+ * is a background flush and so we can chose to ignore it. Otherwise, if the
+ * current sequence is the same as @push_seq we need to do a flush. If
+ * @push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * @push_seq is a value rather than a flag because that allows us to do an
+ * unlocked check of the sequence number for a match. Hence we can allows log
+ * forces to run racily and not issue pushes for the same sequence twice. If we
+ * get a race between multiple pushes for the same sequence they will block on
+ * the first one and then abort, hence avoiding needless pushes.
 */
-int
+STATIC int
 xlog_cil_push(
        struct log              *log,
-        int                     push_now)
+        xfs_lsn_t               push_seq)
 {
        struct xfs_cil          *cil = log->l_cilp;
        struct xfs_log_vec      *lv;
@@ -453,12 +400,20 @@ xlog_cil_push(
        if (!cil)
                return 0;
+        ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
        new_ctx->ticket = xlog_cil_ticket_alloc(log);
-        /* lock out transaction commit, but don't block on background push */
+        /*
+         * Lock out transaction commit, but don't block for background pushes
+         * unless we are well over the CIL space limit. See the definition of
+         * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
+         * used here.
+         */
        if (!down_write_trylock(&cil->xc_ctx_lock)) {
-                if (!push_now)
+                if (!push_seq &&
+                    cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
                        goto out_free_ticket;
                down_write(&cil->xc_ctx_lock);
        }
@@ -469,7 +424,11 @@ xlog_cil_push(
                goto out_skip;
        /* check for spurious background flush */
-        if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+        if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+                goto out_skip;
+        /* check for a previously pushed seqeunce */
+        if (push_seq && push_seq < cil->xc_ctx->sequence)
                goto out_skip;
        /*
@@ -515,6 +474,13 @@ xlog_cil_push(
        cil->xc_ctx = new_ctx;
        /*
+         * mirror the new sequence into the cil structure so that we can do
+         * unlocked checks against the current sequence in log forces without
+         * risking deferencing a freed context pointer.
+         */
+        cil->xc_current_sequence = new_ctx->sequence;
+        /*
         * The switch is now done, so we can drop the context lock and move out
         * of a shared context. We can't just go straight to the commit record,
         * though - we need to synchronise with previous and future commits so
@@ -626,6 +592,102 @@ out_abort:
 }
 /*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        struct log              *log = mp->m_log;
+        int                     log_flags = 0;
+        int                     push = 0;
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        if (XLOG_FORCED_SHUTDOWN(log)) {
+                xlog_cil_free_logvec(log_vector);
+                return XFS_ERROR(EIO);
+        }
+        /*
+         * do all the hard work of formatting items (including memory
+         * allocation) outside the CIL context lock. This prevents stalling CIL
+         * pushes when we are low on memory and a transaction commit spends a
+         * lot of time in memory reclaim.
+         */
+        xlog_cil_format_items(log, log_vector);
+        /* lock out background commit */
+        down_read(&log->l_cilp->xc_ctx_lock);
+        xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+        /* check we didn't blow the reservation */
+        if (tp->t_ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, tp->t_ticket);
+        /* attach the transaction to the CIL if it has any busy extents */
+        if (!list_empty(&tp->t_busy)) {
+                spin_lock(&log->l_cilp->xc_cil_lock);
+                list_splice_init(&tp->t_busy,
+                                        &log->l_cilp->xc_ctx->busy_extents);
+                spin_unlock(&log->l_cilp->xc_cil_lock);
+        }
+        tp->t_commit_lsn = *commit_lsn;
+        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+        xfs_trans_unreserve_and_mod_sb(tp);
+        /*
+         * Once all the items of the transaction have been copied to the CIL,
+         * the items can be unlocked and freed.
+         *
+         * This needs to be done before we drop the CIL context lock because we
+         * have to update state in the log items and unlock them before they go
+         * to disk. If we don't, then the CIL checkpoint can race with us and
+         * we can run checkpoint completion before we've updated and unlocked
+         * the log items. This affects (at least) processing of stale buffers,
+         * inodes and EFIs.
+         */
+        xfs_trans_free_items(tp, *commit_lsn, 0);
+        /* check for background commit before unlock */
+        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+                push = 1;
+        up_read(&log->l_cilp->xc_ctx_lock);
+        /*
+         * We need to push CIL every so often so we don't cache more than we
+         * can fit in the log. The limit really is that a checkpoint can't be
+         * more than half the log (the current checkpoint is not allowed to
+         * overwrite the previous checkpoint), but commit latency and memory
+         * usage limit this to a smaller size in most cases.
+         */
+        if (push)
+                xlog_cil_push(log, 0);
+        return 0;
+}
+/*
 * Conditionally push the CIL based on the sequence passed in.
 *
 * We only need to push if we haven't already pushed the sequence
@@ -639,39 +701,34 @@ out_abort:
 * commit lsn is there. It'll be empty, so this is broken for now.
 */
 xfs_lsn_t
-xlog_cil_push_lsn(
+xlog_cil_force_lsn(
        struct log      *log,
-        xfs_lsn_t       push_seq)
+        xfs_lsn_t       sequence)
 {
        struct xfs_cil          *cil = log->l_cilp;
        struct xfs_cil_ctx      *ctx;
        xfs_lsn_t               commit_lsn = NULLCOMMITLSN;
-restart:
+        ASSERT(sequence <= cil->xc_current_sequence);
-        down_write(&cil->xc_ctx_lock);
-        ASSERT(push_seq <= cil->xc_ctx->sequence);
+        /*
+         * check to see if we need to force out the current context.
-        /* check to see if we need to force out the current context */
+         * xlog_cil_push() handles racing pushes for the same sequence,
-        if (push_seq == cil->xc_ctx->sequence) {
+         * so no need to deal with it here.
-                up_write(&cil->xc_ctx_lock);
+         */
-                xlog_cil_push(log, 1);
+        if (sequence == cil->xc_current_sequence)
-                goto restart;
+                xlog_cil_push(log, sequence);
-        }
        /*
         * See if we can find a previous sequence still committing.
-         * We can drop the flush lock as soon as we have the cil lock
-         * because we are now only comparing contexts protected by
-         * the cil lock.
-         *
         * We need to wait for all previous sequence commits to complete
         * before allowing the force of push_seq to go ahead. Hence block
         * on commits for those as well.
         */
+restart:
        spin_lock(&cil->xc_cil_lock);
-        up_write(&cil->xc_ctx_lock);
        list_for_each_entry(ctx, &cil->xc_committing, committing) {
-                if (ctx->sequence > push_seq)
+                if (ctx->sequence > sequence)
                        continue;
                if (!ctx->commit_lsn) {
                        /*
@@ -681,7 +738,7 @@ restart:
                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
                        goto restart;
                }
-                if (ctx->sequence != push_seq)
+                if (ctx->sequence != sequence)
                        continue;
                /* found it! */
                commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965c..edcdfe01617f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,16 +422,17 @@ struct xfs_cil {
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
        sv_t                    xc_commit_wait;
+        xfs_lsn_t               xc_current_sequence;
 };
 /*
- * The amount of log space we should the CIL to aggregate is difficult to size.
+ * The amount of log space we allow the CIL to aggregate is difficult to size.
- * Whatever we chose we have to make we can get a reservation for the log space
+ * Whatever we choose, we have to make sure we can get a reservation for the
- * effectively, that it is large enough to capture sufficient relogging to
+ * log space effectively, that it is large enough to capture sufficient
- * reduce log buffer IO significantly, but it is not too large for the log or
+ * relogging to reduce log buffer IO significantly, but it is not too large for
- * induces too much latency when writing out through the iclogs. We track both
+ * the log or induces too much latency when writing out through the iclogs. We
- * space consumed and the number of vectors in the checkpoint context, so we
+ * track both space consumed and the number of vectors in the checkpoint
- * need to decide which to use for limiting.
+ * context, so we need to decide which to use for limiting.
 *
 * Every log buffer we write out during a push needs a header reserved, which
 * is at least one sector and more for v2 logs. Hence we need a reservation of
@@ -458,16 +459,21 @@ struct xfs_cil {
 * checkpoint transaction ticket is specific to the checkpoint context, rather
 * than the CIL itself.
 *
- * With dynamic reservations, we can basically make up arbitrary limits for the
+ * With dynamic reservations, we can effectively make up arbitrary limits for
- * checkpoint size so long as they don't violate any other size rules.  Hence
+ * the checkpoint size so long as they don't violate any other size rules.
- * the initial maximum size for the checkpoint transaction will be set to a
+ * Recovery imposes a rule that no transaction exceed half the log, so we are
- * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
+ * limited by that.  Furthermore, the log transaction reservation subsystem
- * right now based on the latency of writing out a large amount of data through
+ * tries to keep 25% of the log free, so we need to keep below that limit or we
- * the circular iclog buffers.
+ * risk running out of free log space to start any new transactions.
+ *
+ * In order to keep background CIL push efficient, we will set a lower
+ * threshold at which background pushing is attempted without blocking current
+ * transaction commits.  A separate, higher bound defines when CIL pushes are
+ * enforced to ensure we stay within our maximum checkpoint size bounds.
+ * threshold, yet give us plenty of space for aggregation on large logs.
 */
+#define XLOG_CIL_SPACE_LIMIT(log)       (log->l_logsize >> 3)
-#define XLOG_CIL_SPACE_LIMIT(log)       \
+#define XLOG_CIL_HARD_SPACE_LIMIT(log)  (3 * (log->l_logsize >> 4))
-        (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
 /*
 * The reservation head lsn is not made up of a cycle number and block number.
@@ -562,8 +568,16 @@ int	xlog_cil_init(struct log *log);
 void    xlog_cil_init_post_recovery(struct log *log);
 void    xlog_cil_destroy(struct log *log);
-int     xlog_cil_push(struct log *log, int push_now);
+/*
-xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
+ * CIL force routines
+ */
+xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
+static inline void
+xlog_cil_force(struct log *log)
+{
+        xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+}
 /*
 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdca7416c754..1c47edaea0d2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1167,7 +1167,7 @@ xfs_trans_del_item(
 * Unlock all of the items of a transaction and free all the descriptors
 * of that transaction.
 */
-STATIC void
+void
 xfs_trans_free_items(
        struct xfs_trans        *tp,
        xfs_lsn_t               commit_lsn,
@@ -1653,9 +1653,6 @@ xfs_trans_commit_cil(
                return error;
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        /* xfs_trans_free_items() unlocks them first */
-        xfs_trans_free_items(tp, *commit_lsn, 0);
        xfs_trans_free(tp);
        return 0;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index e2d93d8ead7b..62da86c90de5 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,7 +25,8 @@ struct xfs_trans;
 void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void    xfs_trans_del_item(struct xfs_log_item *);
+void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+                                int flags);
 void    xfs_trans_item_committed(struct xfs_log_item *lip,
                                xfs_lsn_t commit_lsn, int aborted);
 void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 66d585c6917c..4c7c7bfb2b2f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2299,15 +2299,22 @@ xfs_alloc_file_space(
                        e = allocatesize_fsb;
                }
+                /*
+                 * The transaction reservation is limited to a 32-bit block
+                 * count, hence we need to limit the number of blocks we are
+                 * trying to reserve to avoid an overflow. We can't allocate
+                 * more than @nimaps extents, and an extent is limited on disk
+                 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+                 */
+                resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
                if (unlikely(rt)) {
-                        resrtextents = qblocks = (uint)(e - s);
+                        resrtextents = qblocks = resblks;
                        resrtextents /= mp->m_sb.sb_rextsize;
                        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
                        quota_flag = XFS_QMOPT_RES_RTBLKS;
                } else {
                        resrtextents = 0;
-                        resblks = qblocks = \
+                        resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
-                                XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
                        quota_flag = XFS_QMOPT_RES_REGBLKS;
                }