94 files changed, 1745 insertions, 1063 deletions
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 9e60fd20171..a7528b91393 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -108,7 +108,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        Node *fmt;
        struct file * interp_file = NULL;
        char iname[BINPRM_BUF_SIZE];
-        char *iname_addr = iname;
+        const char *iname_addr = iname;
        int retval;
        int fd_binary = -1;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index aca9d55afb2..396a9884591 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -16,7 +16,8 @@
 static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 {
-        char *cp, *i_name, *i_arg;
+        const char *i_arg, *i_name;
+        char *cp;
        struct file *file;
        char interp[BINPRM_BUF_SIZE];
        int retval;
diff --git a/fs/buffer.c b/fs/buffer.c
index 50efa339e05..3e7dca279d1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -770,11 +770,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                spin_unlock(lock);
                                /*
                                 * Ensure any pending I/O completes so that
-                                 * ll_rw_block() actually writes the current
+                                 * write_dirty_buffer() actually writes the
-                                 * contents - it is a noop if I/O is still in
+                                 * current contents - it is a noop if I/O is
-                                 * flight on potentially older contents.
+                                 * still in flight on potentially older
+                                 * contents.
                                 */
-                                ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+                                write_dirty_buffer(bh, WRITE_SYNC_PLUG);
                                /*
                                 * Kick off IO for the previous mapping. Note
@@ -2912,13 +2913,6 @@ int submit_bh(int rw, struct buffer_head * bh)
        BUG_ON(buffer_unwritten(bh));
        /*
-         * Mask in barrier bit for a write (could be either a WRITE or a
-         * WRITE_SYNC
-         */
-        if (buffer_ordered(bh) && (rw & WRITE))
-                rw |= WRITE_BARRIER;
-        /*
         * Only clear out a write error when rewriting
         */
        if (test_set_buffer_req(bh) && (rw & WRITE))
@@ -2956,22 +2950,21 @@ EXPORT_SYMBOL(submit_bh);
 /**
 * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
+ * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
 * @nr: number of &struct buffer_heads in the array
 * @bhs: array of pointers to &struct buffer_head
 *
 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
- * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
+ * %READA option is described in the documentation for generic_make_request()
- * are sent to disk. The fourth %READA option is described in the documentation
+ * which ll_rw_block() calls.
- * for generic_make_request() which ll_rw_block() calls.
 *
 * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
+ * BH_Lock state bit), any buffer that appears to be clean when doing a write
- * clean when doing a write request, and any buffer that appears to be
+ * request, and any buffer that appears to be up-to-date when doing read
- * up-to-date when doing read request.  Further it marks as clean buffers that
+ * request.  Further it marks as clean buffers that are processed for
- * are processed for writing (the buffer cache won't assume that they are
+ * writing (the buffer cache won't assume that they are actually clean
- * actually clean until the buffer gets unlocked).
+ * until the buffer gets unlocked).
 *
 * ll_rw_block sets b_end_io to simple completion handler that marks
 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -2987,20 +2980,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];
-                if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
+                if (!trylock_buffer(bh))
-                        lock_buffer(bh);
-                else if (!trylock_buffer(bh))
                        continue;
+                if (rw == WRITE) {
-                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
-                    rw == SWRITE_SYNC_PLUG) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                get_bh(bh);
-                                if (rw == SWRITE_SYNC)
+                                submit_bh(WRITE, bh);
-                                        submit_bh(WRITE_SYNC, bh);
-                                else
-                                        submit_bh(WRITE, bh);
                                continue;
                        }
                } else {
@@ -3016,12 +3002,25 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 }
 EXPORT_SYMBOL(ll_rw_block);
+void write_dirty_buffer(struct buffer_head *bh, int rw)
+{
+        lock_buffer(bh);
+        if (!test_clear_buffer_dirty(bh)) {
+                unlock_buffer(bh);
+                return;
+        }
+        bh->b_end_io = end_buffer_write_sync;
+        get_bh(bh);
+        submit_bh(rw, bh);
+}
+EXPORT_SYMBOL(write_dirty_buffer);
 /*
 * For a data-integrity writeout, we need to wait upon any in-progress I/O
 * and then start new I/O and then wait upon it.  The caller must have a ref on
 * the buffer_head.
 */
-int sync_dirty_buffer(struct buffer_head *bh)
+int __sync_dirty_buffer(struct buffer_head *bh, int rw)
 {
        int ret = 0;
@@ -3030,7 +3029,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(rw, bh);
                wait_on_buffer(bh);
                if (buffer_eopnotsupp(bh)) {
                        clear_buffer_eopnotsupp(bh);
@@ -3043,6 +3042,12 @@ int sync_dirty_buffer(struct buffer_head *bh)
        }
        return ret;
 }
+EXPORT_SYMBOL(__sync_dirty_buffer);
+int sync_dirty_buffer(struct buffer_head *bh)
+{
+        return __sync_dirty_buffer(bh, WRITE_SYNC);
+}
 EXPORT_SYMBOL(sync_dirty_buffer);
 /*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5598a0d0229..4cfce1ee31f 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
        /* dirty the head */
        spin_lock(&inode->i_lock);
-        if (ci->i_wrbuffer_ref_head == 0)
+        if (ci->i_head_snapc == NULL)
                ci->i_head_snapc = ceph_get_snap_context(snapc);
        ++ci->i_wrbuffer_ref_head;
        if (ci->i_wrbuffer_ref == 0)
@@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page)
        spin_lock_irq(&mapping->tree_lock);
        if (page->mapping) {    /* Race with truncate? */
                WARN_ON_ONCE(!PageUptodate(page));
+                account_page_dirtied(page, page->mapping);
-                if (mapping_cap_account_dirty(mapping)) {
-                        __inc_zone_page_state(page, NR_FILE_DIRTY);
-                        __inc_bdi_stat(mapping->backing_dev_info,
-                                        BDI_RECLAIMABLE);
-                        task_io_account_write(PAGE_CACHE_SIZE);
-                }
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
@@ -352,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
                        break;
                }
        }
-        if (!snapc && ci->i_head_snapc) {
+        if (!snapc && ci->i_wrbuffer_ref_head) {
                snapc = ceph_get_snap_context(ci->i_head_snapc);
                dout(" head snapc %p has %d dirty pages\n",
                     snapc, ci->i_wrbuffer_ref_head);
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 582e0b2caf8..a2d002cbdec 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -376,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
                th = get_ticket_handler(ac, service);
-                if (!th) {
+                if (IS_ERR(th)) {
                        *pneed |= service;
                        continue;
                }
@@ -399,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
        struct ceph_x_ticket_handler *th =
                get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+        if (IS_ERR(th))
+                return PTR_ERR(th);
        ceph_x_validate_tickets(ac, &need);
        dout("build_request want %x have %x need %x\n",
@@ -450,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
                        return -ERANGE;
                head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-                BUG_ON(!th);
                ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
                if (ret)
                        return ret;
@@ -505,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
        case CEPHX_GET_PRINCIPAL_SESSION_KEY:
                th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-                BUG_ON(!th);
+                if (IS_ERR(th))
+                        return PTR_ERR(th);
                ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
                                               buf + sizeof(*head), end);
                break;
@@ -563,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
        void *end = p + sizeof(au->reply_buf);
        th = get_ticket_handler(ac, au->service);
-        if (!th)
+        if (IS_ERR(th))
-                return -EIO;  /* hrm! */
+                return PTR_ERR(th);
        ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
        if (ret < 0)
                return ret;
@@ -626,7 +629,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
        struct ceph_x_ticket_handler *th;
        th = get_ticket_handler(ac, peer_type);
-        if (th && !IS_ERR(th))
+        if (!IS_ERR(th))
                remove_ticket_handler(ac, th);
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7bf182b0397..a2069b6680a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1082,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        gid_t gid;
        struct ceph_mds_session *session;
        u64 xattr_version = 0;
+        struct ceph_buffer *xattr_blob = NULL;
        int delayed = 0;
        u64 flush_tid = 0;
        int i;
@@ -1142,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                for (i = 0; i < CEPH_CAP_BITS; i++)
                        if (flushing & (1 << i))
                                ci->i_cap_flush_tid[i] = flush_tid;
+                follows = ci->i_head_snapc->seq;
+        } else {
+                follows = 0;
        }
        keep = cap->implemented;
@@ -1155,14 +1160,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        mtime = inode->i_mtime;
        atime = inode->i_atime;
        time_warp_seq = ci->i_time_warp_seq;
-        follows = ci->i_snap_realm->cached_context->seq;
        uid = inode->i_uid;
        gid = inode->i_gid;
        mode = inode->i_mode;
-        if (dropping & CEPH_CAP_XATTR_EXCL) {
+        if (flushing & CEPH_CAP_XATTR_EXCL) {
                __ceph_build_xattrs_blob(ci);
-                xattr_version = ci->i_xattrs.version + 1;
+                xattr_blob = ci->i_xattrs.blob;
+                xattr_version = ci->i_xattrs.version;
        }
        spin_unlock(&inode->i_lock);
@@ -1170,9 +1175,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
                op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
                size, max_size, &mtime, &atime, time_warp_seq,
-                uid, gid, mode,
+                uid, gid, mode, xattr_version, xattr_blob,
-                xattr_version,
-                (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
                follows);
        if (ret < 0) {
                dout("error sending cap msg, must requeue %p\n", inode);
@@ -1282,7 +1285,7 @@ retry:
                             &capsnap->mtime, &capsnap->atime,
                             capsnap->time_warp_seq,
                             capsnap->uid, capsnap->gid, capsnap->mode,
-                             0, NULL,
+                             capsnap->xattr_version, capsnap->xattr_blob,
                             capsnap->follows);
                next_follows = capsnap->follows + 1;
@@ -1332,7 +1335,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
             ceph_cap_string(was | mask));
        ci->i_dirty_caps |= mask;
        if (was == 0) {
-                dout(" inode %p now dirty\n", &ci->vfs_inode);
+                if (!ci->i_head_snapc)
+                        ci->i_head_snapc = ceph_get_snap_context(
+                                ci->i_snap_realm->cached_context);
+                dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
+                        ci->i_head_snapc);
                BUG_ON(!list_empty(&ci->i_dirty_item));
                spin_lock(&mdsc->cap_dirty_lock);
                list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -2190,7 +2197,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
        if (ci->i_head_snapc == snapc) {
                ci->i_wrbuffer_ref_head -= nr;
-                if (!ci->i_wrbuffer_ref_head) {
+                if (ci->i_wrbuffer_ref_head == 0 &&
+                    ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+                        BUG_ON(!ci->i_head_snapc);
                        ceph_put_snap_context(ci->i_head_snapc);
                        ci->i_head_snapc = NULL;
                }
@@ -2483,6 +2492,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                        dout(" inode %p now clean\n", inode);
                        BUG_ON(!list_empty(&ci->i_dirty_item));
                        drop = 1;
+                        if (ci->i_wrbuffer_ref_head == 0) {
+                                BUG_ON(!ci->i_head_snapc);
+                                ceph_put_snap_context(ci->i_head_snapc);
+                                ci->i_head_snapc = NULL;
+                        }
                } else {
                        BUG_ON(list_empty(&ci->i_dirty_item));
                }
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 360c4f22718..6fd8b20a861 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -171,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p)
                } else if (req->r_dentry) {
                        path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
                                                    &pathbase, 0);
+                        if (IS_ERR(path))
+                                path = NULL;
                        spin_lock(&req->r_dentry->d_lock);
                        seq_printf(s, " #%llx/%.*s (%s)",
                                   ceph_ino(req->r_dentry->d_parent->d_inode),
@@ -187,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p)
                if (req->r_old_dentry) {
                        path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
                                                    &pathbase, 0);
+                        if (IS_ERR(path))
+                                path = NULL;
                        spin_lock(&req->r_old_dentry->d_lock);
                        seq_printf(s, " #%llx/%.*s (%s)",
                           ceph_ino(req->r_old_dentry->d_parent->d_inode),
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 67bbb41d552..6e4f43ff23e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -46,7 +46,7 @@ int ceph_init_dentry(struct dentry *dentry)
        else
                dentry->d_op = &ceph_snap_dentry_ops;
-        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
        if (!di)
                return -ENOMEM;          /* oh well */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5d893d31e39..e7cca414da0 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -677,6 +677,7 @@ static int fill_inode(struct inode *inode,
                if (ci->i_files == 0 && ci->i_subdirs == 0 &&
                    ceph_snap(inode) == CEPH_NOSNAP &&
                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+                    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                        dout(" marking %p complete (empty)\n", inode);
                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
@@ -1229,11 +1230,11 @@ retry_lookup:
                        in = dn->d_inode;
                } else {
                        in = ceph_get_inode(parent->d_sb, vino);
-                        if (in == NULL) {
+                        if (IS_ERR(in)) {
                                dout("new_inode badness\n");
                                d_delete(dn);
                                dput(dn);
-                                err = -ENOMEM;
+                                err = PTR_ERR(in);
                                goto out;
                        }
                        dn = splice_dentry(dn, in, NULL);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae85af06454..ff4e753aae9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -82,7 +82,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
                length = fl->fl_end - fl->fl_start + 1;
        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                (u64)fl->fl_pid, (u64)fl->fl_nspid,
+                                (u64)fl->fl_pid,
+                                (u64)(unsigned long)fl->fl_nspid,
                                lock_cmd, fl->fl_start,
                                length, wait);
        if (!err) {
@@ -92,7 +93,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
                        /* undo! This should only happen if the kernel detects
                         * local deadlock. */
                        ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                          (u64)fl->fl_pid, (u64)fl->fl_nspid,
+                                          (u64)fl->fl_pid,
+                                          (u64)(unsigned long)fl->fl_nspid,
                                          CEPH_LOCK_UNLOCK, fl->fl_start,
                                          length, 0);
                        dout("got %d on posix_lock_file, undid lock", err);
@@ -132,7 +134,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                length = fl->fl_end - fl->fl_start + 1;
        err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
-                                file, (u64)fl->fl_pid, (u64)fl->fl_nspid,
+                                file, (u64)fl->fl_pid,
+                                (u64)(unsigned long)fl->fl_nspid,
                                lock_cmd, fl->fl_start,
                                length, wait);
        if (!err) {
@@ -141,7 +144,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                        ceph_lock_message(CEPH_LOCK_FLOCK,
                                          CEPH_MDS_OP_SETFILELOCK,
                                          file, (u64)fl->fl_pid,
-                                          (u64)fl->fl_nspid,
+                                          (u64)(unsigned long)fl->fl_nspid,
                                          CEPH_LOCK_UNLOCK, fl->fl_start,
                                          length, 0);
                        dout("got %d on flock_lock_file_wait, undid lock", err);
@@ -235,7 +238,8 @@ int lock_to_ceph_filelock(struct file_lock *lock,
        cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
        cephlock->client = cpu_to_le64(0);
        cephlock->pid = cpu_to_le64(lock->fl_pid);
-        cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid);
+        cephlock->pid_namespace =
+                cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
        switch (lock->fl_type) {
        case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a75ddbf9fe3..f091b135178 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -560,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 *
 * Called under mdsc->mutex.
 */
+struct dentry *get_nonsnap_parent(struct dentry *dentry)
+{
+        while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+                dentry = dentry->d_parent;
+        return dentry;
+}
 static int __choose_mds(struct ceph_mds_client *mdsc,
                        struct ceph_mds_request *req)
 {
@@ -590,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        if (req->r_inode) {
                inode = req->r_inode;
        } else if (req->r_dentry) {
-                if (req->r_dentry->d_inode) {
+                struct inode *dir = req->r_dentry->d_parent->d_inode;
+                if (dir->i_sb != mdsc->client->sb) {
+                        /* not this fs! */
+                        inode = req->r_dentry->d_inode;
+                } else if (ceph_snap(dir) != CEPH_NOSNAP) {
+                        /* direct snapped/virtual snapdir requests
+                         * based on parent dir inode */
+                        struct dentry *dn =
+                                get_nonsnap_parent(req->r_dentry->d_parent);
+                        inode = dn->d_inode;
+                        dout("__choose_mds using nonsnap parent %p\n", inode);
+                } else if (req->r_dentry->d_inode) {
+                        /* dentry target */
                        inode = req->r_dentry->d_inode;
                } else {
-                        inode = req->r_dentry->d_parent->d_inode;
+                        /* dir + name */
+                        inode = dir;
                        hash = req->r_dentry->d_name.hash;
                        is_hash = true;
                }
        }
        dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
             (int)hash, mode);
        if (!inode)
@@ -2208,7 +2230,7 @@ static void handle_session(struct ceph_mds_session *session,
                        pr_info("mds%d reconnect denied\n", session->s_mds);
                remove_session_caps(session);
                wake = 1; /* for good measure */
-                complete_all(&mdsc->session_close_waiters);
+                wake_up_all(&mdsc->session_close_wq);
                kick_requests(mdsc, mds);
                break;
@@ -2302,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
-                        BUG_ON(err);
+                        goto out_dput;
                }
        } else {
                path = NULL;
@@ -2310,7 +2332,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        }
        err = ceph_pagelist_encode_string(pagelist, path, pathlen);
        if (err)
-                goto out;
+                goto out_free;
        spin_lock(&inode->i_lock);
        cap->seq = 0;        /* reset cap seq */
@@ -2354,8 +2376,9 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                unlock_kernel();
        }
-out:
+out_free:
        kfree(path);
+out_dput:
        dput(dentry);
        return err;
 }
@@ -2876,7 +2899,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
                return -ENOMEM;
        init_completion(&mdsc->safe_umount_waiters);
-        init_completion(&mdsc->session_close_waiters);
+        init_waitqueue_head(&mdsc->session_close_wq);
        INIT_LIST_HEAD(&mdsc->waiting_for_map);
        mdsc->sessions = NULL;
        mdsc->max_sessions = 0;
@@ -3021,6 +3044,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
        wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
 }
+/*
+ * true if all sessions are closed, or we force unmount
+ */
+bool done_closing_sessions(struct ceph_mds_client *mdsc)
+{
+        int i, n = 0;
+        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+                return true;
+        mutex_lock(&mdsc->mutex);
+        for (i = 0; i < mdsc->max_sessions; i++)
+                if (mdsc->sessions[i])
+                        n++;
+        mutex_unlock(&mdsc->mutex);
+        return n == 0;
+}
 /*
 * called after sb is ro.
@@ -3029,45 +3069,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
        struct ceph_mds_session *session;
        int i;
-        int n;
        struct ceph_client *client = mdsc->client;
-        unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+        unsigned long timeout = client->mount_args->mount_timeout * HZ;
        dout("close_sessions\n");
-        mutex_lock(&mdsc->mutex);
        /* close sessions */
-        started = jiffies;
+        mutex_lock(&mdsc->mutex);
-        while (time_before(jiffies, started + timeout)) {
+        for (i = 0; i < mdsc->max_sessions; i++) {
-                dout("closing sessions\n");
+                session = __ceph_lookup_mds_session(mdsc, i);
-                n = 0;
+                if (!session)
-                for (i = 0; i < mdsc->max_sessions; i++) {
+                        continue;
-                        session = __ceph_lookup_mds_session(mdsc, i);
-                        if (!session)
-                                continue;
-                        mutex_unlock(&mdsc->mutex);
-                        mutex_lock(&session->s_mutex);
-                        __close_session(mdsc, session);
-                        mutex_unlock(&session->s_mutex);
-                        ceph_put_mds_session(session);
-                        mutex_lock(&mdsc->mutex);
-                        n++;
-                }
-                if (n == 0)
-                        break;
-                if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
-                        break;
-                dout("waiting for sessions to close\n");
                mutex_unlock(&mdsc->mutex);
-                wait_for_completion_timeout(&mdsc->session_close_waiters,
+                mutex_lock(&session->s_mutex);
-                                            timeout);
+                __close_session(mdsc, session);
+                mutex_unlock(&session->s_mutex);
+                ceph_put_mds_session(session);
                mutex_lock(&mdsc->mutex);
        }
+        mutex_unlock(&mdsc->mutex);
+        dout("waiting for sessions to close\n");
+        wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
+                           timeout);
        /* tear down remaining sessions */
+        mutex_lock(&mdsc->mutex);
        for (i = 0; i < mdsc->max_sessions; i++) {
                if (mdsc->sessions[i]) {
                        session = get_session(mdsc->sessions[i]);
@@ -3080,9 +3107,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
                        mutex_lock(&mdsc->mutex);
                }
        }
        WARN_ON(!list_empty(&mdsc->cap_delay_list));
        mutex_unlock(&mdsc->mutex);
        ceph_cleanup_empty_realms(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ab7e89f5e34..c98267ce6d2 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -234,7 +234,8 @@ struct ceph_mds_client {
        struct mutex            mutex;         /* all nested structures */
        struct ceph_mdsmap      *mdsmap;
-        struct completion       safe_umount_waiters, session_close_waiters;
+        struct completion       safe_umount_waiters;
+        wait_queue_head_t       session_close_wq;
        struct list_head        waiting_for_map;
        struct ceph_mds_session **sessions;    /* NULL for mds if no session */
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index bed6391e52c..dfced1dacbc 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -661,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc,
        reqhead->reassert_version = req->r_reassert_version;
        req->r_stamp = jiffies;
-        list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
+        list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
        ceph_msg_get(req->r_request); /* send consumes a ref */
        ceph_con_send(&req->r_osd->o_con, req->r_request);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index c0b26b6badb..4868b9dcac5 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -435,7 +435,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap_snap *capsnap;
-        int used;
+        int used, dirty;
        capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
        if (!capsnap) {
@@ -445,6 +445,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
        spin_lock(&inode->i_lock);
        used = __ceph_caps_used(ci);
+        dirty = __ceph_caps_dirty(ci);
        if (__ceph_have_pending_cap_snap(ci)) {
                /* there is no point in queuing multiple "pending" cap_snaps,
                   as no new writes are allowed to start when pending, so any
@@ -452,11 +453,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                   cap_snap.  lucky us. */
                dout("queue_cap_snap %p already pending\n", inode);
                kfree(capsnap);
-        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
+                   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+                             CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
                struct ceph_snap_context *snapc = ci->i_head_snapc;
+                dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
+                     capsnap, snapc);
                igrab(inode);
+                
                atomic_set(&capsnap->nref, 1);
                capsnap->ci = ci;
                INIT_LIST_HEAD(&capsnap->ci_item);
@@ -464,15 +469,21 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                capsnap->follows = snapc->seq - 1;
                capsnap->issued = __ceph_caps_issued(ci, NULL);
-                capsnap->dirty = __ceph_caps_dirty(ci);
+                capsnap->dirty = dirty;
                capsnap->mode = inode->i_mode;
                capsnap->uid = inode->i_uid;
                capsnap->gid = inode->i_gid;
-                /* fixme? */
+                if (dirty & CEPH_CAP_XATTR_EXCL) {
-                capsnap->xattr_blob = NULL;
+                        __ceph_build_xattrs_blob(ci);
-                capsnap->xattr_len = 0;
+                        capsnap->xattr_blob =
+                                ceph_buffer_get(ci->i_xattrs.blob);
+                        capsnap->xattr_version = ci->i_xattrs.version;
+                } else {
+                        capsnap->xattr_blob = NULL;
+                        capsnap->xattr_version = 0;
+                }
                /* dirty page count moved from _head to this cap_snap;
                   all subsequent writes page dirties occur _after_ this
@@ -480,7 +491,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
                ci->i_wrbuffer_ref_head = 0;
                capsnap->context = snapc;
-                ci->i_head_snapc = NULL;
+                ci->i_head_snapc =
+                        ceph_get_snap_context(ci->i_snap_realm->cached_context);
+                dout(" new snapc is %p\n", ci->i_head_snapc);
                list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
                if (used & CEPH_CAP_FILE_WR) {
@@ -539,6 +552,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
        return 1;  /* caller may want to ceph_flush_snaps */
 }
+/*
+ * Queue cap_snaps for snap writeback for this realm and its children.
+ * Called under snap_rwsem, so realm topology won't change.
+ */
+static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+{
+        struct ceph_inode_info *ci;
+        struct inode *lastinode = NULL;
+        struct ceph_snap_realm *child;
+        dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+        spin_lock(&realm->inodes_with_caps_lock);
+        list_for_each_entry(ci, &realm->inodes_with_caps,
+                            i_snap_realm_item) {
+                struct inode *inode = igrab(&ci->vfs_inode);
+                if (!inode)
+                        continue;
+                spin_unlock(&realm->inodes_with_caps_lock);
+                if (lastinode)
+                        iput(lastinode);
+                lastinode = inode;
+                ceph_queue_cap_snap(ci);
+                spin_lock(&realm->inodes_with_caps_lock);
+        }
+        spin_unlock(&realm->inodes_with_caps_lock);
+        if (lastinode)
+                iput(lastinode);
+        dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
+        list_for_each_entry(child, &realm->children, child_item)
+                queue_realm_cap_snaps(child);
+        dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+}
 /*
 * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
@@ -589,29 +637,8 @@ more:
                 *
                 * ...unless it's a snap deletion!
                 */
-                if (!deletion) {
+                if (!deletion)
-                        struct ceph_inode_info *ci;
+                        queue_realm_cap_snaps(realm);
-                        struct inode *lastinode = NULL;
-                        spin_lock(&realm->inodes_with_caps_lock);
-                        list_for_each_entry(ci, &realm->inodes_with_caps,
-                                            i_snap_realm_item) {
-                                struct inode *inode = igrab(&ci->vfs_inode);
-                                if (!inode)
-                                        continue;
-                                spin_unlock(&realm->inodes_with_caps_lock);
-                                if (lastinode)
-                                        iput(lastinode);
-                                lastinode = inode;
-                                ceph_queue_cap_snap(ci);
-                                spin_lock(&realm->inodes_with_caps_lock);
-                        }
-                        spin_unlock(&realm->inodes_with_caps_lock);
-                        if (lastinode)
-                                iput(lastinode);
-                        dout("update_snap_trace cap_snaps queued\n");
-                }
        } else {
                dout("update_snap_trace %llx %p seq %lld unchanged\n",
                     realm->ino, realm, realm->seq);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2482d696f0d..c33897ae572 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -216,8 +216,7 @@ struct ceph_cap_snap {
        uid_t uid;
        gid_t gid;
-        void *xattr_blob;
+        struct ceph_buffer *xattr_blob;
-        int xattr_len;
        u64 xattr_version;
        u64 size;
@@ -229,8 +228,11 @@ struct ceph_cap_snap {
 static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
 {
-        if (atomic_dec_and_test(&capsnap->nref))
+        if (atomic_dec_and_test(&capsnap->nref)) {
+                if (capsnap->xattr_blob)
+                        ceph_buffer_put(capsnap->xattr_blob);
                kfree(capsnap);
+        }
 }
 /*
@@ -342,7 +344,8 @@ struct ceph_inode_info {
        unsigned i_cap_exporting_issued;
        struct ceph_cap_reservation i_cap_migration_resv;
        struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
-        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
+        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
+                                                    dirty|flushing caps */
        unsigned i_snap_caps;           /* cap bits for snapped files */
        int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 097a2654c00..9578af610b7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -485,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
                ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
                ci->i_xattrs.prealloc_blob = NULL;
                ci->i_xattrs.dirty = false;
+                ci->i_xattrs.version++;
        }
 }
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 917b7d449bb..0da1debd499 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,8 @@ config CIFS
        tristate "CIFS support (advanced network filesystem, SMBFS successor)"
        depends on INET
        select NLS
+        select CRYPTO_MD5
+        select CRYPTO_ARC4
        help
          This is the client VFS module for the Common Internet File System
          (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index cfd1ce34e0b..21f0fbd8698 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -597,13 +597,13 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                                if (compare_oid(oid, oidlen, MSKRB5_OID,
                                                MSKRB5_OID_LEN))
                                        server->sec_mskerberos = true;
-                                else if (compare_oid(oid, oidlen, KRB5U2U_OID,
+                                if (compare_oid(oid, oidlen, KRB5U2U_OID,
                                                     KRB5U2U_OID_LEN))
                                        server->sec_kerberosu2u = true;
-                                else if (compare_oid(oid, oidlen, KRB5_OID,
+                                if (compare_oid(oid, oidlen, KRB5_OID,
                                                     KRB5_OID_LEN))
                                        server->sec_kerberos = true;
-                                else if (compare_oid(oid, oidlen, NTLMSSP_OID,
+                                if (compare_oid(oid, oidlen, NTLMSSP_OID,
                                                     NTLMSSP_OID_LEN))
                                        server->sec_ntlmssp = true;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 650638275a6..7fe6b52df50 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -30,6 +30,8 @@
 *     This is a compressed table of upper and lower case conversion.
 *
 */
+#ifndef _CIFS_UNICODE_H
+#define _CIFS_UNICODE_H
 #include <asm/byteorder.h>
 #include <linux/types.h>
@@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[];
 #endif                          /* UNIUPR_NOUPPER */
 #ifndef UNIUPR_NOLOWER
-extern signed char UniLowerTable[512];
+extern signed char CifsUniLowerTable[512];
-extern struct UniCaseRange UniLowerRange[];
+extern const struct UniCaseRange CifsUniLowerRange[];
 #endif                          /* UNIUPR_NOLOWER */
 #ifdef __KERNEL__
@@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin)
 * UniTolower:  Convert a unicode character to lower case
 */
 static inline wchar_t
-UniTolower(wchar_t uc)
+UniTolower(register wchar_t uc)
 {
-        register struct UniCaseRange *rp;
+        register const struct UniCaseRange *rp;
-        if (uc < sizeof(UniLowerTable)) {
+        if (uc < sizeof(CifsUniLowerTable)) {
                /* Latin characters */
-                return uc + UniLowerTable[uc];  /* Use base tables */
+                return uc + CifsUniLowerTable[uc];      /* Use base tables */
        } else {
-                rp = UniLowerRange;     /* Use range tables */
+                rp = CifsUniLowerRange; /* Use range tables */
                while (rp->start) {
                        if (uc < rp->start)     /* Before start of range */
                                return uc;      /* Uppercase = input */
@@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin)
 }
 #endif
+#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 18a9d978e51..0ac7c5a8633 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = {
 /*
 * Latin lower case
 */
-static signed char CifsUniLowerTable[512] = {
+signed char CifsUniLowerTable[512] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
@@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = {
 /*
 * Lower Case Range
 */
-static const struct UniCaseRange CifsUniLowerRange[] = {
+const struct UniCaseRange CifsUniLowerRange[] = {
-        0x0380, 0x03ab, UniCaseRangeL0380,
+        {0x0380, 0x03ab, UniCaseRangeL0380},
-        0x0400, 0x042f, UniCaseRangeL0400,
+        {0x0400, 0x042f, UniCaseRangeL0400},
-        0x0490, 0x04cb, UniCaseRangeL0490,
+        {0x0490, 0x04cb, UniCaseRangeL0490},
-        0x1e00, 0x1ff7, UniCaseRangeL1e00,
+        {0x1e00, 0x1ff7, UniCaseRangeL1e00},
-        0xff20, 0xff3a, UniCaseRangeLff20,
+        {0xff20, 0xff3a, UniCaseRangeLff20},
-        0, 0, 0
+        {0}
 };
 #endif
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 847628dfdc4..709f2296bdb 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
 #include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
+#include "ntlmssp.h"
 #include <linux/ctype.h>
 #include <linux/random.h>
@@ -42,21 +43,43 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
                       unsigned char *p24);
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
-                                    const struct mac_key *key, char *signature)
+                        struct TCP_Server_Info *server, char *signature)
 {
-        struct  MD5Context context;
+        int rc;
-        if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
+        if (cifs_pdu == NULL || server == NULL || signature == NULL)
                return -EINVAL;
-        cifs_MD5_init(&context);
+        if (!server->ntlmssp.sdescmd5) {
-        cifs_MD5_update(&context, (char *)&key->data, key->len);
+                cERROR(1,
-        cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+                        "cifs_calculate_signature: can't generate signature\n");
+                return -1;
+        }
-        cifs_MD5_final(signature, &context);
+        rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash);
-        return 0;
+        if (rc) {
+                cERROR(1, "cifs_calculate_signature: oould not init md5\n");
+                return rc;
+        }
+        if (server->secType == RawNTLMSSP)
+                crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+                        server->session_key.data.ntlmv2.key,
+                        CIFS_NTLMV2_SESSKEY_SIZE);
+        else
+                crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+                        (char *)&server->session_key.data,
+                        server->session_key.len);
+        crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+                        cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+        rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature);
+        return rc;
 }
 int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
                  __u32 *pexpected_response_sequence_number)
 {
@@ -78,8 +101,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
        server->sequence_number++;
        spin_unlock(&GlobalMid_Lock);
-        rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key,
+        rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
-                                      smb_signature);
        if (rc)
                memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
        else
@@ -89,21 +111,39 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 }
 static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-                                const struct mac_key *key, char *signature)
+                        struct TCP_Server_Info *server, char *signature)
 {
-        struct  MD5Context context;
        int i;
+        int rc;
-        if ((iov == NULL) || (signature == NULL) || (key == NULL))
+        if (iov == NULL || server == NULL || signature == NULL)
                return -EINVAL;
-        cifs_MD5_init(&context);
+        if (!server->ntlmssp.sdescmd5) {
-        cifs_MD5_update(&context, (char *)&key->data, key->len);
+                cERROR(1, "cifs_calc_signature2: can't generate signature\n");
+                return -1;
+        }
+        rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash);
+        if (rc) {
+                cERROR(1, "cifs_calc_signature2: oould not init md5\n");
+                return rc;
+        }
+        if (server->secType == RawNTLMSSP)
+                crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+                        server->session_key.data.ntlmv2.key,
+                        CIFS_NTLMV2_SESSKEY_SIZE);
+        else
+                crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+                        (char *)&server->session_key.data,
+                        server->session_key.len);
        for (i = 0; i < n_vec; i++) {
                if (iov[i].iov_len == 0)
                        continue;
                if (iov[i].iov_base == NULL) {
-                        cERROR(1, "null iovec entry");
+                        cERROR(1, "cifs_calc_signature2: null iovec entry");
                        return -EIO;
                }
                /* The first entry includes a length field (which does not get
@@ -111,18 +151,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
                if (i == 0) {
                        if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
                                break; /* nothing to sign or corrupt header */
-                        cifs_MD5_update(&context, iov[0].iov_base+4,
+                        crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
-                                  iov[0].iov_len-4);
+                                iov[i].iov_base + 4, iov[i].iov_len - 4);
                } else
-                        cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
+                        crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+                                iov[i].iov_base, iov[i].iov_len);
        }
-        cifs_MD5_final(signature, &context);
+        rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature);
-        return 0;
+        return rc;
 }
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
                   __u32 *pexpected_response_sequence_number)
 {
@@ -145,8 +185,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        server->sequence_number++;
        spin_unlock(&GlobalMid_Lock);
-        rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key,
+        rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
-                                      smb_signature);
        if (rc)
                memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
        else
@@ -156,14 +195,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 }
 int cifs_verify_signature(struct smb_hdr *cifs_pdu,
-                          const struct mac_key *mac_key,
+                          struct TCP_Server_Info *server,
                          __u32 expected_sequence_number)
 {
-        unsigned int rc;
+        int rc;
        char server_response_sig[8];
        char what_we_think_sig_should_be[20];
-        if ((cifs_pdu == NULL) || (mac_key == NULL))
+        if (cifs_pdu == NULL || server == NULL)
                return -EINVAL;
        if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +231,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
                                        cpu_to_le32(expected_sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
-        rc = cifs_calculate_signature(cifs_pdu, mac_key,
+        rc = cifs_calculate_signature(cifs_pdu, server,
                what_we_think_sig_should_be);
        if (rc)
@@ -209,7 +248,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 }
 /* We fill in key by putting in 40 byte array which was allocated by caller */
-int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+int cifs_calculate_session_key(struct session_key *key, const char *rn,
                           const char *password)
 {
        char temp_key[16];
@@ -223,63 +262,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
        return 0;
 }
-int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
-                               const struct nls_table *nls_info)
-{
-        char temp_hash[16];
-        struct HMACMD5Context ctx;
-        char *ucase_buf;
-        __le16 *unicode_buf;
-        unsigned int i, user_name_len, dom_name_len;
-        if (ses == NULL)
-                return -EINVAL;
-        E_md4hash(ses->password, temp_hash);
-        hmac_md5_init_limK_to_64(temp_hash, 16, &ctx);
-        user_name_len = strlen(ses->userName);
-        if (user_name_len > MAX_USERNAME_SIZE)
-                return -EINVAL;
-        if (ses->domainName == NULL)
-                return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
-        dom_name_len = strlen(ses->domainName);
-        if (dom_name_len > MAX_USERNAME_SIZE)
-                return -EINVAL;
-        ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL);
-        if (ucase_buf == NULL)
-                return -ENOMEM;
-        unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL);
-        if (unicode_buf == NULL) {
-                kfree(ucase_buf);
-                return -ENOMEM;
-        }
-        for (i = 0; i < user_name_len; i++)
-                ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]];
-        ucase_buf[i] = 0;
-        user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf,
-                                      MAX_USERNAME_SIZE*2, nls_info);
-        unicode_buf[user_name_len] = 0;
-        user_name_len++;
-        for (i = 0; i < dom_name_len; i++)
-                ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]];
-        ucase_buf[i] = 0;
-        dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf,
-                                     MAX_USERNAME_SIZE*2, nls_info);
-        unicode_buf[user_name_len + dom_name_len] = 0;
-        hmac_md5_update((const unsigned char *) unicode_buf,
-                (user_name_len+dom_name_len)*2, &ctx);
-        hmac_md5_final(ses->server->ntlmv2_hash, &ctx);
-        kfree(ucase_buf);
-        kfree(unicode_buf);
-        return 0;
-}
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
                        char *lnm_session_key)
@@ -324,38 +306,52 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
 {
        int rc = 0;
        int len;
-        char nt_hash[16];
+        char nt_hash[CIFS_NTHASH_SIZE];
-        struct HMACMD5Context *pctxt;
        wchar_t *user;
        wchar_t *domain;
+        wchar_t *server;
-        pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
+        if (!ses->server->ntlmssp.sdeschmacmd5) {
+                cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
-        if (pctxt == NULL)
+                return -1;
-                return -ENOMEM;
+        }
        /* calculate md4 hash of password */
        E_md4hash(ses->password, nt_hash);
-        /* convert Domainname to unicode and uppercase */
+        crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, nt_hash,
-        hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
+                                CIFS_NTHASH_SIZE);
+        rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
+                return rc;
+        }
        /* convert ses->userName to unicode and uppercase */
        len = strlen(ses->userName);
        user = kmalloc(2 + (len * 2), GFP_KERNEL);
-        if (user == NULL)
+        if (user == NULL) {
+                cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
+                rc = -ENOMEM;
                goto calc_exit_2;
+        }
        len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
        UniStrupr(user);
-        hmac_md5_update((char *)user, 2*len, pctxt);
+        crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+                                (char *)user, 2 * len);
        /* convert ses->domainName to unicode and uppercase */
        if (ses->domainName) {
                len = strlen(ses->domainName);
                domain = kmalloc(2 + (len * 2), GFP_KERNEL);
-                if (domain == NULL)
+                if (domain == NULL) {
+                        cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
+                        rc = -ENOMEM;
                        goto calc_exit_1;
+                }
                len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
                                        nls_cp);
                /* the following line was removed since it didn't work well
@@ -363,65 +359,292 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
                   Maybe converting the domain name earlier makes sense */
                /* UniStrupr(domain); */
-                hmac_md5_update((char *)domain, 2*len, pctxt);
+                crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+                                        (char *)domain, 2 * len);
                kfree(domain);
+        } else if (ses->serverName) {
+                len = strlen(ses->serverName);
+                server = kmalloc(2 + (len * 2), GFP_KERNEL);
+                if (server == NULL) {
+                        cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
+                        rc = -ENOMEM;
+                        goto calc_exit_1;
+                }
+                len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+                                        nls_cp);
+                /* the following line was removed since it didn't work well
+                   with lower cased domain name that passed as an option.
+                   Maybe converting the domain name earlier makes sense */
+                /* UniStrupr(domain); */
+                crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+                                        (char *)server, 2 * len);
+                kfree(server);
        }
+        rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash,
+                                        ses->server->ntlmv2_hash);
 calc_exit_1:
        kfree(user);
 calc_exit_2:
        /* BB FIXME what about bytes 24 through 40 of the signing key?
           compare with the NTLM example */
-        hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
-        kfree(pctxt);
        return rc;
 }
-void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
+static int
-                      const struct nls_table *nls_cp)
+find_domain_name(struct cifsSesInfo *ses)
+{
+        int rc = 0;
+        unsigned int attrsize;
+        unsigned int type;
+        unsigned char *blobptr;
+        struct ntlmssp2_name *attrptr;
+        if (ses->server->tiblob) {
+                blobptr = ses->server->tiblob;
+                attrptr = (struct ntlmssp2_name *) blobptr;
+                while ((type = attrptr->type) != 0) {
+                        blobptr += 2; /* advance attr type */
+                        attrsize = attrptr->length;
+                        blobptr += 2; /* advance attr size */
+                        if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
+                                if (!ses->domainName) {
+                                        ses->domainName =
+                                                kmalloc(attrptr->length + 1,
+                                                                GFP_KERNEL);
+                                        if (!ses->domainName)
+                                                        return -ENOMEM;
+                                        cifs_from_ucs2(ses->domainName,
+                                                (__le16 *)blobptr,
+                                                attrptr->length,
+                                                attrptr->length,
+                                                load_nls_default(), false);
+                                }
+                        }
+                        blobptr += attrsize; /* advance attr  value */
+                        attrptr = (struct ntlmssp2_name *) blobptr;
+                }
+        } else {
+                ses->server->tilen = 2 * sizeof(struct ntlmssp2_name);
+                ses->server->tiblob = kmalloc(ses->server->tilen, GFP_KERNEL);
+                if (!ses->server->tiblob) {
+                        ses->server->tilen = 0;
+                        cERROR(1, "Challenge target info allocation failure");
+                        return -ENOMEM;
+                }
+                memset(ses->server->tiblob, 0x0, ses->server->tilen);
+                attrptr = (struct ntlmssp2_name *) ses->server->tiblob;
+                attrptr->type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
+        }
+        return rc;
+}
+static int
+CalcNTLMv2_response(const struct TCP_Server_Info *server,
+                         char *v2_session_response)
 {
        int rc;
+        if (!server->ntlmssp.sdeschmacmd5) {
+                cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+                return -1;
+        }
+        crypto_shash_setkey(server->ntlmssp.hmacmd5, server->ntlmv2_hash,
+                CIFS_HMAC_MD5_HASH_SIZE);
+        rc = crypto_shash_init(&server->ntlmssp.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+                return rc;
+        }
+        memcpy(v2_session_response + CIFS_SERVER_CHALLENGE_SIZE,
+                server->cryptKey, CIFS_SERVER_CHALLENGE_SIZE);
+        crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash,
+                v2_session_response + CIFS_SERVER_CHALLENGE_SIZE,
+                sizeof(struct ntlmv2_resp) - CIFS_SERVER_CHALLENGE_SIZE);
+        if (server->tilen)
+                crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash,
+                                        server->tiblob, server->tilen);
+        rc = crypto_shash_final(&server->ntlmssp.sdeschmacmd5->shash,
+                                        v2_session_response);
+        return rc;
+}
+int
+setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
+                      const struct nls_table *nls_cp)
+{
+        int rc = 0;
        struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf;
-        struct HMACMD5Context context;
        buf->blob_signature = cpu_to_le32(0x00000101);
        buf->reserved = 0;
        buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
        get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
        buf->reserved2 = 0;
-        buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
-        buf->names[0].length = 0;
+        if (!ses->domainName) {
-        buf->names[1].type = 0;
+                rc = find_domain_name(ses);
-        buf->names[1].length = 0;
+                if (rc) {
+                        cERROR(1, "could not get domain/server name rc %d", rc);
+                        return rc;
+                }
+        }
        /* calculate buf->ntlmv2_hash */
        rc = calc_ntlmv2_hash(ses, nls_cp);
-        if (rc)
+        if (rc) {
                cERROR(1, "could not get v2 hash rc %d", rc);
-        CalcNTLMv2_response(ses, resp_buf);
+                return rc;
+        }
+        rc = CalcNTLMv2_response(ses->server, resp_buf);
+        if (rc) {
+                cERROR(1, "could not get v2 hash rc %d", rc);
+                return rc;
+        }
-        /* now calculate the MAC key for NTLMv2 */
+        if (!ses->server->ntlmssp.sdeschmacmd5) {
-        hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
+                cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
-        hmac_md5_update(resp_buf, 16, &context);
+                return -1;
-        hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context);
+        }
-        memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf,
+        crypto_shash_setkey(ses->server->ntlmssp.hmacmd5,
-               sizeof(struct ntlmv2_resp));
+                        ses->server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
-        ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp);
+        rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "setup_ntlmv2_rsp: could not init hmacmd5\n");
+                return rc;
+        }
+        crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+                                resp_buf, CIFS_HMAC_MD5_HASH_SIZE);
+        rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash,
+                ses->server->session_key.data.ntlmv2.key);
+        memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf,
+                        sizeof(struct ntlmv2_resp));
+        ses->server->session_key.len = 16 + sizeof(struct ntlmv2_resp);
+        return rc;
 }
-void CalcNTLMv2_response(const struct cifsSesInfo *ses,
+int
-                         char *v2_session_response)
+calc_seckey(struct TCP_Server_Info *server)
 {
-        struct HMACMD5Context context;
+        int rc;
-        /* rest of v2 struct already generated */
+        unsigned char sec_key[CIFS_NTLMV2_SESSKEY_SIZE];
-        memcpy(v2_session_response + 8, ses->server->cryptKey, 8);
+        struct crypto_blkcipher *tfm_arc4;
-        hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
+        struct scatterlist sgin, sgout;
+        struct blkcipher_desc desc;
+        get_random_bytes(sec_key, CIFS_NTLMV2_SESSKEY_SIZE);
+        tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)",
+                                                0, CRYPTO_ALG_ASYNC);
+        if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+                cERROR(1, "could not allocate " "master crypto API arc4\n");
+                return 1;
+        }
+        desc.tfm = tfm_arc4;
+        crypto_blkcipher_setkey(tfm_arc4,
+                server->session_key.data.ntlmv2.key, CIFS_CPHTXT_SIZE);
+        sg_init_one(&sgin, sec_key, CIFS_CPHTXT_SIZE);
+        sg_init_one(&sgout, server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
+        rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
-        hmac_md5_update(v2_session_response+8,
+        if (!rc)
-                        sizeof(struct ntlmv2_resp) - 8, &context);
+                memcpy(server->session_key.data.ntlmv2.key,
+                                sec_key, CIFS_NTLMV2_SESSKEY_SIZE);
+        crypto_free_blkcipher(tfm_arc4);
+        return 0;
+}
-        hmac_md5_final(v2_session_response, &context);
+void
-/*      cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
+cifs_crypto_shash_release(struct TCP_Server_Info *server)
+{
+        if (server->ntlmssp.md5)
+                crypto_free_shash(server->ntlmssp.md5);
+        if (server->ntlmssp.hmacmd5)
+                crypto_free_shash(server->ntlmssp.hmacmd5);
+        kfree(server->ntlmssp.sdeschmacmd5);
+        kfree(server->ntlmssp.sdescmd5);
+}
+int
+cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+        int rc;
+        unsigned int size;
+        server->ntlmssp.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+        if (!server->ntlmssp.hmacmd5 ||
+                        IS_ERR(server->ntlmssp.hmacmd5)) {
+                cERROR(1, "could not allocate crypto hmacmd5\n");
+                return 1;
+        }
+        server->ntlmssp.md5 = crypto_alloc_shash("md5", 0, 0);
+        if (!server->ntlmssp.md5 || IS_ERR(server->ntlmssp.md5)) {
+                cERROR(1, "could not allocate crypto md5\n");
+                rc = 1;
+                goto cifs_crypto_shash_allocate_ret1;
+        }
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->ntlmssp.hmacmd5);
+        server->ntlmssp.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+        if (!server->ntlmssp.sdeschmacmd5) {
+                cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
+                rc = -ENOMEM;
+                goto cifs_crypto_shash_allocate_ret2;
+        }
+        server->ntlmssp.sdeschmacmd5->shash.tfm = server->ntlmssp.hmacmd5;
+        server->ntlmssp.sdeschmacmd5->shash.flags = 0x0;
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->ntlmssp.md5);
+        server->ntlmssp.sdescmd5 = kmalloc(size, GFP_KERNEL);
+        if (!server->ntlmssp.sdescmd5) {
+                cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
+                rc = -ENOMEM;
+                goto cifs_crypto_shash_allocate_ret3;
+        }
+        server->ntlmssp.sdescmd5->shash.tfm = server->ntlmssp.md5;
+        server->ntlmssp.sdescmd5->shash.flags = 0x0;
+        return 0;
+cifs_crypto_shash_allocate_ret3:
+        kfree(server->ntlmssp.sdeschmacmd5);
+cifs_crypto_shash_allocate_ret2:
+        crypto_free_shash(server->ntlmssp.md5);
+cifs_crypto_shash_allocate_ret1:
+        crypto_free_shash(server->ntlmssp.hmacmd5);
+        return rc;
 }
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0cdfb8c32ac..c9d0cfc086e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -25,6 +25,9 @@
 #include <linux/workqueue.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
+#include <crypto/internal/hash.h>
+#include <linux/scatterlist.h>
 /*
 * The sizes of various internal tables and strings
 */
@@ -97,7 +100,7 @@ enum protocolEnum {
        /* Netbios frames protocol not supported at this time */
 };
-struct mac_key {
+struct session_key {
        unsigned int len;
        union {
                char ntlm[CIFS_SESS_KEY_SIZE + 16];
@@ -120,6 +123,21 @@ struct cifs_cred {
        struct cifs_ace *aces;
 };
+struct sdesc {
+        struct shash_desc shash;
+        char ctx[];
+};
+struct ntlmssp_auth {
+        __u32 client_flags;
+        __u32 server_flags;
+        unsigned char ciphertext[CIFS_CPHTXT_SIZE];
+        struct crypto_shash *hmacmd5;
+        struct crypto_shash *md5;
+        struct sdesc *sdeschmacmd5;
+        struct sdesc *sdescmd5;
+};
 /*
 *****************************************************************
 * Except the CIFS PDUs themselves all the
@@ -182,11 +200,14 @@ struct TCP_Server_Info {
        /* 16th byte of RFC1001 workstation name is always null */
        char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
        __u32 sequence_number; /* needed for CIFS PDU signature */
-        struct mac_key mac_signing_key;
+        struct session_key session_key;
        char ntlmv2_hash[16];
        unsigned long lstrp; /* when we got last response from this server */
        u16 dialect; /* dialect index that server chose */
        /* extended security flavors that server supports */
+        unsigned int tilen; /* length of the target info blob */
+        unsigned char *tiblob; /* target info blob in challenge response */
+        struct ntlmssp_auth ntlmssp; /* various keys, ciphers, flags */
        bool    sec_kerberos;           /* supports plain Kerberos */
        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db1..320e0fd0ba7 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -134,6 +134,12 @@
 * Size of the session key (crypto key encrypted with the password
 */
 #define CIFS_SESS_KEY_SIZE (24)
+#define CIFS_CLIENT_CHALLENGE_SIZE (8)
+#define CIFS_SERVER_CHALLENGE_SIZE (8)
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+#define CIFS_NTLMV2_SESSKEY_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
 /*
 * Maximum user name length
@@ -663,7 +669,6 @@ struct ntlmv2_resp {
        __le64  time;
        __u64  client_chal; /* random */
        __u32  reserved2;
-        struct ntlmssp2_name names[2];
        /* array of name entries could follow ending in minimum 4 byte struct */
 } __attribute__((packed));
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1f545081408..1378d913384 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -361,15 +361,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
                          __u32 *);
 extern int cifs_verify_signature(struct smb_hdr *,
-                                 const struct mac_key *mac_key,
+                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
-extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+extern int cifs_calculate_session_key(struct session_key *key, const char *rn,
                                 const char *pass);
-extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *,
+extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
-                        const struct nls_table *);
-extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
-extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
                             const struct nls_table *);
+extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
+extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern int calc_seckey(struct TCP_Server_Info *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(const char *password, const char *cryptkey,
                                bool encrypt, char *lnm_session_key);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd3..4bda920d1f7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -604,11 +604,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        else
                                rc = -EINVAL;
-                        if (server->sec_kerberos || server->sec_mskerberos)
+                        if (server->secType == Kerberos) {
-                                server->secType = Kerberos;
+                                if (!server->sec_kerberos &&
-                        else if (server->sec_ntlmssp)
+                                                !server->sec_mskerberos)
-                                server->secType = RawNTLMSSP;
+                                        rc = -EOPNOTSUPP;
-                        else
+                        } else if (server->secType == RawNTLMSSP) {
+                                if (!server->sec_ntlmssp)
+                                        rc = -EOPNOTSUPP;
+                        } else
                                rc = -EOPNOTSUPP;
                }
        } else
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 95c2ea67edf..ec0ea4a43bd 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1673,7 +1673,9 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
                                    MAX_USERNAME_SIZE))
                                continue;
                        if (strlen(vol->username) != 0 &&
-                            strncmp(ses->password, vol->password,
+                            ses->password != NULL &&
+                            strncmp(ses->password,
+                                    vol->password ? vol->password : "",
                                    MAX_PASSWORD_SIZE))
                                continue;
                }
@@ -1706,6 +1708,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
                CIFSSMBLogoff(xid, ses);
                _FreeXid(xid);
        }
+        cifs_crypto_shash_release(server);
        sesInfoFree(ses);
        cifs_put_tcp_session(server);
 }
@@ -1785,13 +1788,23 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        ses->linux_uid = volume_info->linux_uid;
        ses->overrideSecFlg = volume_info->secFlg;
+        rc = cifs_crypto_shash_allocate(server);
+        if (rc) {
+                cERROR(1, "could not setup hash structures rc %d", rc);
+                goto get_ses_fail;
+        }
+        server->tilen = 0;
+        server->tiblob = NULL;
        mutex_lock(&ses->session_mutex);
        rc = cifs_negotiate_protocol(xid, ses);
        if (!rc)
                rc = cifs_setup_session(xid, ses, volume_info->local_nls);
        mutex_unlock(&ses->session_mutex);
-        if (rc)
+        if (rc) {
+                cifs_crypto_shash_release(ses->server);
                goto get_ses_fail;
+        }
        /* success, put it on the list */
        write_lock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 578d88c5b46..f9ed0751cc1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -305,8 +305,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto cifs_create_out;
-                return rc;
        }
        if (oplockEnabled)
@@ -365,9 +364,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
        if (buf == NULL) {
-                kfree(full_path);
+                rc = -ENOMEM;
-                FreeXid(xid);
+                goto cifs_create_out;
-                return -ENOMEM;
        }
        /*
@@ -496,6 +494,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
+        int oplock = 0;
+        u16 fileHandle;
+        FILE_ALL_INFO *buf = NULL;
+        unsigned int bytes_written;
+        struct win_dev *pdev;
        if (!old_valid_dev(device_number))
                return -EINVAL;
@@ -506,9 +509,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        pTcon = cifs_sb->tcon;
        full_path = build_path_from_dentry(direntry);
-        if (full_path == NULL)
+        if (full_path == NULL) {
                rc = -ENOMEM;
-        else if (pTcon->unix_ext) {
+                goto mknod_out;
+        }
+        if (pTcon->unix_ext) {
                struct cifs_unix_set_info_args args = {
                        .mode   = mode & ~current_umask(),
                        .ctime  = NO_CHANGE_64,
@@ -527,87 +533,78 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                                            cifs_sb->local_nls,
                                            cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+                if (rc)
+                        goto mknod_out;
-                if (!rc) {
+                rc = cifs_get_inode_info_unix(&newinode, full_path,
-                        rc = cifs_get_inode_info_unix(&newinode, full_path,
                                                inode->i_sb, xid);
-                        if (pTcon->nocase)
+                if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
+                        direntry->d_op = &cifs_ci_dentry_ops;
-                        else
+                else
-                                direntry->d_op = &cifs_dentry_ops;
+                        direntry->d_op = &cifs_dentry_ops;
-                        if (rc == 0)
-                                d_instantiate(direntry, newinode);
-                }
-        } else {
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-                        int oplock = 0;
-                        u16 fileHandle;
-                        FILE_ALL_INFO *buf;
-                        cFYI(1, "sfu compat create special file");
+                if (rc == 0)
+                        d_instantiate(direntry, newinode);
+                goto mknod_out;
+        }
-                        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
-                        if (buf == NULL) {
+                goto mknod_out;
-                                kfree(full_path);
-                                rc = -ENOMEM;
-                                FreeXid(xid);
-                                return rc;
-                        }
-                        rc = CIFSSMBOpen(xid, pTcon, full_path,
-                                         FILE_CREATE, /* fail if exists */
+        cFYI(1, "sfu compat create special file");
-                                         GENERIC_WRITE /* BB would
-                                          WRITE_OWNER | WRITE_DAC be better? */,
+        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-                                         /* Create a file and set the
+        if (buf == NULL) {
-                                            file attribute to SYSTEM */
+                kfree(full_path);
-                                         CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+                rc = -ENOMEM;
-                                         &fileHandle, &oplock, buf,
+                FreeXid(xid);
-                                         cifs_sb->local_nls,
+                return rc;
-                                         cifs_sb->mnt_cifs_flags &
-                                            CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        /* BB FIXME - add handling for backlevel servers
-                           which need legacy open and check for all
-                           calls to SMBOpen for fallback to SMBLeagcyOpen */
-                        if (!rc) {
-                                /* BB Do not bother to decode buf since no
-                                   local inode yet to put timestamps in,
-                                   but we can reuse it safely */
-                                unsigned int bytes_written;
-                                struct win_dev *pdev;
-                                pdev = (struct win_dev *)buf;
-                                if (S_ISCHR(mode)) {
-                                        memcpy(pdev->type, "IntxCHR", 8);
-                                        pdev->major =
-                                              cpu_to_le64(MAJOR(device_number));
-                                        pdev->minor =
-                                              cpu_to_le64(MINOR(device_number));
-                                        rc = CIFSSMBWrite(xid, pTcon,
-                                                fileHandle,
-                                                sizeof(struct win_dev),
-                                                0, &bytes_written, (char *)pdev,
-                                                NULL, 0);
-                                } else if (S_ISBLK(mode)) {
-                                        memcpy(pdev->type, "IntxBLK", 8);
-                                        pdev->major =
-                                              cpu_to_le64(MAJOR(device_number));
-                                        pdev->minor =
-                                              cpu_to_le64(MINOR(device_number));
-                                        rc = CIFSSMBWrite(xid, pTcon,
-                                                fileHandle,
-                                                sizeof(struct win_dev),
-                                                0, &bytes_written, (char *)pdev,
-                                                NULL, 0);
-                                } /* else if(S_ISFIFO */
-                                CIFSSMBClose(xid, pTcon, fileHandle);
-                                d_drop(direntry);
-                        }
-                        kfree(buf);
-                        /* add code here to set EAs */
-                }
        }
+        /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
+                         GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+                         &fileHandle, &oplock, buf, cifs_sb->local_nls,
+                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc)
+                goto mknod_out;
+        /* BB Do not bother to decode buf since no local inode yet to put
+         * timestamps in, but we can reuse it safely */
+        pdev = (struct win_dev *)buf;
+        if (S_ISCHR(mode)) {
+                memcpy(pdev->type, "IntxCHR", 8);
+                pdev->major =
+                      cpu_to_le64(MAJOR(device_number));
+                pdev->minor =
+                      cpu_to_le64(MINOR(device_number));
+                rc = CIFSSMBWrite(xid, pTcon,
+                        fileHandle,
+                        sizeof(struct win_dev),
+                        0, &bytes_written, (char *)pdev,
+                        NULL, 0);
+        } else if (S_ISBLK(mode)) {
+                memcpy(pdev->type, "IntxBLK", 8);
+                pdev->major =
+                      cpu_to_le64(MAJOR(device_number));
+                pdev->minor =
+                      cpu_to_le64(MINOR(device_number));
+                rc = CIFSSMBWrite(xid, pTcon,
+                        fileHandle,
+                        sizeof(struct win_dev),
+                        0, &bytes_written, (char *)pdev,
+                        NULL, 0);
+        } /* else if (S_ISFIFO) */
+        CIFSSMBClose(xid, pTcon, fileHandle);
+        d_drop(direntry);
+        /* FIXME: add code here to set EAs */
+mknod_out:
        kfree(full_path);
+        kfree(buf);
        FreeXid(xid);
        return rc;
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index db11fdef0e9..de748c652d1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -242,8 +242,7 @@ int cifs_open(struct inode *inode, struct file *file)
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto out;
-                return rc;
        }
        cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4bc47e5b5f2..86a164f08a7 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -834,7 +834,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                                                xid, NULL);
        if (!inode)
-                return ERR_PTR(-ENOMEM);
+                return ERR_PTR(rc);
 #ifdef CONFIG_CIFS_FSCACHE
        /* populate tcon->resource_id */
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e7531..1db0f0746a5 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,19 @@
 #define NTLMSSP_NEGOTIATE_KEY_XCH   0x40000000
 #define NTLMSSP_NEGOTIATE_56        0x80000000
+/* Define AV Pair Field IDs */
+#define NTLMSSP_AV_EOL                  0
+#define NTLMSSP_AV_NB_COMPUTER_NAME     1
+#define NTLMSSP_AV_NB_DOMAIN_NAME       2
+#define NTLMSSP_AV_DNS_COMPUTER_NAME    3
+#define NTLMSSP_AV_DNS_DOMAIN_NAME      4
+#define NTLMSSP_AV_DNS_TREE_NAME        5
+#define NTLMSSP_AV_FLAGS                6
+#define NTLMSSP_AV_TIMESTAMP            7
+#define NTLMSSP_AV_RESTRICTION          8
+#define NTLMSSP_AV_TARGET_NAME          9
+#define NTLMSSP_AV_CHANNEL_BINDINGS     10
 /* Although typedefs are not commonly used for structure definitions */
 /* in the Linux kernel, in this particular case they are useful      */
 /* to more closely match the standards document for NTLMSSP from     */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5d..795095f4eac 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -383,6 +383,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
                                    struct cifsSesInfo *ses)
 {
+        unsigned int tioffset; /* challeng message target info area */
+        unsigned int tilen; /* challeng message target info area length  */
        CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
        if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -405,6 +408,20 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
        /* BB spec says that if AvId field of MsvAvTimestamp is populated then
                we must set the MIC field of the AUTHENTICATE_MESSAGE */
+        ses->server->ntlmssp.server_flags = le32_to_cpu(pblob->NegotiateFlags);
+        tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
+        tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
+        ses->server->tilen = tilen;
+        if (tilen) {
+                ses->server->tiblob = kmalloc(tilen, GFP_KERNEL);
+                if (!ses->server->tiblob) {
+                        cERROR(1, "Challenge target info allocation failure");
+                        return -ENOMEM;
+                }
+                memcpy(ses->server->tiblob,  bcc_ptr + tioffset, tilen);
+        }
        return 0;
 }
@@ -425,12 +442,13 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
        /* BB is NTLMV2 session security format easier to use here? */
        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM;
        if (ses->server->secMode &
-           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
-                flags |= NTLMSSP_NEGOTIATE_SIGN;
+                flags |= NTLMSSP_NEGOTIATE_SIGN |
-        if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
+                        NTLMSSP_NEGOTIATE_KEY_XCH |
-                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+                        NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+        }
        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
@@ -451,10 +469,12 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                                   struct cifsSesInfo *ses,
                                   const struct nls_table *nls_cp, bool first)
 {
+        int rc;
+        unsigned int size;
        AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
        __u32 flags;
        unsigned char *tmp;
-        char ntlm_session_key[CIFS_SESS_KEY_SIZE];
+        struct ntlmv2_resp ntlmv2_response = {};
        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
        sec_blob->MessageType = NtLmAuthenticate;
@@ -477,19 +497,25 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->LmChallengeResponse.Length = 0;
        sec_blob->LmChallengeResponse.MaximumLength = 0;
-        /* calculate session key,  BB what about adding similar ntlmv2 path? */
-        SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
-        if (first)
-                cifs_calculate_mac_key(&ses->server->mac_signing_key,
-                                       ntlm_session_key, ses->password);
-        memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
        sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
-        sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+        rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp);
-        sec_blob->NtChallengeResponse.MaximumLength =
+        if (rc) {
-                                cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                cERROR(1, "error rc: %d during ntlmssp ntlmv2 setup", rc);
+                goto setup_ntlmv2_ret;
+        }
+        size =  sizeof(struct ntlmv2_resp);
+        memcpy(tmp, (char *)&ntlmv2_response, size);
+        tmp += size;
+        if (ses->server->tilen > 0) {
+                memcpy(tmp, ses->server->tiblob, ses->server->tilen);
+                tmp += ses->server->tilen;
+        } else
+                ses->server->tilen = 0;
-        tmp += CIFS_SESS_KEY_SIZE;
+        sec_blob->NtChallengeResponse.Length = cpu_to_le16(size +
+                                ses->server->tilen);
+        sec_blob->NtChallengeResponse.MaximumLength =
+                cpu_to_le16(size + ses->server->tilen);
        if (ses->domainName == NULL) {
                sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +527,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
                                    MAX_USERNAME_SIZE, nls_cp);
                len *= 2; /* unicode is 2 bytes each */
-                len += 2; /* trailing null */
                sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->DomainName.Length = cpu_to_le16(len);
                sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +543,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
                                    MAX_USERNAME_SIZE, nls_cp);
                len *= 2; /* unicode is 2 bytes each */
-                len += 2; /* trailing null */
                sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->UserName.Length = cpu_to_le16(len);
                sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -530,9 +554,26 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->WorkstationName.MaximumLength = 0;
        tmp += 2;
-        sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+        if ((ses->server->ntlmssp.server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
-        sec_blob->SessionKey.Length = 0;
+                        !calc_seckey(ses->server)) {
-        sec_blob->SessionKey.MaximumLength = 0;
+                memcpy(tmp, ses->server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
+                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
+                sec_blob->SessionKey.MaximumLength =
+                        cpu_to_le16(CIFS_CPHTXT_SIZE);
+                tmp += CIFS_CPHTXT_SIZE;
+        } else {
+                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->SessionKey.Length = 0;
+                sec_blob->SessionKey.MaximumLength = 0;
+        }
+        ses->server->sequence_number = 0;
+setup_ntlmv2_ret:
+        if (ses->server->tilen > 0)
+                kfree(ses->server->tiblob);
        return tmp - pbuffer;
 }
@@ -546,15 +587,14 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
        return;
 }
-static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
+static int setup_ntlmssp_auth_req(char *ntlmsspblob,
                                  struct cifsSesInfo *ses,
                                  const struct nls_table *nls, bool first_time)
 {
        int bloblen;
-        bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
+        bloblen = build_ntlmssp_auth_blob(ntlmsspblob, ses, nls,
                                          first_time);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
        return bloblen;
 }
@@ -690,7 +730,7 @@ ssetup_ntlmssp_authenticate:
                if (first_time) /* should this be moved into common code
                                  with similar ntlmv2 path? */
-                        cifs_calculate_mac_key(&ses->server->mac_signing_key,
+                        cifs_calculate_session_key(&ses->server->session_key,
                                ntlm_session_key, ses->password);
                /* copy session key */
@@ -729,12 +769,21 @@ ssetup_ntlmssp_authenticate:
                        cpu_to_le16(sizeof(struct ntlmv2_resp));
                /* calculate session key */
-                setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
+                rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
+                if (rc) {
+                        kfree(v2_sess_key);
+                        goto ssetup_exit;
+                }
                /* FIXME: calculate MAC key */
                memcpy(bcc_ptr, (char *)v2_sess_key,
                       sizeof(struct ntlmv2_resp));
                bcc_ptr += sizeof(struct ntlmv2_resp);
                kfree(v2_sess_key);
+                if (ses->server->tilen > 0) {
+                        memcpy(bcc_ptr, ses->server->tiblob,
+                                ses->server->tilen);
+                        bcc_ptr += ses->server->tilen;
+                }
                if (ses->capabilities & CAP_UNICODE) {
                        if (iov[0].iov_len % 2) {
                                *bcc_ptr = 0;
@@ -765,15 +814,15 @@ ssetup_ntlmssp_authenticate:
                }
                /* bail out if key is too long */
                if (msg->sesskey_len >
-                    sizeof(ses->server->mac_signing_key.data.krb5)) {
+                    sizeof(ses->server->session_key.data.krb5)) {
                        cERROR(1, "Kerberos signing key too long (%u bytes)",
                                msg->sesskey_len);
                        rc = -EOVERFLOW;
                        goto ssetup_exit;
                }
                if (first_time) {
-                        ses->server->mac_signing_key.len = msg->sesskey_len;
+                        ses->server->session_key.len = msg->sesskey_len;
-                        memcpy(ses->server->mac_signing_key.data.krb5,
+                        memcpy(ses->server->session_key.data.krb5,
                                msg->data, msg->sesskey_len);
                }
                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
@@ -815,12 +864,28 @@ ssetup_ntlmssp_authenticate:
                        if (phase == NtLmNegotiate) {
                                setup_ntlmssp_neg_req(pSMB, ses);
                                iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+                                iov[1].iov_base = &pSMB->req.SecurityBlob[0];
                        } else if (phase == NtLmAuthenticate) {
                                int blob_len;
-                                blob_len = setup_ntlmssp_auth_req(pSMB, ses,
+                                char *ntlmsspblob;
-                                                                  nls_cp,
-                                                                  first_time);
+                                ntlmsspblob = kmalloc(5 *
+                                        sizeof(struct _AUTHENTICATE_MESSAGE),
+                                        GFP_KERNEL);
+                                if (!ntlmsspblob) {
+                                        cERROR(1, "Can't allocate NTLMSSP");
+                                        rc = -ENOMEM;
+                                        goto ssetup_exit;
+                                }
+                                blob_len = setup_ntlmssp_auth_req(ntlmsspblob,
+                                                                ses,
+                                                                nls_cp,
+                                                                first_time);
                                iov[1].iov_len = blob_len;
+                                iov[1].iov_base = ntlmsspblob;
+                                pSMB->req.SecurityBlobLength =
+                                        cpu_to_le16(blob_len);
                                /* Make sure that we tell the server that we
                                   are using the uid that it just gave us back
                                   on the response (challenge) */
@@ -830,7 +895,6 @@ ssetup_ntlmssp_authenticate:
                                rc = -ENOSYS;
                                goto ssetup_exit;
                        }
-                        iov[1].iov_base = &pSMB->req.SecurityBlob[0];
                        /* unicode strings must be word aligned */
                        if ((iov[0].iov_len + iov[1].iov_len) % 2) {
                                *bcc_ptr = 0;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d697..e0588cdf4cc 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                             SECMODE_SIGN_ENABLED))) {
                        rc = cifs_verify_signature(midQ->resp_buf,
-                                                &ses->server->mac_signing_key,
+                                                ses->server,
                                                midQ->sequence_number+1);
                        if (rc) {
                                cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                             SECMODE_SIGN_ENABLED))) {
                        rc = cifs_verify_signature(out_buf,
-                                                &ses->server->mac_signing_key,
+                                                ses->server,
                                                midQ->sequence_number+1);
                        if (rc) {
                                cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
            (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                     SECMODE_SIGN_ENABLED))) {
                rc = cifs_verify_signature(out_buf,
-                                           &ses->server->mac_signing_key,
+                                           ses->server,
                                           midQ->sequence_number+1);
                if (rc) {
                        cERROR(1, "Unexpected SMB signature");
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a53b130b366..1e7a33028d3 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -80,7 +80,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
                }
        } else {
                inode = iget_locked(sb, CRAMINO(cramfs_inode));
-                if (inode) {
+                if (inode && (inode->i_state & I_NEW)) {
                        setup_inode(inode, cramfs_inode);
                        unlock_new_inode(inode);
                }
diff --git a/fs/dcache.c b/fs/dcache.c
index 4d13bf50b7b..83293be4814 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1332,31 +1332,13 @@ EXPORT_SYMBOL(d_add_ci);
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
 *
- * Searches the children of the parent dentry for the name in question. If
+ * d_lookup searches the children of the parent dentry for the name in
- * the dentry is found its reference count is incremented and the dentry
+ * question. If the dentry is found its reference count is incremented and the
- * is returned. The caller must use dput to free the entry when it has
+ * dentry is returned. The caller must use dput to free the entry when it has
- * finished using it. %NULL is returned on failure.
+ * finished using it. %NULL is returned if the dentry does not exist.
- *
- * __d_lookup is dcache_lock free. The hash list is protected using RCU.
- * Memory barriers are used while updating and doing lockless traversal. 
- * To avoid races with d_move while rename is happening, d_lock is used.
- *
- * Overflows in memcmp(), while d_move, are avoided by keeping the length
- * and name pointer in one structure pointed by d_qstr.
- *
- * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
- * lookup is going on.
- *
- * The dentry unused LRU is not updated even if lookup finds the required dentry
- * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
- * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
- * acquisition.
- *
- * d_lookup() is protected against the concurrent renames in some unrelated
- * directory using the seqlockt_t rename_lock.
 */
 struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 {
        struct dentry * dentry = NULL;
@@ -1372,6 +1354,21 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
+/*
+ * __d_lookup - search for a dentry (racy)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup is like d_lookup, however it may (rarely) return a
+ * false-negative result due to unrelated rename activity.
+ *
+ * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
+ * however it must be used carefully, eg. with a following d_lookup in
+ * the case of failure.
+ *
+ * __d_lookup callers must be commented.
+ */
 struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 {
        unsigned int len = name->len;
@@ -1382,6 +1379,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
        struct hlist_node *node;
        struct dentry *dentry;
+        /*
+         * The hash list is protected using RCU.
+         *
+         * Take d_lock when comparing a candidate dentry, to avoid races
+         * with d_move().
+         *
+         * It is possible that concurrent renames can mess up our list
+         * walk here and result in missing our dentry, resulting in the
+         * false-negative result. d_lookup() protects against concurrent
+         * renames using rename_lock seqlock.
+         *
+         * See Documentation/vfs/dcache-locking.txt for more details.
+         */
        rcu_read_lock();
        
        hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
@@ -1396,8 +1406,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                /*
                 * Recheck the dentry after taking the lock - d_move may have
-                 * changed things.  Don't bother checking the hash because we're
+                 * changed things. Don't bother checking the hash because
-                 * about to compare the whole name anyway.
+                 * we're about to compare the whole name anyway.
                 */
                if (dentry->d_parent != parent)
                        goto next;
@@ -1925,7 +1935,7 @@ static int prepend_path(const struct path *path, struct path *root,
        bool slash = false;
        int error = 0;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        while (dentry != root->dentry || vfsmnt != root->mnt) {
                struct dentry * parent;
@@ -1954,7 +1964,7 @@ out:
        if (!error && !slash)
                error = prepend(buffer, buflen, "/", 1);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return error;
 global_root:
@@ -2292,11 +2302,12 @@ int path_is_under(struct path *path1, struct path *path2)
        struct vfsmount *mnt = path1->mnt;
        struct dentry *dentry = path1->dentry;
        int res;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if (mnt != path2->mnt) {
                for (;;) {
                        if (mnt->mnt_parent == mnt) {
-                                spin_unlock(&vfsmount_lock);
+                                br_read_unlock(vfsmount_lock);
                                return 0;
                        }
                        if (mnt->mnt_parent == path2->mnt)
@@ -2306,7 +2317,7 @@ int path_is_under(struct path *path1, struct path *path2)
                dentry = mnt->mnt_mountpoint;
        }
        res = is_subdir(dentry, path2->dentry);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return res;
 }
 EXPORT_SYMBOL(path_is_under);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index a2e3b562e65..cbadc1bee6e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache;
 static struct list_head key_tfm_list;
 struct mutex key_tfm_list_mutex;
-int ecryptfs_init_crypto(void)
+int __init ecryptfs_init_crypto(void)
 {
        mutex_init(&key_tfm_list_mutex);
        INIT_LIST_HEAD(&key_tfm_list);
@@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename(
                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
                                 + encoded_name_no_prefix_size);
                        (*encoded_name)[(*encoded_name_size)] = '\0';
-                        (*encoded_name_size)++;
                } else {
                        rc = -EOPNOTSUPP;
                }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 6c55113e722..3fbc9420338 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -349,7 +349,7 @@ out:
 /**
 * ecryptfs_new_lower_dentry
- * @ename: The name of the new dentry.
+ * @name: The name of the new dentry.
 * @lower_dir_dentry: Parent directory of the new dentry.
 * @nd: nameidata from last lookup.
 *
@@ -386,20 +386,19 @@ ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
 * ecryptfs_lookup_one_lower
 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
 * @lower_dir_dentry: lower parent directory
+ * @name: lower file name
 *
 * Get the lower dentry from vfs. If lower dentry does not exist yet,
 * create it.
 */
 static struct dentry *
 ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
-                          struct dentry *lower_dir_dentry)
+                          struct dentry *lower_dir_dentry, struct qstr *name)
 {
        struct nameidata nd;
        struct vfsmount *lower_mnt;
-        struct qstr *name;
        int err;
-        name = &ecryptfs_dentry->d_name;
        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
                                    ecryptfs_dentry->d_parent));
        err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
@@ -434,6 +433,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        size_t encrypted_and_encoded_name_size;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
        struct dentry *lower_dir_dentry, *lower_dentry;
+        struct qstr lower_name;
        int rc = 0;
        ecryptfs_dentry->d_op = &ecryptfs_dops;
@@ -444,9 +444,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                goto out_d_drop;
        }
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+        lower_name.name = ecryptfs_dentry->d_name.name;
+        lower_name.len = ecryptfs_dentry->d_name.len;
+        lower_name.hash = ecryptfs_dentry->d_name.hash;
+        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+                                                    &lower_name);
+                if (rc < 0)
+                        goto out_d_drop;
+        }
        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-                                                 lower_dir_dentry);
+                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
@@ -471,8 +479,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                       "filename; rc = [%d]\n", __func__, rc);
                goto out_d_drop;
        }
+        lower_name.name = encrypted_and_encoded_name;
+        lower_name.len = encrypted_and_encoded_name_size;
+        lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
+        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+                                                    &lower_name);
+                if (rc < 0)
+                        goto out_d_drop;
+        }
        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-                                                 lower_dir_dentry);
+                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 89c5476506e..73811cfa2ea 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -515,6 +515,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        if (!s) {
                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                rc = -ENOMEM;
                goto out;
        }
        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -806,6 +807,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
        if (!s) {
                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                rc = -ENOMEM;
                goto out;
        }
        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index d8c3a373aaf..0851ab6980f 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -86,7 +86,7 @@ out:
        return 0;
 }
-int ecryptfs_init_kthread(void)
+int __init ecryptfs_init_kthread(void)
 {
        int rc = 0;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index bcb68c0cb1f..ab224809051 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -473,7 +473,7 @@ sleep:
        return rc;
 }
-int ecryptfs_init_messaging(void)
+int __init ecryptfs_init_messaging(void)
 {
        int i;
        int rc = 0;
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f612bcd..00208c3d7e9 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -500,7 +500,7 @@ static struct miscdevice ecryptfs_miscdev = {
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_init_ecryptfs_miscdev(void)
+int __init ecryptfs_init_ecryptfs_miscdev(void)
 {
        int rc;
diff --git a/fs/exec.c b/fs/exec.c
index 7761837e450..2d945528274 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -361,13 +361,13 @@ err:
 /*
 * count() counts the number of strings in array ARGV.
 */
-static int count(char __user * __user * argv, int max)
+static int count(const char __user * const __user * argv, int max)
 {
        int i = 0;
        if (argv != NULL) {
                for (;;) {
-                        char __user * p;
+                        const char __user * p;
                        if (get_user(p, argv))
                                return -EFAULT;
@@ -387,7 +387,7 @@ static int count(char __user * __user * argv, int max)
 * processes's memory to the new process's stack.  The call to get_user_pages()
 * ensures the destination page is created and not swapped out.
 */
-static int copy_strings(int argc, char __user * __user * argv,
+static int copy_strings(int argc, const char __user *const __user *argv,
                        struct linux_binprm *bprm)
 {
        struct page *kmapped_page = NULL;
@@ -396,7 +396,7 @@ static int copy_strings(int argc, char __user * __user * argv,
        int ret;
        while (argc-- > 0) {
-                char __user *str;
+                const char __user *str;
                int len;
                unsigned long pos;
@@ -470,12 +470,13 @@ out:
 /*
 * Like copy_strings, but get argv and its values from kernel memory.
 */
-int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
+int copy_strings_kernel(int argc, const char *const *argv,
+                        struct linux_binprm *bprm)
 {
        int r;
        mm_segment_t oldfs = get_fs();
        set_fs(KERNEL_DS);
-        r = copy_strings(argc, (char __user * __user *)argv, bprm);
+        r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
        set_fs(oldfs);
        return r;
 }
@@ -997,7 +998,7 @@ EXPORT_SYMBOL(flush_old_exec);
 void setup_new_exec(struct linux_binprm * bprm)
 {
        int i, ch;
-        char * name;
+        const char *name;
        char tcomm[sizeof(current->comm)];
        arch_pick_mmap_layout(current->mm);
@@ -1117,7 +1118,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
        bprm->unsafe = tracehook_unsafe_exec(p);
        n_fs = 1;
-        write_lock(&p->fs->lock);
+        spin_lock(&p->fs->lock);
        rcu_read_lock();
        for (t = next_thread(p); t != p; t = next_thread(t)) {
                if (t->fs == p->fs)
@@ -1134,7 +1135,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
                        res = 1;
                }
        }
-        write_unlock(&p->fs->lock);
+        spin_unlock(&p->fs->lock);
        return res;
 }
@@ -1316,9 +1317,9 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
 * sys_execve() executes a new program.
 */
-int do_execve(char * filename,
+int do_execve(const char * filename,
-        char __user *__user *argv,
+        const char __user *const __user *argv,
-        char __user *__user *envp,
+        const char __user *const __user *envp,
        struct pt_regs * regs)
 {
        struct linux_binprm *bprm;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1fa23f6ffba..1736f235638 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -250,7 +250,9 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
        int i, err = 0;
-        ll_rw_block(SWRITE, nr_bhs, bhs);
+        for (i = 0; i < nr_bhs; i++)
+                write_dirty_buffer(bhs[i], WRITE);
        for (i = 0; i < nr_bhs; i++) {
                wait_on_buffer(bhs[i]);
                if (buffer_eopnotsupp(bhs[i])) {
diff --git a/fs/file_table.c b/fs/file_table.c
index edecd36fed9..a04bdd81c11 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,9 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/lglock.h>
 #include <linux/percpu_counter.h>
+#include <linux/percpu.h>
 #include <linux/ima.h>
 #include <asm/atomic.h>
@@ -32,8 +34,8 @@ struct files_stat_struct files_stat = {
        .max_files = NR_FILE
 };
-/* public. Not pretty! */
+DECLARE_LGLOCK(files_lglock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+DEFINE_LGLOCK(files_lglock);
 /* SLAB cache for file structures */
 static struct kmem_cache *filp_cachep __read_mostly;
@@ -249,7 +251,7 @@ static void __fput(struct file *file)
                cdev_put(inode->i_cdev);
        fops_put(file->f_op);
        put_pid(file->f_owner.pid);
-        file_kill(file);
+        file_sb_list_del(file);
        if (file->f_mode & FMODE_WRITE)
                drop_file_write_access(file);
        file->f_path.dentry = NULL;
@@ -328,41 +330,107 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
        return file;
 }
 void put_filp(struct file *file)
 {
        if (atomic_long_dec_and_test(&file->f_count)) {
                security_file_free(file);
-                file_kill(file);
+                file_sb_list_del(file);
                file_free(file);
        }
 }
-void file_move(struct file *file, struct list_head *list)
+static inline int file_list_cpu(struct file *file)
 {
-        if (!list)
+#ifdef CONFIG_SMP
-                return;
+        return file->f_sb_list_cpu;
-        file_list_lock();
+#else
-        list_move(&file->f_u.fu_list, list);
+        return smp_processor_id();
-        file_list_unlock();
+#endif
+}
+/* helper for file_sb_list_add to reduce ifdefs */
+static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
+{
+        struct list_head *list;
+#ifdef CONFIG_SMP
+        int cpu;
+        cpu = smp_processor_id();
+        file->f_sb_list_cpu = cpu;
+        list = per_cpu_ptr(sb->s_files, cpu);
+#else
+        list = &sb->s_files;
+#endif
+        list_add(&file->f_u.fu_list, list);
 }
-void file_kill(struct file *file)
+/**
+ * file_sb_list_add - add a file to the sb's file list
+ * @file: file to add
+ * @sb: sb to add it to
+ *
+ * Use this function to associate a file with the superblock of the inode it
+ * refers to.
+ */
+void file_sb_list_add(struct file *file, struct super_block *sb)
+{
+        lg_local_lock(files_lglock);
+        __file_sb_list_add(file, sb);
+        lg_local_unlock(files_lglock);
+}
+/**
+ * file_sb_list_del - remove a file from the sb's file list
+ * @file: file to remove
+ * @sb: sb to remove it from
+ *
+ * Use this function to remove a file from its superblock.
+ */
+void file_sb_list_del(struct file *file)
 {
        if (!list_empty(&file->f_u.fu_list)) {
-                file_list_lock();
+                lg_local_lock_cpu(files_lglock, file_list_cpu(file));
                list_del_init(&file->f_u.fu_list);
-                file_list_unlock();
+                lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
        }
 }
+#ifdef CONFIG_SMP
+/*
+ * These macros iterate all files on all CPUs for a given superblock.
+ * files_lglock must be held globally.
+ */
+#define do_file_list_for_each_entry(__sb, __file)               \
+{                                                               \
+        int i;                                                  \
+        for_each_possible_cpu(i) {                              \
+                struct list_head *list;                         \
+                list = per_cpu_ptr((__sb)->s_files, i);         \
+                list_for_each_entry((__file), list, f_u.fu_list)
+#define while_file_list_for_each_entry                          \
+        }                                                       \
+}
+#else
+#define do_file_list_for_each_entry(__sb, __file)               \
+{                                                               \
+        struct list_head *list;                                 \
+        list = &(sb)->s_files;                                  \
+        list_for_each_entry((__file), list, f_u.fu_list)
+#define while_file_list_for_each_entry                          \
+}
+#endif
 int fs_may_remount_ro(struct super_block *sb)
 {
        struct file *file;
        /* Check that no files are currently opened for writing. */
-        file_list_lock();
+        lg_global_lock(files_lglock);
-        list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
+        do_file_list_for_each_entry(sb, file) {
                struct inode *inode = file->f_path.dentry->d_inode;
                /* File with pending delete? */
@@ -372,11 +440,11 @@ int fs_may_remount_ro(struct super_block *sb)
                /* Writeable file? */
                if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
                        goto too_bad;
-        }
+        } while_file_list_for_each_entry;
-        file_list_unlock();
+        lg_global_unlock(files_lglock);
        return 1; /* Tis' cool bro. */
 too_bad:
-        file_list_unlock();
+        lg_global_unlock(files_lglock);
        return 0;
 }
@@ -392,8 +460,8 @@ void mark_files_ro(struct super_block *sb)
        struct file *f;
 retry:
-        file_list_lock();
+        lg_global_lock(files_lglock);
-        list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+        do_file_list_for_each_entry(sb, f) {
                struct vfsmount *mnt;
                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
                       continue;
@@ -408,16 +476,13 @@ retry:
                        continue;
                file_release_write(f);
                mnt = mntget(f->f_path.mnt);
-                file_list_unlock();
+                /* This can sleep, so we can't hold the spinlock. */
-                /*
+                lg_global_unlock(files_lglock);
-                 * This can sleep, so we can't hold
-                 * the file_list_lock() spinlock.
-                 */
                mnt_drop_write(mnt);
                mntput(mnt);
                goto retry;
-        }
+        } while_file_list_for_each_entry;
-        file_list_unlock();
+        lg_global_unlock(files_lglock);
 }
 void __init files_init(unsigned long mempages)
@@ -437,5 +502,6 @@ void __init files_init(unsigned long mempages)
        if (files_stat.max_files < NR_FILE)
                files_stat.max_files = NR_FILE;
        files_defer_init();
+        lg_lock_init(files_lglock);
        percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 1ee40eb9a2c..ed45a9cf5f3 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -13,11 +13,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
 {
        struct path old_root;
-        write_lock(&fs->lock);
+        spin_lock(&fs->lock);
        old_root = fs->root;
        fs->root = *path;
        path_get(path);
-        write_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
        if (old_root.dentry)
                path_put(&old_root);
 }
@@ -30,11 +30,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
 {
        struct path old_pwd;
-        write_lock(&fs->lock);
+        spin_lock(&fs->lock);
        old_pwd = fs->pwd;
        fs->pwd = *path;
        path_get(path);
-        write_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
        if (old_pwd.dentry)
                path_put(&old_pwd);
@@ -51,7 +51,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                task_lock(p);
                fs = p->fs;
                if (fs) {
-                        write_lock(&fs->lock);
+                        spin_lock(&fs->lock);
                        if (fs->root.dentry == old_root->dentry
                            && fs->root.mnt == old_root->mnt) {
                                path_get(new_root);
@@ -64,7 +64,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                                fs->pwd = *new_root;
                                count++;
                        }
-                        write_unlock(&fs->lock);
+                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        } while_each_thread(g, p);
@@ -87,10 +87,10 @@ void exit_fs(struct task_struct *tsk)
        if (fs) {
                int kill;
                task_lock(tsk);
-                write_lock(&fs->lock);
+                spin_lock(&fs->lock);
                tsk->fs = NULL;
                kill = !--fs->users;
-                write_unlock(&fs->lock);
+                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
                        free_fs_struct(fs);
@@ -104,7 +104,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
        if (fs) {
                fs->users = 1;
                fs->in_exec = 0;
-                rwlock_init(&fs->lock);
+                spin_lock_init(&fs->lock);
                fs->umask = old->umask;
                get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
        }
@@ -121,10 +121,10 @@ int unshare_fs_struct(void)
                return -ENOMEM;
        task_lock(current);
-        write_lock(&fs->lock);
+        spin_lock(&fs->lock);
        kill = !--fs->users;
        current->fs = new_fs;
-        write_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
        task_unlock(current);
        if (kill)
@@ -143,7 +143,7 @@ EXPORT_SYMBOL(current_umask);
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
        .users          = 1,
-        .lock           = __RW_LOCK_UNLOCKED(init_fs.lock),
+        .lock           = __SPIN_LOCK_UNLOCKED(init_fs.lock),
        .umask          = 0022,
 };
@@ -156,14 +156,14 @@ void daemonize_fs_struct(void)
                task_lock(current);
-                write_lock(&init_fs.lock);
+                spin_lock(&init_fs.lock);
                init_fs.users++;
-                write_unlock(&init_fs.lock);
+                spin_unlock(&init_fs.lock);
-                write_lock(&fs->lock);
+                spin_lock(&fs->lock);
                current->fs = &init_fs;
                kill = !--fs->users;
-                write_unlock(&fs->lock);
+                spin_unlock(&fs->lock);
                task_unlock(current);
                if (kill)
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 99800e56415..6bc9e3a5a69 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -94,6 +94,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
                        if (error < 0)
                                goto failed;
                        inode->i_mode = mode;
+                        inode->i_ctime = CURRENT_TIME;
                        if (error == 0) {
                                posix_acl_release(acl);
                                acl = NULL;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index dd1e55535a4..f7dc9b5f9ef 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -104,7 +104,7 @@ static char *__dentry_name(struct dentry *dentry, char *name)
                __putname(name);
                return NULL;
        }
-        strncpy(name, root, PATH_MAX);
+        strlcpy(name, root, PATH_MAX);
        if (len > p - name) {
                __putname(name);
                return NULL;
@@ -876,7 +876,7 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
                char *path = dentry_name(dentry);
                int err = -ENOMEM;
                if (path) {
-                        int err = hostfs_do_readlink(path, link, PATH_MAX);
+                        err = hostfs_do_readlink(path, link, PATH_MAX);
                        if (err == PATH_MAX)
                                err = -E2BIG;
                        __putname(path);
diff --git a/fs/internal.h b/fs/internal.h
index 6b706bc60a6..a6910e91cee 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,6 +9,8 @@
 * 2 of the License, or (at your option) any later version.
 */
+#include <linux/lglock.h>
 struct super_block;
 struct linux_binprm;
 struct path;
@@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern void __init mnt_init(void);
-extern spinlock_t vfsmount_lock;
+DECLARE_BRLOCK(vfsmount_lock);
 /*
 * fs_struct.c
@@ -80,6 +83,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
 /*
 * file_table.c
 */
+extern void file_sb_list_add(struct file *f, struct super_block *sb);
+extern void file_sb_list_del(struct file *f);
 extern void mark_files_ro(struct super_block *);
 extern struct file *get_empty_filp(void);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index b0435dd0654..05a38b9c4c0 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -254,7 +254,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
        int i;
-        ll_rw_block(SWRITE, *batch_count, bhs);
+        for (i = 0; i < *batch_count; i++)
+                write_dirty_buffer(bhs[i], WRITE);
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = bhs[i];
                clear_buffer_jwrite(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 28a9ddaa0c4..95d8c11c929 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -119,7 +119,6 @@ static int journal_write_commit_record(journal_t *journal,
        struct buffer_head *bh;
        journal_header_t *header;
        int ret;
-        int barrier_done = 0;
        if (is_journal_aborted(journal))
                return 0;
@@ -137,34 +136,36 @@ static int journal_write_commit_record(journal_t *journal,
        JBUFFER_TRACE(descriptor, "write commit block");
        set_buffer_dirty(bh);
        if (journal->j_flags & JFS_BARRIER) {
-                set_buffer_ordered(bh);
+                ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
-                barrier_done = 1;
-        }
-        ret = sync_dirty_buffer(bh);
-        if (barrier_done)
-                clear_buffer_ordered(bh);
-        /* is it possible for another commit to fail at roughly
-         * the same time as this one?  If so, we don't want to
-         * trust the barrier flag in the super, but instead want
-         * to remember if we sent a barrier request
-         */
-        if (ret == -EOPNOTSUPP && barrier_done) {
-                char b[BDEVNAME_SIZE];
-                printk(KERN_WARNING
+                /*
-                        "JBD: barrier-based sync failed on %s - "
+                 * Is it possible for another commit to fail at roughly
-                        "disabling barriers\n",
+                 * the same time as this one?  If so, we don't want to
-                        bdevname(journal->j_dev, b));
+                 * trust the barrier flag in the super, but instead want
-                spin_lock(&journal->j_state_lock);
+                 * to remember if we sent a barrier request
-                journal->j_flags &= ~JFS_BARRIER;
+                 */
-                spin_unlock(&journal->j_state_lock);
+                if (ret == -EOPNOTSUPP) {
+                        char b[BDEVNAME_SIZE];
-                /* And try again, without the barrier */
+                        printk(KERN_WARNING
-                set_buffer_uptodate(bh);
+                                "JBD: barrier-based sync failed on %s - "
-                set_buffer_dirty(bh);
+                                "disabling barriers\n",
+                                bdevname(journal->j_dev, b));
+                        spin_lock(&journal->j_state_lock);
+                        journal->j_flags &= ~JFS_BARRIER;
+                        spin_unlock(&journal->j_state_lock);
+                        /* And try again, without the barrier */
+                        set_buffer_uptodate(bh);
+                        set_buffer_dirty(bh);
+                        ret = sync_dirty_buffer(bh);
+                }
+        } else {
                ret = sync_dirty_buffer(bh);
        }
        put_bh(bh);             /* One for getblk() */
        journal_put_journal_head(descriptor);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f19ce94693d..2c4b1f109da 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1024,7 +1024,7 @@ void journal_update_superblock(journal_t *journal, int wait)
        if (wait)
                sync_dirty_buffer(bh);
        else
-                ll_rw_block(SWRITE, 1, &bh);
+                write_dirty_buffer(bh, WRITE);
 out:
        /* If we have just flushed the log (by marking s_start==0), then
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index ad717328343..d29018307e2 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -617,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+        write_dirty_buffer(bh, write_op);
 }
 #endif
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1c23a0f4e8a..5247e7ffdcb 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -255,7 +255,9 @@ __flush_batch(journal_t *journal, int *batch_count)
 {
        int i;
-        ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
+        for (i = 0; i < *batch_count; i++)
+                write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = journal->j_chkpt_bhs[i];
                clear_buffer_jwrite(bh);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f52e5e8049f..7c068c189d8 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -101,7 +101,6 @@ static int journal_submit_commit_record(journal_t *journal,
        struct commit_header *tmp;
        struct buffer_head *bh;
        int ret;
-        int barrier_done = 0;
        struct timespec now = current_kernel_time();
        if (is_journal_aborted(journal))
@@ -136,30 +135,22 @@ static int journal_submit_commit_record(journal_t *journal,
        if (journal->j_flags & JBD2_BARRIER &&
            !JBD2_HAS_INCOMPAT_FEATURE(journal,
                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-                set_buffer_ordered(bh);
+                ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
-                barrier_done = 1;
+                if (ret == -EOPNOTSUPP) {
-        }
+                        printk(KERN_WARNING
-        ret = submit_bh(WRITE_SYNC_PLUG, bh);
+                               "JBD2: Disabling barriers on %s, "
-        if (barrier_done)
+                               "not supported by device\n", journal->j_devname);
-                clear_buffer_ordered(bh);
+                        write_lock(&journal->j_state_lock);
+                        journal->j_flags &= ~JBD2_BARRIER;
-        /* is it possible for another commit to fail at roughly
+                        write_unlock(&journal->j_state_lock);
-         * the same time as this one?  If so, we don't want to
-         * trust the barrier flag in the super, but instead want
-         * to remember if we sent a barrier request
-         */
-        if (ret == -EOPNOTSUPP && barrier_done) {
-                printk(KERN_WARNING
-                       "JBD2: Disabling barriers on %s, "
-                       "not supported by device\n", journal->j_devname);
-                write_lock(&journal->j_state_lock);
-                journal->j_flags &= ~JBD2_BARRIER;
-                write_unlock(&journal->j_state_lock);
-                /* And try again, without the barrier */
+                        /* And try again, without the barrier */
-                lock_buffer(bh);
+                        lock_buffer(bh);
-                set_buffer_uptodate(bh);
+                        set_buffer_uptodate(bh);
-                clear_buffer_dirty(bh);
+                        clear_buffer_dirty(bh);
+                        ret = submit_bh(WRITE_SYNC_PLUG, bh);
+                }
+        } else {
                ret = submit_bh(WRITE_SYNC_PLUG, bh);
        }
        *cbh = bh;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ad5866aaf0f..0e8014ea6b9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1124,7 +1124,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
                        set_buffer_uptodate(bh);
                }
        } else
-                ll_rw_block(SWRITE, 1, &bh);
+                write_dirty_buffer(bh, WRITE);
 out:
        /* If we have just flushed the log (by marking s_start==0), then
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a360b06af2e..9ad321fd63f 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -625,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+        write_dirty_buffer(bh, write_op);
 }
 #endif
diff --git a/fs/mbcache.c b/fs/mbcache.c
index cf4e6cdfd15..93444747237 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -80,6 +80,7 @@ struct mb_cache {
        struct list_head                c_cache_list;
        const char                      *c_name;
        atomic_t                        c_entry_count;
+        int                             c_max_entries;
        int                             c_bucket_bits;
        struct kmem_cache               *c_entry_cache;
        struct list_head                *c_block_hash;
@@ -243,6 +244,12 @@ mb_cache_create(const char *name, int bucket_bits)
        if (!cache->c_entry_cache)
                goto fail2;
+        /*
+         * Set an upper limit on the number of cache entries so that the hash
+         * chains won't grow too long.
+         */
+        cache->c_max_entries = bucket_count << 4;
        spin_lock(&mb_cache_spinlock);
        list_add(&cache->c_cache_list, &mb_cache_list);
        spin_unlock(&mb_cache_spinlock);
@@ -333,7 +340,6 @@ mb_cache_destroy(struct mb_cache *cache)
        kfree(cache);
 }
 /*
 * mb_cache_entry_alloc()
 *
@@ -345,17 +351,29 @@ mb_cache_destroy(struct mb_cache *cache)
 struct mb_cache_entry *
 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
-        struct mb_cache_entry *ce;
+        struct mb_cache_entry *ce = NULL;
-        ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+        if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
-        if (ce) {
+                spin_lock(&mb_cache_spinlock);
+                if (!list_empty(&mb_cache_lru_list)) {
+                        ce = list_entry(mb_cache_lru_list.next,
+                                        struct mb_cache_entry, e_lru_list);
+                        list_del_init(&ce->e_lru_list);
+                        __mb_cache_entry_unhash(ce);
+                }
+                spin_unlock(&mb_cache_spinlock);
+        }
+        if (!ce) {
+                ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+                if (!ce)
+                        return NULL;
                atomic_inc(&cache->c_entry_count);
                INIT_LIST_HEAD(&ce->e_lru_list);
                INIT_LIST_HEAD(&ce->e_block_list);
                ce->e_cache = cache;
-                ce->e_used = 1 + MB_CACHE_WRITER;
                ce->e_queued = 0;
        }
+        ce->e_used = 1 + MB_CACHE_WRITER;
        return ce;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 17ea76bf2fb..24896e83356 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -595,15 +595,16 @@ int follow_up(struct path *path)
 {
        struct vfsmount *parent;
        struct dentry *mountpoint;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        parent = path->mnt->mnt_parent;
        if (parent == path->mnt) {
-                spin_unlock(&vfsmount_lock);
+                br_read_unlock(vfsmount_lock);
                return 0;
        }
        mntget(parent);
        mountpoint = dget(path->mnt->mnt_mountpoint);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
@@ -686,6 +687,35 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 }
 /*
+ * Allocate a dentry with name and parent, and perform a parent
+ * directory ->lookup on it. Returns the new dentry, or ERR_PTR
+ * on error. parent->d_inode->i_mutex must be held. d_lookup must
+ * have verified that no child exists while under i_mutex.
+ */
+static struct dentry *d_alloc_and_lookup(struct dentry *parent,
+                                struct qstr *name, struct nameidata *nd)
+{
+        struct inode *inode = parent->d_inode;
+        struct dentry *dentry;
+        struct dentry *old;
+        /* Don't create child dentry for a dead directory. */
+        if (unlikely(IS_DEADDIR(inode)))
+                return ERR_PTR(-ENOENT);
+        dentry = d_alloc(parent, name);
+        if (unlikely(!dentry))
+                return ERR_PTR(-ENOMEM);
+        old = inode->i_op->lookup(inode, dentry, nd);
+        if (unlikely(old)) {
+                dput(dentry);
+                dentry = old;
+        }
+        return dentry;
+}
+/*
 *  It's more convoluted than I'd like it to be, but... it's still fairly
 *  small and for now I'd prefer to have fast path as straight as possible.
 *  It _is_ time-critical.
@@ -706,9 +736,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                        return err;
        }
+        /*
+         * Rename seqlock is not required here because in the off chance
+         * of a false negative due to a concurrent rename, we're going to
+         * do the non-racy lookup, below.
+         */
        dentry = __d_lookup(nd->path.dentry, name);
        if (!dentry)
                goto need_lookup;
+found:
        if (dentry->d_op && dentry->d_op->d_revalidate)
                goto need_revalidate;
 done:
@@ -724,56 +760,28 @@ need_lookup:
        mutex_lock(&dir->i_mutex);
        /*
         * First re-do the cached lookup just in case it was created
-         * while we waited for the directory semaphore..
+         * while we waited for the directory semaphore, or the first
+         * lookup failed due to an unrelated rename.
         *
-         * FIXME! This could use version numbering or similar to
+         * This could use version numbering or similar to avoid unnecessary
-         * avoid unnecessary cache lookups.
+         * cache lookups, but then we'd have to do the first lookup in the
-         *
+         * non-racy way. However in the common case here, everything should
-         * The "dcache_lock" is purely to protect the RCU list walker
+         * be hot in cache, so would it be a big win?
-         * from concurrent renames at this point (we mustn't get false
-         * negatives from the RCU list walk here, unlike the optimistic
-         * fast walk).
-         *
-         * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
         */
        dentry = d_lookup(parent, name);
-        if (!dentry) {
+        if (likely(!dentry)) {
-                struct dentry *new;
+                dentry = d_alloc_and_lookup(parent, name, nd);
-                /* Don't create child dentry for a dead directory. */
-                dentry = ERR_PTR(-ENOENT);
-                if (IS_DEADDIR(dir))
-                        goto out_unlock;
-                new = d_alloc(parent, name);
-                dentry = ERR_PTR(-ENOMEM);
-                if (new) {
-                        dentry = dir->i_op->lookup(dir, new, nd);
-                        if (dentry)
-                                dput(new);
-                        else
-                                dentry = new;
-                }
-out_unlock:
                mutex_unlock(&dir->i_mutex);
                if (IS_ERR(dentry))
                        goto fail;
                goto done;
        }
        /*
         * Uhhuh! Nasty case: the cache was re-populated while
         * we waited on the semaphore. Need to revalidate.
         */
        mutex_unlock(&dir->i_mutex);
-        if (dentry->d_op && dentry->d_op->d_revalidate) {
+        goto found;
-                dentry = do_revalidate(dentry, nd);
-                if (!dentry)
-                        dentry = ERR_PTR(-ENOENT);
-        }
-        if (IS_ERR(dentry))
-                goto fail;
-        goto done;
 need_revalidate:
        dentry = do_revalidate(dentry, nd);
@@ -1130,35 +1138,18 @@ static struct dentry *__lookup_hash(struct qstr *name,
                        goto out;
        }
-        dentry = __d_lookup(base, name);
+        /*
+         * Don't bother with __d_lookup: callers are for creat as
-        /* lockess __d_lookup may fail due to concurrent d_move()
+         * well as unlink, so a lot of the time it would cost
-         * in some unrelated directory, so try with d_lookup
+         * a double lookup.
         */
-        if (!dentry)
+        dentry = d_lookup(base, name);
-                dentry = d_lookup(base, name);
        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
                dentry = do_revalidate(dentry, nd);
-        if (!dentry) {
+        if (!dentry)
-                struct dentry *new;
+                dentry = d_alloc_and_lookup(base, name, nd);
-                /* Don't create child dentry for a dead directory. */
-                dentry = ERR_PTR(-ENOENT);
-                if (IS_DEADDIR(inode))
-                        goto out;
-                new = d_alloc(base, name);
-                dentry = ERR_PTR(-ENOMEM);
-                if (!new)
-                        goto out;
-                dentry = inode->i_op->lookup(inode, new, nd);
-                if (!dentry)
-                        dentry = new;
-                else
-                        dput(new);
-        }
 out:
        return dentry;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 2e10cb19c5b..de402eb6eaf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -38,12 +40,10 @@
 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
 #define HASH_SIZE (1UL << HASH_SHIFT)
-/* spinlock for vfsmount related operations, inplace of dcache_lock */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 static int event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
 static int mnt_id_start = 0;
 static int mnt_group_start = 1;
@@ -55,6 +55,16 @@ static struct rw_semaphore namespace_sem;
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+DEFINE_BRLOCK(vfsmount_lock);
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -65,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
-/* allocation is serialized by namespace_sem */
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
 static int mnt_alloc_id(struct vfsmount *mnt)
 {
        int res;
 retry:
        ida_pre_get(&mnt_id_ida, GFP_KERNEL);
-        spin_lock(&vfsmount_lock);
+        spin_lock(&mnt_id_lock);
        res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
        if (!res)
                mnt_id_start = mnt->mnt_id + 1;
-        spin_unlock(&vfsmount_lock);
+        spin_unlock(&mnt_id_lock);
        if (res == -EAGAIN)
                goto retry;
@@ -86,11 +99,11 @@ retry:
 static void mnt_free_id(struct vfsmount *mnt)
 {
        int id = mnt->mnt_id;
-        spin_lock(&vfsmount_lock);
+        spin_lock(&mnt_id_lock);
        ida_remove(&mnt_id_ida, id);
        if (mnt_id_start > id)
                mnt_id_start = id;
-        spin_unlock(&vfsmount_lock);
+        spin_unlock(&mnt_id_lock);
 }
 /*
@@ -348,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
        int ret = 0;
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        mnt->mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -382,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
         */
        smp_wmb();
        mnt->mnt_flags &= ~MNT_WRITE_HOLD;
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        return ret;
 }
 static void __mnt_unmake_readonly(struct vfsmount *mnt)
 {
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        mnt->mnt_flags &= ~MNT_READONLY;
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 }
 void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -414,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
 /*
 * find the first or last mount at @dentry on vfsmount @mnt depending on
 * @dir. If @dir is set return the first mount else return the last mount.
+ * vfsmount_lock must be held for read or write.
 */
 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
                              int dir)
@@ -443,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 struct vfsmount *lookup_mnt(struct path *path)
 {
        struct vfsmount *child_mnt;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
                mntget(child_mnt);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return child_mnt;
 }
@@ -455,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
        return mnt->mnt_ns == current->nsproxy->mnt_ns;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void touch_mnt_namespace(struct mnt_namespace *ns)
 {
        if (ns) {
@@ -463,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
        }
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void __touch_mnt_namespace(struct mnt_namespace *ns)
 {
        if (ns && ns->event != event) {
@@ -471,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
        }
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 {
        old_path->dentry = mnt->mnt_mountpoint;
@@ -482,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
        old_path->dentry->d_mounted--;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
                        struct vfsmount *child_mnt)
 {
@@ -490,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
        dentry->d_mounted++;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void attach_mnt(struct vfsmount *mnt, struct path *path)
 {
        mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -499,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
 }
 /*
- * the caller must hold vfsmount_lock
+ * vfsmount lock must be held for write
 */
 static void commit_tree(struct vfsmount *mnt)
 {
@@ -623,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
 void mntput_no_expire(struct vfsmount *mnt)
 {
 repeat:
-        if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
+        if (atomic_add_unless(&mnt->mnt_count, -1, 1))
-                if (likely(!mnt->mnt_pinned)) {
+                return;
-                        spin_unlock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
-                        __mntput(mnt);
+        if (!atomic_dec_and_test(&mnt->mnt_count)) {
-                        return;
+                br_write_unlock(vfsmount_lock);
-                }
+                return;
-                atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+        }
-                mnt->mnt_pinned = 0;
+        if (likely(!mnt->mnt_pinned)) {
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
-                acct_auto_close_mnt(mnt);
+                __mntput(mnt);
-                goto repeat;
+                return;
        }
+        atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+        mnt->mnt_pinned = 0;
+        br_write_unlock(vfsmount_lock);
+        acct_auto_close_mnt(mnt);
+        goto repeat;
 }
 EXPORT_SYMBOL(mntput_no_expire);
 void mnt_pin(struct vfsmount *mnt)
 {
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        mnt->mnt_pinned++;
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 void mnt_unpin(struct vfsmount *mnt)
 {
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (mnt->mnt_pinned) {
                atomic_inc(&mnt->mnt_count);
                mnt->mnt_pinned--;
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
@@ -746,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
        struct mnt_namespace *ns = p->ns;
        int res = 0;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if (p->event != ns->event) {
                p->event = ns->event;
                res = 1;
        }
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return res;
 }
@@ -952,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
        int minimum_refs = 0;
        struct vfsmount *p;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += atomic_read(&p->mnt_count);
                minimum_refs += 2;
        }
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        if (actual_refs > minimum_refs)
                return 0;
@@ -984,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
        down_read(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        up_read(&namespace_sem);
        return ret;
 }
@@ -1003,13 +1037,14 @@ void release_mounts(struct list_head *head)
                if (mnt->mnt_parent != mnt) {
                        struct dentry *dentry;
                        struct vfsmount *m;
-                        spin_lock(&vfsmount_lock);
+                        br_write_lock(vfsmount_lock);
                        dentry = mnt->mnt_mountpoint;
                        m = mnt->mnt_parent;
                        mnt->mnt_mountpoint = mnt->mnt_root;
                        mnt->mnt_parent = mnt;
                        m->mnt_ghosts--;
-                        spin_unlock(&vfsmount_lock);
+                        br_write_unlock(vfsmount_lock);
                        dput(dentry);
                        mntput(m);
                }
@@ -1017,6 +1052,10 @@ void release_mounts(struct list_head *head)
        }
 }
+/*
+ * vfsmount lock must be held for write
+ * namespace_sem must be held for write
+ */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
        struct vfsmount *p;
@@ -1107,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
        }
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        event++;
        if (!(flags & MNT_DETACH))
@@ -1119,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
                        umount_tree(mnt, 1, &umount_list);
                retval = 0;
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        return retval;
@@ -1231,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
                        q = clone_mnt(p, p->mnt_root, flag);
                        if (!q)
                                goto Enomem;
-                        spin_lock(&vfsmount_lock);
+                        br_write_lock(vfsmount_lock);
                        list_add_tail(&q->mnt_list, &res->mnt_list);
                        attach_mnt(q, &path);
-                        spin_unlock(&vfsmount_lock);
+                        br_write_unlock(vfsmount_lock);
                }
        }
        return res;
 Enomem:
        if (res) {
                LIST_HEAD(umount_list);
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                umount_tree(res, 0, &umount_list);
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
                release_mounts(&umount_list);
        }
        return NULL;
@@ -1262,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
 {
        LIST_HEAD(umount_list);
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        umount_tree(mnt, 0, &umount_list);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
 }
@@ -1392,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (err)
                goto out_cleanup_ids;
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1411,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
                list_del_init(&child->mnt_hash);
                commit_tree(child);
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        return 0;
 out_cleanup_ids:
@@ -1466,10 +1506,10 @@ static int do_change_type(struct path *path, int flag)
                        goto out_unlock;
        }
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 out_unlock:
        up_write(&namespace_sem);
@@ -1513,9 +1553,10 @@ static int do_loopback(struct path *path, char *old_name,
        err = graft_tree(mnt, path);
        if (err) {
                LIST_HEAD(umount_list);
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                umount_tree(mnt, 0, &umount_list);
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
                release_mounts(&umount_list);
        }
@@ -1568,16 +1609,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        else
                err = do_remount_sb(sb, flags, data, 0);
        if (!err) {
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
                path->mnt->mnt_flags = mnt_flags;
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
        }
        up_write(&sb->s_umount);
        if (!err) {
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                touch_mnt_namespace(path->mnt->mnt_ns);
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
        }
        return err;
 }
@@ -1754,7 +1795,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                return;
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
@@ -1773,7 +1814,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, 1, &umounts);
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umounts);
@@ -1830,6 +1871,8 @@ resume:
 /*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
+ *
+ * vfsmount_lock must be held for write
 */
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 {
@@ -2048,9 +2091,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                kfree(new_ns);
                return ERR_PTR(-ENOMEM);
        }
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2244,7 +2287,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                goto out2; /* not attached */
        /* make sure we can reach put_old from new_root */
        tmp = old.mnt;
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (tmp != new.mnt) {
                for (;;) {
                        if (tmp->mnt_parent == tmp)
@@ -2264,7 +2307,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        /* mount new_root on / */
        attach_mnt(new.mnt, &root_parent);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
        path_put(&root_parent);
@@ -2279,7 +2322,7 @@ out1:
 out0:
        return error;
 out3:
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        goto out2;
 }
@@ -2326,6 +2369,8 @@ void __init mnt_init(void)
        for (u = 0; u < HASH_SIZE; u++)
                INIT_LIST_HEAD(&mount_hashtable[u]);
+        br_lock_init(vfsmount_lock);
        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2344,9 +2389,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
        if (!atomic_dec_and_test(&ns->count))
                return;
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        umount_tree(ns->root, 0, &umount_list);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        kfree(ns);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 26a510a7be0..6c2aad49d73 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -63,7 +63,6 @@ config NFS_V3_ACL
 config NFS_V4
        bool "NFS client support for NFS version 4"
        depends on NFS_FS
-        select RPCSEC_GSS_KRB5
        help
          This option enables support for version 4 of the NFS protocol
          (RFC 3530) in the kernel's NFS client.
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 29539ceeb74..e257172d438 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -140,6 +140,13 @@ nfs_opendir(struct inode *inode, struct file *filp)
        /* Call generic open code in order to cache credentials */
        res = nfs_open(inode, filp);
+        if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
+                /* This is a mountpoint, so d_revalidate will never
+                 * have been called, so we need to refresh the
+                 * inode (for close-open consistency) ourselves.
+                 */
+                __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+        }
        return res;
 }
@@ -1103,7 +1110,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
                goto no_open_dput;
        /* We can't create new files, or truncate existing ones here */
-        openflags &= ~(O_CREAT|O_TRUNC);
+        openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
        /*
         * Note: we're not holding inode->i_mutex and so may be racing with
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2d141a74ae8..eb51bd6201d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -323,7 +323,7 @@ nfs_file_fsync(struct file *file, int datasync)
        have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
        if (have_error)
                ret = xchg(&ctx->error, 0);
-        if (!ret)
+        if (!ret && status < 0)
                ret = status;
        return ret;
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7ffbb98ddec..089da5b5d20 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2036,7 +2036,8 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct rpc_cred *cred;
        struct nfs4_state *state;
        struct dentry *res;
-        fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+        int open_flags = nd->intent.open.flags;
+        fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
        if (nd->flags & LOOKUP_CREATE) {
                attr.ia_mode = nd->intent.open.create_mode;
@@ -2044,8 +2045,9 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                if (!IS_POSIXACL(dir))
                        attr.ia_mode &= ~current_umask();
        } else {
+                open_flags &= ~O_EXCL;
                attr.ia_valid = 0;
-                BUG_ON(nd->intent.open.flags & O_CREAT);
+                BUG_ON(open_flags & O_CREAT);
        }
        cred = rpc_lookup_cred();
@@ -2054,7 +2056,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-        state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
+        state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred);
        put_rpccred(cred);
        if (IS_ERR(state)) {
                if (PTR_ERR(state) == -ENOENT) {
@@ -2273,8 +2275,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
 out:
        if (page)
                __free_page(page);
-        if (locations)
+        kfree(locations);
-                kfree(locations);
        return status;
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ee26316ad1f..ec3966e4706 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -655,6 +655,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        if (nfss->options & NFS_OPTION_FSCACHE)
                seq_printf(m, ",fsc");
+        if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
+                if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+                        seq_printf(m, ",lookupcache=none");
+                else
+                        seq_printf(m, ",lookupcache=pos");
+        }
 }
 /*
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 503b9da159a..95932f523ae 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -69,7 +69,6 @@ config NFSD_V4
        depends on NFSD && PROC_FS && EXPERIMENTAL
        select NFSD_V3
        select FS_POSIX_ACL
-        select RPCSEC_GSS_KRB5
        help
          This option enables support in your system's NFS server for
          version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e7357104cf..3dfef062396 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2450,14 +2450,13 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 static __be32
 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
 {
-        u32 op_share_access, new_access;
+        u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+        bool new_access;
        __be32 status;
-        set_access(&new_access, stp->st_access_bmap);
+        new_access = !test_bit(op_share_access, &stp->st_access_bmap);
-        new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK;
        if (new_access) {
-                status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access);
+                status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access);
                if (status)
                        return status;
        }
@@ -2470,7 +2469,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
                return status;
        }
        /* remember the open */
-        op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
        __set_bit(op_share_access, &stp->st_access_bmap);
        __set_bit(open->op_share_deny, &stp->st_deny_bmap);
@@ -2983,7 +2981,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                                *filpp = find_readable_file(stp->st_file);
                        else
                                *filpp = find_writeable_file(stp->st_file);
-                        BUG_ON(!*filpp); /* assured by check_openmode */
                }
        }
        status = nfs_ok;
@@ -3561,7 +3558,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct nfs4_stateowner *open_sop = NULL;
        struct nfs4_stateowner *lock_sop = NULL;
        struct nfs4_stateid *lock_stp;
-        struct file *filp;
+        struct nfs4_file *fp;
+        struct file *filp = NULL;
        struct file_lock file_lock;
        struct file_lock conflock;
        __be32 status = 0;
@@ -3591,7 +3589,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                 * lock stateid.
                 */
                struct nfs4_stateid *open_stp = NULL;
-                struct nfs4_file *fp;
                
                status = nfserr_stale_clientid;
                if (!nfsd4_has_session(cstate) &&
@@ -3634,6 +3631,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                if (status)
                        goto out;
                lock_sop = lock->lk_replay_owner;
+                fp = lock_stp->st_file;
        }
        /* lock->lk_replay_owner and lock_stp have been created or found */
@@ -3648,13 +3646,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (lock->lk_type) {
                case NFS4_READ_LT:
                case NFS4_READW_LT:
-                        filp = find_readable_file(lock_stp->st_file);
+                        if (find_readable_file(lock_stp->st_file)) {
+                                nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
+                                filp = find_readable_file(lock_stp->st_file);
+                        }
                        file_lock.fl_type = F_RDLCK;
                        cmd = F_SETLK;
                break;
                case NFS4_WRITE_LT:
                case NFS4_WRITEW_LT:
-                        filp = find_writeable_file(lock_stp->st_file);
+                        if (find_writeable_file(lock_stp->st_file)) {
+                                nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
+                                filp = find_writeable_file(lock_stp->st_file);
+                        }
                        file_lock.fl_type = F_WRLCK;
                        cmd = F_SETLK;
                break;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 7731a75971d..322518c88e4 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -363,23 +363,23 @@ struct nfs4_file {
 * at all? */
 static inline struct file *find_writeable_file(struct nfs4_file *f)
 {
-        if (f->fi_fds[O_RDWR])
+        if (f->fi_fds[O_WRONLY])
-                return f->fi_fds[O_RDWR];
+                return f->fi_fds[O_WRONLY];
-        return f->fi_fds[O_WRONLY];
+        return f->fi_fds[O_RDWR];
 }
 static inline struct file *find_readable_file(struct nfs4_file *f)
 {
-        if (f->fi_fds[O_RDWR])
+        if (f->fi_fds[O_RDONLY])
-                return f->fi_fds[O_RDWR];
+                return f->fi_fds[O_RDONLY];
-        return f->fi_fds[O_RDONLY];
+        return f->fi_fds[O_RDWR];
 }
 static inline struct file *find_any_file(struct nfs4_file *f)
 {
        if (f->fi_fds[O_RDWR])
                return f->fi_fds[O_RDWR];
-        else if (f->fi_fds[O_RDWR])
+        else if (f->fi_fds[O_WRONLY])
                return f->fi_fds[O_WRONLY];
        else
                return f->fi_fds[O_RDONLY];
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 96360a83cb9..661a6cf8e82 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2033,15 +2033,17 @@ out:
 __be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
 {
-        struct path path = {
-                .mnt    = fhp->fh_export->ex_path.mnt,
-                .dentry = fhp->fh_dentry,
-        };
        __be32 err;
        err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
-        if (!err && vfs_statfs(&path, stat))
+        if (!err) {
-                err = nfserr_io;
+                struct path path = {
+                        .mnt    = fhp->fh_export->ex_path.mnt,
+                        .dentry = fhp->fh_dentry,
+                };
+                if (vfs_statfs(&path, stat))
+                        err = nfserr_io;
+        }
        return err;
 }
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1fa86b9df73..922263393c7 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -175,24 +175,24 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
-        int barrier_done = 0;
-        if (nilfs_test_opt(sbi, BARRIER)) {
-                set_buffer_ordered(nilfs->ns_sbh[0]);
-                barrier_done = 1;
-        }
 retry:
        set_buffer_dirty(nilfs->ns_sbh[0]);
-        err = sync_dirty_buffer(nilfs->ns_sbh[0]);
-        if (err == -EOPNOTSUPP && barrier_done) {
+        if (nilfs_test_opt(sbi, BARRIER)) {
-                nilfs_warning(sbi->s_super, __func__,
+                err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-                              "barrier-based sync failed. "
+                                          WRITE_SYNC | WRITE_BARRIER);
-                              "disabling barriers\n");
+                if (err == -EOPNOTSUPP) {
-                nilfs_clear_opt(sbi, BARRIER);
+                        nilfs_warning(sbi->s_super, __func__,
-                barrier_done = 0;
+                                      "barrier-based sync failed. "
-                clear_buffer_ordered(nilfs->ns_sbh[0]);
+                                      "disabling barriers\n");
-                goto retry;
+                        nilfs_clear_opt(sbi, BARRIER);
+                        goto retry;
+                }
+        } else {
+                err = sync_dirty_buffer(nilfs->ns_sbh[0]);
        }
        if (unlikely(err)) {
                printk(KERN_ERR
                       "NILFS: unable to write superblock (err=%d)\n", err);
@@ -400,9 +400,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        list_add(&sbi->s_list, &nilfs->ns_supers);
        up_write(&nilfs->ns_super_sem);
+        err = -ENOMEM;
        sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
        if (!sbi->s_ifile)
-                return -ENOMEM;
+                goto delist;
        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -433,6 +434,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        nilfs_mdt_destroy(sbi->s_ifile);
        sbi->s_ifile = NULL;
+ delist:
        down_write(&nilfs->ns_super_sem);
        list_del_init(&sbi->s_list);
        up_write(&nilfs->ns_super_sem);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 37de1f062d8..4317f177ea7 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -608,11 +608,11 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
                return -EINVAL;
        }
-        if (swp) {
+        if (!valid[!swp])
                printk(KERN_WARNING "NILFS warning: broken superblock. "
                       "using spare superblock.\n");
+        if (swp)
                nilfs_swap_super_block(nilfs);
-        }
        nilfs->ns_sbwcount = 0;
        nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
@@ -775,6 +775,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                                                   start * sects_per_block,
                                                   nblocks * sects_per_block,
                                                   GFP_NOFS,
+                                                   BLKDEV_IFL_WAIT |
                                                   BLKDEV_IFL_BARRIER);
                        if (ret < 0)
                                return ret;
@@ -785,7 +786,8 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                ret = blkdev_issue_discard(nilfs->ns_bdev,
                                           start * sects_per_block,
                                           nblocks * sects_per_block,
-                                           GFP_NOFS, BLKDEV_IFL_BARRIER);
+                                           GFP_NOFS,
+                                          BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
        return ret;
 }
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 756566fe844..85366c78cc3 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -165,9 +165,6 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
                 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
                 inode_mark, vfsmnt_mark, event_mask, data, data_type);
-        pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n",
-                 __func__, group, vfsmnt_mark, inode_mark, event_mask);
        /* sorry, fanotify only gives a damn about files and dirs */
        if (!S_ISREG(to_tell->i_mode) &&
            !S_ISDIR(to_tell->i_mode))
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 032b837fcd1..5ed8e58d7bf 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -195,6 +195,14 @@ static int prepare_for_access_response(struct fsnotify_group *group,
        re->fd = fd;
        mutex_lock(&group->fanotify_data.access_mutex);
+        if (group->fanotify_data.bypass_perm) {
+                mutex_unlock(&group->fanotify_data.access_mutex);
+                kmem_cache_free(fanotify_response_event_cache, re);
+                event->response = FAN_ALLOW;
+                return 0;
+        }
+                
        list_add_tail(&re->list, &group->fanotify_data.access_list);
        mutex_unlock(&group->fanotify_data.access_mutex);
@@ -364,9 +372,28 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
        struct fsnotify_group *group = file->private_data;
+        struct fanotify_response_event *re, *lre;
        pr_debug("%s: file=%p group=%p\n", __func__, file, group);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        mutex_lock(&group->fanotify_data.access_mutex);
+        group->fanotify_data.bypass_perm = true;
+        list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
+                pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
+                         re, re->event);
+                list_del_init(&re->list);
+                re->event->response = FAN_ALLOW;
+                kmem_cache_free(fanotify_response_event_cache, re);
+        }
+        mutex_unlock(&group->fanotify_data.access_mutex);
+        wake_up(&group->fanotify_data.access_waitq);
+#endif
        /* matches the fanotify_init->fsnotify_alloc_group */
        fsnotify_put_group(group);
@@ -614,7 +641,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
                __func__, flags, event_f_flags);
        if (!capable(CAP_SYS_ADMIN))
-                return -EACCES;
+                return -EPERM;
        if (flags & ~FAN_ALL_INIT_FLAGS)
                return -EINVAL;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 3970392b272..36802420d69 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -148,13 +148,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
                         const unsigned char *file_name,
                         struct fsnotify_event **event)
 {
-        struct fsnotify_group *group = inode_mark->group;
+        struct fsnotify_group *group = NULL;
-        __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+        __u32 inode_test_mask = 0;
-        __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+        __u32 vfsmount_test_mask = 0;
-        pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p"
+        if (unlikely(!inode_mark && !vfsmount_mark)) {
-                 " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell,
+                BUG();
-                 mnt, inode_mark, mask, data, data_is, cookie, *event);
+                return 0;
+        }
        /* clear ignored on inode modification */
        if (mask & FS_MODIFY) {
@@ -168,18 +169,29 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
        /* does the inode mark tell us to do something? */
        if (inode_mark) {
+                group = inode_mark->group;
+                inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
                inode_test_mask &= inode_mark->mask;
                inode_test_mask &= ~inode_mark->ignored_mask;
        }
        /* does the vfsmount_mark tell us to do something? */
        if (vfsmount_mark) {
+                vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+                group = vfsmount_mark->group;
                vfsmount_test_mask &= vfsmount_mark->mask;
                vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
                if (inode_mark)
                        vfsmount_test_mask &= ~inode_mark->ignored_mask;
        }
+        pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
+                 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
+                 " data=%p data_is=%d cookie=%d event=%p\n",
+                 __func__, group, to_tell, mnt, mask, inode_mark,
+                 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
+                 data_is, cookie, *event);
        if (!inode_test_mask && !vfsmount_test_mask)
                return 0;
@@ -207,13 +219,12 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
 int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
             const unsigned char *file_name, u32 cookie)
 {
-        struct hlist_node *inode_node, *vfsmount_node;
+        struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
        struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
        struct fsnotify_group *inode_group, *vfsmount_group;
        struct fsnotify_event *event = NULL;
        struct vfsmount *mnt;
        int idx, ret = 0;
-        bool used_inode = false, used_vfsmount = false;
        /* global tests shouldn't care about events on child only the specific event */
        __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
@@ -238,57 +249,50 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
            (test_mask & to_tell->i_fsnotify_mask))
                inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
                                              &fsnotify_mark_srcu);
-        else
-                inode_node = NULL;
-        if (mnt) {
+        if (mnt && ((mask & FS_MODIFY) ||
-                if ((mask & FS_MODIFY) ||
+                    (test_mask & mnt->mnt_fsnotify_mask))) {
-                    (test_mask & mnt->mnt_fsnotify_mask))
+                vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
-                        vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
+                                                 &fsnotify_mark_srcu);
-                                                         &fsnotify_mark_srcu);
+                inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
-                else
+                                              &fsnotify_mark_srcu);
-                        vfsmount_node = NULL;
-        } else {
-                mnt = NULL;
-                vfsmount_node = NULL;
        }
        while (inode_node || vfsmount_node) {
+                inode_group = vfsmount_group = NULL;
                if (inode_node) {
                        inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
                                                 struct fsnotify_mark, i.i_list);
                        inode_group = inode_mark->group;
-                } else
+                }
-                        inode_group = (void *)-1;
                if (vfsmount_node) {
                        vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
                                                        struct fsnotify_mark, m.m_list);
                        vfsmount_group = vfsmount_mark->group;
-                } else
+                }
-                        vfsmount_group = (void *)-1;
-                if (inode_group < vfsmount_group) {
+                if (inode_group > vfsmount_group) {
                        /* handle inode */
                        send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
                                      data_is, cookie, file_name, &event);
-                        used_inode = true;
+                        /* we didn't use the vfsmount_mark */
-                } else if (vfsmount_group < inode_group) {
+                        vfsmount_group = NULL;
+                } else if (vfsmount_group > inode_group) {
                        send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
                                      data_is, cookie, file_name, &event);
-                        used_vfsmount = true;
+                        inode_group = NULL;
                } else {
                        send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
                                      mask, data, data_is, cookie, file_name,
                                      &event);
-                        used_vfsmount = true;
-                        used_inode = true;
                }
-                if (used_inode)
+                if (inode_group)
                        inode_node = srcu_dereference(inode_node->next,
                                                      &fsnotify_mark_srcu);
-                if (used_vfsmount)
+                if (vfsmount_group)
                        vfsmount_node = srcu_dereference(vfsmount_node->next,
                                                         &fsnotify_mark_srcu);
        }
diff --git a/fs/open.c b/fs/open.c
index 630715f9f73..d74e1983e8d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -675,7 +675,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
        f->f_path.mnt = mnt;
        f->f_pos = 0;
        f->f_op = fops_get(inode->i_fop);
-        file_move(f, &inode->i_sb->s_files);
+        file_sb_list_add(f, inode->i_sb);
        error = security_dentry_open(f, cred);
        if (error)
@@ -721,7 +721,7 @@ cleanup_all:
                        mnt_drop_write(mnt);
                }
        }
-        file_kill(f);
+        file_sb_list_del(f);
        f->f_path.dentry = NULL;
        f->f_path.mnt = NULL;
 cleanup_file:
diff --git a/fs/pnode.c b/fs/pnode.c
index 5cc564a8314..8066b8dd748 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt)
        return 0;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 void change_mnt_propagation(struct vfsmount *mnt, int type)
 {
        if (type == MS_SHARED) {
@@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
                prev_src_mnt  = child;
        }
 out:
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        while (!list_empty(&tmp_list)) {
                child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
                umount_tree(child, 0, &umount_list);
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        release_mounts(&umount_list);
        return ret;
 }
@@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
 * other mounts its parent propagates to.
 * Check if any of these mounts that **do not have submounts**
 * have more references than 'refcnt'. If so return busy.
+ *
+ * vfsmount lock must be held for read or write
 */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
@@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt)
 * collect all mounts that receive propagation from the mount in @list,
 * and return these additional mounts in the same list.
 * @list: the list of mounts to be unmounted.
+ *
+ * vfsmount lock must be held for write
 */
 int propagate_umount(struct list_head *list)
 {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ae35413dcbe..caa758377d6 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -83,6 +83,7 @@ void reiserfs_evict_inode(struct inode *inode)
        dquot_drop(inode);
        inode->i_blocks = 0;
        reiserfs_write_unlock_once(inode->i_sb, depth);
+        return;
 no_delete:
        end_writeback(inode);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1ec952b1f03..812e2c05aa2 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2311,7 +2311,7 @@ static int journal_read_transaction(struct super_block *sb,
        /* flush out the real blocks */
        for (i = 0; i < get_desc_trans_len(desc); i++) {
                set_buffer_dirty(real_blocks[i]);
-                ll_rw_block(SWRITE, 1, real_blocks + i);
+                write_dirty_buffer(real_blocks[i], WRITE);
        }
        for (i = 0; i < get_desc_trans_len(desc); i++) {
                wait_on_buffer(real_blocks[i]);
diff --git a/fs/super.c b/fs/super.c
index 9674ab2c871..8819e3a7ff2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
                        s = NULL;
                        goto out;
                }
+#ifdef CONFIG_SMP
+                s->s_files = alloc_percpu(struct list_head);
+                if (!s->s_files) {
+                        security_sb_free(s);
+                        kfree(s);
+                        s = NULL;
+                        goto out;
+                } else {
+                        int i;
+                        for_each_possible_cpu(i)
+                                INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
+                }
+#else
                INIT_LIST_HEAD(&s->s_files);
+#endif
                INIT_LIST_HEAD(&s->s_instances);
                INIT_HLIST_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
@@ -108,6 +123,9 @@ out:
 */
 static inline void destroy_super(struct super_block *s)
 {
+#ifdef CONFIG_SMP
+        free_percpu(s->s_files);
+#endif
        security_sb_free(s);
        kfree(s->s_subtype);
        kfree(s->s_options);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 048484fb10d..46f7a807bbc 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -114,10 +114,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
        
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        
        unlock_super (sb);
@@ -207,10 +205,8 @@ do_more:
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        if (overflow) {
                fragment += count;
@@ -558,10 +554,8 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
        
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
@@ -680,10 +674,8 @@ cg_found:
 succed:
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        result += cgno * uspi->s_fpg;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 428017e018f..2eabf04af3d 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -113,10 +113,8 @@ void ufs_free_inode (struct inode * inode)
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        
        sb->s_dirt = 1;
        unlock_super (sb);
@@ -156,10 +154,8 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
        fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
        ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer(UCPI_UBH(ucpi));
-        }
        UFSD("EXIT\n");
 }
@@ -290,10 +286,8 @@ cg_found:
        }
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        inode->i_ino = cg * uspi->s_ipg + bit;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 34d5cb13532..a58f9155fc9 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -243,10 +243,8 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
                ubh_bforget(ind_ubh);
                ind_ubh = NULL;
        }
-        if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
+        if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
-                ubh_ll_rw_block(SWRITE, ind_ubh);
+                ubh_sync_block(ind_ubh);
-                ubh_wait_on_buffer (ind_ubh);
-        }
        ubh_brelse (ind_ubh);
        
        UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -307,10 +305,8 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
                ubh_bforget(dind_bh);
                dind_bh = NULL;
        }
-        if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
+        if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
-                ubh_ll_rw_block(SWRITE, dind_bh);
+                ubh_sync_block(dind_bh);
-                ubh_wait_on_buffer (dind_bh);
-        }
        ubh_brelse (dind_bh);
        
        UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -367,10 +363,8 @@ static int ufs_trunc_tindirect(struct inode *inode)
                ubh_bforget(tind_bh);
                tind_bh = NULL;
        }
-        if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
+        if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
-                ubh_ll_rw_block(SWRITE, tind_bh);
+                ubh_sync_block(tind_bh);
-                ubh_wait_on_buffer (tind_bh);
-        }
        ubh_brelse (tind_bh);
        
        UFSD("EXIT: ino %lu\n", inode->i_ino);
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 85a7fc9e4a4..d2c36d53fe6 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -113,21 +113,17 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
        }
 }
-void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
+void ubh_sync_block(struct ufs_buffer_head *ubh)
 {
-        if (!ubh)
+        if (ubh) {
-                return;
+                unsigned i;
-        ll_rw_block(rw, ubh->count, ubh->bh);
+                for (i = 0; i < ubh->count; i++)
-}
+                        write_dirty_buffer(ubh->bh[i], WRITE);
-void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
+                for (i = 0; i < ubh->count; i++)
-{
+                        wait_on_buffer(ubh->bh[i]);
-        unsigned i;
+        }
-        if (!ubh)
-                return;
-        for ( i = 0; i < ubh->count; i++ )
-                wait_on_buffer (ubh->bh[i]);
 }
 void ubh_bforget (struct ufs_buffer_head * ubh)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 0466036912f..9f8775ce381 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -269,8 +269,7 @@ extern void ubh_brelse (struct ufs_buffer_head *);
 extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
 extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
 extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
-extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
+extern void ubh_sync_block(struct ufs_buffer_head *);
-extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
 extern void ubh_bforget (struct ufs_buffer_head *);
 extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
 #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 15412fe15c3..b552f816de1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -852,8 +852,8 @@ xfs_convert_page(
                SetPageUptodate(page);
        if (count) {
-                wbc->nr_to_write--;
+                if (--wbc->nr_to_write <= 0 &&
-                if (wbc->nr_to_write <= 0)
+                    wbc->sync_mode == WB_SYNC_NONE)
                        done = 1;
        }
        xfs_start_page_writeback(page, !page_dirty, count);
@@ -1068,7 +1068,7 @@ xfs_vm_writepage(
         * by themselves.
         */
        if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
-                goto out_fail;
+                goto redirty;
        /*
         * We need a transaction if there are delalloc or unwritten buffers
@@ -1080,7 +1080,7 @@ xfs_vm_writepage(
         */
        xfs_count_page_state(page, &delalloc, &unwritten);
        if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
-                goto out_fail;
+                goto redirty;
        /* Is this page beyond the end of the file? */
        offset = i_size_read(inode);
@@ -1245,12 +1245,15 @@ error:
        if (iohead)
                xfs_cancel_ioend(iohead);
+        if (err == -EAGAIN)
+                goto redirty;
        xfs_aops_discard_page(page);
        ClearPageUptodate(page);
        unlock_page(page);
        return err;
-out_fail:
+redirty:
        redirty_page_for_writepage(wbc, page);
        unlock_page(page);
        return 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 15c35b62ff1..a4e07974955 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1226,6 +1226,7 @@ xfs_fs_statfs(
        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
        __uint64_t              fakeinos, id;
        xfs_extlen_t            lsize;
+        __int64_t               ffree;
        statp->f_type = XFS_SB_MAGIC;
        statp->f_namelen = MAXNAMELEN - 1;
@@ -1249,7 +1250,11 @@ xfs_fs_statfs(
                statp->f_files = min_t(typeof(statp->f_files),
                                        statp->f_files,
                                        mp->m_maxicount);
-        statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        /* make sure statp->f_ffree does not underflow */
+        ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        statp->f_ffree = max_t(__int64_t, ffree, 0);
        spin_unlock(&mp->m_sb_lock);
        if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1402,7 +1407,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp);
+        return -xfs_fs_log_dummy(mp, SYNC_WAIT);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index dfcbd98d159..d59c4a65d49 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -34,6 +34,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -341,38 +342,6 @@ xfs_sync_attr(
 }
 STATIC int
-xfs_commit_dummy_trans(
-        struct xfs_mount        *mp,
-        uint                    flags)
-{
-        struct xfs_inode        *ip = mp->m_rootip;
-        struct xfs_trans        *tp;
-        int                     error;
-        /*
-         * Put a dummy transaction in the log to tell recovery
-         * that all others are OK.
-         */
-        tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        /* the log force ensures this transaction is pushed to disk */
-        xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-        return error;
-}
-STATIC int
 xfs_sync_fsdata(
        struct xfs_mount        *mp)
 {
@@ -432,7 +401,7 @@ xfs_quiesce_data(
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-                error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
+                error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -563,7 +532,7 @@ xfs_flush_inodes(
 /*
 * Every sync period we need to unpin all items, reclaim inodes and sync
 * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle.
+ * filesystem is idle and not frozen.
 */
 STATIC void
 xfs_sync_worker(
@@ -577,8 +546,9 @@ xfs_sync_worker(
                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-                if (xfs_log_need_covered(mp))
+                if (mp->m_super->s_frozen == SB_UNFROZEN &&
-                        error = xfs_commit_dummy_trans(mp, 0);
+                    xfs_log_need_covered(mp))
+                        error = xfs_fs_log_dummy(mp, 0);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dbca5f5c37b..43b1d569933 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -604,31 +604,36 @@ out:
        return 0;
 }
+/*
+ * Dump a transaction into the log that contains no real change. This is needed
+ * to be able to make the log dirty or stamp the current tail LSN into the log
+ * during the covering operation.
+ *
+ * We cannot use an inode here for this - that will push dirty state back up
+ * into the VFS and then periodic inode flushing will prevent log covering from
+ * making progress. Hence we log a field in the superblock instead.
+ */
 int
 xfs_fs_log_dummy(
-        xfs_mount_t     *mp)
+        xfs_mount_t     *mp,
+        int             flags)
 {
        xfs_trans_t     *tp;
-        xfs_inode_t     *ip;
        int             error;
        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+                                        XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
        }
-        ip = mp->m_rootip;
+        /* log the UUID because it is an unchanging field */
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_mod_sb(tp, XFS_SB_UUID);
+        if (flags & SYNC_WAIT)
-        xfs_trans_ijoin(tp, ip);
+                xfs_trans_set_sync(tp);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        return xfs_trans_commit(tp, 0);
-        xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        return error;
 }
 int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c..a786c5212c1 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index abf80ae1e95..5371d2dc360 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1213,7 +1213,6 @@ xfs_imap_lookup(
        struct xfs_inobt_rec_incore rec;
        struct xfs_btree_cur    *cur;
        struct xfs_buf          *agbp;
-        xfs_agino_t             startino;
        int                     error;
        int                     i;
@@ -1227,13 +1226,13 @@ xfs_imap_lookup(
        }
        /*
-         * derive and lookup the exact inode record for the given agino. If the
+         * Lookup the inode record for the given agino. If the record cannot be
-         * record cannot be found, then it's an invalid inode number and we
+         * found, then it's an invalid inode number and we should abort. Once
-         * should abort.
+         * we have a record, we need to ensure it contains the inode number
+         * we are looking up.
         */
        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-        startino = agino & ~(XFS_IALLOC_INODES(mp) - 1);
+        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
-        error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
        if (!error) {
                if (i)
                        error = xfs_inobt_get_rec(cur, &rec, &i);
@@ -1246,6 +1245,11 @@ xfs_imap_lookup(
        if (error)
                return error;
+        /* check that the returned record contains the required inode */
+        if (rec.ir_startino > agino ||
+            rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+                return EINVAL;
        /* for untrusted inodes check it is allocated first */
        if ((flags & XFS_IGET_UNTRUSTED) &&
            (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 68415cb4f23..34798f391c4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1914,6 +1914,11 @@ xfs_iunlink_remove(
        return 0;
 }
+/*
+ * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
 STATIC void
 xfs_ifree_cluster(
        xfs_inode_t     *free_ip,
@@ -1945,8 +1950,6 @@ xfs_ifree_cluster(
        }
        for (j = 0; j < nbufs; j++, inum += ninodes) {
-                int     found = 0;
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
@@ -1965,7 +1968,9 @@ xfs_ifree_cluster(
                /*
                 * Walk the inodes already attached to the buffer and mark them
                 * stale. These will all have the flush locks held, so an
-                 * in-memory inode walk can't lock them.
+                 * in-memory inode walk can't lock them. By marking them all
+                 * stale first, we will not attempt to lock them in the loop
+                 * below as the XFS_ISTALE flag will be set.
                 */
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                while (lip) {
@@ -1977,11 +1982,11 @@ xfs_ifree_cluster(
                                                        &iip->ili_flush_lsn,
                                                        &iip->ili_item.li_lsn);
                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-                                found++;
                        }
                        lip = lip->li_bio_list;
                }
                /*
                 * For each inode in memory attempt to add it to the inode
                 * buffer and set it up for being staled on buffer IO
@@ -1993,6 +1998,7 @@ xfs_ifree_cluster(
                 * even trying to lock them.
                 */
                for (i = 0; i < ninodes; i++) {
+retry:
                        read_lock(&pag->pag_ici_lock);
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
@@ -2003,38 +2009,36 @@ xfs_ifree_cluster(
                                continue;
                        }
-                        /* don't try to lock/unlock the current inode */
+                        /*
+                         * Don't try to lock/unlock the current inode, but we
+                         * _cannot_ skip the other inodes that we did not find
+                         * in the list attached to the buffer and are not
+                         * already marked stale. If we can't lock it, back off
+                         * and retry.
+                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
                                read_unlock(&pag->pag_ici_lock);
-                                continue;
+                                delay(1);
+                                goto retry;
                        }
                        read_unlock(&pag->pag_ici_lock);
-                        if (!xfs_iflock_nowait(ip)) {
+                        xfs_iflock(ip);
-                                if (ip != free_ip)
-                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                continue;
-                        }
                        xfs_iflags_set(ip, XFS_ISTALE);
-                        if (xfs_inode_clean(ip)) {
-                                ASSERT(ip != free_ip);
-                                xfs_ifunlock(ip);
-                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                continue;
-                        }
+                        /*
+                         * we don't need to attach clean inodes or those only
+                         * with unlogged changes (which we throw away, anyway).
+                         */
                        iip = ip->i_itemp;
-                        if (!iip) {
+                        if (!iip || xfs_inode_clean(ip)) {
-                                /* inode with unlogged changes only */
                                ASSERT(ip != free_ip);
                                ip->i_update_core = 0;
                                xfs_ifunlock(ip);
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                                continue;
                        }
-                        found++;
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
@@ -2049,8 +2053,7 @@ xfs_ifree_cluster(
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                }
-                if (found)
+                xfs_trans_stale_inode_buf(tp, bp);
-                        xfs_trans_stale_inode_buf(tp, bp);
                xfs_trans_binval(tp, bp);
        }
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 925d572bf0f..33f718f92a4 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3015,7 +3015,8 @@ _xfs_log_force(
        XFS_STATS_INC(xs_log_force);
-        xlog_cil_push(log, 1);
+        if (log->l_cilp)
+                xlog_cil_force(log);
        spin_lock(&log->l_icloglock);
@@ -3167,7 +3168,7 @@ _xfs_log_force_lsn(
        XFS_STATS_INC(xs_log_force);
        if (log->l_cilp) {
-                lsn = xlog_cil_push_lsn(log, lsn);
+                lsn = xlog_cil_force_lsn(log, lsn);
                if (lsn == NULLCOMMITLSN)
                        return 0;
        }
@@ -3724,7 +3725,7 @@ xfs_log_force_umount(
         * call below.
         */
        if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
-                xlog_cil_push(log, 1);
+                xlog_cil_force(log);
        /*
         * We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 31e4ea2d19a..ed575fb4b49 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -68,6 +68,7 @@ xlog_cil_init(
        ctx->sequence = 1;
        ctx->cil = cil;
        cil->xc_ctx = ctx;
+        cil->xc_current_sequence = ctx->sequence;
        cil->xc_log = log;
        log->l_cilp = cil;
@@ -269,15 +270,10 @@ xlog_cil_insert(
 static void
 xlog_cil_format_items(
        struct log              *log,
-        struct xfs_log_vec      *log_vector,
+        struct xfs_log_vec      *log_vector)
-        struct xlog_ticket      *ticket,
-        xfs_lsn_t               *start_lsn)
 {
        struct xfs_log_vec *lv;
-        if (start_lsn)
-                *start_lsn = log->l_cilp->xc_ctx->sequence;
        ASSERT(log_vector);
        for (lv = log_vector; lv; lv = lv->lv_next) {
                void    *ptr;
@@ -301,9 +297,24 @@ xlog_cil_format_items(
                        ptr += vec->i_len;
                }
                ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+        }
+}
+static void
+xlog_cil_insert_items(
+        struct log              *log,
+        struct xfs_log_vec      *log_vector,
+        struct xlog_ticket      *ticket,
+        xfs_lsn_t               *start_lsn)
+{
+        struct xfs_log_vec *lv;
+        if (start_lsn)
+                *start_lsn = log->l_cilp->xc_ctx->sequence;
+        ASSERT(log_vector);
+        for (lv = log_vector; lv; lv = lv->lv_next)
                xlog_cil_insert(log, ticket, lv->lv_item, lv);
-        }
 }
 static void
@@ -321,80 +332,6 @@ xlog_cil_free_logvec(
 }
 /*
- * Commit a transaction with the given vector to the Committed Item List.
- *
- * To do this, we need to format the item, pin it in memory if required and
- * account for the space used by the transaction. Once we have done that we
- * need to release the unused reservation for the transaction, attach the
- * transaction to the checkpoint context so we carry the busy extents through
- * to checkpoint completion, and then unlock all the items in the transaction.
- *
- * For more specific information about the order of operations in
- * xfs_log_commit_cil() please refer to the comments in
- * xfs_trans_commit_iclog().
- *
- * Called with the context lock already held in read mode to lock out
- * background commit, returns without it held once background commits are
- * allowed again.
- */
-int
-xfs_log_commit_cil(
-        struct xfs_mount        *mp,
-        struct xfs_trans        *tp,
-        struct xfs_log_vec      *log_vector,
-        xfs_lsn_t               *commit_lsn,
-        int                     flags)
-{
-        struct log              *log = mp->m_log;
-        int                     log_flags = 0;
-        int                     push = 0;
-        if (flags & XFS_TRANS_RELEASE_LOG_RES)
-                log_flags = XFS_LOG_REL_PERM_RESERV;
-        if (XLOG_FORCED_SHUTDOWN(log)) {
-                xlog_cil_free_logvec(log_vector);
-                return XFS_ERROR(EIO);
-        }
-        /* lock out background commit */
-        down_read(&log->l_cilp->xc_ctx_lock);
-        xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
-        /* check we didn't blow the reservation */
-        if (tp->t_ticket->t_curr_res < 0)
-                xlog_print_tic_res(log->l_mp, tp->t_ticket);
-        /* attach the transaction to the CIL if it has any busy extents */
-        if (!list_empty(&tp->t_busy)) {
-                spin_lock(&log->l_cilp->xc_cil_lock);
-                list_splice_init(&tp->t_busy,
-                                        &log->l_cilp->xc_ctx->busy_extents);
-                spin_unlock(&log->l_cilp->xc_cil_lock);
-        }
-        tp->t_commit_lsn = *commit_lsn;
-        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-        xfs_trans_unreserve_and_mod_sb(tp);
-        /* check for background commit before unlock */
-        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
-                push = 1;
-        up_read(&log->l_cilp->xc_ctx_lock);
-        /*
-         * We need to push CIL every so often so we don't cache more than we
-         * can fit in the log. The limit really is that a checkpoint can't be
-         * more than half the log (the current checkpoint is not allowed to
-         * overwrite the previous checkpoint), but commit latency and memory
-         * usage limit this to a smaller size in most cases.
-         */
-        if (push)
-                xlog_cil_push(log, 0);
-        return 0;
-}
-/*
 * Mark all items committed and clear busy extents. We free the log vector
 * chains in a separate pass so that we unpin the log items as quickly as
 * possible.
@@ -427,13 +364,23 @@ xlog_cil_committed(
 }
 /*
- * Push the Committed Item List to the log. If the push_now flag is not set,
+ * Push the Committed Item List to the log. If @push_seq flag is zero, then it
- * then it is a background flush and so we can chose to ignore it.
+ * is a background flush and so we can chose to ignore it. Otherwise, if the
+ * current sequence is the same as @push_seq we need to do a flush. If
+ * @push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * @push_seq is a value rather than a flag because that allows us to do an
+ * unlocked check of the sequence number for a match. Hence we can allows log
+ * forces to run racily and not issue pushes for the same sequence twice. If we
+ * get a race between multiple pushes for the same sequence they will block on
+ * the first one and then abort, hence avoiding needless pushes.
 */
-int
+STATIC int
 xlog_cil_push(
        struct log              *log,
-        int                     push_now)
+        xfs_lsn_t               push_seq)
 {
        struct xfs_cil          *cil = log->l_cilp;
        struct xfs_log_vec      *lv;
@@ -453,12 +400,14 @@ xlog_cil_push(
        if (!cil)
                return 0;
+        ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
        new_ctx->ticket = xlog_cil_ticket_alloc(log);
        /* lock out transaction commit, but don't block on background push */
        if (!down_write_trylock(&cil->xc_ctx_lock)) {
-                if (!push_now)
+                if (!push_seq)
                        goto out_free_ticket;
                down_write(&cil->xc_ctx_lock);
        }
@@ -469,7 +418,11 @@ xlog_cil_push(
                goto out_skip;
        /* check for spurious background flush */
-        if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+        if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+                goto out_skip;
+        /* check for a previously pushed seqeunce */
+        if (push_seq < cil->xc_ctx->sequence)
                goto out_skip;
        /*
@@ -515,6 +468,13 @@ xlog_cil_push(
        cil->xc_ctx = new_ctx;
        /*
+         * mirror the new sequence into the cil structure so that we can do
+         * unlocked checks against the current sequence in log forces without
+         * risking deferencing a freed context pointer.
+         */
+        cil->xc_current_sequence = new_ctx->sequence;
+        /*
         * The switch is now done, so we can drop the context lock and move out
         * of a shared context. We can't just go straight to the commit record,
         * though - we need to synchronise with previous and future commits so
@@ -626,6 +586,102 @@ out_abort:
 }
 /*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        struct log              *log = mp->m_log;
+        int                     log_flags = 0;
+        int                     push = 0;
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        if (XLOG_FORCED_SHUTDOWN(log)) {
+                xlog_cil_free_logvec(log_vector);
+                return XFS_ERROR(EIO);
+        }
+        /*
+         * do all the hard work of formatting items (including memory
+         * allocation) outside the CIL context lock. This prevents stalling CIL
+         * pushes when we are low on memory and a transaction commit spends a
+         * lot of time in memory reclaim.
+         */
+        xlog_cil_format_items(log, log_vector);
+        /* lock out background commit */
+        down_read(&log->l_cilp->xc_ctx_lock);
+        xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+        /* check we didn't blow the reservation */
+        if (tp->t_ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, tp->t_ticket);
+        /* attach the transaction to the CIL if it has any busy extents */
+        if (!list_empty(&tp->t_busy)) {
+                spin_lock(&log->l_cilp->xc_cil_lock);
+                list_splice_init(&tp->t_busy,
+                                        &log->l_cilp->xc_ctx->busy_extents);
+                spin_unlock(&log->l_cilp->xc_cil_lock);
+        }
+        tp->t_commit_lsn = *commit_lsn;
+        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+        xfs_trans_unreserve_and_mod_sb(tp);
+        /*
+         * Once all the items of the transaction have been copied to the CIL,
+         * the items can be unlocked and freed.
+         *
+         * This needs to be done before we drop the CIL context lock because we
+         * have to update state in the log items and unlock them before they go
+         * to disk. If we don't, then the CIL checkpoint can race with us and
+         * we can run checkpoint completion before we've updated and unlocked
+         * the log items. This affects (at least) processing of stale buffers,
+         * inodes and EFIs.
+         */
+        xfs_trans_free_items(tp, *commit_lsn, 0);
+        /* check for background commit before unlock */
+        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+                push = 1;
+        up_read(&log->l_cilp->xc_ctx_lock);
+        /*
+         * We need to push CIL every so often so we don't cache more than we
+         * can fit in the log. The limit really is that a checkpoint can't be
+         * more than half the log (the current checkpoint is not allowed to
+         * overwrite the previous checkpoint), but commit latency and memory
+         * usage limit this to a smaller size in most cases.
+         */
+        if (push)
+                xlog_cil_push(log, 0);
+        return 0;
+}
+/*
 * Conditionally push the CIL based on the sequence passed in.
 *
 * We only need to push if we haven't already pushed the sequence
@@ -639,39 +695,34 @@ out_abort:
 * commit lsn is there. It'll be empty, so this is broken for now.
 */
 xfs_lsn_t
-xlog_cil_push_lsn(
+xlog_cil_force_lsn(
        struct log      *log,
-        xfs_lsn_t       push_seq)
+        xfs_lsn_t       sequence)
 {
        struct xfs_cil          *cil = log->l_cilp;
        struct xfs_cil_ctx      *ctx;
        xfs_lsn_t               commit_lsn = NULLCOMMITLSN;
-restart:
+        ASSERT(sequence <= cil->xc_current_sequence);
-        down_write(&cil->xc_ctx_lock);
-        ASSERT(push_seq <= cil->xc_ctx->sequence);
+        /*
+         * check to see if we need to force out the current context.
-        /* check to see if we need to force out the current context */
+         * xlog_cil_push() handles racing pushes for the same sequence,
-        if (push_seq == cil->xc_ctx->sequence) {
+         * so no need to deal with it here.
-                up_write(&cil->xc_ctx_lock);
+         */
-                xlog_cil_push(log, 1);
+        if (sequence == cil->xc_current_sequence)
-                goto restart;
+                xlog_cil_push(log, sequence);
-        }
        /*
         * See if we can find a previous sequence still committing.
-         * We can drop the flush lock as soon as we have the cil lock
-         * because we are now only comparing contexts protected by
-         * the cil lock.
-         *
         * We need to wait for all previous sequence commits to complete
         * before allowing the force of push_seq to go ahead. Hence block
         * on commits for those as well.
         */
+restart:
        spin_lock(&cil->xc_cil_lock);
-        up_write(&cil->xc_ctx_lock);
        list_for_each_entry(ctx, &cil->xc_committing, committing) {
-                if (ctx->sequence > push_seq)
+                if (ctx->sequence > sequence)
                        continue;
                if (!ctx->commit_lsn) {
                        /*
@@ -681,7 +732,7 @@ restart:
                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
                        goto restart;
                }
-                if (ctx->sequence != push_seq)
+                if (ctx->sequence != sequence)
                        continue;
                /* found it! */
                commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965..ced52b98b32 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,6 +422,7 @@ struct xfs_cil {
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
        sv_t                    xc_commit_wait;
+        xfs_lsn_t               xc_current_sequence;
 };
 /*
@@ -562,8 +563,16 @@ int	xlog_cil_init(struct log *log);
 void    xlog_cil_init_post_recovery(struct log *log);
 void    xlog_cil_destroy(struct log *log);
-int     xlog_cil_push(struct log *log, int push_now);
+/*
-xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
+ * CIL force routines
+ */
+xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
+static inline void
+xlog_cil_force(struct log *log)
+{
+        xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+}
 /*
 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdca7416c75..1c47edaea0d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1167,7 +1167,7 @@ xfs_trans_del_item(
 * Unlock all of the items of a transaction and free all the descriptors
 * of that transaction.
 */
-STATIC void
+void
 xfs_trans_free_items(
        struct xfs_trans        *tp,
        xfs_lsn_t               commit_lsn,
@@ -1653,9 +1653,6 @@ xfs_trans_commit_cil(
                return error;
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        /* xfs_trans_free_items() unlocks them first */
-        xfs_trans_free_items(tp, *commit_lsn, 0);
        xfs_trans_free(tp);
        return 0;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index e2d93d8ead7..62da86c90de 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,7 +25,8 @@ struct xfs_trans;
 void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void    xfs_trans_del_item(struct xfs_log_item *);
+void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+                                int flags);
 void    xfs_trans_item_committed(struct xfs_log_item *lip,
                                xfs_lsn_t commit_lsn, int aborted);
 void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);